Package mvpa :: Package datasets :: Module misc
[hide private]
[frames] | no frames]

Source Code for Module mvpa.datasets.misc

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Misc function performing operations on datasets. 
 10   
 11  TODO: shouldn't it be gone under mvpa.misc.signal? or may be smth like 
 12  mvpa.misc.stats? Or may be we should have mvpa.processing to store this 
 13  bastards? 
 14   
 15  """ 
 16   
 17  __docformat__ = 'restructuredtext' 
 18   
 19  import numpy as N 
 20  from sets import Set 
 21  from mvpa.datasets import Dataset 
 22   
 23   
24 -def zscore(dataset, mean = None, std = None, 25 perchunk=True, baselinelabels=None, targetdtype='float64'):
26 """Z-Score the samples of a `Dataset` (in-place). 27 28 `mean` and `std` can be used to pass custom values to the z-scoring. 29 Both may be scalars or arrays. 30 31 All computations are done in place. Data upcasting is done 32 automatically if necessary into `targetdtype` 33 34 If `baselinelabels` provided, and `mean` or `std` aren't provided, it would 35 compute the corresponding measure based only on labels in `baselinelabels` 36 37 If `perchunk` is True samples within the same chunk are z-scored independent 38 of samples from other chunks, e.i. mean and standard deviation are 39 calculated individually. 40 """ 41 # cast to floating point datatype if necessary 42 if str(dataset.samples.dtype).startswith('uint') \ 43 or str(dataset.samples.dtype).startswith('int'): 44 dataset.setSamplesDType(targetdtype) 45 46 def doit(samples, mean, std, statsamples=None): 47 """Internal method.""" 48 49 if statsamples is None: 50 # if nothing provided -- mean/std on all samples 51 statsamples = samples 52 53 # calculate mean if necessary 54 if not mean: 55 mean = statsamples.mean(axis=0) 56 57 # calculate std-deviation if necessary 58 if not std: 59 std = statsamples.std(axis=0) 60 61 # do the z-scoring 62 samples -= mean 63 samples[:, std != 0] /= std[std != 0] 64 65 return samples
66 67 if baselinelabels is None: 68 statids = None 69 else: 70 statids = Set(dataset.idsbylabels(baselinelabels)) 71 72 # for the sake of speed yoh didn't simply create a list 73 # [True]*dataset.nsamples to provide easy selection of everything 74 if perchunk: 75 for c in dataset.uniquechunks: 76 slicer = N.where(dataset.chunks == c)[0] 77 if not statids is None: 78 statslicer = list(statids.intersection(Set(slicer))) 79 dataset.samples[slicer] = doit(dataset.samples[slicer], 80 mean, std, 81 dataset.samples[statslicer]) 82 else: 83 slicedsamples = dataset.samples[slicer] 84 dataset.samples[slicer] = doit(slicedsamples, 85 mean, std, 86 slicedsamples) 87 elif statids is None: 88 doit(dataset.samples, mean, std, dataset.samples) 89 else: 90 doit(dataset.samples, mean, std, dataset.samples[list(statids)]) 91 92 93
94 -def aggregateFeatures(dataset, fx):
95 """Apply a function to each row of the samples matrix of a dataset. 96 97 The functor given as `fx` has to honour an `axis` keyword argument in the 98 way that NumPy used it (e.g. NumPy.mean, var). 99 100 Returns a new `Dataset` object with the aggregated feature(s). 101 """ 102 agg = fx(dataset.samples, axis=1) 103 104 return Dataset(samples=N.array(agg, ndmin=2).T, 105 labels=dataset.labels, 106 chunks=dataset.chunks)
107