Package mvpa :: Package datasets :: Module miscfx
[hide private]
[frames] | no frames]

Source Code for Module mvpa.datasets.miscfx

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Misc function performing operations on datasets. 
 10   
 11  All the functions defined in this module must accept dataset as the 
 12  first argument since they are bound to Dataset class in the trailer. 
 13  """ 
 14   
 15  __docformat__ = 'restructuredtext' 
 16   
 17  from sets import Set 
 18  from operator import isSequenceType 
 19   
 20  import numpy as N 
 21   
 22  from mvpa.datasets.base import Dataset, datasetmethod 
 23  from mvpa.misc.support import getBreakPoints 
 24   
 25  from mvpa.base import externals, warning 
 26   
 27  if __debug__: 
 28      from mvpa.base import debug 
 29   
 30  if externals.exists('scipy'): 
 31      from mvpa.datasets.miscfx_sp import detrend 
32 33 34 @datasetmethod 35 -def zscore(dataset, mean=None, std=None, 36 perchunk=True, baselinelabels=None, 37 pervoxel=True, targetdtype='float64'):
38 """Z-Score the samples of a `Dataset` (in-place). 39 40 `mean` and `std` can be used to pass custom values to the z-scoring. 41 Both may be scalars or arrays. 42 43 All computations are done *in place*. Data upcasting is done 44 automatically if necessary into `targetdtype` 45 46 If `baselinelabels` provided, and `mean` or `std` aren't provided, it would 47 compute the corresponding measure based only on labels in `baselinelabels` 48 49 If `perchunk` is True samples within the same chunk are z-scored independent 50 of samples from other chunks, e.i. mean and standard deviation are 51 calculated individually. 52 """ 53 54 if __debug__ and perchunk \ 55 and N.array(dataset.samplesperchunk.values()).min() < 2: 56 warning("Z-scoring chunk-wise and one chunk with less than two " \ 57 "samples will set features in these samples to zero.") 58 59 # cast to floating point datatype if necessary 60 if str(dataset.samples.dtype).startswith('uint') \ 61 or str(dataset.samples.dtype).startswith('int'): 62 dataset.setSamplesDType(targetdtype) 63 64 def doit(samples, mean, std, statsamples=None): 65 """Internal method.""" 66 67 if statsamples is None: 68 # if nothing provided -- mean/std on all samples 69 statsamples = samples 70 71 if pervoxel: 72 axisarg = {'axis':0} 73 else: 74 axisarg = {} 75 76 # calculate mean if necessary 77 if mean is None: 78 mean = statsamples.mean(**axisarg) 79 80 # de-mean 81 samples -= mean 82 83 # calculate std-deviation if necessary 84 if std is None: 85 std = statsamples.std(**axisarg) 86 87 # do the z-scoring 88 if pervoxel: 89 samples[:, std != 0] /= std[std != 0] 90 else: 91 samples /= std 92 93 return samples
94 95 if baselinelabels is None: 96 statids = None 97 else: 98 statids = Set(dataset.idsbylabels(baselinelabels)) 99 100 # for the sake of speed yoh didn't simply create a list 101 # [True]*dataset.nsamples to provide easy selection of everything 102 if perchunk: 103 for c in dataset.uniquechunks: 104 slicer = N.where(dataset.chunks == c)[0] 105 if not statids is None: 106 statslicer = list(statids.intersection(Set(slicer))) 107 dataset.samples[slicer] = doit(dataset.samples[slicer], 108 mean, std, 109 dataset.samples[statslicer]) 110 else: 111 slicedsamples = dataset.samples[slicer] 112 dataset.samples[slicer] = doit(slicedsamples, 113 mean, std, 114 slicedsamples) 115 elif statids is None: 116 doit(dataset.samples, mean, std, dataset.samples) 117 else: 118 doit(dataset.samples, mean, std, dataset.samples[list(statids)]) 119
120 121 @datasetmethod 122 -def aggregateFeatures(dataset, fx=N.mean):
123 """Apply a function to each row of the samples matrix of a dataset. 124 125 The functor given as `fx` has to honour an `axis` keyword argument in the 126 way that NumPy used it (e.g. NumPy.mean, var). 127 128 :Returns: 129 a new `Dataset` object with the aggregated feature(s). 130 """ 131 agg = fx(dataset.samples, axis=1) 132 133 return Dataset(samples=N.array(agg, ndmin=2).T, 134 labels=dataset.labels, 135 chunks=dataset.chunks)
136
137 138 @datasetmethod 139 -def removeInvariantFeatures(dataset):
140 """Returns a new dataset with all invariant features removed. 141 """ 142 return dataset.selectFeatures(dataset.samples.std(axis=0).nonzero()[0])
143
144 145 @datasetmethod 146 -def coarsenChunks(source, nchunks=4):
147 """Change chunking of the dataset 148 149 Group chunks into groups to match desired number of chunks. Makes 150 sense if originally there were no strong groupping into chunks or 151 each sample was independent, thus belonged to its own chunk 152 153 :Parameters: 154 source : Dataset or list of chunk ids 155 dataset or list of chunk ids to operate on. If Dataset, then its chunks 156 get modified 157 nchunks : int 158 desired number of chunks 159 """ 160 161 if isinstance(source, Dataset): 162 chunks = source.chunks 163 else: 164 chunks = source 165 chunks_unique = N.unique(chunks) 166 nchunks_orig = len(chunks_unique) 167 168 if nchunks_orig < nchunks: 169 raise ValueError, \ 170 "Original number of chunks is %d. Cannot coarse them " \ 171 "to get %d chunks" % (nchunks_orig, nchunks) 172 173 # figure out number of samples per each chunk 174 counts = dict(zip(chunks_unique, [ 0 ] * len(chunks_unique))) 175 for c in chunks: 176 counts[c] += 1 177 178 # now we need to group chunks to get more or less equalized number 179 # of samples per chunk. No sophistication is done -- just 180 # consecutively group to get close to desired number of samples 181 # per chunk 182 avg_chunk_size = N.sum(counts.values())*1.0/nchunks 183 chunks_groups = [] 184 cur_chunk = [] 185 nchunks = 0 186 cur_chunk_nsamples = 0 187 samples_counted = 0 188 for i,c in enumerate(chunks_unique): 189 cc = counts[c] 190 191 cur_chunk += [c] 192 cur_chunk_nsamples += cc 193 194 # time to get a new chunk? 195 if (samples_counted + cur_chunk_nsamples 196 >= (nchunks+1)*avg_chunk_size) or i==nchunks_orig-1: 197 chunks_groups.append(cur_chunk) 198 samples_counted += cur_chunk_nsamples 199 cur_chunk_nsamples = 0 200 cur_chunk = [] 201 nchunks += 1 202 203 if len(chunks_groups) != nchunks: 204 warning("Apparently logic in coarseChunks is wrong. " 205 "It was desired to get %d chunks, got %d" 206 % (nchunks, len(chunks_groups))) 207 208 # remap using groups 209 # create dictionary 210 chunks_map = {} 211 for i, group in enumerate(chunks_groups): 212 for c in group: 213 chunks_map[c] = i 214 215 chunks_new = [chunks_map[x] for x in chunks] 216 217 if __debug__: 218 debug("DS_", "Using dictionary %s to remap old chunks %s into new %s" 219 % (chunks_map, chunks, chunks_new)) 220 221 if isinstance(source, Dataset): 222 if __debug__: 223 debug("DS", "Coarsing %d chunks into %d chunks for %s" 224 %(nchunks_orig, len(chunks_new), source)) 225 source.chunks = chunks_new 226 return 227 else: 228 return chunks_new
229
230 231 @datasetmethod 232 -def getSamplesPerChunkLabel(dataset):
233 """Returns an array with the number of samples per label in each chunk. 234 235 Array shape is (chunks x labels). 236 237 :Parameters: 238 dataset: Dataset 239 Source dataset. 240 """ 241 ul = dataset.uniquelabels 242 uc = dataset.uniquechunks 243 244 count = N.zeros((len(uc), len(ul)), dtype='uint') 245 246 for cc, c in enumerate(uc): 247 for lc, l in enumerate(ul): 248 count[cc, lc] = N.sum(N.logical_and(dataset.labels == l, 249 dataset.chunks == c)) 250 251 return count
252