Package mvpa :: Package datasets :: Module miscfx
[hide private]
[frames] | no frames]

Source Code for Module mvpa.datasets.miscfx

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Misc function performing operations on datasets. 
 10   
 11  All the functions defined in this module must accept dataset as the 
 12  first argument since they are bound to Dataset class in the trailer. 
 13  """ 
 14   
 15  __docformat__ = 'restructuredtext' 
 16   
 17  from sets import Set 
 18  from operator import isSequenceType 
 19   
 20  import numpy as N 
 21   
 22  from mvpa.datasets.base import Dataset, datasetmethod 
 23  from mvpa.base.dochelpers import table2string 
 24  from mvpa.misc.support import getBreakPoints 
 25   
 26  from mvpa.base import externals, warning 
 27   
 28  if __debug__: 
 29      from mvpa.base import debug 
 30   
 31  if externals.exists('scipy'): 
 32      from mvpa.datasets.miscfx_sp import detrend 
33 34 35 @datasetmethod 36 -def zscore(dataset, mean=None, std=None, 37 perchunk=True, baselinelabels=None, 38 pervoxel=True, targetdtype='float64'):
39 """Z-Score the samples of a `Dataset` (in-place). 40 41 `mean` and `std` can be used to pass custom values to the z-scoring. 42 Both may be scalars or arrays. 43 44 All computations are done *in place*. Data upcasting is done 45 automatically if necessary into `targetdtype` 46 47 If `baselinelabels` provided, and `mean` or `std` aren't provided, it would 48 compute the corresponding measure based only on labels in `baselinelabels` 49 50 If `perchunk` is True samples within the same chunk are z-scored independent 51 of samples from other chunks, e.i. mean and standard deviation are 52 calculated individually. 53 """ 54 55 if __debug__ and perchunk \ 56 and N.array(dataset.samplesperchunk.values()).min() <= 2: 57 warning("Z-scoring chunk-wise and one chunk with less than three " 58 "samples will set features in these samples to either zero " 59 "(with 1 sample in a chunk) " 60 "or -1/+1 (with 2 samples in a chunk).") 61 62 # cast to floating point datatype if necessary 63 if str(dataset.samples.dtype).startswith('uint') \ 64 or str(dataset.samples.dtype).startswith('int'): 65 dataset.setSamplesDType(targetdtype) 66 67 def doit(samples, mean, std, statsamples=None): 68 """Internal method.""" 69 70 if statsamples is None: 71 # if nothing provided -- mean/std on all samples 72 statsamples = samples 73 74 if pervoxel: 75 axisarg = {'axis':0} 76 else: 77 axisarg = {} 78 79 # calculate mean if necessary 80 if mean is None: 81 mean = statsamples.mean(**axisarg) 82 83 # de-mean 84 samples -= mean 85 86 # calculate std-deviation if necessary 87 # XXX YOH: would that be actually what we want? 88 # may be we want actually estimate of deviation from the mean, 89 # which per se might be not statsamples.mean (see above)? 90 # if logic to be changed -- adjust ZScoreMapper as well 91 if std is None: 92 std = statsamples.std(**axisarg) 93 94 # do the z-scoring 95 if pervoxel: 96 samples[:, std != 0] /= std[std != 0] 97 else: 98 samples /= std 99 100 return samples
101 102 if baselinelabels is None: 103 statids = None 104 else: 105 statids = Set(dataset.idsbylabels(baselinelabels)) 106 107 # for the sake of speed yoh didn't simply create a list 108 # [True]*dataset.nsamples to provide easy selection of everything 109 if perchunk: 110 for c in dataset.uniquechunks: 111 slicer = N.where(dataset.chunks == c)[0] 112 if not statids is None: 113 statslicer = list(statids.intersection(Set(slicer))) 114 dataset.samples[slicer] = doit(dataset.samples[slicer], 115 mean, std, 116 dataset.samples[statslicer]) 117 else: 118 slicedsamples = dataset.samples[slicer] 119 dataset.samples[slicer] = doit(slicedsamples, 120 mean, std, 121 slicedsamples) 122 elif statids is None: 123 doit(dataset.samples, mean, std, dataset.samples) 124 else: 125 doit(dataset.samples, mean, std, dataset.samples[list(statids)]) 126
127 128 @datasetmethod 129 -def aggregateFeatures(dataset, fx=N.mean):
130 """Apply a function to each row of the samples matrix of a dataset. 131 132 The functor given as `fx` has to honour an `axis` keyword argument in the 133 way that NumPy used it (e.g. NumPy.mean, var). 134 135 :Returns: 136 a new `Dataset` object with the aggregated feature(s). 137 """ 138 agg = fx(dataset.samples, axis=1) 139 140 return Dataset(samples=N.array(agg, ndmin=2).T, 141 labels=dataset.labels, 142 chunks=dataset.chunks)
143
144 145 @datasetmethod 146 -def removeInvariantFeatures(dataset):
147 """Returns a new dataset with all invariant features removed. 148 """ 149 return dataset.selectFeatures(dataset.samples.std(axis=0).nonzero()[0])
150
151 152 @datasetmethod 153 -def coarsenChunks(source, nchunks=4):
154 """Change chunking of the dataset 155 156 Group chunks into groups to match desired number of chunks. Makes 157 sense if originally there were no strong groupping into chunks or 158 each sample was independent, thus belonged to its own chunk 159 160 :Parameters: 161 source : Dataset or list of chunk ids 162 dataset or list of chunk ids to operate on. If Dataset, then its chunks 163 get modified 164 nchunks : int 165 desired number of chunks 166 """ 167 168 if isinstance(source, Dataset): 169 chunks = source.chunks 170 else: 171 chunks = source 172 chunks_unique = N.unique(chunks) 173 nchunks_orig = len(chunks_unique) 174 175 if nchunks_orig < nchunks: 176 raise ValueError, \ 177 "Original number of chunks is %d. Cannot coarse them " \ 178 "to get %d chunks" % (nchunks_orig, nchunks) 179 180 # figure out number of samples per each chunk 181 counts = dict(zip(chunks_unique, [ 0 ] * len(chunks_unique))) 182 for c in chunks: 183 counts[c] += 1 184 185 # now we need to group chunks to get more or less equalized number 186 # of samples per chunk. No sophistication is done -- just 187 # consecutively group to get close to desired number of samples 188 # per chunk 189 avg_chunk_size = N.sum(counts.values())*1.0/nchunks 190 chunks_groups = [] 191 cur_chunk = [] 192 nchunks = 0 193 cur_chunk_nsamples = 0 194 samples_counted = 0 195 for i, c in enumerate(chunks_unique): 196 cc = counts[c] 197 198 cur_chunk += [c] 199 cur_chunk_nsamples += cc 200 201 # time to get a new chunk? 202 if (samples_counted + cur_chunk_nsamples 203 >= (nchunks+1)*avg_chunk_size) or i==nchunks_orig-1: 204 chunks_groups.append(cur_chunk) 205 samples_counted += cur_chunk_nsamples 206 cur_chunk_nsamples = 0 207 cur_chunk = [] 208 nchunks += 1 209 210 if len(chunks_groups) != nchunks: 211 warning("Apparently logic in coarseChunks is wrong. " 212 "It was desired to get %d chunks, got %d" 213 % (nchunks, len(chunks_groups))) 214 215 # remap using groups 216 # create dictionary 217 chunks_map = {} 218 for i, group in enumerate(chunks_groups): 219 for c in group: 220 chunks_map[c] = i 221 222 chunks_new = [chunks_map[x] for x in chunks] 223 224 if __debug__: 225 debug("DS_", "Using dictionary %s to remap old chunks %s into new %s" 226 % (chunks_map, chunks, chunks_new)) 227 228 if isinstance(source, Dataset): 229 if __debug__: 230 debug("DS", "Coarsing %d chunks into %d chunks for %s" 231 %(nchunks_orig, len(chunks_new), source)) 232 source.chunks = chunks_new 233 return 234 else: 235 return chunks_new
236
237 238 @datasetmethod 239 -def getSamplesPerChunkLabel(dataset):
240 """Returns an array with the number of samples per label in each chunk. 241 242 Array shape is (chunks x labels). 243 244 :Parameters: 245 dataset: Dataset 246 Source dataset. 247 """ 248 ul = dataset.uniquelabels 249 uc = dataset.uniquechunks 250 251 count = N.zeros((len(uc), len(ul)), dtype='uint') 252 253 for cc, c in enumerate(uc): 254 for lc, l in enumerate(ul): 255 count[cc, lc] = N.sum(N.logical_and(dataset.labels == l, 256 dataset.chunks == c)) 257 258 return count
259
260 261 -class SequenceStats(dict):
262 """Simple helper to provide representation of sequence statistics 263 264 Matlab analog: 265 http://cfn.upenn.edu/aguirre/code/matlablib/mseq/mtest.m 266 267 WARNING: Experimental -- API might change without warning! 268 Current implementation is ugly! 269 """ 270
271 - def __init__(self, seq, order=2):#, chunks=None, perchunk=False):
272 """Initialize SequenceStats 273 274 :Parameters: 275 seq : list or ndarray 276 Actual sequence of labels 277 278 :Keywords: 279 order : int 280 Maximal order of counter-balancing check. For perfect 281 counterbalancing all matrices should be identical 282 """ 283 """ 284 chunks : None or list or ndarray 285 Chunks to use if `perchunk`=True 286 perchunk .... TODO 287 """ 288 dict.__init__(self) 289 self.order = order 290 self._seq = seq 291 self.stats = None 292 self._str_stats = None 293 self.__compute()
294 295
296 - def __repr__(self):
297 """Representation of SequenceStats 298 """ 299 return "SequenceStats(%s, order=%d)" % (repr(self._seq), self.order)
300
301 - def __str__(self):
302 return self._str_stats
303
304 - def __compute(self):
305 """Compute stats and string representation 306 """ 307 # Do actual computation 308 order = self.order 309 seq = list(self._seq) # assure list 310 nsamples = len(seq) # # of samples/labels 311 ulabels = sorted(list(set(seq))) # unique labels 312 nlabels = len(ulabels) # # of labels 313 314 # mapping for labels 315 labels_map = dict([(l, i) for i,l in enumerate(ulabels)]) 316 317 # map sequence first 318 seqm = [labels_map[i] for i in seq] 319 nperlabel = N.bincount(seqm) 320 321 res = dict(ulabels=ulabels) 322 # Estimate counter-balance 323 cbcounts = N.zeros((order, nlabels, nlabels), dtype=int) 324 for cb in xrange(order): 325 for i,j in zip(seqm[:-(cb+1)], seqm[cb+1:]): 326 cbcounts[cb, i, j] += 1 327 res['cbcounts'] = cbcounts 328 329 """ 330 Lets compute relative counter-balancing 331 Ideally, nperlabel[i]/nlabels should precede each label 332 """ 333 # Autocorrelation 334 corr = [] 335 # for all possible shifts: 336 for shift in xrange(1, nsamples): 337 shifted = seqm[shift:] + seqm[:shift] 338 # ??? User pearsonsr with p may be? 339 corr += [N.corrcoef(seqm, shifted)[0, 1]] 340 # ??? report high (anti)correlations? 341 res['corrcoef'] = corr = N.array(corr) 342 res['sumabscorr'] = sumabscorr = N.sum(N.abs(corr)) 343 self.update(res) 344 345 # Assign textual summary 346 # XXX move into a helper function and do on demand 347 t = [ [""] * (1 + self.order*(nlabels+1)) for i in xrange(nlabels+1) ] 348 t[0][0] = "Labels/Order" 349 for i, l in enumerate(ulabels): 350 t[i+1][0] = '%s:' % l 351 for cb in xrange(order): 352 t[0][1+cb*(nlabels+1)] = "O%d" % (cb+1) 353 for i in xrange(nlabels+1): 354 t[i][(cb+1)*(nlabels+1)] = " | " 355 m = cbcounts[cb] 356 # ??? there should be better way to get indexes 357 ind = N.where(~N.isnan(m)) 358 for i, j in zip(*ind): 359 t[1+i][1+cb*(nlabels+1)+j] = '%d' % m[i, j] 360 361 sout = "Original sequence had %d entries from set %s\n" \ 362 % (len(seq), ulabels) + \ 363 "Counter-balance table for orders up to %d:\n" % order \ 364 + table2string(t) 365 sout += "Correlations: min=%.2g max=%.2g mean=%.2g sum(abs)=%.2g" \ 366 % (min(corr), max(corr), N.mean(corr), sumabscorr) 367 self._str_stats = sout
368 369
370 - def plot(self):
371 """Plot correlation coefficients 372 """ 373 externals.exists('pylab', raiseException=True) 374 import pylab as P 375 P.plot(self['corrcoef']) 376 P.title('Auto-correlation of the sequence') 377 P.xlabel('Offset') 378 P.ylabel('Correlation Coefficient') 379 P.show()
380