1
2
3
4
5
6
7
8
9 """Misc function performing operations on datasets.
10
11 All the functions defined in this module must accept dataset as the
12 first argument since they are bound to Dataset class in the trailer.
13 """
14
15 __docformat__ = 'restructuredtext'
16
17 from operator import isSequenceType
18
19 import numpy as N
20
21 from mvpa.datasets.base import Dataset, datasetmethod
22 from mvpa.base.dochelpers import table2string
23 from mvpa.misc.support import getBreakPoints
24
25 from mvpa.base import externals, warning
26
27 if __debug__:
28 from mvpa.base import debug
29
30 if externals.exists('scipy'):
31 from mvpa.datasets.miscfx_sp import detrend
32
33
34 @datasetmethod
35 -def zscore(dataset, mean=None, std=None,
36 perchunk=True, baselinelabels=None,
37 pervoxel=True, targetdtype='float64'):
38 """Z-Score the samples of a `Dataset` (in-place).
39
40 `mean` and `std` can be used to pass custom values to the z-scoring.
41 Both may be scalars or arrays.
42
43 All computations are done *in place*. Data upcasting is done
44 automatically if necessary into `targetdtype`
45
46 If `baselinelabels` provided, and `mean` or `std` aren't provided, it would
47 compute the corresponding measure based only on labels in `baselinelabels`
48
49 If `perchunk` is True samples within the same chunk are z-scored independent
50 of samples from other chunks, e.i. mean and standard deviation are
51 calculated individually.
52 """
53
54 if __debug__ and perchunk \
55 and N.array(dataset.samplesperchunk.values()).min() <= 2:
56 warning("Z-scoring chunk-wise and one chunk with less than three "
57 "samples will set features in these samples to either zero "
58 "(with 1 sample in a chunk) "
59 "or -1/+1 (with 2 samples in a chunk).")
60
61
62 if str(dataset.samples.dtype).startswith('uint') \
63 or str(dataset.samples.dtype).startswith('int'):
64 dataset.setSamplesDType(targetdtype)
65
66 def doit(samples, mean, std, statsamples=None):
67 """Internal method."""
68
69 if statsamples is None:
70
71 statsamples = samples
72
73 if pervoxel:
74 axisarg = {'axis':0}
75 else:
76 axisarg = {}
77
78
79 if mean is None:
80 mean = statsamples.mean(**axisarg)
81
82
83 samples -= mean
84
85
86
87
88
89
90 if std is None:
91 std = statsamples.std(**axisarg)
92
93
94 if pervoxel:
95 samples[:, std != 0] /= std[std != 0]
96 else:
97 samples /= std
98
99 return samples
100
101 if baselinelabels is None:
102 statids = None
103 else:
104 statids = set(dataset.idsbylabels(baselinelabels))
105
106
107
108 if perchunk:
109 for c in dataset.uniquechunks:
110 slicer = N.where(dataset.chunks == c)[0]
111 if not statids is None:
112 statslicer = list(statids.intersection(set(slicer)))
113 dataset.samples[slicer] = doit(dataset.samples[slicer],
114 mean, std,
115 dataset.samples[statslicer])
116 else:
117 slicedsamples = dataset.samples[slicer]
118 dataset.samples[slicer] = doit(slicedsamples,
119 mean, std,
120 slicedsamples)
121 elif statids is None:
122 doit(dataset.samples, mean, std, dataset.samples)
123 else:
124 doit(dataset.samples, mean, std, dataset.samples[list(statids)])
125
129 """Apply a function to each row of the samples matrix of a dataset.
130
131 The functor given as `fx` has to honour an `axis` keyword argument in the
132 way that NumPy used it (e.g. NumPy.mean, var).
133
134 :Returns:
135 a new `Dataset` object with the aggregated feature(s).
136 """
137 agg = fx(dataset.samples, axis=1)
138
139 return Dataset(samples=N.array(agg, ndmin=2).T,
140 labels=dataset.labels,
141 chunks=dataset.chunks)
142
146 """Returns a new dataset with all invariant features removed.
147 """
148 return dataset.selectFeatures(dataset.samples.std(axis=0).nonzero()[0])
149
153 """Change chunking of the dataset
154
155 Group chunks into groups to match desired number of chunks. Makes
156 sense if originally there were no strong groupping into chunks or
157 each sample was independent, thus belonged to its own chunk
158
159 :Parameters:
160 source : Dataset or list of chunk ids
161 dataset or list of chunk ids to operate on. If Dataset, then its chunks
162 get modified
163 nchunks : int
164 desired number of chunks
165 """
166
167 if isinstance(source, Dataset):
168 chunks = source.chunks
169 else:
170 chunks = source
171 chunks_unique = N.unique(chunks)
172 nchunks_orig = len(chunks_unique)
173
174 if nchunks_orig < nchunks:
175 raise ValueError, \
176 "Original number of chunks is %d. Cannot coarse them " \
177 "to get %d chunks" % (nchunks_orig, nchunks)
178
179
180 counts = dict(zip(chunks_unique, [ 0 ] * len(chunks_unique)))
181 for c in chunks:
182 counts[c] += 1
183
184
185
186
187
188 avg_chunk_size = N.sum(counts.values())*1.0/nchunks
189 chunks_groups = []
190 cur_chunk = []
191 nchunks = 0
192 cur_chunk_nsamples = 0
193 samples_counted = 0
194 for i, c in enumerate(chunks_unique):
195 cc = counts[c]
196
197 cur_chunk += [c]
198 cur_chunk_nsamples += cc
199
200
201 if (samples_counted + cur_chunk_nsamples
202 >= (nchunks+1)*avg_chunk_size) or i==nchunks_orig-1:
203 chunks_groups.append(cur_chunk)
204 samples_counted += cur_chunk_nsamples
205 cur_chunk_nsamples = 0
206 cur_chunk = []
207 nchunks += 1
208
209 if len(chunks_groups) != nchunks:
210 warning("Apparently logic in coarseChunks is wrong. "
211 "It was desired to get %d chunks, got %d"
212 % (nchunks, len(chunks_groups)))
213
214
215
216 chunks_map = {}
217 for i, group in enumerate(chunks_groups):
218 for c in group:
219 chunks_map[c] = i
220
221 chunks_new = [chunks_map[x] for x in chunks]
222
223 if __debug__:
224 debug("DS_", "Using dictionary %s to remap old chunks %s into new %s"
225 % (chunks_map, chunks, chunks_new))
226
227 if isinstance(source, Dataset):
228 if __debug__:
229 debug("DS", "Coarsing %d chunks into %d chunks for %s"
230 %(nchunks_orig, len(chunks_new), source))
231 source.chunks = chunks_new
232 return
233 else:
234 return chunks_new
235
239 """Returns an array with the number of samples per label in each chunk.
240
241 Array shape is (chunks x labels).
242
243 :Parameters:
244 dataset: Dataset
245 Source dataset.
246 """
247 ul = dataset.uniquelabels
248 uc = dataset.uniquechunks
249
250 count = N.zeros((len(uc), len(ul)), dtype='uint')
251
252 for cc, c in enumerate(uc):
253 for lc, l in enumerate(ul):
254 count[cc, lc] = N.sum(N.logical_and(dataset.labels == l,
255 dataset.chunks == c))
256
257 return count
258
261 """Simple helper to provide representation of sequence statistics
262
263 Matlab analog:
264 http://cfn.upenn.edu/aguirre/code/matlablib/mseq/mtest.m
265
266 WARNING: Experimental -- API might change without warning!
267 Current implementation is ugly!
268 """
269
271 """Initialize SequenceStats
272
273 :Parameters:
274 seq : list or ndarray
275 Actual sequence of labels
276
277 :Keywords:
278 order : int
279 Maximal order of counter-balancing check. For perfect
280 counterbalancing all matrices should be identical
281 """
282 """
283 chunks : None or list or ndarray
284 Chunks to use if `perchunk`=True
285 perchunk .... TODO
286 """
287 dict.__init__(self)
288 self.order = order
289 self._seq = seq
290 self.stats = None
291 self._str_stats = None
292 self.__compute()
293
294
296 """Representation of SequenceStats
297 """
298 return "SequenceStats(%s, order=%d)" % (repr(self._seq), self.order)
299
301 return self._str_stats
302
304 """Compute stats and string representation
305 """
306
307 order = self.order
308 seq = list(self._seq)
309 nsamples = len(seq)
310 ulabels = sorted(list(set(seq)))
311 nlabels = len(ulabels)
312
313
314 labels_map = dict([(l, i) for i,l in enumerate(ulabels)])
315
316
317 seqm = [labels_map[i] for i in seq]
318 nperlabel = N.bincount(seqm)
319
320 res = dict(ulabels=ulabels)
321
322 cbcounts = N.zeros((order, nlabels, nlabels), dtype=int)
323 for cb in xrange(order):
324 for i,j in zip(seqm[:-(cb+1)], seqm[cb+1:]):
325 cbcounts[cb, i, j] += 1
326 res['cbcounts'] = cbcounts
327
328 """
329 Lets compute relative counter-balancing
330 Ideally, nperlabel[i]/nlabels should precede each label
331 """
332
333 corr = []
334
335 for shift in xrange(1, nsamples):
336 shifted = seqm[shift:] + seqm[:shift]
337
338 corr += [N.corrcoef(seqm, shifted)[0, 1]]
339
340 res['corrcoef'] = corr = N.array(corr)
341 res['sumabscorr'] = sumabscorr = N.sum(N.abs(corr))
342 self.update(res)
343
344
345
346 t = [ [""] * (1 + self.order*(nlabels+1)) for i in xrange(nlabels+1) ]
347 t[0][0] = "Labels/Order"
348 for i, l in enumerate(ulabels):
349 t[i+1][0] = '%s:' % l
350 for cb in xrange(order):
351 t[0][1+cb*(nlabels+1)] = "O%d" % (cb+1)
352 for i in xrange(nlabels+1):
353 t[i][(cb+1)*(nlabels+1)] = " | "
354 m = cbcounts[cb]
355
356 ind = N.where(~N.isnan(m))
357 for i, j in zip(*ind):
358 t[1+i][1+cb*(nlabels+1)+j] = '%d' % m[i, j]
359
360 sout = "Original sequence had %d entries from set %s\n" \
361 % (len(seq), ulabels) + \
362 "Counter-balance table for orders up to %d:\n" % order \
363 + table2string(t)
364 sout += "Correlations: min=%.2g max=%.2g mean=%.2g sum(abs)=%.2g" \
365 % (min(corr), max(corr), N.mean(corr), sumabscorr)
366 self._str_stats = sout
367
368
370 """Plot correlation coefficients
371 """
372 externals.exists('pylab', raiseException=True)
373 import pylab as P
374 P.plot(self['corrcoef'])
375 P.title('Auto-correlation of the sequence')
376 P.xlabel('Offset')
377 P.ylabel('Correlation Coefficient')
378 P.show()
379