1
2
3
4
5
6
7
8
9 """Misc function performing operations on datasets.
10
11 All the functions defined in this module must accept dataset as the
12 first argument since they are bound to Dataset class in the trailer.
13 """
14
15 __docformat__ = 'restructuredtext'
16
17 from sets import Set
18 from operator import isSequenceType
19
20 import numpy as N
21
22 from mvpa.datasets.base import Dataset, datasetmethod
23 from mvpa.base.dochelpers import table2string
24 from mvpa.misc.support import getBreakPoints
25
26 from mvpa.base import externals, warning
27
28 if __debug__:
29 from mvpa.base import debug
30
31 if externals.exists('scipy'):
32 from mvpa.datasets.miscfx_sp import detrend
33
34
35 @datasetmethod
36 -def zscore(dataset, mean=None, std=None,
37 perchunk=True, baselinelabels=None,
38 pervoxel=True, targetdtype='float64'):
39 """Z-Score the samples of a `Dataset` (in-place).
40
41 `mean` and `std` can be used to pass custom values to the z-scoring.
42 Both may be scalars or arrays.
43
44 All computations are done *in place*. Data upcasting is done
45 automatically if necessary into `targetdtype`
46
47 If `baselinelabels` provided, and `mean` or `std` aren't provided, it would
48 compute the corresponding measure based only on labels in `baselinelabels`
49
50 If `perchunk` is True samples within the same chunk are z-scored independent
51 of samples from other chunks, e.i. mean and standard deviation are
52 calculated individually.
53 """
54
55 if __debug__ and perchunk \
56 and N.array(dataset.samplesperchunk.values()).min() <= 2:
57 warning("Z-scoring chunk-wise and one chunk with less than three "
58 "samples will set features in these samples to either zero "
59 "(with 1 sample in a chunk) "
60 "or -1/+1 (with 2 samples in a chunk).")
61
62
63 if str(dataset.samples.dtype).startswith('uint') \
64 or str(dataset.samples.dtype).startswith('int'):
65 dataset.setSamplesDType(targetdtype)
66
67 def doit(samples, mean, std, statsamples=None):
68 """Internal method."""
69
70 if statsamples is None:
71
72 statsamples = samples
73
74 if pervoxel:
75 axisarg = {'axis':0}
76 else:
77 axisarg = {}
78
79
80 if mean is None:
81 mean = statsamples.mean(**axisarg)
82
83
84 samples -= mean
85
86
87
88
89
90
91 if std is None:
92 std = statsamples.std(**axisarg)
93
94
95 if pervoxel:
96 samples[:, std != 0] /= std[std != 0]
97 else:
98 samples /= std
99
100 return samples
101
102 if baselinelabels is None:
103 statids = None
104 else:
105 statids = Set(dataset.idsbylabels(baselinelabels))
106
107
108
109 if perchunk:
110 for c in dataset.uniquechunks:
111 slicer = N.where(dataset.chunks == c)[0]
112 if not statids is None:
113 statslicer = list(statids.intersection(Set(slicer)))
114 dataset.samples[slicer] = doit(dataset.samples[slicer],
115 mean, std,
116 dataset.samples[statslicer])
117 else:
118 slicedsamples = dataset.samples[slicer]
119 dataset.samples[slicer] = doit(slicedsamples,
120 mean, std,
121 slicedsamples)
122 elif statids is None:
123 doit(dataset.samples, mean, std, dataset.samples)
124 else:
125 doit(dataset.samples, mean, std, dataset.samples[list(statids)])
126
130 """Apply a function to each row of the samples matrix of a dataset.
131
132 The functor given as `fx` has to honour an `axis` keyword argument in the
133 way that NumPy used it (e.g. NumPy.mean, var).
134
135 :Returns:
136 a new `Dataset` object with the aggregated feature(s).
137 """
138 agg = fx(dataset.samples, axis=1)
139
140 return Dataset(samples=N.array(agg, ndmin=2).T,
141 labels=dataset.labels,
142 chunks=dataset.chunks)
143
147 """Returns a new dataset with all invariant features removed.
148 """
149 return dataset.selectFeatures(dataset.samples.std(axis=0).nonzero()[0])
150
154 """Change chunking of the dataset
155
156 Group chunks into groups to match desired number of chunks. Makes
157 sense if originally there were no strong groupping into chunks or
158 each sample was independent, thus belonged to its own chunk
159
160 :Parameters:
161 source : Dataset or list of chunk ids
162 dataset or list of chunk ids to operate on. If Dataset, then its chunks
163 get modified
164 nchunks : int
165 desired number of chunks
166 """
167
168 if isinstance(source, Dataset):
169 chunks = source.chunks
170 else:
171 chunks = source
172 chunks_unique = N.unique(chunks)
173 nchunks_orig = len(chunks_unique)
174
175 if nchunks_orig < nchunks:
176 raise ValueError, \
177 "Original number of chunks is %d. Cannot coarse them " \
178 "to get %d chunks" % (nchunks_orig, nchunks)
179
180
181 counts = dict(zip(chunks_unique, [ 0 ] * len(chunks_unique)))
182 for c in chunks:
183 counts[c] += 1
184
185
186
187
188
189 avg_chunk_size = N.sum(counts.values())*1.0/nchunks
190 chunks_groups = []
191 cur_chunk = []
192 nchunks = 0
193 cur_chunk_nsamples = 0
194 samples_counted = 0
195 for i, c in enumerate(chunks_unique):
196 cc = counts[c]
197
198 cur_chunk += [c]
199 cur_chunk_nsamples += cc
200
201
202 if (samples_counted + cur_chunk_nsamples
203 >= (nchunks+1)*avg_chunk_size) or i==nchunks_orig-1:
204 chunks_groups.append(cur_chunk)
205 samples_counted += cur_chunk_nsamples
206 cur_chunk_nsamples = 0
207 cur_chunk = []
208 nchunks += 1
209
210 if len(chunks_groups) != nchunks:
211 warning("Apparently logic in coarseChunks is wrong. "
212 "It was desired to get %d chunks, got %d"
213 % (nchunks, len(chunks_groups)))
214
215
216
217 chunks_map = {}
218 for i, group in enumerate(chunks_groups):
219 for c in group:
220 chunks_map[c] = i
221
222 chunks_new = [chunks_map[x] for x in chunks]
223
224 if __debug__:
225 debug("DS_", "Using dictionary %s to remap old chunks %s into new %s"
226 % (chunks_map, chunks, chunks_new))
227
228 if isinstance(source, Dataset):
229 if __debug__:
230 debug("DS", "Coarsing %d chunks into %d chunks for %s"
231 %(nchunks_orig, len(chunks_new), source))
232 source.chunks = chunks_new
233 return
234 else:
235 return chunks_new
236
240 """Returns an array with the number of samples per label in each chunk.
241
242 Array shape is (chunks x labels).
243
244 :Parameters:
245 dataset: Dataset
246 Source dataset.
247 """
248 ul = dataset.uniquelabels
249 uc = dataset.uniquechunks
250
251 count = N.zeros((len(uc), len(ul)), dtype='uint')
252
253 for cc, c in enumerate(uc):
254 for lc, l in enumerate(ul):
255 count[cc, lc] = N.sum(N.logical_and(dataset.labels == l,
256 dataset.chunks == c))
257
258 return count
259
262 """Simple helper to provide representation of sequence statistics
263
264 Matlab analog:
265 http://cfn.upenn.edu/aguirre/code/matlablib/mseq/mtest.m
266
267 WARNING: Experimental -- API might change without warning!
268 Current implementation is ugly!
269 """
270
272 """Initialize SequenceStats
273
274 :Parameters:
275 seq : list or ndarray
276 Actual sequence of labels
277
278 :Keywords:
279 order : int
280 Maximal order of counter-balancing check. For perfect
281 counterbalancing all matrices should be identical
282 """
283 """
284 chunks : None or list or ndarray
285 Chunks to use if `perchunk`=True
286 perchunk .... TODO
287 """
288 dict.__init__(self)
289 self.order = order
290 self._seq = seq
291 self.stats = None
292 self._str_stats = None
293 self.__compute()
294
295
297 """Representation of SequenceStats
298 """
299 return "SequenceStats(%s, order=%d)" % (repr(self._seq), self.order)
300
302 return self._str_stats
303
305 """Compute stats and string representation
306 """
307
308 order = self.order
309 seq = list(self._seq)
310 nsamples = len(seq)
311 ulabels = sorted(list(set(seq)))
312 nlabels = len(ulabels)
313
314
315 labels_map = dict([(l, i) for i,l in enumerate(ulabels)])
316
317
318 seqm = [labels_map[i] for i in seq]
319 nperlabel = N.bincount(seqm)
320
321 res = dict(ulabels=ulabels)
322
323 cbcounts = N.zeros((order, nlabels, nlabels), dtype=int)
324 for cb in xrange(order):
325 for i,j in zip(seqm[:-(cb+1)], seqm[cb+1:]):
326 cbcounts[cb, i, j] += 1
327 res['cbcounts'] = cbcounts
328
329 """
330 Lets compute relative counter-balancing
331 Ideally, nperlabel[i]/nlabels should precede each label
332 """
333
334 corr = []
335
336 for shift in xrange(1, nsamples):
337 shifted = seqm[shift:] + seqm[:shift]
338
339 corr += [N.corrcoef(seqm, shifted)[0, 1]]
340
341 res['corrcoef'] = corr = N.array(corr)
342 res['sumabscorr'] = sumabscorr = N.sum(N.abs(corr))
343 self.update(res)
344
345
346
347 t = [ [""] * (1 + self.order*(nlabels+1)) for i in xrange(nlabels+1) ]
348 t[0][0] = "Labels/Order"
349 for i, l in enumerate(ulabels):
350 t[i+1][0] = '%s:' % l
351 for cb in xrange(order):
352 t[0][1+cb*(nlabels+1)] = "O%d" % (cb+1)
353 for i in xrange(nlabels+1):
354 t[i][(cb+1)*(nlabels+1)] = " | "
355 m = cbcounts[cb]
356
357 ind = N.where(~N.isnan(m))
358 for i, j in zip(*ind):
359 t[1+i][1+cb*(nlabels+1)+j] = '%d' % m[i, j]
360
361 sout = "Original sequence had %d entries from set %s\n" \
362 % (len(seq), ulabels) + \
363 "Counter-balance table for orders up to %d:\n" % order \
364 + table2string(t)
365 sout += "Correlations: min=%.2g max=%.2g mean=%.2g sum(abs)=%.2g" \
366 % (min(corr), max(corr), N.mean(corr), sumabscorr)
367 self._str_stats = sout
368
369
371 """Plot correlation coefficients
372 """
373 externals.exists('pylab', raiseException=True)
374 import pylab as P
375 P.plot(self['corrcoef'])
376 P.title('Auto-correlation of the sequence')
377 P.xlabel('Offset')
378 P.ylabel('Correlation Coefficient')
379 P.show()
380