1
2
3
4
5
6
7
8
9 """Misc function performing operations on datasets.
10
11 All the functions defined in this module must accept dataset as the
12 first argument since they are bound to Dataset class in the trailer.
13 """
14
15 __docformat__ = 'restructuredtext'
16
17 from sets import Set
18 from operator import isSequenceType
19
20 import numpy as N
21
22 from mvpa.datasets.base import Dataset, datasetmethod
23 from mvpa.misc.support import getBreakPoints
24
25 from mvpa.base import externals, warning
26
27 if __debug__:
28 from mvpa.base import debug
29
30 if externals.exists('scipy'):
31 from mvpa.datasets.miscfx_sp import detrend
32
33
34 @datasetmethod
35 -def zscore(dataset, mean=None, std=None,
36 perchunk=True, baselinelabels=None,
37 pervoxel=True, targetdtype='float64'):
38 """Z-Score the samples of a `Dataset` (in-place).
39
40 `mean` and `std` can be used to pass custom values to the z-scoring.
41 Both may be scalars or arrays.
42
43 All computations are done *in place*. Data upcasting is done
44 automatically if necessary into `targetdtype`
45
46 If `baselinelabels` provided, and `mean` or `std` aren't provided, it would
47 compute the corresponding measure based only on labels in `baselinelabels`
48
49 If `perchunk` is True samples within the same chunk are z-scored independent
50 of samples from other chunks, e.i. mean and standard deviation are
51 calculated individually.
52 """
53
54 if __debug__ and perchunk \
55 and N.array(dataset.samplesperchunk.values()).min() < 2:
56 warning("Z-scoring chunk-wise and one chunk with less than two " \
57 "samples will set features in these samples to zero.")
58
59
60 if str(dataset.samples.dtype).startswith('uint') \
61 or str(dataset.samples.dtype).startswith('int'):
62 dataset.setSamplesDType(targetdtype)
63
64 def doit(samples, mean, std, statsamples=None):
65 """Internal method."""
66
67 if statsamples is None:
68
69 statsamples = samples
70
71 if pervoxel:
72 axisarg = {'axis':0}
73 else:
74 axisarg = {}
75
76
77 if mean is None:
78 mean = statsamples.mean(**axisarg)
79
80
81 samples -= mean
82
83
84 if std is None:
85 std = statsamples.std(**axisarg)
86
87
88 if pervoxel:
89 samples[:, std != 0] /= std[std != 0]
90 else:
91 samples /= std
92
93 return samples
94
95 if baselinelabels is None:
96 statids = None
97 else:
98 statids = Set(dataset.idsbylabels(baselinelabels))
99
100
101
102 if perchunk:
103 for c in dataset.uniquechunks:
104 slicer = N.where(dataset.chunks == c)[0]
105 if not statids is None:
106 statslicer = list(statids.intersection(Set(slicer)))
107 dataset.samples[slicer] = doit(dataset.samples[slicer],
108 mean, std,
109 dataset.samples[statslicer])
110 else:
111 slicedsamples = dataset.samples[slicer]
112 dataset.samples[slicer] = doit(slicedsamples,
113 mean, std,
114 slicedsamples)
115 elif statids is None:
116 doit(dataset.samples, mean, std, dataset.samples)
117 else:
118 doit(dataset.samples, mean, std, dataset.samples[list(statids)])
119
123 """Apply a function to each row of the samples matrix of a dataset.
124
125 The functor given as `fx` has to honour an `axis` keyword argument in the
126 way that NumPy used it (e.g. NumPy.mean, var).
127
128 :Returns:
129 a new `Dataset` object with the aggregated feature(s).
130 """
131 agg = fx(dataset.samples, axis=1)
132
133 return Dataset(samples=N.array(agg, ndmin=2).T,
134 labels=dataset.labels,
135 chunks=dataset.chunks)
136
140 """Returns a new dataset with all invariant features removed.
141 """
142 return dataset.selectFeatures(dataset.samples.std(axis=0).nonzero()[0])
143
147 """Change chunking of the dataset
148
149 Group chunks into groups to match desired number of chunks. Makes
150 sense if originally there were no strong groupping into chunks or
151 each sample was independent, thus belonged to its own chunk
152
153 :Parameters:
154 source : Dataset or list of chunk ids
155 dataset or list of chunk ids to operate on. If Dataset, then its chunks
156 get modified
157 nchunks : int
158 desired number of chunks
159 """
160
161 if isinstance(source, Dataset):
162 chunks = source.chunks
163 else:
164 chunks = source
165 chunks_unique = N.unique(chunks)
166 nchunks_orig = len(chunks_unique)
167
168 if nchunks_orig < nchunks:
169 raise ValueError, \
170 "Original number of chunks is %d. Cannot coarse them " \
171 "to get %d chunks" % (nchunks_orig, nchunks)
172
173
174 counts = dict(zip(chunks_unique, [ 0 ] * len(chunks_unique)))
175 for c in chunks:
176 counts[c] += 1
177
178
179
180
181
182 avg_chunk_size = N.sum(counts.values())*1.0/nchunks
183 chunks_groups = []
184 cur_chunk = []
185 nchunks = 0
186 cur_chunk_nsamples = 0
187 samples_counted = 0
188 for i,c in enumerate(chunks_unique):
189 cc = counts[c]
190
191 cur_chunk += [c]
192 cur_chunk_nsamples += cc
193
194
195 if (samples_counted + cur_chunk_nsamples
196 >= (nchunks+1)*avg_chunk_size) or i==nchunks_orig-1:
197 chunks_groups.append(cur_chunk)
198 samples_counted += cur_chunk_nsamples
199 cur_chunk_nsamples = 0
200 cur_chunk = []
201 nchunks += 1
202
203 if len(chunks_groups) != nchunks:
204 warning("Apparently logic in coarseChunks is wrong. "
205 "It was desired to get %d chunks, got %d"
206 % (nchunks, len(chunks_groups)))
207
208
209
210 chunks_map = {}
211 for i, group in enumerate(chunks_groups):
212 for c in group:
213 chunks_map[c] = i
214
215 chunks_new = [chunks_map[x] for x in chunks]
216
217 if __debug__:
218 debug("DS_", "Using dictionary %s to remap old chunks %s into new %s"
219 % (chunks_map, chunks, chunks_new))
220
221 if isinstance(source, Dataset):
222 if __debug__:
223 debug("DS", "Coarsing %d chunks into %d chunks for %s"
224 %(nchunks_orig, len(chunks_new), source))
225 source.chunks = chunks_new
226 return
227 else:
228 return chunks_new
229
233 """Returns an array with the number of samples per label in each chunk.
234
235 Array shape is (chunks x labels).
236
237 :Parameters:
238 dataset: Dataset
239 Source dataset.
240 """
241 ul = dataset.uniquelabels
242 uc = dataset.uniquechunks
243
244 count = N.zeros((len(uc), len(ul)), dtype='uint')
245
246 for cc, c in enumerate(uc):
247 for lc, l in enumerate(ul):
248 count[cc, lc] = N.sum(N.logical_and(dataset.labels == l,
249 dataset.chunks == c))
250
251 return count
252