1
2
3
4
5
6
7
8
9 """Misc function performing operations on datasets.
10
11 TODO: shouldn't it be gone under mvpa.misc.signal? or may be smth like
12 mvpa.misc.stats? Or may be we should have mvpa.processing to store this
13 bastards?
14
15 """
16
17 __docformat__ = 'restructuredtext'
18
19 import numpy as N
20 from sets import Set
21 from mvpa.datasets import Dataset
22
23
24 -def zscore(dataset, mean = None, std = None,
25 perchunk=True, baselinelabels=None, targetdtype='float64'):
26 """Z-Score the samples of a `Dataset` (in-place).
27
28 `mean` and `std` can be used to pass custom values to the z-scoring.
29 Both may be scalars or arrays.
30
31 All computations are done in place. Data upcasting is done
32 automatically if necessary into `targetdtype`
33
34 If `baselinelabels` provided, and `mean` or `std` aren't provided, it would
35 compute the corresponding measure based only on labels in `baselinelabels`
36
37 If `perchunk` is True samples within the same chunk are z-scored independent
38 of samples from other chunks, e.i. mean and standard deviation are
39 calculated individually.
40 """
41
42 if str(dataset.samples.dtype).startswith('uint') \
43 or str(dataset.samples.dtype).startswith('int'):
44 dataset.setSamplesDType(targetdtype)
45
46 def doit(samples, mean, std, statsamples=None):
47 """Internal method."""
48
49 if statsamples is None:
50
51 statsamples = samples
52
53
54 if not mean:
55 mean = statsamples.mean(axis=0)
56
57
58 if not std:
59 std = statsamples.std(axis=0)
60
61
62 samples -= mean
63 samples[:, std != 0] /= std[std != 0]
64
65 return samples
66
67 if baselinelabels is None:
68 statids = None
69 else:
70 statids = Set(dataset.idsbylabels(baselinelabels))
71
72
73
74 if perchunk:
75 for c in dataset.uniquechunks:
76 slicer = N.where(dataset.chunks == c)[0]
77 if not statids is None:
78 statslicer = list(statids.intersection(Set(slicer)))
79 dataset.samples[slicer] = doit(dataset.samples[slicer],
80 mean, std,
81 dataset.samples[statslicer])
82 else:
83 slicedsamples = dataset.samples[slicer]
84 dataset.samples[slicer] = doit(slicedsamples,
85 mean, std,
86 slicedsamples)
87 elif statids is None:
88 doit(dataset.samples, mean, std, dataset.samples)
89 else:
90 doit(dataset.samples, mean, std, dataset.samples[list(statids)])
91
92
93
95 """Apply a function to each row of the samples matrix of a dataset.
96
97 The functor given as `fx` has to honour an `axis` keyword argument in the
98 way that NumPy used it (e.g. NumPy.mean, var).
99
100 Returns a new `Dataset` object with the aggregated feature(s).
101 """
102 agg = fx(dataset.samples, axis=1)
103
104 return Dataset(samples=N.array(agg, ndmin=2).T,
105 labels=dataset.labels,
106 chunks=dataset.chunks)
107