1
2
3
4
5
6
7
8
9 """Estimator for classifier error distributions."""
10
11 __docformat__ = 'restructuredtext'
12
13 import numpy as N
14
15
18 """Cheap initialization.
19
20 :Parameter:
21 tail: str ['left', 'right']
22 Which tail of the distribution to report.
23 """
24 self._tail = tail
25
26
27 if self._tail not in ['left', 'right']:
28 raise ValueError, 'Unknown value "%s" to `tail` argument.' \
29
30
31 - def fit(self, measure, wdata, vdata=None):
32 """Implement to fit the distribution to the data."""
33 raise NotImplementedError
34
35
37 """Implementations return the value of the cumulative distribution
38 function (left or right tail dpending on the setting).
39 """
40 raise NotImplementedError
41
42
43
45
46
47
48
49
50
51
52
53 """Class to determine the distribution of a measure under the NULL
54 distribution (no signal).
55
56 No assumptions are made about the shape of the distribution under the null
57 hypothesis. Instead this distribution is estimated by performing multiple
58 measurements with permuted `label` vectors, hence no or random signal.
59
60 The distribution is estimated by calling fit() with an appropriate
61 `DatasetMeasure` or `TransferError` instance and a training and a
62 validation dataset (in case of a `TransferError`). For a customizable
63 amount of cycles the training data labels are permuted and the
64 corresponding measure computed. In case of a `TransferError` this is the
65 error when predicting the *correct* labels of the validation dataset.
66
67 The distribution can be queried using the `cdf()` method, which can be
68 configured to report probabilities/frequencies from `left` or `right` tail,
69 i.e. fraction of the distribution that is lower or larger than some
70 critical value.
71
72 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()`
73 returns an array of featurewise probabilities/frequencies.
74 """
75 - def __init__(self, permutations=1000, **kwargs):
76 """Cheap initialization.
77
78 :Parameter:
79 permutations: int
80 This many classification attempts with permuted label vectors
81 will be performed to determine the distribution under the null
82 hypothesis.
83 """
84 Distribution.__init__(self, **kwargs)
85
86 self.__dist_samples = None
87 self.__permutations = permutations
88 """Number of permutations to compute the estimate the null
89 distribution."""
90
91
92 - def fit(self, measure, wdata, vdata=None):
93 """Fit the distribution by performing multiple cycles which repeatedly
94 permuted labels in the training dataset.
95
96 :Parameter:
97 measure: (`Featurewise`)`DatasetMeasure` | `TransferError`
98 TransferError instance used to compute all errors.
99 wdata: `Dataset` which gets permuted and used to compute the
100 measure/transfer error multiple times.
101 vdata: `Dataset` used for validation.
102 If provided measure is assumed to be a `TransferError` and
103 working and validation dataset are passed onto it.
104 """
105 dist_samples = []
106 """Holds the transfer errors when randomized signal."""
107
108
109 for p in xrange(self.__permutations):
110
111
112
113
114
115
116
117 wdata.permuteLabels(True, perchunk=False)
118
119
120 if not vdata is None:
121
122 dist_samples.append(measure(vdata, wdata))
123 else:
124 dist_samples.append(measure(wdata))
125
126
127 self.__dist_samples = N.asarray(dist_samples)
128
129
130 wdata.permuteLabels(False, perchunk=False)
131
132
134 """Returns the frequency/probability of a value `x` given the estimated
135 distribution. Returned values are determined left or right tailed
136 depending on the constructor setting.
137
138 In case a `FeaturewiseDatasetMeasure` was used to estimate the
139 distribution the method returns an array. In that case `x` can be
140 a scalar value or an array of a matching shape.
141 """
142 if self._tail == 'left':
143 return (self.__dist_samples <= x).mean(axis=0)
144 else:
145 return (self.__dist_samples >= x).mean(axis=0)
146
147
148
150 """Proxy/Adaptor class for SciPy distributions.
151
152 All distributions from SciPy's 'stats' module can be used with this class.
153
154 >>> import numpy as N
155 >>> from scipy import stats
156 >>> from mvpa.clfs.stats import FixedDist
157 >>>
158 >>> dist = FixedDist(stats.norm(loc=2, scale=4))
159 >>> dist.cdf(2)
160 array(0.5)
161 >>>
162 >>> dist.cdf(N.arange(5))
163 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246])
164 >>>
165 >>> dist = FixedDist(stats.norm(loc=2, scale=4), tail='right')
166 >>> dist.cdf(N.arange(5))
167 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754])
168 """
170 """
171 :Parameter:
172 dist: distribution object
173 This can be any object the has a `cdf()` method to report the
174 cumulative distribition function values.
175 """
176 Distribution.__init__(self, **kwargs)
177
178 self._dist = dist
179
180
181 - def fit(self, measure, wdata, vdata=None):
182 """Does nothing since the distribution is already fixed."""
183 pass
184
185
187 """Return value of the cumulative distribution function at `x`.
188 """
189 if self._tail == 'left':
190 return self._dist.cdf(x)
191 else:
192 return 1 - self._dist.cdf(x)
193