Package mvpa :: Package clfs :: Module stats
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.stats

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Estimator for classifier error distributions.""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  import numpy as N 
 14   
 15   
16 -class Distribution(object):
17 - def __init__(self, tail='left'):
18 """Cheap initialization. 19 20 :Parameter: 21 tail: str ['left', 'right'] 22 Which tail of the distribution to report. 23 """ 24 self._tail = tail 25 26 # sanity check 27 if self._tail not in ['left', 'right']: 28 raise ValueError, 'Unknown value "%s" to `tail` argument.' \ 29
30
31 - def fit(self, measure, wdata, vdata=None):
32 """Implement to fit the distribution to the data.""" 33 raise NotImplementedError
34 35
36 - def cdf(self, x):
37 """Implementations return the value of the cumulative distribution 38 function (left or right tail dpending on the setting). 39 """ 40 raise NotImplementedError
41 42 43
44 -class MCNullDist(Distribution):
45 # XXX this should be the baseclass of a bunch of tests with more 46 # sophisticated tests, perhaps making more assumptions about the data 47 # TODO invent derived classes that make use of some reasonable assumptions 48 # e.g. a gaussian distribution of transfer errors under the null hypothesis 49 # this would have the advantage that a model could be fitted to a much 50 # lower number of transfer errors and therefore dramatically reduce the 51 # necessary CPU time. This is almost trivial to do with 52 # scipy.stats.norm.{fit,cdf} 53 """Class to determine the distribution of a measure under the NULL 54 distribution (no signal). 55 56 No assumptions are made about the shape of the distribution under the null 57 hypothesis. Instead this distribution is estimated by performing multiple 58 measurements with permuted `label` vectors, hence no or random signal. 59 60 The distribution is estimated by calling fit() with an appropriate 61 `DatasetMeasure` or `TransferError` instance and a training and a 62 validation dataset (in case of a `TransferError`). For a customizable 63 amount of cycles the training data labels are permuted and the 64 corresponding measure computed. In case of a `TransferError` this is the 65 error when predicting the *correct* labels of the validation dataset. 66 67 The distribution can be queried using the `cdf()` method, which can be 68 configured to report probabilities/frequencies from `left` or `right` tail, 69 i.e. fraction of the distribution that is lower or larger than some 70 critical value. 71 72 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()` 73 returns an array of featurewise probabilities/frequencies. 74 """
75 - def __init__(self, permutations=1000, **kwargs):
76 """Cheap initialization. 77 78 :Parameter: 79 permutations: int 80 This many classification attempts with permuted label vectors 81 will be performed to determine the distribution under the null 82 hypothesis. 83 """ 84 Distribution.__init__(self, **kwargs) 85 86 self.__dist_samples = None 87 self.__permutations = permutations 88 """Number of permutations to compute the estimate the null 89 distribution."""
90 91
92 - def fit(self, measure, wdata, vdata=None):
93 """Fit the distribution by performing multiple cycles which repeatedly 94 permuted labels in the training dataset. 95 96 :Parameter: 97 measure: (`Featurewise`)`DatasetMeasure` | `TransferError` 98 TransferError instance used to compute all errors. 99 wdata: `Dataset` which gets permuted and used to compute the 100 measure/transfer error multiple times. 101 vdata: `Dataset` used for validation. 102 If provided measure is assumed to be a `TransferError` and 103 working and validation dataset are passed onto it. 104 """ 105 dist_samples = [] 106 """Holds the transfer errors when randomized signal.""" 107 108 # estimate null-distribution 109 for p in xrange(self.__permutations): 110 # new permutation all the time 111 # but only permute the training data and keep the testdata constant 112 # TODO this really needs to be more clever! If data samples are 113 # shuffled within a class it really makes no difference for the 114 # classifier, hence the number of permutations to estimate the 115 # null-distribution of transfer errors can be reduced dramatically 116 # when the *right* permutations (the ones that matter) are done. 117 wdata.permuteLabels(True, perchunk=False) 118 119 # compute and store the measure of this permutation 120 if not vdata is None: 121 # assume it has `TransferError` interface 122 dist_samples.append(measure(vdata, wdata)) 123 else: 124 dist_samples.append(measure(wdata)) 125 126 # store errors 127 self.__dist_samples = N.asarray(dist_samples) 128 129 # restore original labels 130 wdata.permuteLabels(False, perchunk=False)
131 132
133 - def cdf(self, x):
134 """Returns the frequency/probability of a value `x` given the estimated 135 distribution. Returned values are determined left or right tailed 136 depending on the constructor setting. 137 138 In case a `FeaturewiseDatasetMeasure` was used to estimate the 139 distribution the method returns an array. In that case `x` can be 140 a scalar value or an array of a matching shape. 141 """ 142 if self._tail == 'left': 143 return (self.__dist_samples <= x).mean(axis=0) 144 else: 145 return (self.__dist_samples >= x).mean(axis=0)
146 147 148
149 -class FixedDist(Distribution):
150 """Proxy/Adaptor class for SciPy distributions. 151 152 All distributions from SciPy's 'stats' module can be used with this class. 153 154 >>> import numpy as N 155 >>> from scipy import stats 156 >>> from mvpa.clfs.stats import FixedDist 157 >>> 158 >>> dist = FixedDist(stats.norm(loc=2, scale=4)) 159 >>> dist.cdf(2) 160 array(0.5) 161 >>> 162 >>> dist.cdf(N.arange(5)) 163 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246]) 164 >>> 165 >>> dist = FixedDist(stats.norm(loc=2, scale=4), tail='right') 166 >>> dist.cdf(N.arange(5)) 167 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754]) 168 """
169 - def __init__(self, dist, **kwargs):
170 """ 171 :Parameter: 172 dist: distribution object 173 This can be any object the has a `cdf()` method to report the 174 cumulative distribition function values. 175 """ 176 Distribution.__init__(self, **kwargs) 177 178 self._dist = dist
179 180
181 - def fit(self, measure, wdata, vdata=None):
182 """Does nothing since the distribution is already fixed.""" 183 pass
184 185
186 - def cdf(self, x):
187 """Return value of the cumulative distribution function at `x`. 188 """ 189 if self._tail == 'left': 190 return self._dist.cdf(x) 191 else: 192 return 1 - self._dist.cdf(x)
193