Package mvpa :: Package clfs :: Module stats
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.stats

   1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
   2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
   3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   4  # 
   5  #   See COPYING file distributed along with the PyMVPA package for the 
   6  #   copyright and license terms. 
   7  # 
   8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   9  """Estimator for classifier error distributions.""" 
  10   
  11  __docformat__ = 'restructuredtext' 
  12   
  13  import numpy as N 
  14   
  15  from mvpa.base import externals, warning 
  16  from mvpa.misc.state import ClassWithCollections, StateVariable 
  17   
  18  if __debug__: 
  19      from mvpa.base import debug 
20 21 22 -class Nonparametric(object):
23 """Non-parametric 1d distribution -- derives cdf based on stored values. 24 25 Introduced to complement parametric distributions present in scipy.stats. 26 """ 27
28 - def __init__(self, dist_samples, correction='clip'):
29 """ 30 :Parameters: 31 dist_samples : ndarray 32 Samples to be used to assess the distribution. 33 correction : {'clip'} or None, optional 34 Determines the behavior when .cdf is queried. If None -- no 35 correction is made. If 'clip' -- values are clipped to lie 36 in the range [1/(N+2), (N+1)/(N+2)] (simply because 37 non-parametric assessment lacks the power to resolve with 38 higher precision in the tails, so 'imagery' samples are 39 placed in each of the two tails). 40 """ 41 self._dist_samples = N.ravel(dist_samples) 42 self._correction = correction
43
44 - def __repr__(self):
45 return '%s(%r%s)' % ( 46 self.__class__.__name__, 47 self._dist_samples, 48 ('', ', correction=%r' % self._correction) 49 [int(self._correction != 'clip')])
50 51 @staticmethod
52 - def fit(dist_samples):
53 return [dist_samples]
54 55
56 - def cdf(self, x):
57 """Returns the cdf value at `x`. 58 """ 59 dist_samples = self._dist_samples 60 res = N.vectorize(lambda v:(dist_samples <= v).mean())(x) 61 if self._correction == 'clip': 62 nsamples = len(dist_samples) 63 N.clip(res, 1.0/(nsamples+2), (nsamples+1.0)/(nsamples+2), res) 64 elif self._correction is None: 65 pass 66 else: 67 raise ValueError, \ 68 '%r is incorrect value for correction parameter of %s' \ 69 % (self._correction, self.__class__.__name__) 70 return res
71
72 73 -def _pvalue(x, cdf_func, tail, return_tails=False, name=None):
74 """Helper function to return p-value(x) given cdf and tail 75 76 :Parameters: 77 cdf_func : callable 78 Function to be used to derive cdf values for x 79 tail : str ('left', 'right', 'any', 'both') 80 Which tail of the distribution to report. For 'any' and 'both' 81 it chooses the tail it belongs to based on the comparison to 82 p=0.5. In the case of 'any' significance is taken like in a 83 one-tailed test. 84 return_tails : bool 85 If True, a tuple return (pvalues, tails), where tails contain 86 1s if value was from the right tail, and 0 if the value was 87 from the left tail. 88 """ 89 is_scalar = N.isscalar(x) 90 if is_scalar: 91 x = [x] 92 93 cdf = cdf_func(x) 94 95 if __debug__ and 'CHECK_STABILITY' in debug.active: 96 cdf_min, cdf_max = N.min(cdf), N.max(cdf) 97 if cdf_min < 0 or cdf_max > 1.0: 98 s = ('', ' for %s' % name)[int(name is not None)] 99 warning('Stability check of cdf %s failed%s. Min=%s, max=%s' % \ 100 (cdf_func, s, cdf_min, cdf_max)) 101 102 # no escape but to assure that CDF is in the right range. Some 103 # distributions from scipy tend to jump away from [0,1] 104 cdf = N.clip(cdf, 0, 1.0) 105 106 if tail == 'left': 107 if return_tails: 108 right_tail = N.zeros(cdf.shape, dtype=bool) 109 elif tail == 'right': 110 cdf = 1 - cdf 111 if return_tails: 112 right_tail = N.ones(cdf.shape, dtype=bool) 113 elif tail in ('any', 'both'): 114 right_tail = (cdf >= 0.5) 115 cdf[right_tail] = 1.0 - cdf[right_tail] 116 if tail == 'both': 117 # we need to half the signficance 118 cdf *= 2 119 120 # Assure that NaNs didn't get significant value 121 cdf[N.isnan(x)] = 1.0 122 if is_scalar: res = cdf[0] 123 else: res = cdf 124 125 if return_tails: 126 return (res, right_tail) 127 else: 128 return res
129
130 131 -class NullDist(ClassWithCollections):
132 """Base class for null-hypothesis testing. 133 134 """ 135 136 # Although base class is not benefiting from states, derived 137 # classes do (MCNullDist). For the sake of avoiding multiple 138 # inheritance and associated headache -- let them all be ClassWithCollections, 139 # performance hit should be negligible in most of the scenarios 140 _ATTRIBUTE_COLLECTIONS = ['states'] 141
142 - def __init__(self, tail='both', **kwargs):
143 """Cheap initialization. 144 145 :Parameter: 146 tail: str ('left', 'right', 'any', 'both') 147 Which tail of the distribution to report. For 'any' and 'both' 148 it chooses the tail it belongs to based on the comparison to 149 p=0.5. In the case of 'any' significance is taken like in a 150 one-tailed test. 151 """ 152 ClassWithCollections.__init__(self, **kwargs) 153 154 self._setTail(tail)
155
156 - def __repr__(self, prefixes=[]):
157 return super(NullDist, self).__repr__( 158 prefixes=["tail=%s" % `self.__tail`] + prefixes)
159 160
161 - def _setTail(self, tail):
162 # sanity check 163 if tail not in ('left', 'right', 'any', 'both'): 164 raise ValueError, 'Unknown value "%s" to `tail` argument.' \ 165 % tail 166 self.__tail = tail
167 168
169 - def fit(self, measure, wdata, vdata=None):
170 """Implement to fit the distribution to the data.""" 171 raise NotImplementedError
172 173
174 - def cdf(self, x):
175 """Implementations return the value of the cumulative distribution 176 function (left or right tail dpending on the setting). 177 """ 178 raise NotImplementedError
179 180
181 - def p(self, x, **kwargs):
182 """Returns the p-value for values of `x`. 183 Returned values are determined left, right, or from any tail 184 depending on the constructor setting. 185 186 In case a `FeaturewiseDatasetMeasure` was used to estimate the 187 distribution the method returns an array. In that case `x` can be 188 a scalar value or an array of a matching shape. 189 """ 190 return _pvalue(x, self.cdf, self.__tail, **kwargs)
191 192 193 tail = property(fget=lambda x:x.__tail, fset=_setTail)
194
195 196 -class MCNullDist(NullDist):
197 """Null-hypothesis distribution is estimated from randomly permuted data labels. 198 199 The distribution is estimated by calling fit() with an appropriate 200 `DatasetMeasure` or `TransferError` instance and a training and a 201 validation dataset (in case of a `TransferError`). For a customizable 202 amount of cycles the training data labels are permuted and the 203 corresponding measure computed. In case of a `TransferError` this is the 204 error when predicting the *correct* labels of the validation dataset. 205 206 The distribution can be queried using the `cdf()` method, which can be 207 configured to report probabilities/frequencies from `left` or `right` tail, 208 i.e. fraction of the distribution that is lower or larger than some 209 critical value. 210 211 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()` 212 returns an array of featurewise probabilities/frequencies. 213 """ 214 215 _DEV_DOC = """ 216 TODO automagically decide on the number of samples/permutations needed 217 Caution should be paid though since resultant distributions might be 218 quite far from some conventional ones (e.g. Normal) -- it is expected to 219 them to be bimodal (or actually multimodal) in many scenarios. 220 """ 221 222 dist_samples = StateVariable(enabled=False, 223 doc='Samples obtained for each permutation') 224
225 - def __init__(self, dist_class=Nonparametric, permutations=100, **kwargs):
226 """Initialize Monte-Carlo Permutation Null-hypothesis testing 227 228 :Parameters: 229 dist_class: class 230 This can be any class which provides parameters estimate 231 using `fit()` method to initialize the instance, and 232 provides `cdf(x)` method for estimating value of x in CDF. 233 All distributions from SciPy's 'stats' module can be used. 234 permutations: int 235 This many permutations of label will be performed to 236 determine the distribution under the null hypothesis. 237 """ 238 NullDist.__init__(self, **kwargs) 239 240 self._dist_class = dist_class 241 self._dist = [] # actual distributions 242 243 self.__permutations = permutations 244 """Number of permutations to compute the estimate the null 245 distribution."""
246
247 - def __repr__(self, prefixes=[]):
248 prefixes_ = ["permutations=%s" % self.__permutations] 249 if self._dist_class != Nonparametric: 250 prefixes_.insert(0, 'dist_class=%s' % `self._dist_class`) 251 return super(MCNullDist, self).__repr__( 252 prefixes=prefixes_ + prefixes)
253 254
255 - def fit(self, measure, wdata, vdata=None):
256 """Fit the distribution by performing multiple cycles which repeatedly 257 permuted labels in the training dataset. 258 259 :Parameters: 260 measure: (`Featurewise`)`DatasetMeasure` | `TransferError` 261 TransferError instance used to compute all errors. 262 wdata: `Dataset` which gets permuted and used to compute the 263 measure/transfer error multiple times. 264 vdata: `Dataset` used for validation. 265 If provided measure is assumed to be a `TransferError` and 266 working and validation dataset are passed onto it. 267 """ 268 dist_samples = [] 269 """Holds the values for randomized labels.""" 270 271 # decide on the arguments to measure 272 if not vdata is None: 273 measure_args = [vdata, wdata] 274 else: 275 measure_args = [wdata] 276 277 # estimate null-distribution 278 for p in xrange(self.__permutations): 279 # new permutation all the time 280 # but only permute the training data and keep the testdata constant 281 # 282 if __debug__: 283 debug('STATMC', "Doing %i permutations: %i" \ 284 % (self.__permutations, p+1), cr=True) 285 286 # TODO this really needs to be more clever! If data samples are 287 # shuffled within a class it really makes no difference for the 288 # classifier, hence the number of permutations to estimate the 289 # null-distribution of transfer errors can be reduced dramatically 290 # when the *right* permutations (the ones that matter) are done. 291 wdata.permuteLabels(True, perchunk=False) 292 293 # compute and store the measure of this permutation 294 # assume it has `TransferError` interface 295 dist_samples.append(measure(*measure_args)) 296 297 if __debug__: 298 debug('STATMC', '') 299 300 # restore original labels 301 wdata.permuteLabels(False, perchunk=False) 302 303 # store samples 304 self.dist_samples = dist_samples = N.asarray(dist_samples) 305 306 # fit distribution per each element 307 308 # to decide either it was done on scalars or vectors 309 shape = dist_samples.shape 310 nshape = len(shape) 311 # if just 1 dim, original data was scalar, just create an 312 # artif dimension for it 313 if nshape == 1: 314 dist_samples = dist_samples[:, N.newaxis] 315 316 # fit per each element. 317 # XXX could be more elegant? may be use N.vectorize? 318 dist_samples_rs = dist_samples.reshape((shape[0], -1)) 319 dist = [] 320 for samples in dist_samples_rs.T: 321 params = self._dist_class.fit(samples) 322 if __debug__ and 'STAT' in debug.active: 323 debug('STAT', 'Estimated parameters for the %s are %s' 324 % (self._dist_class, str(params))) 325 dist.append(self._dist_class(*params)) 326 self._dist = dist
327 328
329 - def cdf(self, x):
330 """Return value of the cumulative distribution function at `x`. 331 """ 332 if self._dist is None: 333 # XXX We might not want to descriminate that way since 334 # usually generators also have .cdf where they rely on the 335 # default parameters. But then what about Nonparametric 336 raise RuntimeError, "Distribution has to be fit first" 337 338 is_scalar = N.isscalar(x) 339 if is_scalar: 340 x = [x] 341 342 x = N.asanyarray(x) 343 xshape = x.shape 344 # assure x is a 1D array now 345 x = x.reshape((-1,)) 346 347 if len(self._dist) != len(x): 348 raise ValueError, 'Distribution was fit for structure with %d' \ 349 ' elements, whenever now queried with %d elements' \ 350 % (len(self._dist), len(x)) 351 352 # extract cdf values per each element 353 cdfs = [ dist.cdf(v) for v, dist in zip(x, self._dist) ] 354 return N.array(cdfs).reshape(xshape)
355 356
357 - def clean(self):
358 """Clean stored distributions 359 360 Storing all of the distributions might be too expensive 361 (e.g. in case of Nonparametric), and the scope of the object 362 might be too broad to wait for it to be destroyed. Clean would 363 bind dist_samples to empty list to let gc revoke the memory. 364 """ 365 self._dist = []
366
367 368 369 -class FixedNullDist(NullDist):
370 """Proxy/Adaptor class for SciPy distributions. 371 372 All distributions from SciPy's 'stats' module can be used with this class. 373 374 >>> import numpy as N 375 >>> from scipy import stats 376 >>> from mvpa.clfs.stats import FixedNullDist 377 >>> 378 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4)) 379 >>> dist.p(2) 380 0.5 381 >>> 382 >>> dist.cdf(N.arange(5)) 383 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246]) 384 >>> 385 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4), tail='right') 386 >>> dist.p(N.arange(5)) 387 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754]) 388 """
389 - def __init__(self, dist, **kwargs):
390 """ 391 :Parameter: 392 dist: distribution object 393 This can be any object the has a `cdf()` method to report the 394 cumulative distribition function values. 395 """ 396 NullDist.__init__(self, **kwargs) 397 398 self._dist = dist
399 400
401 - def fit(self, measure, wdata, vdata=None):
402 """Does nothing since the distribution is already fixed.""" 403 pass
404 405
406 - def cdf(self, x):
407 """Return value of the cumulative distribution function at `x`. 408 """ 409 return self._dist.cdf(x)
410 411
412 - def __repr__(self, prefixes=[]):
413 prefixes_ = ["dist=%s" % `self._dist`] 414 return super(FixedNullDist, self).__repr__( 415 prefixes=prefixes_ + prefixes)
416
417 418 -class AdaptiveNullDist(FixedNullDist):
419 """Adaptive distribution which adjusts parameters according to the data 420 421 WiP: internal implementation might change 422 """
423 - def fit(self, measure, wdata, vdata=None):
424 """Cares about dimensionality of the feature space in measure 425 """ 426 427 try: 428 nfeatures = len(measure.feature_ids) 429 except ValueError: # XXX 430 nfeatures = N.prod(wdata.shape[1:]) 431 432 dist_gen = self._dist 433 if not hasattr(dist_gen, 'fit'): # frozen already 434 dist_gen = dist_gen.dist # rv_frozen at least has it ;) 435 436 args, kwargs = self._adapt(nfeatures, measure, wdata, vdata) 437 if __debug__: 438 debug('STAT', 'Adapted parameters for %s to be %s, %s' 439 % (dist_gen, args, kwargs)) 440 self._dist = dist_gen(*args, **kwargs)
441 442
443 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
444 raise NotImplementedError
445
446 447 -class AdaptiveRDist(AdaptiveNullDist):
448 """Adaptive rdist: params are (nfeatures-1, 0, 1) 449 """ 450
451 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
452 return (nfeatures-1, 0, 1), {}
453 454 # XXX: RDist has stability issue, just run 455 # python -c "import scipy.stats; print scipy.stats.rdist(541,0,1).cdf(0.72)" 456 # to get some improbable value, so we need to take care about that manually 457 # here
458 - def cdf(self, x):
459 cdf_ = self._dist.cdf(x) 460 bad_values = N.where(N.abs(cdf_)>1) 461 # XXX there might be better implementation (faster/elegant) using N.clip, 462 # the only problem is that instability results might flip the sign 463 # arbitrarily 464 if len(bad_values[0]): 465 # in this distribution we have mean at 0, so we can take that easily 466 # into account 467 cdf_bad = cdf_[bad_values] 468 x_bad = x[bad_values] 469 cdf_bad[x_bad<0] = 0.0 470 cdf_bad[x_bad>=0] = 1.0 471 cdf_[bad_values] = cdf_bad 472 return cdf_
473
474 475 -class AdaptiveNormal(AdaptiveNullDist):
476 """Adaptive Normal Distribution: params are (0, sqrt(1/nfeatures)) 477 """ 478
479 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
480 return (0, 1.0/N.sqrt(nfeatures)), {}
481 482 483 if externals.exists('scipy'): 484 from mvpa.support.stats import scipy 485 from scipy.stats import kstest 486 """ 487 Thoughts: 488 489 So we can use `scipy.stats.kstest` (Kolmogorov-Smirnov test) to 490 check/reject H0 that samples come from a given distribution. But 491 since it is based on a full range of data, we might better of with 492 some ad-hoc checking by the detection power of the values in the 493 tail of a tentative distribution. 494 495 """ 496 497 # We need a way to fixate estimation of some parameters 498 # (e.g. mean) so lets create a simple proxy, or may be class 499 # generator later on, which would take care about punishing change 500 # from the 'right' arguments 501 502 import scipy
503 504 - class rv_semifrozen(object):
505 """Helper proxy-class to fit distribution when some parameters are known 506 507 It is an ugly hack with snippets of code taken from scipy, which is 508 Copyright (c) 2001, 2002 Enthought, Inc. 509 and is distributed under BSD license 510 http://www.scipy.org/License_Compatibility 511 """ 512
513 - def __init__(self, dist, loc=None, scale=None, args=None):
514 515 self._dist = dist 516 # loc and scale 517 theta = (loc, scale) 518 # args 519 Narg_ = dist.numargs 520 if args is not None: 521 Narg = len(args) 522 if Narg > Narg_: 523 raise ValueError, \ 524 'Distribution %s has only %d arguments. Got %d' \ 525 % (dist, Narg_, Narg) 526 args += (None,) * (Narg_ - Narg) 527 else: 528 args = (None,) * Narg_ 529 530 args_i = [i for i,v in enumerate(args) if v is None] 531 self._fargs = (list(args+theta), args_i) 532 """Arguments which should get some fixed value"""
533 534
535 - def __call__(self, *args, **kwargs):
536 """Upon call mimic call to get actual rv_frozen distribution 537 """ 538 return self._dist(*args, **kwargs)
539 540
541 - def nnlf(self, theta, x):
542 # - sum (log pdf(x, theta),axis=0) 543 # where theta are the parameters (including loc and scale) 544 # 545 fargs, fargs_i = self._fargs 546 try: 547 i=-1 548 if fargs[-1] is not None: 549 scale = fargs[-1] 550 else: 551 scale = theta[i] 552 i -= 1 553 554 if fargs[-2] is not None: 555 loc = fargs[-2] 556 else: 557 loc = theta[i] 558 i -= 1 559 560 args = theta[:i+1] 561 # adjust args if there were fixed 562 for i,a in zip(fargs_i, args): 563 fargs[i] = a 564 args = fargs[:-2] 565 566 except IndexError: 567 raise ValueError, "Not enough input arguments." 568 if not self._argcheck(*args) or scale <= 0: 569 return N.inf 570 x = N.asarray((x-loc) / scale) 571 cond0 = (x <= self.a) | (x >= self.b) 572 if (N.any(cond0)): 573 return N.inf 574 else: 575 return self._nnlf(x, *args) + len(x)*N.log(scale)
576
577 - def fit(self, data, *args, **kwds):
578 loc0, scale0 = map(kwds.get, ['loc', 'scale'], [0.0, 1.0]) 579 fargs, fargs_i = self._fargs 580 Narg = len(args) 581 Narg_ = self.numargs 582 if Narg != Narg_: 583 if Narg > Narg_: 584 raise ValueError, "Too many input arguments." 585 else: 586 args += (1.0,)*(self.numargs-Narg) 587 588 # Provide only those args which are not fixed, and 589 # append location and scale (if not fixed) at the end 590 if len(fargs_i) != Narg_: 591 x0 = tuple([args[i] for i in fargs_i]) 592 else: 593 x0 = args 594 if fargs[-2] is None: x0 = x0 + (loc0,) 595 if fargs[-1] is None: x0 = x0 + (scale0,) 596 597 opt_x = scipy.optimize.fmin(self.nnlf, x0, args=(N.ravel(data),), disp=0) 598 599 # reconstruct back 600 i = 0 601 loc, scale = fargs[-2:] 602 if fargs[-1] is None: 603 i -= 1 604 scale = opt_x[i] 605 if fargs[-2] is None: 606 i -= 1 607 loc = opt_x[i] 608 609 # assign those which weren't fixed 610 for i in fargs_i: 611 fargs[i] = opt_x[i] 612 613 #raise ValueError 614 opt_x = N.hstack((fargs[:-2], (loc, scale))) 615 return opt_x
616 617
618 - def __setattr__(self, a, v):
619 if not a in ['_dist', '_fargs', 'fit', 'nnlf']: 620 self._dist.__setattr__(a, v) 621 else: 622 object.__setattr__(self, a, v)
623 624
625 - def __getattribute__(self, a):
626 """We need to redirect all queries correspondingly 627 """ 628 if not a in ['_dist', '_fargs', 'fit', 'nnlf']: 629 return getattr(self._dist, a) 630 else: 631 return object.__getattribute__(self, a)
632
633 634 635 - def matchDistribution(data, nsamples=None, loc=None, scale=None, 636 args=None, test='kstest', distributions=None, 637 **kwargs):
638 """Determine best matching distribution. 639 640 Can be used for 'smelling' the data, as well to choose a 641 parametric distribution for data obtained from non-parametric 642 testing (e.g. `MCNullDist`). 643 644 WiP: use with caution, API might change 645 646 :Parameters: 647 data : N.ndarray 648 Array of the data for which to deduce the distribution. It has 649 to be sufficiently large to make a reliable conclusion 650 nsamples : int or None 651 If None -- use all samples in data to estimate parametric 652 distribution. Otherwise use only specified number randomly selected 653 from data. 654 loc : float or None 655 Loc for the distribution (if known) 656 scale : float or None 657 Scale for the distribution (if known) 658 test : basestring 659 What kind of testing to do. Choices: 660 'p-roc' : detection power for a given ROC. Needs two 661 parameters: `p=0.05` and `tail='both'` 662 'kstest' : 'full-body' distribution comparison. The best 663 choice is made by minimal reported distance after estimating 664 parameters of the distribution. Parameter `p=0.05` sets 665 threshold to reject null-hypothesis that distribution is the 666 same. 667 WARNING: older versions (e.g. 0.5.2 in etch) of scipy have 668 incorrect kstest implementation and do not function 669 properly 670 distributions : None or list of basestring or tuple(basestring, dict) 671 Distributions to check. If None, all known in scipy.stats 672 are tested. If distribution is specified as a tuple, then 673 it must contain name and additional parameters (name, loc, 674 scale, args) in the dictionary. Entry 'scipy' adds all known 675 in scipy.stats. 676 **kwargs 677 Additional arguments which are needed for each particular test 678 (see above) 679 680 :Example: 681 data = N.random.normal(size=(1000,1)); 682 matches = matchDistribution( 683 data, 684 distributions=['rdist', 685 ('rdist', {'name':'rdist_fixed', 686 'loc': 0.0, 687 'args': (10,)})], 688 nsamples=30, test='p-roc', p=0.05) 689 """ 690 691 # Handle parameters 692 _KNOWN_TESTS = ['p-roc', 'kstest'] 693 if not test in _KNOWN_TESTS: 694 raise ValueError, 'Unknown kind of test %s. Known are %s' \ 695 % (test, _KNOWN_TESTS) 696 697 data = N.ravel(data) 698 # data sampled 699 if nsamples is not None: 700 if __debug__: 701 debug('STAT', 'Sampling %d samples from data for the ' \ 702 'estimation of the distributions parameters' % nsamples) 703 indexes_selected = (N.random.sample(nsamples)*len(data)).astype(int) 704 data_selected = data[indexes_selected] 705 else: 706 indexes_selected = N.arange(len(data)) 707 data_selected = data 708 709 p_thr = kwargs.get('p', 0.05) 710 if test == 'p-roc': 711 tail = kwargs.get('tail', 'both') 712 data_p = _pvalue(data, Nonparametric(data).cdf, tail) 713 data_p_thr = N.abs(data_p) <= p_thr 714 true_positives = N.sum(data_p_thr) 715 if true_positives == 0: 716 raise ValueError, "Provided data has no elements in non-" \ 717 "parametric distribution under p<=%.3f. Please " \ 718 "increase the size of data or value of p" % p 719 if __debug__: 720 debug('STAT_', 'Number of positives in non-parametric ' 721 'distribution is %d' % true_positives) 722 723 if distributions is None: 724 distributions = ['scipy'] 725 726 # lets see if 'scipy' entry was in there 727 try: 728 scipy_ind = distributions.index('scipy') 729 distributions.pop(scipy_ind) 730 distributions += scipy.stats.distributions.__all__ 731 except ValueError: 732 pass 733 734 results = [] 735 for d in distributions: 736 dist_gen, loc_, scale_, args_ = None, loc, scale, args 737 if isinstance(d, basestring): 738 dist_gen = d 739 dist_name = d 740 elif isinstance(d, tuple): 741 if not (len(d)==2 and isinstance(d[1], dict)): 742 raise ValueError,\ 743 "Tuple specification of distribution must be " \ 744 "(d, {params}). Got %s" % (d,) 745 dist_gen = d[0] 746 loc_ = d[1].get('loc', loc) 747 scale_ = d[1].get('scale', scale) 748 args_ = d[1].get('args', args) 749 dist_name = d[1].get('name', str(dist_gen)) 750 else: 751 dist_gen = d 752 dist_name = str(d) 753 754 # perform actions which might puke for some distributions 755 try: 756 dist_gen_ = getattr(scipy.stats, dist_gen) 757 # specify distribution 'optimizer'. If loc or scale was provided, 758 # use home-brewed rv_semifrozen 759 if args_ is not None or loc_ is not None or scale_ is not None: 760 dist_opt = rv_semifrozen(dist_gen_, loc=loc_, scale=scale_, args=args_) 761 else: 762 dist_opt = dist_gen_ 763 dist_params = dist_opt.fit(data_selected) 764 if __debug__: 765 debug('STAT__', 766 'Got distribution parameters %s for %s' 767 % (dist_params, dist_name)) 768 if test == 'p-roc': 769 cdf_func = lambda x: dist_gen_.cdf(x, *dist_params) 770 # We need to compare detection under given p 771 cdf_p = N.abs(_pvalue(data, cdf_func, tail, name=dist_gen)) 772 cdf_p_thr = cdf_p <= p_thr 773 D, p = N.sum(N.abs(data_p_thr - cdf_p_thr))*1.0/true_positives, 1 774 if __debug__: res_sum = 'D=%.2f' % D 775 elif test == 'kstest': 776 D, p = kstest(data, d, args=dist_params) 777 if __debug__: res_sum = 'D=%.3f p=%.3f' % (D, p) 778 except (TypeError, ValueError, AttributeError, 779 NotImplementedError), e:#Exception, e: 780 if __debug__: 781 debug('STAT__', 782 'Testing for %s distribution failed due to %s' 783 % (d, str(e))) 784 continue 785 786 if p > p_thr and not N.isnan(D): 787 results += [ (D, dist_gen, dist_name, dist_params) ] 788 if __debug__: 789 debug('STAT_', 'Testing for %s dist.: %s' % (dist_name, res_sum)) 790 else: 791 if __debug__: 792 debug('STAT__', 'Cannot consider %s dist. with %s' 793 % (d, res_sum)) 794 continue 795 796 # sort in ascending order, so smaller is better 797 results.sort() 798 799 if __debug__ and 'STAT' in debug.active: 800 # find the best and report it 801 nresults = len(results) 802 sresult = lambda r:'%s(%s)=%.2f' % (r[1], ', '.join(map(str, r[3])), r[0]) 803 if nresults>0: 804 nnextbest = min(2, nresults-1) 805 snextbest = ', '.join(map(sresult, results[1:1+nnextbest])) 806 debug('STAT', 'Best distribution %s. Next best: %s' 807 % (sresult(results[0]), snextbest)) 808 else: 809 debug('STAT', 'Could not find suitable distribution') 810 811 # return all the results 812 return results
813 814 815 if externals.exists('pylab'): 816 import pylab as P
817 818 - def plotDistributionMatches(data, matches, nbins=31, nbest=5, 819 expand_tails=8, legend=2, plot_cdf=True, 820 p=None, tail='both'):
821 """Plot best matching distributions 822 823 :Parameters: 824 data : N.ndarray 825 Data which was used to obtain the matches 826 matches : list of tuples 827 Sorted matches as provided by matchDistribution 828 nbins : int 829 Number of bins in the histogram 830 nbest : int 831 Number of top matches to plot 832 expand_tails : int 833 How many bins away to add to parametrized distributions 834 plots 835 legend : int 836 Either to provide legend and statistics in the legend. 837 1 -- just lists distributions. 838 2 -- adds distance measure 839 3 -- tp/fp/fn in the case if p is provided 840 plot_cdf : bool 841 Either to plot cdf for data using non-parametric distribution 842 p : float or None 843 If not None, visualize null-hypothesis testing (given p). 844 Bars in the histogram which fall under given p are colored 845 in red. False positives and false negatives are marked as 846 triangle up and down symbols correspondingly 847 tail : ('left', 'right', 'any', 'both') 848 If p is not None, the choise of tail for null-hypothesis 849 testing 850 851 :Returns: tuple(histogram, list of lines) 852 """ 853 854 hist = P.hist(data, nbins, normed=1, align='center') 855 data_range = [N.min(data), N.max(data)] 856 857 # x's 858 x = hist[1] 859 dx = x[expand_tails] - x[0] # how much to expand tails by 860 x = N.hstack((x[:expand_tails] - dx, x, x[-expand_tails:] + dx)) 861 862 nonparam = Nonparametric(data) 863 # plot cdf 864 if plot_cdf: 865 P.plot(x, nonparam.cdf(x), 'k--', linewidth=1) 866 867 p_thr = p 868 869 data_p = _pvalue(data, nonparam.cdf, tail) 870 data_p_thr = (data_p <= p_thr).ravel() 871 872 x_p = _pvalue(x, Nonparametric(data).cdf, tail) 873 x_p_thr = N.abs(x_p) <= p_thr 874 # color bars which pass thresholding in red 875 for thr, bar in zip(x_p_thr[expand_tails:], hist[2]): 876 bar.set_facecolor(('w','r')[int(thr)]) 877 878 if not len(matches): 879 # no matches were provided 880 warning("No matching distributions were provided -- nothing to plot") 881 return (hist, ) 882 883 lines = [] 884 labels = [] 885 for i in xrange(min(nbest, len(matches))): 886 D, dist_gen, dist_name, params = matches[i] 887 dist = getattr(scipy.stats, dist_gen)(*params) 888 889 label = '%s' % (dist_name) 890 if legend > 1: label += '(D=%.2f)' % (D) 891 892 xcdf_p = N.abs(_pvalue(x, dist.cdf, tail)) 893 xcdf_p_thr = (xcdf_p <= p_thr).ravel() 894 895 if p is not None and legend > 2: 896 # We need to compare detection under given p 897 data_cdf_p = N.abs(_pvalue(data, dist.cdf, tail)) 898 data_cdf_p_thr = (data_cdf_p <= p_thr).ravel() 899 900 # true positives 901 tp = N.logical_and(data_cdf_p_thr, data_p_thr) 902 # false positives 903 fp = N.logical_and(data_cdf_p_thr, ~data_p_thr) 904 # false negatives 905 fn = N.logical_and(~data_cdf_p_thr, data_p_thr) 906 907 label += ' tp/fp/fn=%d/%d/%d)' % \ 908 tuple(map(N.sum, [tp,fp,fn])) 909 910 pdf = dist.pdf(x) 911 line = P.plot(x, pdf, '-', linewidth=2, label=label) 912 color = line[0].get_color() 913 914 if plot_cdf: 915 cdf = dist.cdf(x) 916 P.plot(x, cdf, ':', linewidth=1, color=color, label=label) 917 918 # TODO: decide on tp/fp/fn by not centers of the bins but 919 # by the values in data in the ranges covered by 920 # those bins. Then it would correspond to the values 921 # mentioned in the legend 922 if p is not None: 923 # true positives 924 xtp = N.logical_and(xcdf_p_thr, x_p_thr) 925 # false positives 926 xfp = N.logical_and(xcdf_p_thr, ~x_p_thr) 927 # false negatives 928 xfn = N.logical_and(~xcdf_p_thr, x_p_thr) 929 930 # no need to plot tp explicitely -- marked by color of the bar 931 # P.plot(x[xtp], pdf[xtp], 'o', color=color) 932 P.plot(x[xfp], pdf[xfp], '^', color=color) 933 P.plot(x[xfn], pdf[xfn], 'v', color=color) 934 935 lines.append(line) 936 labels.append(label) 937 938 if legend: 939 P.legend(lines, labels) 940 941 return (hist, lines)
942
943 #if True: 944 # data = N.random.normal(size=(1000,1)); 945 # matches = matchDistribution( 946 # data, 947 # distributions=['scipy', 948 # ('norm', {'name':'norm_known', 949 # 'scale': 1.0, 950 # 'loc': 0.0})], 951 # nsamples=30, test='p-roc', p=0.05) 952 # P.figure(); plotDistributionMatches(data, matches, nbins=101, 953 # p=0.05, legend=4, nbest=5) 954 955 956 -def autoNullDist(dist):
957 """Cheater for human beings -- wraps `dist` if needed with some 958 NullDist 959 960 tail and other arguments are assumed to be default as in 961 NullDist/MCNullDist 962 """ 963 if dist is None or isinstance(dist, NullDist): 964 return dist 965 elif hasattr(dist, 'fit'): 966 if __debug__: 967 debug('STAT', 'Wrapping %s into MCNullDist' % dist) 968 return MCNullDist(dist) 969 else: 970 if __debug__: 971 debug('STAT', 'Wrapping %s into FixedNullDist' % dist) 972 return FixedNullDist(dist)
973
974 975 # if no scipy, we need nanmean 976 -def _chk_asarray(a, axis):
977 if axis is None: 978 a = N.ravel(a) 979 outaxis = 0 980 else: 981 a = N.asarray(a) 982 outaxis = axis 983 return a, outaxis
984
985 -def nanmean(x, axis=0):
986 """Compute the mean over the given axis ignoring nans. 987 988 :Parameters: 989 x : ndarray 990 input array 991 axis : int 992 axis along which the mean is computed. 993 994 :Results: 995 m : float 996 the mean.""" 997 x, axis = _chk_asarray(x,axis) 998 x = x.copy() 999 Norig = x.shape[axis] 1000 factor = 1.0-N.sum(N.isnan(x),axis)*1.0/Norig 1001 1002 x[N.isnan(x)] = 0 1003 return N.mean(x,axis)/factor
1004