Package mvpa :: Package clfs :: Module stats
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.stats

   1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
   2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
   3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   4  # 
   5  #   See COPYING file distributed along with the PyMVPA package for the 
   6  #   copyright and license terms. 
   7  # 
   8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   9  """Estimator for classifier error distributions.""" 
  10   
  11  __docformat__ = 'restructuredtext' 
  12   
  13  import numpy as N 
  14   
  15  from mvpa.base import externals, warning 
  16  from mvpa.misc.state import ClassWithCollections, StateVariable 
  17   
  18  if __debug__: 
  19      from mvpa.base import debug 
20 21 22 -class Nonparametric(object):
23 """Non-parametric 1d distribution -- derives cdf based on stored values. 24 25 Introduced to complement parametric distributions present in scipy.stats. 26 """ 27
28 - def __init__(self, dist_samples, correction='clip'):
29 """ 30 :Parameters: 31 dist_samples : ndarray 32 Samples to be used to assess the distribution. 33 correction : {'clip'} or None, optional 34 Determines the behavior when .cdf is queried. If None -- no 35 correction is made. If 'clip' -- values are clipped to lie 36 in the range [1/(N+2), (N+1)/(N+2)] (simply because 37 non-parametric assessment lacks the power to resolve with 38 higher precision in the tails, so 'imagery' samples are 39 placed in each of the two tails). 40 """ 41 self._dist_samples = N.ravel(dist_samples) 42 self._correction = correction
43
44 - def __repr__(self):
45 return '%s(%r%s)' % ( 46 self.__class__.__name__, 47 self._dist_samples, 48 ('', ', correction=%r' % self._correction) 49 [int(self._correction != 'clip')])
50 51 @staticmethod
52 - def fit(dist_samples):
53 return [dist_samples]
54 55
56 - def cdf(self, x):
57 """Returns the cdf value at `x`. 58 """ 59 dist_samples = self._dist_samples 60 res = N.vectorize(lambda v:(dist_samples <= v).mean())(x) 61 if self._correction == 'clip': 62 nsamples = len(dist_samples) 63 N.clip(res, 1.0/(nsamples+2), (nsamples+1.0)/(nsamples+2), res) 64 elif self._correction is None: 65 pass 66 else: 67 raise ValueError, \ 68 '%r is incorrect value for correction parameter of %s' \ 69 % (self._correction, self.__class__.__name__) 70 return res
71
72 73 -def _pvalue(x, cdf_func, tail, return_tails=False, name=None):
74 """Helper function to return p-value(x) given cdf and tail 75 76 :Parameters: 77 cdf_func : callable 78 Function to be used to derive cdf values for x 79 tail : str ('left', 'right', 'any', 'both') 80 Which tail of the distribution to report. For 'any' and 'both' 81 it chooses the tail it belongs to based on the comparison to 82 p=0.5. In the case of 'any' significance is taken like in a 83 one-tailed test. 84 return_tails : bool 85 If True, a tuple return (pvalues, tails), where tails contain 86 1s if value was from the right tail, and 0 if the value was 87 from the left tail. 88 """ 89 is_scalar = N.isscalar(x) 90 if is_scalar: 91 x = [x] 92 93 cdf = cdf_func(x) 94 95 if __debug__ and 'CHECK_STABILITY' in debug.active: 96 cdf_min, cdf_max = N.min(cdf), N.max(cdf) 97 if cdf_min < 0 or cdf_max > 1.0: 98 s = ('', ' for %s' % name)[int(name is not None)] 99 warning('Stability check of cdf %s failed%s. Min=%s, max=%s' % \ 100 (cdf_func, s, cdf_min, cdf_max)) 101 102 # no escape but to assure that CDF is in the right range. Some 103 # distributions from scipy tend to jump away from [0,1] 104 cdf = N.clip(cdf, 0, 1.0) 105 106 if tail == 'left': 107 if return_tails: 108 right_tail = N.zeros(cdf.shape, dtype=bool) 109 elif tail == 'right': 110 cdf = 1 - cdf 111 if return_tails: 112 right_tail = N.ones(cdf.shape, dtype=bool) 113 elif tail in ('any', 'both'): 114 right_tail = (cdf >= 0.5) 115 cdf[right_tail] = 1.0 - cdf[right_tail] 116 if tail == 'both': 117 # we need to half the signficance 118 cdf *= 2 119 120 # Assure that NaNs didn't get significant value 121 cdf[N.isnan(x)] = 1.0 122 if is_scalar: res = cdf[0] 123 else: res = cdf 124 125 if return_tails: 126 return (res, right_tail) 127 else: 128 return res
129
130 131 -class NullDist(ClassWithCollections):
132 """Base class for null-hypothesis testing. 133 134 """ 135 136 # Although base class is not benefiting from states, derived 137 # classes do (MCNullDist). For the sake of avoiding multiple 138 # inheritance and associated headache -- let them all be ClassWithCollections, 139 # performance hit should be negligible in most of the scenarios 140 _ATTRIBUTE_COLLECTIONS = ['states'] 141
142 - def __init__(self, tail='both', **kwargs):
143 """Cheap initialization. 144 145 :Parameter: 146 tail: str ('left', 'right', 'any', 'both') 147 Which tail of the distribution to report. For 'any' and 'both' 148 it chooses the tail it belongs to based on the comparison to 149 p=0.5. In the case of 'any' significance is taken like in a 150 one-tailed test. 151 """ 152 ClassWithCollections.__init__(self, **kwargs) 153 154 self._setTail(tail)
155
156 - def __repr__(self, prefixes=[]):
157 return super(NullDist, self).__repr__( 158 prefixes=["tail=%s" % `self.__tail`] + prefixes)
159 160
161 - def _setTail(self, tail):
162 # sanity check 163 if tail not in ('left', 'right', 'any', 'both'): 164 raise ValueError, 'Unknown value "%s" to `tail` argument.' \ 165 % tail 166 self.__tail = tail
167 168
169 - def fit(self, measure, wdata, vdata=None):
170 """Implement to fit the distribution to the data.""" 171 raise NotImplementedError
172 173
174 - def cdf(self, x):
175 """Implementations return the value of the cumulative distribution 176 function (left or right tail dpending on the setting). 177 """ 178 raise NotImplementedError
179 180
181 - def p(self, x, **kwargs):
182 """Returns the p-value for values of `x`. 183 Returned values are determined left, right, or from any tail 184 depending on the constructor setting. 185 186 In case a `FeaturewiseDatasetMeasure` was used to estimate the 187 distribution the method returns an array. In that case `x` can be 188 a scalar value or an array of a matching shape. 189 """ 190 return _pvalue(x, self.cdf, self.__tail, **kwargs)
191 192 193 tail = property(fget=lambda x:x.__tail, fset=_setTail)
194
195 196 -class MCNullDist(NullDist):
197 """Null-hypothesis distribution is estimated from randomly permuted data labels. 198 199 The distribution is estimated by calling fit() with an appropriate 200 `DatasetMeasure` or `TransferError` instance and a training and a 201 validation dataset (in case of a `TransferError`). For a customizable 202 amount of cycles the training data labels are permuted and the 203 corresponding measure computed. In case of a `TransferError` this is the 204 error when predicting the *correct* labels of the validation dataset. 205 206 The distribution can be queried using the `cdf()` method, which can be 207 configured to report probabilities/frequencies from `left` or `right` tail, 208 i.e. fraction of the distribution that is lower or larger than some 209 critical value. 210 211 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()` 212 returns an array of featurewise probabilities/frequencies. 213 """ 214 215 _DEV_DOC = """ 216 TODO automagically decide on the number of samples/permutations needed 217 Caution should be paid though since resultant distributions might be 218 quite far from some conventional ones (e.g. Normal) -- it is expected to 219 them to be bimodal (or actually multimodal) in many scenarios. 220 """ 221 222 dist_samples = StateVariable(enabled=False, 223 doc='Samples obtained for each permutation') 224
225 - def __init__(self, dist_class=Nonparametric, permutations=100, **kwargs):
226 """Initialize Monte-Carlo Permutation Null-hypothesis testing 227 228 :Parameters: 229 dist_class: class 230 This can be any class which provides parameters estimate 231 using `fit()` method to initialize the instance, and 232 provides `cdf(x)` method for estimating value of x in CDF. 233 All distributions from SciPy's 'stats' module can be used. 234 permutations: int 235 This many permutations of label will be performed to 236 determine the distribution under the null hypothesis. 237 """ 238 NullDist.__init__(self, **kwargs) 239 240 self._dist_class = dist_class 241 self._dist = [] # actual distributions 242 243 self.__permutations = permutations 244 """Number of permutations to compute the estimate the null 245 distribution."""
246
247 - def __repr__(self, prefixes=[]):
248 prefixes_ = ["permutations=%s" % self.__permutations] 249 if self._dist_class != Nonparametric: 250 prefixes_.insert(0, 'dist_class=%s' % `self._dist_class`) 251 return super(MCNullDist, self).__repr__( 252 prefixes=prefixes_ + prefixes)
253 254
255 - def fit(self, measure, wdata, vdata=None):
256 """Fit the distribution by performing multiple cycles which repeatedly 257 permuted labels in the training dataset. 258 259 :Parameters: 260 measure: (`Featurewise`)`DatasetMeasure` | `TransferError` 261 TransferError instance used to compute all errors. 262 wdata: `Dataset` which gets permuted and used to compute the 263 measure/transfer error multiple times. 264 vdata: `Dataset` used for validation. 265 If provided measure is assumed to be a `TransferError` and 266 working and validation dataset are passed onto it. 267 """ 268 dist_samples = [] 269 """Holds the values for randomized labels.""" 270 271 # Needs to be imported here upon demand due to circular imports 272 # TODO: place MC into a separate module 273 from mvpa.clfs.base import DegenerateInputError, FailedToTrainError 274 275 # decide on the arguments to measure 276 if not vdata is None: 277 measure_args = [vdata, wdata] 278 else: 279 measure_args = [wdata] 280 281 # estimate null-distribution 282 skipped = 0 # # of skipped permutations 283 for p in xrange(self.__permutations): 284 # new permutation all the time 285 # but only permute the training data and keep the testdata constant 286 # 287 if __debug__: 288 debug('STATMC', "Doing %i permutations: %i" \ 289 % (self.__permutations, p+1), cr=True) 290 291 # TODO this really needs to be more clever! If data samples are 292 # shuffled within a class it really makes no difference for the 293 # classifier, hence the number of permutations to estimate the 294 # null-distribution of transfer errors can be reduced dramatically 295 # when the *right* permutations (the ones that matter) are done. 296 wdata.permuteLabels(True, perchunk=False) 297 298 # compute and store the measure of this permutation 299 # assume it has `TransferError` interface 300 try: 301 dist_samples.append(measure(*measure_args)) 302 except (DegenerateInputError, FailedToTrainError), e: 303 if __debug__: 304 debug('STATMC', " skipped", cr=True) 305 warning("Failed to estimate %s on %s, due to %s. " 306 "Permutation %d skipped." % 307 (measure, measure_args, e, p)) 308 skipped += 1 309 continue 310 311 if __debug__: 312 debug('STATMC', ' Skipped: %d permutations' % skipped) 313 314 # restore original labels 315 wdata.permuteLabels(False, perchunk=False) 316 317 # store samples 318 self.dist_samples = dist_samples = N.asarray(dist_samples) 319 320 # fit distribution per each element 321 322 # to decide either it was done on scalars or vectors 323 shape = dist_samples.shape 324 nshape = len(shape) 325 # if just 1 dim, original data was scalar, just create an 326 # artif dimension for it 327 if nshape == 1: 328 dist_samples = dist_samples[:, N.newaxis] 329 330 # fit per each element. 331 # XXX could be more elegant? may be use N.vectorize? 332 dist_samples_rs = dist_samples.reshape((shape[0], -1)) 333 dist = [] 334 for samples in dist_samples_rs.T: 335 params = self._dist_class.fit(samples) 336 if __debug__ and 'STAT' in debug.active: 337 debug('STAT', 'Estimated parameters for the %s are %s' 338 % (self._dist_class, str(params))) 339 dist.append(self._dist_class(*params)) 340 self._dist = dist
341 342
343 - def cdf(self, x):
344 """Return value of the cumulative distribution function at `x`. 345 """ 346 if self._dist is None: 347 # XXX We might not want to descriminate that way since 348 # usually generators also have .cdf where they rely on the 349 # default parameters. But then what about Nonparametric 350 raise RuntimeError, "Distribution has to be fit first" 351 352 is_scalar = N.isscalar(x) 353 if is_scalar: 354 x = [x] 355 356 x = N.asanyarray(x) 357 xshape = x.shape 358 # assure x is a 1D array now 359 x = x.reshape((-1,)) 360 361 if len(self._dist) != len(x): 362 raise ValueError, 'Distribution was fit for structure with %d' \ 363 ' elements, whenever now queried with %d elements' \ 364 % (len(self._dist), len(x)) 365 366 # extract cdf values per each element 367 cdfs = [ dist.cdf(v) for v, dist in zip(x, self._dist) ] 368 return N.array(cdfs).reshape(xshape)
369 370
371 - def clean(self):
372 """Clean stored distributions 373 374 Storing all of the distributions might be too expensive 375 (e.g. in case of Nonparametric), and the scope of the object 376 might be too broad to wait for it to be destroyed. Clean would 377 bind dist_samples to empty list to let gc revoke the memory. 378 """ 379 self._dist = []
380
381 382 383 -class FixedNullDist(NullDist):
384 """Proxy/Adaptor class for SciPy distributions. 385 386 All distributions from SciPy's 'stats' module can be used with this class. 387 388 >>> import numpy as N 389 >>> from scipy import stats 390 >>> from mvpa.clfs.stats import FixedNullDist 391 >>> 392 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4)) 393 >>> dist.p(2) 394 0.5 395 >>> 396 >>> dist.cdf(N.arange(5)) 397 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246]) 398 >>> 399 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4), tail='right') 400 >>> dist.p(N.arange(5)) 401 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754]) 402 """
403 - def __init__(self, dist, **kwargs):
404 """ 405 :Parameter: 406 dist: distribution object 407 This can be any object the has a `cdf()` method to report the 408 cumulative distribition function values. 409 """ 410 NullDist.__init__(self, **kwargs) 411 412 self._dist = dist
413 414
415 - def fit(self, measure, wdata, vdata=None):
416 """Does nothing since the distribution is already fixed.""" 417 pass
418 419
420 - def cdf(self, x):
421 """Return value of the cumulative distribution function at `x`. 422 """ 423 return self._dist.cdf(x)
424 425
426 - def __repr__(self, prefixes=[]):
427 prefixes_ = ["dist=%s" % `self._dist`] 428 return super(FixedNullDist, self).__repr__( 429 prefixes=prefixes_ + prefixes)
430
431 432 -class AdaptiveNullDist(FixedNullDist):
433 """Adaptive distribution which adjusts parameters according to the data 434 435 WiP: internal implementation might change 436 """
437 - def fit(self, measure, wdata, vdata=None):
438 """Cares about dimensionality of the feature space in measure 439 """ 440 441 try: 442 nfeatures = len(measure.feature_ids) 443 except ValueError: # XXX 444 nfeatures = N.prod(wdata.shape[1:]) 445 446 dist_gen = self._dist 447 if not hasattr(dist_gen, 'fit'): # frozen already 448 dist_gen = dist_gen.dist # rv_frozen at least has it ;) 449 450 args, kwargs = self._adapt(nfeatures, measure, wdata, vdata) 451 if __debug__: 452 debug('STAT', 'Adapted parameters for %s to be %s, %s' 453 % (dist_gen, args, kwargs)) 454 self._dist = dist_gen(*args, **kwargs)
455 456
457 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
458 raise NotImplementedError
459
460 461 -class AdaptiveRDist(AdaptiveNullDist):
462 """Adaptive rdist: params are (nfeatures-1, 0, 1) 463 """ 464
465 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
466 return (nfeatures-1, 0, 1), {}
467 468 # XXX: RDist has stability issue, just run 469 # python -c "import scipy.stats; print scipy.stats.rdist(541,0,1).cdf(0.72)" 470 # to get some improbable value, so we need to take care about that manually 471 # here
472 - def cdf(self, x):
473 cdf_ = self._dist.cdf(x) 474 bad_values = N.where(N.abs(cdf_)>1) 475 # XXX there might be better implementation (faster/elegant) using N.clip, 476 # the only problem is that instability results might flip the sign 477 # arbitrarily 478 if len(bad_values[0]): 479 # in this distribution we have mean at 0, so we can take that easily 480 # into account 481 cdf_bad = cdf_[bad_values] 482 x_bad = x[bad_values] 483 cdf_bad[x_bad<0] = 0.0 484 cdf_bad[x_bad>=0] = 1.0 485 cdf_[bad_values] = cdf_bad 486 return cdf_
487
488 489 -class AdaptiveNormal(AdaptiveNullDist):
490 """Adaptive Normal Distribution: params are (0, sqrt(1/nfeatures)) 491 """ 492
493 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
494 return (0, 1.0/N.sqrt(nfeatures)), {}
495 496 497 if externals.exists('scipy'): 498 from mvpa.support.stats import scipy 499 from scipy.stats import kstest 500 """ 501 Thoughts: 502 503 So we can use `scipy.stats.kstest` (Kolmogorov-Smirnov test) to 504 check/reject H0 that samples come from a given distribution. But 505 since it is based on a full range of data, we might better of with 506 some ad-hoc checking by the detection power of the values in the 507 tail of a tentative distribution. 508 509 """ 510 511 # We need a way to fixate estimation of some parameters 512 # (e.g. mean) so lets create a simple proxy, or may be class 513 # generator later on, which would take care about punishing change 514 # from the 'right' arguments 515 516 import scipy
517 518 - class rv_semifrozen(object):
519 """Helper proxy-class to fit distribution when some parameters are known 520 521 It is an ugly hack with snippets of code taken from scipy, which is 522 Copyright (c) 2001, 2002 Enthought, Inc. 523 and is distributed under BSD license 524 http://www.scipy.org/License_Compatibility 525 """ 526
527 - def __init__(self, dist, loc=None, scale=None, args=None):
528 529 self._dist = dist 530 # loc and scale 531 theta = (loc, scale) 532 # args 533 Narg_ = dist.numargs 534 if args is not None: 535 Narg = len(args) 536 if Narg > Narg_: 537 raise ValueError, \ 538 'Distribution %s has only %d arguments. Got %d' \ 539 % (dist, Narg_, Narg) 540 args += (None,) * (Narg_ - Narg) 541 else: 542 args = (None,) * Narg_ 543 544 args_i = [i for i,v in enumerate(args) if v is None] 545 self._fargs = (list(args+theta), args_i) 546 """Arguments which should get some fixed value"""
547 548
549 - def __call__(self, *args, **kwargs):
550 """Upon call mimic call to get actual rv_frozen distribution 551 """ 552 return self._dist(*args, **kwargs)
553 554
555 - def nnlf(self, theta, x):
556 # - sum (log pdf(x, theta),axis=0) 557 # where theta are the parameters (including loc and scale) 558 # 559 fargs, fargs_i = self._fargs 560 try: 561 i=-1 562 if fargs[-1] is not None: 563 scale = fargs[-1] 564 else: 565 scale = theta[i] 566 i -= 1 567 568 if fargs[-2] is not None: 569 loc = fargs[-2] 570 else: 571 loc = theta[i] 572 i -= 1 573 574 args = theta[:i+1] 575 # adjust args if there were fixed 576 for i,a in zip(fargs_i, args): 577 fargs[i] = a 578 args = fargs[:-2] 579 580 except IndexError: 581 raise ValueError, "Not enough input arguments." 582 if not self._argcheck(*args) or scale <= 0: 583 return N.inf 584 x = N.asarray((x-loc) / scale) 585 cond0 = (x <= self.a) | (x >= self.b) 586 if (N.any(cond0)): 587 return N.inf 588 else: 589 return self._nnlf(x, *args) + len(x)*N.log(scale)
590
591 - def fit(self, data, *args, **kwds):
592 loc0, scale0 = map(kwds.get, ['loc', 'scale'], [0.0, 1.0]) 593 fargs, fargs_i = self._fargs 594 Narg = len(args) 595 Narg_ = self.numargs 596 if Narg != Narg_: 597 if Narg > Narg_: 598 raise ValueError, "Too many input arguments." 599 else: 600 args += (1.0,)*(self.numargs-Narg) 601 602 # Provide only those args which are not fixed, and 603 # append location and scale (if not fixed) at the end 604 if len(fargs_i) != Narg_: 605 x0 = tuple([args[i] for i in fargs_i]) 606 else: 607 x0 = args 608 if fargs[-2] is None: x0 = x0 + (loc0,) 609 if fargs[-1] is None: x0 = x0 + (scale0,) 610 611 opt_x = scipy.optimize.fmin(self.nnlf, x0, args=(N.ravel(data),), disp=0) 612 613 # reconstruct back 614 i = 0 615 loc, scale = fargs[-2:] 616 if fargs[-1] is None: 617 i -= 1 618 scale = opt_x[i] 619 if fargs[-2] is None: 620 i -= 1 621 loc = opt_x[i] 622 623 # assign those which weren't fixed 624 for i in fargs_i: 625 fargs[i] = opt_x[i] 626 627 #raise ValueError 628 opt_x = N.hstack((fargs[:-2], (loc, scale))) 629 return opt_x
630 631
632 - def __setattr__(self, a, v):
633 if not a in ['_dist', '_fargs', 'fit', 'nnlf']: 634 self._dist.__setattr__(a, v) 635 else: 636 object.__setattr__(self, a, v)
637 638
639 - def __getattribute__(self, a):
640 """We need to redirect all queries correspondingly 641 """ 642 if not a in ['_dist', '_fargs', 'fit', 'nnlf']: 643 return getattr(self._dist, a) 644 else: 645 return object.__getattribute__(self, a)
646
647 648 649 - def matchDistribution(data, nsamples=None, loc=None, scale=None, 650 args=None, test='kstest', distributions=None, 651 **kwargs):
652 """Determine best matching distribution. 653 654 Can be used for 'smelling' the data, as well to choose a 655 parametric distribution for data obtained from non-parametric 656 testing (e.g. `MCNullDist`). 657 658 WiP: use with caution, API might change 659 660 :Parameters: 661 data : N.ndarray 662 Array of the data for which to deduce the distribution. It has 663 to be sufficiently large to make a reliable conclusion 664 nsamples : int or None 665 If None -- use all samples in data to estimate parametric 666 distribution. Otherwise use only specified number randomly selected 667 from data. 668 loc : float or None 669 Loc for the distribution (if known) 670 scale : float or None 671 Scale for the distribution (if known) 672 test : basestring 673 What kind of testing to do. Choices: 674 'p-roc' : detection power for a given ROC. Needs two 675 parameters: `p=0.05` and `tail='both'` 676 'kstest' : 'full-body' distribution comparison. The best 677 choice is made by minimal reported distance after estimating 678 parameters of the distribution. Parameter `p=0.05` sets 679 threshold to reject null-hypothesis that distribution is the 680 same. 681 WARNING: older versions (e.g. 0.5.2 in etch) of scipy have 682 incorrect kstest implementation and do not function 683 properly 684 distributions : None or list of basestring or tuple(basestring, dict) 685 Distributions to check. If None, all known in scipy.stats 686 are tested. If distribution is specified as a tuple, then 687 it must contain name and additional parameters (name, loc, 688 scale, args) in the dictionary. Entry 'scipy' adds all known 689 in scipy.stats. 690 **kwargs 691 Additional arguments which are needed for each particular test 692 (see above) 693 694 :Example: 695 data = N.random.normal(size=(1000,1)); 696 matches = matchDistribution( 697 data, 698 distributions=['rdist', 699 ('rdist', {'name':'rdist_fixed', 700 'loc': 0.0, 701 'args': (10,)})], 702 nsamples=30, test='p-roc', p=0.05) 703 """ 704 705 # Handle parameters 706 _KNOWN_TESTS = ['p-roc', 'kstest'] 707 if not test in _KNOWN_TESTS: 708 raise ValueError, 'Unknown kind of test %s. Known are %s' \ 709 % (test, _KNOWN_TESTS) 710 711 data = N.ravel(data) 712 # data sampled 713 if nsamples is not None: 714 if __debug__: 715 debug('STAT', 'Sampling %d samples from data for the ' \ 716 'estimation of the distributions parameters' % nsamples) 717 indexes_selected = (N.random.sample(nsamples)*len(data)).astype(int) 718 data_selected = data[indexes_selected] 719 else: 720 indexes_selected = N.arange(len(data)) 721 data_selected = data 722 723 p_thr = kwargs.get('p', 0.05) 724 if test == 'p-roc': 725 tail = kwargs.get('tail', 'both') 726 data_p = _pvalue(data, Nonparametric(data).cdf, tail) 727 data_p_thr = N.abs(data_p) <= p_thr 728 true_positives = N.sum(data_p_thr) 729 if true_positives == 0: 730 raise ValueError, "Provided data has no elements in non-" \ 731 "parametric distribution under p<=%.3f. Please " \ 732 "increase the size of data or value of p" % p 733 if __debug__: 734 debug('STAT_', 'Number of positives in non-parametric ' 735 'distribution is %d' % true_positives) 736 737 if distributions is None: 738 distributions = ['scipy'] 739 740 # lets see if 'scipy' entry was in there 741 try: 742 scipy_ind = distributions.index('scipy') 743 distributions.pop(scipy_ind) 744 distributions += scipy.stats.distributions.__all__ 745 except ValueError: 746 pass 747 748 results = [] 749 for d in distributions: 750 dist_gen, loc_, scale_, args_ = None, loc, scale, args 751 if isinstance(d, basestring): 752 dist_gen = d 753 dist_name = d 754 elif isinstance(d, tuple): 755 if not (len(d)==2 and isinstance(d[1], dict)): 756 raise ValueError,\ 757 "Tuple specification of distribution must be " \ 758 "(d, {params}). Got %s" % (d,) 759 dist_gen = d[0] 760 loc_ = d[1].get('loc', loc) 761 scale_ = d[1].get('scale', scale) 762 args_ = d[1].get('args', args) 763 dist_name = d[1].get('name', str(dist_gen)) 764 else: 765 dist_gen = d 766 dist_name = str(d) 767 768 # perform actions which might puke for some distributions 769 try: 770 dist_gen_ = getattr(scipy.stats, dist_gen) 771 # specify distribution 'optimizer'. If loc or scale was provided, 772 # use home-brewed rv_semifrozen 773 if args_ is not None or loc_ is not None or scale_ is not None: 774 dist_opt = rv_semifrozen(dist_gen_, loc=loc_, scale=scale_, args=args_) 775 else: 776 dist_opt = dist_gen_ 777 dist_params = dist_opt.fit(data_selected) 778 if __debug__: 779 debug('STAT__', 780 'Got distribution parameters %s for %s' 781 % (dist_params, dist_name)) 782 if test == 'p-roc': 783 cdf_func = lambda x: dist_gen_.cdf(x, *dist_params) 784 # We need to compare detection under given p 785 cdf_p = N.abs(_pvalue(data, cdf_func, tail, name=dist_gen)) 786 cdf_p_thr = cdf_p <= p_thr 787 D, p = N.sum(N.abs(data_p_thr - cdf_p_thr))*1.0/true_positives, 1 788 if __debug__: res_sum = 'D=%.2f' % D 789 elif test == 'kstest': 790 D, p = kstest(data, d, args=dist_params) 791 if __debug__: res_sum = 'D=%.3f p=%.3f' % (D, p) 792 except (TypeError, ValueError, AttributeError, 793 NotImplementedError), e:#Exception, e: 794 if __debug__: 795 debug('STAT__', 796 'Testing for %s distribution failed due to %s' 797 % (d, str(e))) 798 continue 799 800 if p > p_thr and not N.isnan(D): 801 results += [ (D, dist_gen, dist_name, dist_params) ] 802 if __debug__: 803 debug('STAT_', 'Testing for %s dist.: %s' % (dist_name, res_sum)) 804 else: 805 if __debug__: 806 debug('STAT__', 'Cannot consider %s dist. with %s' 807 % (d, res_sum)) 808 continue 809 810 # sort in ascending order, so smaller is better 811 results.sort() 812 813 if __debug__ and 'STAT' in debug.active: 814 # find the best and report it 815 nresults = len(results) 816 sresult = lambda r:'%s(%s)=%.2f' % (r[1], ', '.join(map(str, r[3])), r[0]) 817 if nresults>0: 818 nnextbest = min(2, nresults-1) 819 snextbest = ', '.join(map(sresult, results[1:1+nnextbest])) 820 debug('STAT', 'Best distribution %s. Next best: %s' 821 % (sresult(results[0]), snextbest)) 822 else: 823 debug('STAT', 'Could not find suitable distribution') 824 825 # return all the results 826 return results
827 828 829 if externals.exists('pylab'): 830 import pylab as P
831 832 - def plotDistributionMatches(data, matches, nbins=31, nbest=5, 833 expand_tails=8, legend=2, plot_cdf=True, 834 p=None, tail='both'):
835 """Plot best matching distributions 836 837 :Parameters: 838 data : N.ndarray 839 Data which was used to obtain the matches 840 matches : list of tuples 841 Sorted matches as provided by matchDistribution 842 nbins : int 843 Number of bins in the histogram 844 nbest : int 845 Number of top matches to plot 846 expand_tails : int 847 How many bins away to add to parametrized distributions 848 plots 849 legend : int 850 Either to provide legend and statistics in the legend. 851 1 -- just lists distributions. 852 2 -- adds distance measure 853 3 -- tp/fp/fn in the case if p is provided 854 plot_cdf : bool 855 Either to plot cdf for data using non-parametric distribution 856 p : float or None 857 If not None, visualize null-hypothesis testing (given p). 858 Bars in the histogram which fall under given p are colored 859 in red. False positives and false negatives are marked as 860 triangle up and down symbols correspondingly 861 tail : ('left', 'right', 'any', 'both') 862 If p is not None, the choise of tail for null-hypothesis 863 testing 864 865 :Returns: tuple(histogram, list of lines) 866 """ 867 868 hist = P.hist(data, nbins, normed=1, align='center') 869 data_range = [N.min(data), N.max(data)] 870 871 # x's 872 x = hist[1] 873 dx = x[expand_tails] - x[0] # how much to expand tails by 874 x = N.hstack((x[:expand_tails] - dx, x, x[-expand_tails:] + dx)) 875 876 nonparam = Nonparametric(data) 877 # plot cdf 878 if plot_cdf: 879 P.plot(x, nonparam.cdf(x), 'k--', linewidth=1) 880 881 p_thr = p 882 883 data_p = _pvalue(data, nonparam.cdf, tail) 884 data_p_thr = (data_p <= p_thr).ravel() 885 886 x_p = _pvalue(x, Nonparametric(data).cdf, tail) 887 x_p_thr = N.abs(x_p) <= p_thr 888 # color bars which pass thresholding in red 889 for thr, bar in zip(x_p_thr[expand_tails:], hist[2]): 890 bar.set_facecolor(('w','r')[int(thr)]) 891 892 if not len(matches): 893 # no matches were provided 894 warning("No matching distributions were provided -- nothing to plot") 895 return (hist, ) 896 897 lines = [] 898 labels = [] 899 for i in xrange(min(nbest, len(matches))): 900 D, dist_gen, dist_name, params = matches[i] 901 dist = getattr(scipy.stats, dist_gen)(*params) 902 903 label = '%s' % (dist_name) 904 if legend > 1: label += '(D=%.2f)' % (D) 905 906 xcdf_p = N.abs(_pvalue(x, dist.cdf, tail)) 907 xcdf_p_thr = (xcdf_p <= p_thr).ravel() 908 909 if p is not None and legend > 2: 910 # We need to compare detection under given p 911 data_cdf_p = N.abs(_pvalue(data, dist.cdf, tail)) 912 data_cdf_p_thr = (data_cdf_p <= p_thr).ravel() 913 914 # true positives 915 tp = N.logical_and(data_cdf_p_thr, data_p_thr) 916 # false positives 917 fp = N.logical_and(data_cdf_p_thr, ~data_p_thr) 918 # false negatives 919 fn = N.logical_and(~data_cdf_p_thr, data_p_thr) 920 921 label += ' tp/fp/fn=%d/%d/%d)' % \ 922 tuple(map(N.sum, [tp,fp,fn])) 923 924 pdf = dist.pdf(x) 925 line = P.plot(x, pdf, '-', linewidth=2, label=label) 926 color = line[0].get_color() 927 928 if plot_cdf: 929 cdf = dist.cdf(x) 930 P.plot(x, cdf, ':', linewidth=1, color=color, label=label) 931 932 # TODO: decide on tp/fp/fn by not centers of the bins but 933 # by the values in data in the ranges covered by 934 # those bins. Then it would correspond to the values 935 # mentioned in the legend 936 if p is not None: 937 # true positives 938 xtp = N.logical_and(xcdf_p_thr, x_p_thr) 939 # false positives 940 xfp = N.logical_and(xcdf_p_thr, ~x_p_thr) 941 # false negatives 942 xfn = N.logical_and(~xcdf_p_thr, x_p_thr) 943 944 # no need to plot tp explicitely -- marked by color of the bar 945 # P.plot(x[xtp], pdf[xtp], 'o', color=color) 946 P.plot(x[xfp], pdf[xfp], '^', color=color) 947 P.plot(x[xfn], pdf[xfn], 'v', color=color) 948 949 lines.append(line) 950 labels.append(label) 951 952 if legend: 953 P.legend(lines, labels) 954 955 return (hist, lines)
956
957 #if True: 958 # data = N.random.normal(size=(1000,1)); 959 # matches = matchDistribution( 960 # data, 961 # distributions=['scipy', 962 # ('norm', {'name':'norm_known', 963 # 'scale': 1.0, 964 # 'loc': 0.0})], 965 # nsamples=30, test='p-roc', p=0.05) 966 # P.figure(); plotDistributionMatches(data, matches, nbins=101, 967 # p=0.05, legend=4, nbest=5) 968 969 970 -def autoNullDist(dist):
971 """Cheater for human beings -- wraps `dist` if needed with some 972 NullDist 973 974 tail and other arguments are assumed to be default as in 975 NullDist/MCNullDist 976 """ 977 if dist is None or isinstance(dist, NullDist): 978 return dist 979 elif hasattr(dist, 'fit'): 980 if __debug__: 981 debug('STAT', 'Wrapping %s into MCNullDist' % dist) 982 return MCNullDist(dist) 983 else: 984 if __debug__: 985 debug('STAT', 'Wrapping %s into FixedNullDist' % dist) 986 return FixedNullDist(dist)
987
988 989 # if no scipy, we need nanmean 990 -def _chk_asarray(a, axis):
991 if axis is None: 992 a = N.ravel(a) 993 outaxis = 0 994 else: 995 a = N.asarray(a) 996 outaxis = axis 997 return a, outaxis
998
999 -def nanmean(x, axis=0):
1000 """Compute the mean over the given axis ignoring nans. 1001 1002 :Parameters: 1003 x : ndarray 1004 input array 1005 axis : int 1006 axis along which the mean is computed. 1007 1008 :Results: 1009 m : float 1010 the mean.""" 1011 x, axis = _chk_asarray(x,axis) 1012 x = x.copy() 1013 Norig = x.shape[axis] 1014 factor = 1.0-N.sum(N.isnan(x),axis)*1.0/Norig 1015 1016 x[N.isnan(x)] = 0 1017 return N.mean(x,axis)/factor
1018