Package mvpa :: Package clfs :: Module stats
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.stats

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Estimator for classifier error distributions.""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  import numpy as N 
 14   
 15  from mvpa.base import externals, warning 
 16  from mvpa.misc.state import Stateful, StateVariable 
 17   
 18  if __debug__: 
 19      from mvpa.base import debug 
20 21 22 -class Nonparametric(object):
23 """Non-parametric 1d distribution -- derives cdf based on stored values. 24 25 Introduced to complement parametric distributions present in scipy.stats. 26 """ 27
28 - def __init__(self, dist_samples):
29 self._dist_samples = N.ravel(dist_samples)
30 31 32 @staticmethod
33 - def fit(dist_samples):
34 return [dist_samples]
35 36
37 - def cdf(self, x):
38 """Returns the cdf value at `x`. 39 """ 40 return N.vectorize(lambda v:(self._dist_samples <= v).mean())(x)
41
42 43 -def _pvalue(x, cdf_func, tail, return_tails=False):
44 """Helper function to return p-value(x) given cdf and tail 45 46 :Parameters: 47 cdf_func : callable 48 Function to be used to derive cdf values for x 49 tail : str ('left', 'right', 'any', 'both') 50 Which tail of the distribution to report. For 'any' and 'both' 51 it chooses the tail it belongs to based on the comparison to 52 p=0.5. In the case of 'any' significance is taken like in a 53 one-tailed test. 54 return_tails : bool 55 If True, a tuple return (pvalues, tails), where tails contain 56 1s if value was from the right tail, and 0 if the value was 57 from the left tail. 58 """ 59 is_scalar = N.isscalar(x) 60 if is_scalar: 61 x = [x] 62 63 cdf = cdf_func(x) 64 65 if __debug__ and 'CHECK_STABILITY' in debug.active: 66 cdf_min, cdf_max = N.min(cdf), N.max(cdf) 67 if cdf_min < 0 or cdf_max > 1.0: 68 warning('Stability check of cdf %s failed. Min=%s, max=%s' % \ 69 (cdf_func, cdf_min, cdf_max)) 70 71 # no escape but to assure that CDF is in the right range. Some 72 # distributions from scipy tend to jump away from [0,1] 73 cdf = N.clip(cdf, 0, 1.0) 74 75 if tail == 'left': 76 if return_tails: 77 right_tail = N.zeros(cdf.shape, dtype=bool) 78 elif tail == 'right': 79 cdf = 1 - cdf 80 if return_tails: 81 right_tail = N.ones(cdf.shape, dtype=bool) 82 elif tail in ('any', 'both'): 83 right_tail = (cdf >= 0.5) 84 cdf[right_tail] = 1.0 - cdf[right_tail] 85 if tail == 'both': 86 # we need to half the signficance 87 cdf *= 2 88 89 # Assure that NaNs didn't get significant value 90 cdf[N.isnan(x)] = 1.0 91 if is_scalar: res = cdf[0] 92 else: res = cdf 93 94 if return_tails: 95 return (res, right_tail) 96 else: 97 return res
98
99 100 -class NullDist(Stateful):
101 """Base class for null-hypothesis testing. 102 103 """ 104 105 # Although base class is not benefiting from states, derived 106 # classes do (MCNullDist). For the sake of avoiding multiple 107 # inheritance and associated headache -- let them all be Stateful, 108 # performance hit should be negligible in most of the scenarios 109 _ATTRIBUTE_COLLECTIONS = ['states'] 110
111 - def __init__(self, tail='both', **kwargs):
112 """Cheap initialization. 113 114 :Parameter: 115 tail: str ('left', 'right', 'any', 'both') 116 Which tail of the distribution to report. For 'any' and 'both' 117 it chooses the tail it belongs to based on the comparison to 118 p=0.5. In the case of 'any' significance is taken like in a 119 one-tailed test. 120 """ 121 Stateful.__init__(self, **kwargs) 122 123 self._setTail(tail)
124 125
126 - def _setTail(self, tail):
127 # sanity check 128 if tail not in ('left', 'right', 'any', 'both'): 129 raise ValueError, 'Unknown value "%s" to `tail` argument.' \ 130 % tail 131 self.__tail = tail
132 133
134 - def fit(self, measure, wdata, vdata=None):
135 """Implement to fit the distribution to the data.""" 136 raise NotImplementedError
137 138
139 - def cdf(self, x):
140 """Implementations return the value of the cumulative distribution 141 function (left or right tail dpending on the setting). 142 """ 143 raise NotImplementedError
144 145
146 - def p(self, x, **kwargs):
147 """Returns the p-value for values of `x`. 148 Returned values are determined left, right, or from any tail 149 depending on the constructor setting. 150 151 In case a `FeaturewiseDatasetMeasure` was used to estimate the 152 distribution the method returns an array. In that case `x` can be 153 a scalar value or an array of a matching shape. 154 """ 155 return _pvalue(x, self.cdf, self.__tail, **kwargs)
156 157 158 tail = property(fget=lambda x:x.__tail, fset=_setTail)
159
160 161 -class MCNullDist(NullDist):
162 """Null-hypothesis distribution is estimated from randomly permuted data labels. 163 164 The distribution is estimated by calling fit() with an appropriate 165 `DatasetMeasure` or `TransferError` instance and a training and a 166 validation dataset (in case of a `TransferError`). For a customizable 167 amount of cycles the training data labels are permuted and the 168 corresponding measure computed. In case of a `TransferError` this is the 169 error when predicting the *correct* labels of the validation dataset. 170 171 The distribution can be queried using the `cdf()` method, which can be 172 configured to report probabilities/frequencies from `left` or `right` tail, 173 i.e. fraction of the distribution that is lower or larger than some 174 critical value. 175 176 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()` 177 returns an array of featurewise probabilities/frequencies. 178 """ 179 180 _DEV_DOC = """ 181 TODO automagically decide on the number of samples/permutations needed 182 Caution should be paid though since resultant distributions might be 183 quite far from some conventional ones (e.g. Normal) -- it is expected to 184 them to be bimodal (or actually multimodal) in many scenarios. 185 """ 186 187 dist_samples = StateVariable(enabled=False, 188 doc='Samples obtained for each permutation') 189
190 - def __init__(self, dist_class=Nonparametric, permutations=100, **kwargs):
191 """Initialize Monte-Carlo Permutation Null-hypothesis testing 192 193 :Parameter: 194 dist_class: class 195 This can be any class which provides parameters estimate 196 using `fit()` method to initialize the instance, and 197 provides `cdf(x)` method for estimating value of x in CDF. 198 All distributions from SciPy's 'stats' module can be used. 199 permutations: int 200 This many permutations of label will be performed to 201 determine the distribution under the null hypothesis. 202 """ 203 NullDist.__init__(self, **kwargs) 204 205 self._dist_class = dist_class 206 self._dist = [] # actual distributions 207 208 self.__permutations = permutations 209 """Number of permutations to compute the estimate the null 210 distribution."""
211 212 213
214 - def fit(self, measure, wdata, vdata=None):
215 """Fit the distribution by performing multiple cycles which repeatedly 216 permuted labels in the training dataset. 217 218 :Parameter: 219 measure: (`Featurewise`)`DatasetMeasure` | `TransferError` 220 TransferError instance used to compute all errors. 221 wdata: `Dataset` which gets permuted and used to compute the 222 measure/transfer error multiple times. 223 vdata: `Dataset` used for validation. 224 If provided measure is assumed to be a `TransferError` and 225 working and validation dataset are passed onto it. 226 """ 227 dist_samples = [] 228 """Holds the values for randomized labels.""" 229 230 # decide on the arguments to measure 231 if not vdata is None: 232 measure_args = [vdata, wdata] 233 else: 234 measure_args = [wdata] 235 236 # estimate null-distribution 237 for p in xrange(self.__permutations): 238 # new permutation all the time 239 # but only permute the training data and keep the testdata constant 240 # 241 # TODO this really needs to be more clever! If data samples are 242 # shuffled within a class it really makes no difference for the 243 # classifier, hence the number of permutations to estimate the 244 # null-distribution of transfer errors can be reduced dramatically 245 # when the *right* permutations (the ones that matter) are done. 246 wdata.permuteLabels(True, perchunk=False) 247 248 # compute and store the measure of this permutation 249 # assume it has `TransferError` interface 250 dist_samples.append(measure(*measure_args)) 251 252 # restore original labels 253 wdata.permuteLabels(False, perchunk=False) 254 255 # store samples 256 self.dist_samples = dist_samples = N.asarray(dist_samples) 257 258 # fit distribution per each element 259 260 # to decide either it was done on scalars or vectors 261 shape = dist_samples.shape 262 nshape = len(shape) 263 # if just 1 dim, original data was scalar, just create an 264 # artif dimension for it 265 if nshape == 1: 266 dist_samples = dist_samples[:, N.newaxis] 267 268 # fit per each element. 269 # XXX could be more elegant? may be use N.vectorize? 270 dist_samples_rs = dist_samples.reshape((shape[0], -1)) 271 dist = [] 272 for samples in dist_samples_rs.T: 273 params = self._dist_class.fit(samples) 274 if __debug__ and 'STAT' in debug.active: 275 debug('STAT', 'Estimated parameters for the %s are %s' 276 % (self._dist_class, str(params))) 277 dist.append(self._dist_class(*params)) 278 self._dist = dist
279 280
281 - def cdf(self, x):
282 """Return value of the cumulative distribution function at `x`. 283 """ 284 if self._dist is None: 285 # XXX We might not want to descriminate that way since 286 # usually generators also have .cdf where they rely on the 287 # default parameters. But then what about Nonparametric 288 raise RuntimeError, "Distribution has to be fit first" 289 290 is_scalar = N.isscalar(x) 291 if is_scalar: 292 x = [x] 293 294 x = N.asanyarray(x) 295 xshape = x.shape 296 # assure x is a 1D array now 297 x = x.reshape((-1,)) 298 299 if len(self._dist) != len(x): 300 raise ValueError, 'Distribution was fit for structure with %d' \ 301 ' elements, whenever now queried with %d elements' \ 302 % (len(self._dist), len(x)) 303 304 # extract cdf values per each element 305 cdfs = [ dist.cdf(v) for v, dist in zip(x, self._dist) ] 306 return N.array(cdfs).reshape(xshape)
307 308
309 - def clean(self):
310 """Clean stored distributions 311 312 Storing all of the distributions might be too expensive 313 (e.g. in case of Nonparametric), and the scope of the object 314 might be too broad to wait for it to be destroyed. Clean would 315 bind dist_samples to empty list to let gc revoke the memory. 316 """ 317 self._dist = []
318
319 320 321 -class FixedNullDist(NullDist):
322 """Proxy/Adaptor class for SciPy distributions. 323 324 All distributions from SciPy's 'stats' module can be used with this class. 325 326 >>> import numpy as N 327 >>> from scipy import stats 328 >>> from mvpa.clfs.stats import FixedNullDist 329 >>> 330 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4)) 331 >>> dist.p(2) 332 0.5 333 >>> 334 >>> dist.cdf(N.arange(5)) 335 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246]) 336 >>> 337 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4), tail='right') 338 >>> dist.p(N.arange(5)) 339 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754]) 340 """
341 - def __init__(self, dist, **kwargs):
342 """ 343 :Parameter: 344 dist: distribution object 345 This can be any object the has a `cdf()` method to report the 346 cumulative distribition function values. 347 """ 348 NullDist.__init__(self, **kwargs) 349 350 self._dist = dist
351 352
353 - def fit(self, measure, wdata, vdata=None):
354 """Does nothing since the distribution is already fixed.""" 355 pass
356 357
358 - def cdf(self, x):
359 """Return value of the cumulative distribution function at `x`. 360 """ 361 return self._dist.cdf(x)
362
363 364 -class AdaptiveNullDist(FixedNullDist):
365 """Adaptive distribution which adjusts parameters according to the data 366 367 WiP: internal implementation might change 368 """
369 - def fit(self, measure, wdata, vdata=None):
370 """Cares about dimensionality of the feature space in measure 371 """ 372 373 try: 374 nfeatures = len(measure.feature_ids) 375 except ValueError: # XXX 376 nfeatures = N.prod(wdata.shape[1:]) 377 378 dist_gen = self._dist 379 if not hasattr(dist_gen, 'fit'): # frozen already 380 dist_gen = dist_gen.dist # rv_frozen at least has it ;) 381 382 args, kwargs = self._adapt(nfeatures, measure, wdata, vdata) 383 if __debug__: 384 debug('STAT', 'Adapted parameters for %s to be %s, %s' 385 % (dist_gen, args, kwargs)) 386 self._dist = dist_gen(*args, **kwargs)
387 388
389 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
390 raise NotImplementedError
391
392 393 -class AdaptiveRDist(AdaptiveNullDist):
394 """Adaptive rdist: params are (nfeatures-1, 0, 1) 395 """ 396
397 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
398 return (nfeatures-1, 0, 1), {}
399 400 # XXX: RDist has stability issue, just run 401 # python -c "import scipy.stats; print scipy.stats.rdist(541,0,1).cdf(0.72)" 402 # to get some improbable value, so we need to take care about that manually 403 # here
404 - def cdf(self, x):
405 cdf_ = self._dist.cdf(x) 406 bad_values = N.where(N.abs(cdf_)>1) 407 # XXX there might be better implementation (faster/elegant) using N.clip, 408 # the only problem is that instability results might flip the sign 409 # arbitrarily 410 if len(bad_values[0]): 411 # in this distribution we have mean at 0, so we can take that easily 412 # into account 413 cdf_bad = cdf_[bad_values] 414 x_bad = x[bad_values] 415 cdf_bad[x_bad<0] = 0.0 416 cdf_bad[x_bad>=0] = 1.0 417 cdf_[bad_values] = cdf_bad 418 return cdf_
419
420 421 -class AdaptiveNormal(AdaptiveNullDist):
422 """Adaptive rdist: params are (0, sqrt(1/nfeatures)) 423 """ 424
425 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
426 return (0, 1.0/N.sqrt(nfeatures)), {}
427 428 429 if externals.exists('scipy'): 430 import scipy.stats 431 from scipy.stats import kstest 432 """ 433 Thoughts: 434 435 So we can use `scipy.stats.kstest` (Kolmogorov-Smirnov test) to 436 check/reject H0 that samples come from a given distribution. But 437 since it is based on a full range of data, we might better of with 438 some ad-hoc checking by the detection power of the values in the 439 tail of a tentative distribution. 440 441 """ 442 443 # We need a way to fixate estimation of some parameters 444 # (e.g. mean) so lets create a simple proxy, or may be class 445 # generator later on, which would take care about punishing change 446 # from the 'right' arguments 447 448 import scipy
449 450 - class rv_semifrozen(object):
451 """Helper proxy-class to fit distribution when some parameters are known 452 453 It is an ugly hack with snippets of code taken from scipy, which is 454 Copyright (c) 2001, 2002 Enthought, Inc. 455 and is distributed under BSD license 456 http://www.scipy.org/License_Compatibility 457 """ 458
459 - def __init__(self, dist, loc=None, scale=None, args=None):
460 461 self._dist = dist 462 # loc and scale 463 theta = (loc, scale) 464 # args 465 Narg_ = dist.numargs 466 if args is not None: 467 Narg = len(args) 468 if Narg > Narg_: 469 raise ValueError, \ 470 'Distribution %s has only %d arguments. Got %d' \ 471 % (dist, Narg_, Narg) 472 args += (None,) * (Narg_ - Narg) 473 else: 474 args = (None,) * Narg_ 475 476 args_i = [i for i,v in enumerate(args) if v is None] 477 self._fargs = (list(args+theta), args_i) 478 """Arguments which should get some fixed value"""
479 480
481 - def nnlf(self, theta, x):
482 # - sum (log pdf(x, theta),axis=0) 483 # where theta are the parameters (including loc and scale) 484 # 485 fargs, fargs_i = self._fargs 486 try: 487 i=-1 488 if fargs[-1] is not None: 489 scale = fargs[-1] 490 else: 491 scale = theta[i] 492 i -= 1 493 494 if fargs[-2] is not None: 495 loc = fargs[-2] 496 else: 497 loc = theta[i] 498 i -= 1 499 500 args = theta[:i+1] 501 # adjust args if there were fixed 502 for i,a in zip(fargs_i, args): 503 fargs[i] = a 504 args = fargs[:-2] 505 506 except IndexError: 507 raise ValueError, "Not enough input arguments." 508 if not self._argcheck(*args) or scale <= 0: 509 return N.inf 510 x = N.asarray((x-loc) / scale) 511 cond0 = (x <= self.a) | (x >= self.b) 512 if (N.any(cond0)): 513 return N.inf 514 else: 515 return self._nnlf(x, *args) + len(x)*N.log(scale)
516
517 - def fit(self, data, *args, **kwds):
518 loc0, scale0 = map(kwds.get, ['loc', 'scale'], [0.0, 1.0]) 519 fargs, fargs_i = self._fargs 520 Narg = len(args) 521 Narg_ = self.numargs 522 if Narg != Narg_: 523 if Narg > Narg_: 524 raise ValueError, "Too many input arguments." 525 else: 526 args += (1.0,)*(self.numargs-Narg) 527 528 # Provide only those args which are not fixed, and 529 # append location and scale (if not fixed) at the end 530 if len(fargs_i) != Narg_: 531 x0 = tuple([args[i] for i in fargs_i]) 532 else: 533 x0 = args 534 if fargs[-2] is None: x0 = x0 + (loc0,) 535 if fargs[-1] is None: x0 = x0 + (scale0,) 536 537 opt_x = scipy.optimize.fmin(self.nnlf, x0, args=(N.ravel(data),), disp=0) 538 539 # reconstruct back 540 i = 0 541 loc, scale = fargs[-2:] 542 if fargs[-1] is None: 543 i -= 1 544 scale = opt_x[i] 545 if fargs[-2] is None: 546 i -= 1 547 loc = opt_x[i] 548 549 # assign those which weren't fixed 550 for i in fargs_i: 551 fargs[i] = opt_x[i] 552 553 #raise ValueError 554 opt_x = N.hstack((fargs[:-2], (loc, scale))) 555 return opt_x
556 557
558 - def __setattr__(self, a, v):
559 if not a in ['_dist', '_fargs', 'fit', 'nnlf']: 560 self._dist.__setattr__(a, v) 561 else: 562 object.__setattr__(self, a, v)
563 564
565 - def __getattribute__(self, a):
566 """We need to redirect all queries correspondingly 567 """ 568 if not a in ['_dist', '_fargs', 'fit', 'nnlf']: 569 return getattr(self._dist, a) 570 else: 571 return object.__getattribute__(self, a)
572
573 574 575 - def matchDistribution(data, nsamples=None, loc=None, scale=None, 576 args=None, test='kstest', distributions=None, 577 **kwargs):
578 """Determine best matching distribution. 579 580 Can be used for 'smelling' the data, as well to choose a 581 parametric distribution for data obtained from non-parametric 582 testing (e.g. `MCNullDist`). 583 584 WiP: use with caution, API might change 585 586 :Parameters: 587 data : N.ndarray 588 Array of the data for which to deduce the distribution. It has 589 to be sufficiently large to make a reliable conclusion 590 nsamples : int or None 591 If None -- use all samples in data to estimate parametric 592 distribution. Otherwise use only specified number randomly selected 593 from data. 594 loc : float or None 595 Loc for the distribution (if known) 596 scale : float or None 597 Scale for the distribution (if known) 598 test : basestring 599 What kind of testing to do. Choices: 600 'p-roc' : detection power for a given ROC. Needs two 601 parameters: `p=0.05` and `tail='both'` 602 'kstest' : 'full-body' distribution comparison. The best 603 choice is made by minimal reported distance after estimating 604 parameters of the distribution. Parameter `p=0.05` sets 605 threshold to reject null-hypothesis that distribution is the 606 same. 607 WARNING: older versions (e.g. 0.5.2 in etch) of scipy have 608 incorrect kstest implementation and do not function 609 properly 610 distributions : None or list of basestring or tuple(basestring, dict) 611 Distributions to check. If None, all known in scipy.stats 612 are tested. If distribution is specified as a tuple, then 613 it must contain name and additional parameters (name, loc, 614 scale, args) in the dictionary. Entry 'scipy' adds all known 615 in scipy.stats. 616 **kwargs 617 Additional arguments which are needed for each particular test 618 (see above) 619 620 :Example: 621 data = N.random.normal(size=(1000,1)); 622 matches = matchDistribution( 623 data, 624 distributions=['rdist', 625 ('rdist', {'name':'rdist_fixed', 626 'loc': 0.0, 627 'args': (10,)})], 628 nsamples=30, test='p-roc', p=0.05) 629 """ 630 631 # Handle parameters 632 _KNOWN_TESTS = ['p-roc', 'kstest'] 633 if not test in _KNOWN_TESTS: 634 raise ValueError, 'Unknown kind of test %s. Known are %s' \ 635 % (test, _KNOWN_TESTS) 636 637 data = N.ravel(data) 638 # data sampled 639 if nsamples is not None: 640 if __debug__: 641 debug('STAT', 'Sampling %d samples from data for the ' \ 642 'estimation of the distributions parameters' % nsamples) 643 indexes_selected = (N.random.sample(nsamples)*len(data)).astype(int) 644 data_selected = data[indexes_selected] 645 else: 646 indexes_selected = N.arange(len(data)) 647 data_selected = data 648 649 p_thr = kwargs.get('p', 0.05) 650 if test == 'p-roc': 651 tail = kwargs.get('tail', 'both') 652 data_p = _pvalue(data, Nonparametric(data).cdf, tail) 653 data_p_thr = N.abs(data_p) <= p_thr 654 true_positives = N.sum(data_p_thr) 655 if true_positives == 0: 656 raise ValueError, "Provided data has no elements in non-" \ 657 "parametric distribution under p<=%.3f. Please " \ 658 "increase the size of data or value of p" % p 659 if __debug__: 660 debug('STAT_', 'Number of positives in non-parametric ' 661 'distribution is %d' % true_positives) 662 663 if distributions is None: 664 distributions = ['scipy'] 665 666 # lets see if 'scipy' entry was in there 667 try: 668 scipy_ind = distributions.index('scipy') 669 distributions.pop(scipy_ind) 670 distributions += scipy.stats.distributions.__all__ 671 except ValueError: 672 pass 673 674 results = [] 675 for d in distributions: 676 dist_gen, loc_, scale_, args_ = None, loc, scale, args 677 if isinstance(d, basestring): 678 dist_gen = d 679 dist_name = d 680 elif isinstance(d, tuple): 681 if not (len(d)==2 and isinstance(d[1], dict)): 682 raise ValueError,\ 683 "Tuple specification of distribution must be " \ 684 "(d, {params}). Got %s" % (d,) 685 dist_gen = d[0] 686 loc_ = d[1].get('loc', loc) 687 scale_ = d[1].get('scale', scale) 688 args_ = d[1].get('args', args) 689 dist_name = d[1].get('name', str(dist_gen)) 690 else: 691 dist_gen = d 692 dist_name = str(d) 693 694 # perform actions which might puke for some distributions 695 try: 696 dist_gen_ = getattr(scipy.stats, dist_gen) 697 # specify distribution 'optimizer'. If loc or scale was provided, 698 # use home-brewed rv_semifrozen 699 if args_ is not None or loc_ is not None or scale_ is not None: 700 dist_opt = rv_semifrozen(dist_gen_, loc=loc_, scale=scale_, args=args_) 701 else: 702 dist_opt = dist_gen_ 703 dist_params = dist_opt.fit(data_selected) 704 if __debug__: 705 debug('STAT__', 706 'Got distribution parameters %s for %s' 707 % (dist_params, dist_name)) 708 if test == 'p-roc': 709 cdf_func = lambda x: dist_gen_.cdf(x, *dist_params) 710 # We need to compare detection under given p 711 cdf_p = N.abs(_pvalue(data, cdf_func, tail)) 712 cdf_p_thr = cdf_p <= p_thr 713 D, p = N.sum(N.abs(data_p_thr - cdf_p_thr))*1.0/true_positives, 1 714 if __debug__: res_sum = 'D=%.2f' % D 715 elif test == 'kstest': 716 D, p = kstest(data, d, args=dist_params) 717 if __debug__: res_sum = 'D=%.3f p=%.3f' % (D, p) 718 except (TypeError, ValueError, AttributeError), e:#Exception, e: 719 if __debug__: 720 debug('STAT__', 721 'Testing for %s distribution failed due to %s' 722 % (d, str(e))) 723 continue 724 725 if p > p_thr and not N.isnan(D): 726 results += [ (D, dist_gen, dist_name, dist_params) ] 727 if __debug__: 728 debug('STAT_', 'Testing for %s dist.: %s' % (dist_name, res_sum)) 729 else: 730 if __debug__: 731 debug('STAT__', 'Cannot consider %s dist. with %s' 732 % (d, res_sum)) 733 continue 734 735 # sort in ascending order, so smaller is better 736 results.sort() 737 738 if __debug__ and 'STAT' in debug.active: 739 # find the best and report it 740 nresults = len(results) 741 sresult = lambda r:'%s(%s)=%.2f' % (r[1], ', '.join(map(str, r[3])), r[0]) 742 if nresults>0: 743 nnextbest = min(2, nresults-1) 744 snextbest = ', '.join(map(sresult, results[1:1+nnextbest])) 745 debug('STAT', 'Best distribution %s. Next best: %s' 746 % (sresult(results[0]), snextbest)) 747 else: 748 debug('STAT', 'Could not find suitable distribution') 749 750 # return all the results 751 return results
752 753 754 if externals.exists('pylab'): 755 import pylab as P
756 757 - def plotDistributionMatches(data, matches, nbins=31, nbest=5, 758 expand_tails=8, legend=2, plot_cdf=True, 759 p=None, tail='both'):
760 """Plot best matching distributions 761 762 :Parameters: 763 data : N.ndarray 764 Data which was used to obtain the matches 765 matches : list of tuples 766 Sorted matches as provided by matchDistribution 767 nbins : int 768 Number of bins in the histogram 769 nbest : int 770 Number of top matches to plot 771 expand_tails : int 772 How many bins away to add to parametrized distributions 773 plots 774 legend : int 775 Either to provide legend and statistics in the legend. 776 1 -- just lists distributions. 777 2 -- adds distance measure 778 3 -- tp/fp/fn in the case if p is provided 779 plot_cdf : bool 780 Either to plot cdf for data using non-parametric distribution 781 p : float or None 782 If not None, visualize null-hypothesis testing (given p). 783 Bars in the histogram which fall under given p are colored 784 in red. False positives and false negatives are marked as 785 triangle up and down symbols correspondingly 786 tail : ('left', 'right', 'any', 'both') 787 If p is not None, the choise of tail for null-hypothesis 788 testing 789 790 :Returns: tuple(histogram, list of lines) 791 """ 792 793 hist = P.hist(data, nbins, normed=1, align='center') 794 data_range = [N.min(data), N.max(data)] 795 796 # x's 797 x = hist[1] 798 dx = x[expand_tails] - x[0] # how much to expand tails by 799 x = N.hstack((x[:expand_tails] - dx, x, x[-expand_tails:] + dx)) 800 801 nonparam = Nonparametric(data) 802 # plot cdf 803 if plot_cdf: 804 P.plot(x, nonparam.cdf(x), 'k--', linewidth=1) 805 806 p_thr = p 807 808 data_p = _pvalue(data, nonparam.cdf, tail) 809 data_p_thr = (data_p <= p_thr).ravel() 810 811 x_p = _pvalue(x, Nonparametric(data).cdf, tail) 812 x_p_thr = N.abs(x_p) <= p_thr 813 # color bars which pass thresholding in red 814 for thr, bar in zip(x_p_thr[expand_tails:], hist[2]): 815 bar.set_facecolor(('w','r')[int(thr)]) 816 817 if not len(matches): 818 # no matches were provided 819 warning("No matching distributions were provided -- nothing to plot") 820 return (hist, ) 821 822 lines = [] 823 labels = [] 824 for i in xrange(min(nbest, len(matches))): 825 D, dist_gen, dist_name, params = matches[i] 826 dist = getattr(scipy.stats, dist_gen)(*params) 827 828 label = '%s' % (dist_name) 829 if legend > 1: label += '(D=%.2f)' % (D) 830 831 xcdf_p = N.abs(_pvalue(x, dist.cdf, tail)) 832 xcdf_p_thr = (xcdf_p <= p_thr).ravel() 833 834 if p is not None and legend > 2: 835 # We need to compare detection under given p 836 data_cdf_p = N.abs(_pvalue(data, dist.cdf, tail)) 837 data_cdf_p_thr = (data_cdf_p <= p_thr).ravel() 838 839 # true positives 840 tp = N.logical_and(data_cdf_p_thr, data_p_thr) 841 # false positives 842 fp = N.logical_and(data_cdf_p_thr, ~data_p_thr) 843 # false negatives 844 fn = N.logical_and(~data_cdf_p_thr, data_p_thr) 845 846 label += ' tp/fp/fn=%d/%d/%d)' % \ 847 tuple(map(N.sum, [tp,fp,fn])) 848 849 pdf = dist.pdf(x) 850 line = P.plot(x, pdf, '-', linewidth=2, label=label) 851 color = line[0].get_color() 852 853 if plot_cdf: 854 cdf = dist.cdf(x) 855 P.plot(x, cdf, ':', linewidth=1, color=color, label=label) 856 857 # TODO: decide on tp/fp/fn by not centers of the bins but 858 # by the values in data in the ranges covered by 859 # those bins. Then it would correspond to the values 860 # mentioned in the legend 861 if p is not None: 862 # true positives 863 xtp = N.logical_and(xcdf_p_thr, x_p_thr) 864 # false positives 865 xfp = N.logical_and(xcdf_p_thr, ~x_p_thr) 866 # false negatives 867 xfn = N.logical_and(~xcdf_p_thr, x_p_thr) 868 869 # no need to plot tp explicitely -- marked by color of the bar 870 # P.plot(x[xtp], pdf[xtp], 'o', color=color) 871 P.plot(x[xfp], pdf[xfp], '^', color=color) 872 P.plot(x[xfn], pdf[xfn], 'v', color=color) 873 874 lines.append(line) 875 labels.append(label) 876 877 if legend: 878 P.legend(lines, labels) 879 880 return (hist, lines)
881
882 #if True: 883 # data = N.random.normal(size=(1000,1)); 884 # matches = matchDistribution( 885 # data, 886 # distributions=['scipy', 887 # ('norm', {'name':'norm_known', 888 # 'scale': 1.0, 889 # 'loc': 0.0})], 890 # nsamples=30, test='p-roc', p=0.05) 891 # P.figure(); plotDistributionMatches(data, matches, nbins=101, 892 # p=0.05, legend=4, nbest=5) 893 894 895 -def autoNullDist(dist):
896 """Cheater for human beings -- wraps `dist` if needed with some 897 NullDist 898 899 tail and other arguments are assumed to be default as in 900 NullDist/MCNullDist 901 """ 902 if dist is None or isinstance(dist, NullDist): 903 return dist 904 elif hasattr(dist, 'fit'): 905 if __debug__: 906 debug('STAT', 'Wrapping %s into MCNullDist' % dist) 907 return MCNullDist(dist) 908 else: 909 if __debug__: 910 debug('STAT', 'Wrapping %s into FixedNullDist' % dist) 911 return FixedNullDist(dist)
912
913 914 # if no scipy, we need nanmean 915 -def _chk_asarray(a, axis):
916 if axis is None: 917 a = N.ravel(a) 918 outaxis = 0 919 else: 920 a = N.asarray(a) 921 outaxis = axis 922 return a, outaxis
923
924 -def nanmean(x, axis=0):
925 """Compute the mean over the given axis ignoring nans. 926 927 :Parameters: 928 x : ndarray 929 input array 930 axis : int 931 axis along which the mean is computed. 932 933 :Results: 934 m : float 935 the mean.""" 936 x, axis = _chk_asarray(x,axis) 937 x = x.copy() 938 Norig = x.shape[axis] 939 factor = 1.0-N.sum(N.isnan(x),axis)*1.0/Norig 940 941 x[N.isnan(x)] = 0 942 return N.mean(x,axis)/factor
943