1
2
3
4
5
6
7
8
9 """Estimator for classifier error distributions."""
10
11 __docformat__ = 'restructuredtext'
12
13 import numpy as N
14
15 from mvpa.base import externals, warning
16 from mvpa.misc.state import ClassWithCollections, StateVariable
17
18 if __debug__:
19 from mvpa.base import debug
23 """Non-parametric 1d distribution -- derives cdf based on stored values.
24
25 Introduced to complement parametric distributions present in scipy.stats.
26 """
27
28 - def __init__(self, dist_samples, correction='clip'):
29 """
30 :Parameters:
31 dist_samples : ndarray
32 Samples to be used to assess the distribution.
33 correction : {'clip'} or None, optional
34 Determines the behavior when .cdf is queried. If None -- no
35 correction is made. If 'clip' -- values are clipped to lie
36 in the range [1/(N+2), (N+1)/(N+2)] (simply because
37 non-parametric assessment lacks the power to resolve with
38 higher precision in the tails, so 'imagery' samples are
39 placed in each of the two tails).
40 """
41 self._dist_samples = N.ravel(dist_samples)
42 self._correction = correction
43
45 return '%s(%r%s)' % (
46 self.__class__.__name__,
47 self._dist_samples,
48 ('', ', correction=%r' % self._correction)
49 [int(self._correction != 'clip')])
50
51 @staticmethod
52 - def fit(dist_samples):
54
55
57 """Returns the cdf value at `x`.
58 """
59 dist_samples = self._dist_samples
60 res = N.vectorize(lambda v:(dist_samples <= v).mean())(x)
61 if self._correction == 'clip':
62 nsamples = len(dist_samples)
63 N.clip(res, 1.0/(nsamples+2), (nsamples+1.0)/(nsamples+2), res)
64 elif self._correction is None:
65 pass
66 else:
67 raise ValueError, \
68 '%r is incorrect value for correction parameter of %s' \
69 % (self._correction, self.__class__.__name__)
70 return res
71
72
73 -def _pvalue(x, cdf_func, tail, return_tails=False, name=None):
74 """Helper function to return p-value(x) given cdf and tail
75
76 :Parameters:
77 cdf_func : callable
78 Function to be used to derive cdf values for x
79 tail : str ('left', 'right', 'any', 'both')
80 Which tail of the distribution to report. For 'any' and 'both'
81 it chooses the tail it belongs to based on the comparison to
82 p=0.5. In the case of 'any' significance is taken like in a
83 one-tailed test.
84 return_tails : bool
85 If True, a tuple return (pvalues, tails), where tails contain
86 1s if value was from the right tail, and 0 if the value was
87 from the left tail.
88 """
89 is_scalar = N.isscalar(x)
90 if is_scalar:
91 x = [x]
92
93 cdf = cdf_func(x)
94
95 if __debug__ and 'CHECK_STABILITY' in debug.active:
96 cdf_min, cdf_max = N.min(cdf), N.max(cdf)
97 if cdf_min < 0 or cdf_max > 1.0:
98 s = ('', ' for %s' % name)[int(name is not None)]
99 warning('Stability check of cdf %s failed%s. Min=%s, max=%s' % \
100 (cdf_func, s, cdf_min, cdf_max))
101
102
103
104 cdf = N.clip(cdf, 0, 1.0)
105
106 if tail == 'left':
107 if return_tails:
108 right_tail = N.zeros(cdf.shape, dtype=bool)
109 elif tail == 'right':
110 cdf = 1 - cdf
111 if return_tails:
112 right_tail = N.ones(cdf.shape, dtype=bool)
113 elif tail in ('any', 'both'):
114 right_tail = (cdf >= 0.5)
115 cdf[right_tail] = 1.0 - cdf[right_tail]
116 if tail == 'both':
117
118 cdf *= 2
119
120
121 cdf[N.isnan(x)] = 1.0
122 if is_scalar: res = cdf[0]
123 else: res = cdf
124
125 if return_tails:
126 return (res, right_tail)
127 else:
128 return res
129
130
131 -class NullDist(ClassWithCollections):
132 """Base class for null-hypothesis testing.
133
134 """
135
136
137
138
139
140 _ATTRIBUTE_COLLECTIONS = ['states']
141
142 - def __init__(self, tail='both', **kwargs):
143 """Cheap initialization.
144
145 :Parameter:
146 tail: str ('left', 'right', 'any', 'both')
147 Which tail of the distribution to report. For 'any' and 'both'
148 it chooses the tail it belongs to based on the comparison to
149 p=0.5. In the case of 'any' significance is taken like in a
150 one-tailed test.
151 """
152 ClassWithCollections.__init__(self, **kwargs)
153
154 self._setTail(tail)
155
157 return super(NullDist, self).__repr__(
158 prefixes=["tail=%s" % `self.__tail`] + prefixes)
159
160
162
163 if tail not in ('left', 'right', 'any', 'both'):
164 raise ValueError, 'Unknown value "%s" to `tail` argument.' \
165 % tail
166 self.__tail = tail
167
168
169 - def fit(self, measure, wdata, vdata=None):
170 """Implement to fit the distribution to the data."""
171 raise NotImplementedError
172
173
175 """Implementations return the value of the cumulative distribution
176 function (left or right tail dpending on the setting).
177 """
178 raise NotImplementedError
179
180
181 - def p(self, x, **kwargs):
182 """Returns the p-value for values of `x`.
183 Returned values are determined left, right, or from any tail
184 depending on the constructor setting.
185
186 In case a `FeaturewiseDatasetMeasure` was used to estimate the
187 distribution the method returns an array. In that case `x` can be
188 a scalar value or an array of a matching shape.
189 """
190 return _pvalue(x, self.cdf, self.__tail, **kwargs)
191
192
193 tail = property(fget=lambda x:x.__tail, fset=_setTail)
194
197 """Null-hypothesis distribution is estimated from randomly permuted data labels.
198
199 The distribution is estimated by calling fit() with an appropriate
200 `DatasetMeasure` or `TransferError` instance and a training and a
201 validation dataset (in case of a `TransferError`). For a customizable
202 amount of cycles the training data labels are permuted and the
203 corresponding measure computed. In case of a `TransferError` this is the
204 error when predicting the *correct* labels of the validation dataset.
205
206 The distribution can be queried using the `cdf()` method, which can be
207 configured to report probabilities/frequencies from `left` or `right` tail,
208 i.e. fraction of the distribution that is lower or larger than some
209 critical value.
210
211 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()`
212 returns an array of featurewise probabilities/frequencies.
213 """
214
215 _DEV_DOC = """
216 TODO automagically decide on the number of samples/permutations needed
217 Caution should be paid though since resultant distributions might be
218 quite far from some conventional ones (e.g. Normal) -- it is expected to
219 them to be bimodal (or actually multimodal) in many scenarios.
220 """
221
222 dist_samples = StateVariable(enabled=False,
223 doc='Samples obtained for each permutation')
224
226 """Initialize Monte-Carlo Permutation Null-hypothesis testing
227
228 :Parameters:
229 dist_class: class
230 This can be any class which provides parameters estimate
231 using `fit()` method to initialize the instance, and
232 provides `cdf(x)` method for estimating value of x in CDF.
233 All distributions from SciPy's 'stats' module can be used.
234 permutations: int
235 This many permutations of label will be performed to
236 determine the distribution under the null hypothesis.
237 """
238 NullDist.__init__(self, **kwargs)
239
240 self._dist_class = dist_class
241 self._dist = []
242
243 self.__permutations = permutations
244 """Number of permutations to compute the estimate the null
245 distribution."""
246
248 prefixes_ = ["permutations=%s" % self.__permutations]
249 if self._dist_class != Nonparametric:
250 prefixes_.insert(0, 'dist_class=%s' % `self._dist_class`)
251 return super(MCNullDist, self).__repr__(
252 prefixes=prefixes_ + prefixes)
253
254
255 - def fit(self, measure, wdata, vdata=None):
256 """Fit the distribution by performing multiple cycles which repeatedly
257 permuted labels in the training dataset.
258
259 :Parameters:
260 measure: (`Featurewise`)`DatasetMeasure` | `TransferError`
261 TransferError instance used to compute all errors.
262 wdata: `Dataset` which gets permuted and used to compute the
263 measure/transfer error multiple times.
264 vdata: `Dataset` used for validation.
265 If provided measure is assumed to be a `TransferError` and
266 working and validation dataset are passed onto it.
267 """
268 dist_samples = []
269 """Holds the values for randomized labels."""
270
271
272
273 from mvpa.clfs.base import DegenerateInputError, FailedToTrainError
274
275
276 if not vdata is None:
277 measure_args = [vdata, wdata]
278 else:
279 measure_args = [wdata]
280
281
282 skipped = 0
283 for p in xrange(self.__permutations):
284
285
286
287 if __debug__:
288 debug('STATMC', "Doing %i permutations: %i" \
289 % (self.__permutations, p+1), cr=True)
290
291
292
293
294
295
296 wdata.permuteLabels(True, perchunk=False)
297
298
299
300 try:
301 dist_samples.append(measure(*measure_args))
302 except (DegenerateInputError, FailedToTrainError), e:
303 if __debug__:
304 debug('STATMC', " skipped", cr=True)
305 warning("Failed to estimate %s on %s, due to %s. "
306 "Permutation %d skipped." %
307 (measure, measure_args, e, p))
308 skipped += 1
309 continue
310
311 if __debug__:
312 debug('STATMC', ' Skipped: %d permutations' % skipped)
313
314
315 wdata.permuteLabels(False, perchunk=False)
316
317
318 self.dist_samples = dist_samples = N.asarray(dist_samples)
319
320
321
322
323 shape = dist_samples.shape
324 nshape = len(shape)
325
326
327 if nshape == 1:
328 dist_samples = dist_samples[:, N.newaxis]
329
330
331
332 dist_samples_rs = dist_samples.reshape((shape[0], -1))
333 dist = []
334 for samples in dist_samples_rs.T:
335 params = self._dist_class.fit(samples)
336 if __debug__ and 'STAT' in debug.active:
337 debug('STAT', 'Estimated parameters for the %s are %s'
338 % (self._dist_class, str(params)))
339 dist.append(self._dist_class(*params))
340 self._dist = dist
341
342
344 """Return value of the cumulative distribution function at `x`.
345 """
346 if self._dist is None:
347
348
349
350 raise RuntimeError, "Distribution has to be fit first"
351
352 is_scalar = N.isscalar(x)
353 if is_scalar:
354 x = [x]
355
356 x = N.asanyarray(x)
357 xshape = x.shape
358
359 x = x.reshape((-1,))
360
361 if len(self._dist) != len(x):
362 raise ValueError, 'Distribution was fit for structure with %d' \
363 ' elements, whenever now queried with %d elements' \
364 % (len(self._dist), len(x))
365
366
367 cdfs = [ dist.cdf(v) for v, dist in zip(x, self._dist) ]
368 return N.array(cdfs).reshape(xshape)
369
370
372 """Clean stored distributions
373
374 Storing all of the distributions might be too expensive
375 (e.g. in case of Nonparametric), and the scope of the object
376 might be too broad to wait for it to be destroyed. Clean would
377 bind dist_samples to empty list to let gc revoke the memory.
378 """
379 self._dist = []
380
384 """Proxy/Adaptor class for SciPy distributions.
385
386 All distributions from SciPy's 'stats' module can be used with this class.
387
388 >>> import numpy as N
389 >>> from scipy import stats
390 >>> from mvpa.clfs.stats import FixedNullDist
391 >>>
392 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4))
393 >>> dist.p(2)
394 0.5
395 >>>
396 >>> dist.cdf(N.arange(5))
397 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246])
398 >>>
399 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4), tail='right')
400 >>> dist.p(N.arange(5))
401 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754])
402 """
404 """
405 :Parameter:
406 dist: distribution object
407 This can be any object the has a `cdf()` method to report the
408 cumulative distribition function values.
409 """
410 NullDist.__init__(self, **kwargs)
411
412 self._dist = dist
413
414
415 - def fit(self, measure, wdata, vdata=None):
416 """Does nothing since the distribution is already fixed."""
417 pass
418
419
421 """Return value of the cumulative distribution function at `x`.
422 """
423 return self._dist.cdf(x)
424
425
427 prefixes_ = ["dist=%s" % `self._dist`]
428 return super(FixedNullDist, self).__repr__(
429 prefixes=prefixes_ + prefixes)
430
433 """Adaptive distribution which adjusts parameters according to the data
434
435 WiP: internal implementation might change
436 """
437 - def fit(self, measure, wdata, vdata=None):
438 """Cares about dimensionality of the feature space in measure
439 """
440
441 try:
442 nfeatures = len(measure.feature_ids)
443 except ValueError:
444 nfeatures = N.prod(wdata.shape[1:])
445
446 dist_gen = self._dist
447 if not hasattr(dist_gen, 'fit'):
448 dist_gen = dist_gen.dist
449
450 args, kwargs = self._adapt(nfeatures, measure, wdata, vdata)
451 if __debug__:
452 debug('STAT', 'Adapted parameters for %s to be %s, %s'
453 % (dist_gen, args, kwargs))
454 self._dist = dist_gen(*args, **kwargs)
455
456
457 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
458 raise NotImplementedError
459
462 """Adaptive rdist: params are (nfeatures-1, 0, 1)
463 """
464
465 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
467
468
469
470
471
473 cdf_ = self._dist.cdf(x)
474 bad_values = N.where(N.abs(cdf_)>1)
475
476
477
478 if len(bad_values[0]):
479
480
481 cdf_bad = cdf_[bad_values]
482 x_bad = x[bad_values]
483 cdf_bad[x_bad<0] = 0.0
484 cdf_bad[x_bad>=0] = 1.0
485 cdf_[bad_values] = cdf_bad
486 return cdf_
487
490 """Adaptive Normal Distribution: params are (0, sqrt(1/nfeatures))
491 """
492
493 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
495
496
497 if externals.exists('scipy'):
498 from mvpa.support.stats import scipy
499 from scipy.stats import kstest
500 """
501 Thoughts:
502
503 So we can use `scipy.stats.kstest` (Kolmogorov-Smirnov test) to
504 check/reject H0 that samples come from a given distribution. But
505 since it is based on a full range of data, we might better of with
506 some ad-hoc checking by the detection power of the values in the
507 tail of a tentative distribution.
508
509 """
510
511
512
513
514
515
516 import scipy
519 """Helper proxy-class to fit distribution when some parameters are known
520
521 It is an ugly hack with snippets of code taken from scipy, which is
522 Copyright (c) 2001, 2002 Enthought, Inc.
523 and is distributed under BSD license
524 http://www.scipy.org/License_Compatibility
525 """
526
527 - def __init__(self, dist, loc=None, scale=None, args=None):
528
529 self._dist = dist
530
531 theta = (loc, scale)
532
533 Narg_ = dist.numargs
534 if args is not None:
535 Narg = len(args)
536 if Narg > Narg_:
537 raise ValueError, \
538 'Distribution %s has only %d arguments. Got %d' \
539 % (dist, Narg_, Narg)
540 args += (None,) * (Narg_ - Narg)
541 else:
542 args = (None,) * Narg_
543
544 args_i = [i for i,v in enumerate(args) if v is None]
545 self._fargs = (list(args+theta), args_i)
546 """Arguments which should get some fixed value"""
547
548
550 """Upon call mimic call to get actual rv_frozen distribution
551 """
552 return self._dist(*args, **kwargs)
553
554
555 - def nnlf(self, theta, x):
556
557
558
559 fargs, fargs_i = self._fargs
560 try:
561 i=-1
562 if fargs[-1] is not None:
563 scale = fargs[-1]
564 else:
565 scale = theta[i]
566 i -= 1
567
568 if fargs[-2] is not None:
569 loc = fargs[-2]
570 else:
571 loc = theta[i]
572 i -= 1
573
574 args = theta[:i+1]
575
576 for i,a in zip(fargs_i, args):
577 fargs[i] = a
578 args = fargs[:-2]
579
580 except IndexError:
581 raise ValueError, "Not enough input arguments."
582 if not self._argcheck(*args) or scale <= 0:
583 return N.inf
584 x = N.asarray((x-loc) / scale)
585 cond0 = (x <= self.a) | (x >= self.b)
586 if (N.any(cond0)):
587 return N.inf
588 else:
589 return self._nnlf(x, *args) + len(x)*N.log(scale)
590
591 - def fit(self, data, *args, **kwds):
592 loc0, scale0 = map(kwds.get, ['loc', 'scale'], [0.0, 1.0])
593 fargs, fargs_i = self._fargs
594 Narg = len(args)
595 Narg_ = self.numargs
596 if Narg != Narg_:
597 if Narg > Narg_:
598 raise ValueError, "Too many input arguments."
599 else:
600 args += (1.0,)*(self.numargs-Narg)
601
602
603
604 if len(fargs_i) != Narg_:
605 x0 = tuple([args[i] for i in fargs_i])
606 else:
607 x0 = args
608 if fargs[-2] is None: x0 = x0 + (loc0,)
609 if fargs[-1] is None: x0 = x0 + (scale0,)
610
611 opt_x = scipy.optimize.fmin(self.nnlf, x0, args=(N.ravel(data),), disp=0)
612
613
614 i = 0
615 loc, scale = fargs[-2:]
616 if fargs[-1] is None:
617 i -= 1
618 scale = opt_x[i]
619 if fargs[-2] is None:
620 i -= 1
621 loc = opt_x[i]
622
623
624 for i in fargs_i:
625 fargs[i] = opt_x[i]
626
627
628 opt_x = N.hstack((fargs[:-2], (loc, scale)))
629 return opt_x
630
631
633 if not a in ['_dist', '_fargs', 'fit', 'nnlf']:
634 self._dist.__setattr__(a, v)
635 else:
636 object.__setattr__(self, a, v)
637
638
640 """We need to redirect all queries correspondingly
641 """
642 if not a in ['_dist', '_fargs', 'fit', 'nnlf']:
643 return getattr(self._dist, a)
644 else:
645 return object.__getattribute__(self, a)
646
647
648
649 - def matchDistribution(data, nsamples=None, loc=None, scale=None,
650 args=None, test='kstest', distributions=None,
651 **kwargs):
652 """Determine best matching distribution.
653
654 Can be used for 'smelling' the data, as well to choose a
655 parametric distribution for data obtained from non-parametric
656 testing (e.g. `MCNullDist`).
657
658 WiP: use with caution, API might change
659
660 :Parameters:
661 data : N.ndarray
662 Array of the data for which to deduce the distribution. It has
663 to be sufficiently large to make a reliable conclusion
664 nsamples : int or None
665 If None -- use all samples in data to estimate parametric
666 distribution. Otherwise use only specified number randomly selected
667 from data.
668 loc : float or None
669 Loc for the distribution (if known)
670 scale : float or None
671 Scale for the distribution (if known)
672 test : basestring
673 What kind of testing to do. Choices:
674 'p-roc' : detection power for a given ROC. Needs two
675 parameters: `p=0.05` and `tail='both'`
676 'kstest' : 'full-body' distribution comparison. The best
677 choice is made by minimal reported distance after estimating
678 parameters of the distribution. Parameter `p=0.05` sets
679 threshold to reject null-hypothesis that distribution is the
680 same.
681 WARNING: older versions (e.g. 0.5.2 in etch) of scipy have
682 incorrect kstest implementation and do not function
683 properly
684 distributions : None or list of basestring or tuple(basestring, dict)
685 Distributions to check. If None, all known in scipy.stats
686 are tested. If distribution is specified as a tuple, then
687 it must contain name and additional parameters (name, loc,
688 scale, args) in the dictionary. Entry 'scipy' adds all known
689 in scipy.stats.
690 **kwargs
691 Additional arguments which are needed for each particular test
692 (see above)
693
694 :Example:
695 data = N.random.normal(size=(1000,1));
696 matches = matchDistribution(
697 data,
698 distributions=['rdist',
699 ('rdist', {'name':'rdist_fixed',
700 'loc': 0.0,
701 'args': (10,)})],
702 nsamples=30, test='p-roc', p=0.05)
703 """
704
705
706 _KNOWN_TESTS = ['p-roc', 'kstest']
707 if not test in _KNOWN_TESTS:
708 raise ValueError, 'Unknown kind of test %s. Known are %s' \
709 % (test, _KNOWN_TESTS)
710
711 data = N.ravel(data)
712
713 if nsamples is not None:
714 if __debug__:
715 debug('STAT', 'Sampling %d samples from data for the ' \
716 'estimation of the distributions parameters' % nsamples)
717 indexes_selected = (N.random.sample(nsamples)*len(data)).astype(int)
718 data_selected = data[indexes_selected]
719 else:
720 indexes_selected = N.arange(len(data))
721 data_selected = data
722
723 p_thr = kwargs.get('p', 0.05)
724 if test == 'p-roc':
725 tail = kwargs.get('tail', 'both')
726 data_p = _pvalue(data, Nonparametric(data).cdf, tail)
727 data_p_thr = N.abs(data_p) <= p_thr
728 true_positives = N.sum(data_p_thr)
729 if true_positives == 0:
730 raise ValueError, "Provided data has no elements in non-" \
731 "parametric distribution under p<=%.3f. Please " \
732 "increase the size of data or value of p" % p
733 if __debug__:
734 debug('STAT_', 'Number of positives in non-parametric '
735 'distribution is %d' % true_positives)
736
737 if distributions is None:
738 distributions = ['scipy']
739
740
741 try:
742 scipy_ind = distributions.index('scipy')
743 distributions.pop(scipy_ind)
744 distributions += scipy.stats.distributions.__all__
745 except ValueError:
746 pass
747
748 results = []
749 for d in distributions:
750 dist_gen, loc_, scale_, args_ = None, loc, scale, args
751 if isinstance(d, basestring):
752 dist_gen = d
753 dist_name = d
754 elif isinstance(d, tuple):
755 if not (len(d)==2 and isinstance(d[1], dict)):
756 raise ValueError,\
757 "Tuple specification of distribution must be " \
758 "(d, {params}). Got %s" % (d,)
759 dist_gen = d[0]
760 loc_ = d[1].get('loc', loc)
761 scale_ = d[1].get('scale', scale)
762 args_ = d[1].get('args', args)
763 dist_name = d[1].get('name', str(dist_gen))
764 else:
765 dist_gen = d
766 dist_name = str(d)
767
768
769 try:
770 dist_gen_ = getattr(scipy.stats, dist_gen)
771
772
773 if args_ is not None or loc_ is not None or scale_ is not None:
774 dist_opt = rv_semifrozen(dist_gen_, loc=loc_, scale=scale_, args=args_)
775 else:
776 dist_opt = dist_gen_
777 dist_params = dist_opt.fit(data_selected)
778 if __debug__:
779 debug('STAT__',
780 'Got distribution parameters %s for %s'
781 % (dist_params, dist_name))
782 if test == 'p-roc':
783 cdf_func = lambda x: dist_gen_.cdf(x, *dist_params)
784
785 cdf_p = N.abs(_pvalue(data, cdf_func, tail, name=dist_gen))
786 cdf_p_thr = cdf_p <= p_thr
787 D, p = N.sum(N.abs(data_p_thr - cdf_p_thr))*1.0/true_positives, 1
788 if __debug__: res_sum = 'D=%.2f' % D
789 elif test == 'kstest':
790 D, p = kstest(data, d, args=dist_params)
791 if __debug__: res_sum = 'D=%.3f p=%.3f' % (D, p)
792 except (TypeError, ValueError, AttributeError,
793 NotImplementedError), e:
794 if __debug__:
795 debug('STAT__',
796 'Testing for %s distribution failed due to %s'
797 % (d, str(e)))
798 continue
799
800 if p > p_thr and not N.isnan(D):
801 results += [ (D, dist_gen, dist_name, dist_params) ]
802 if __debug__:
803 debug('STAT_', 'Testing for %s dist.: %s' % (dist_name, res_sum))
804 else:
805 if __debug__:
806 debug('STAT__', 'Cannot consider %s dist. with %s'
807 % (d, res_sum))
808 continue
809
810
811 results.sort()
812
813 if __debug__ and 'STAT' in debug.active:
814
815 nresults = len(results)
816 sresult = lambda r:'%s(%s)=%.2f' % (r[1], ', '.join(map(str, r[3])), r[0])
817 if nresults>0:
818 nnextbest = min(2, nresults-1)
819 snextbest = ', '.join(map(sresult, results[1:1+nnextbest]))
820 debug('STAT', 'Best distribution %s. Next best: %s'
821 % (sresult(results[0]), snextbest))
822 else:
823 debug('STAT', 'Could not find suitable distribution')
824
825
826 return results
827
828
829 if externals.exists('pylab'):
830 import pylab as P
831
832 - def plotDistributionMatches(data, matches, nbins=31, nbest=5,
833 expand_tails=8, legend=2, plot_cdf=True,
834 p=None, tail='both'):
835 """Plot best matching distributions
836
837 :Parameters:
838 data : N.ndarray
839 Data which was used to obtain the matches
840 matches : list of tuples
841 Sorted matches as provided by matchDistribution
842 nbins : int
843 Number of bins in the histogram
844 nbest : int
845 Number of top matches to plot
846 expand_tails : int
847 How many bins away to add to parametrized distributions
848 plots
849 legend : int
850 Either to provide legend and statistics in the legend.
851 1 -- just lists distributions.
852 2 -- adds distance measure
853 3 -- tp/fp/fn in the case if p is provided
854 plot_cdf : bool
855 Either to plot cdf for data using non-parametric distribution
856 p : float or None
857 If not None, visualize null-hypothesis testing (given p).
858 Bars in the histogram which fall under given p are colored
859 in red. False positives and false negatives are marked as
860 triangle up and down symbols correspondingly
861 tail : ('left', 'right', 'any', 'both')
862 If p is not None, the choise of tail for null-hypothesis
863 testing
864
865 :Returns: tuple(histogram, list of lines)
866 """
867
868 hist = P.hist(data, nbins, normed=1, align='center')
869 data_range = [N.min(data), N.max(data)]
870
871
872 x = hist[1]
873 dx = x[expand_tails] - x[0]
874 x = N.hstack((x[:expand_tails] - dx, x, x[-expand_tails:] + dx))
875
876 nonparam = Nonparametric(data)
877
878 if plot_cdf:
879 P.plot(x, nonparam.cdf(x), 'k--', linewidth=1)
880
881 p_thr = p
882
883 data_p = _pvalue(data, nonparam.cdf, tail)
884 data_p_thr = (data_p <= p_thr).ravel()
885
886 x_p = _pvalue(x, Nonparametric(data).cdf, tail)
887 x_p_thr = N.abs(x_p) <= p_thr
888
889 for thr, bar in zip(x_p_thr[expand_tails:], hist[2]):
890 bar.set_facecolor(('w','r')[int(thr)])
891
892 if not len(matches):
893
894 warning("No matching distributions were provided -- nothing to plot")
895 return (hist, )
896
897 lines = []
898 labels = []
899 for i in xrange(min(nbest, len(matches))):
900 D, dist_gen, dist_name, params = matches[i]
901 dist = getattr(scipy.stats, dist_gen)(*params)
902
903 label = '%s' % (dist_name)
904 if legend > 1: label += '(D=%.2f)' % (D)
905
906 xcdf_p = N.abs(_pvalue(x, dist.cdf, tail))
907 xcdf_p_thr = (xcdf_p <= p_thr).ravel()
908
909 if p is not None and legend > 2:
910
911 data_cdf_p = N.abs(_pvalue(data, dist.cdf, tail))
912 data_cdf_p_thr = (data_cdf_p <= p_thr).ravel()
913
914
915 tp = N.logical_and(data_cdf_p_thr, data_p_thr)
916
917 fp = N.logical_and(data_cdf_p_thr, ~data_p_thr)
918
919 fn = N.logical_and(~data_cdf_p_thr, data_p_thr)
920
921 label += ' tp/fp/fn=%d/%d/%d)' % \
922 tuple(map(N.sum, [tp,fp,fn]))
923
924 pdf = dist.pdf(x)
925 line = P.plot(x, pdf, '-', linewidth=2, label=label)
926 color = line[0].get_color()
927
928 if plot_cdf:
929 cdf = dist.cdf(x)
930 P.plot(x, cdf, ':', linewidth=1, color=color, label=label)
931
932
933
934
935
936 if p is not None:
937
938 xtp = N.logical_and(xcdf_p_thr, x_p_thr)
939
940 xfp = N.logical_and(xcdf_p_thr, ~x_p_thr)
941
942 xfn = N.logical_and(~xcdf_p_thr, x_p_thr)
943
944
945
946 P.plot(x[xfp], pdf[xfp], '^', color=color)
947 P.plot(x[xfn], pdf[xfn], 'v', color=color)
948
949 lines.append(line)
950 labels.append(label)
951
952 if legend:
953 P.legend(lines, labels)
954
955 return (hist, lines)
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970 -def autoNullDist(dist):
971 """Cheater for human beings -- wraps `dist` if needed with some
972 NullDist
973
974 tail and other arguments are assumed to be default as in
975 NullDist/MCNullDist
976 """
977 if dist is None or isinstance(dist, NullDist):
978 return dist
979 elif hasattr(dist, 'fit'):
980 if __debug__:
981 debug('STAT', 'Wrapping %s into MCNullDist' % dist)
982 return MCNullDist(dist)
983 else:
984 if __debug__:
985 debug('STAT', 'Wrapping %s into FixedNullDist' % dist)
986 return FixedNullDist(dist)
987
991 if axis is None:
992 a = N.ravel(a)
993 outaxis = 0
994 else:
995 a = N.asarray(a)
996 outaxis = axis
997 return a, outaxis
998
1000 """Compute the mean over the given axis ignoring nans.
1001
1002 :Parameters:
1003 x : ndarray
1004 input array
1005 axis : int
1006 axis along which the mean is computed.
1007
1008 :Results:
1009 m : float
1010 the mean."""
1011 x, axis = _chk_asarray(x,axis)
1012 x = x.copy()
1013 Norig = x.shape[axis]
1014 factor = 1.0-N.sum(N.isnan(x),axis)*1.0/Norig
1015
1016 x[N.isnan(x)] = 0
1017 return N.mean(x,axis)/factor
1018