1
2
3
4
5
6
7
8
9 """Estimator for classifier error distributions."""
10
11 __docformat__ = 'restructuredtext'
12
13 import numpy as N
14
15 from mvpa.base import externals, warning
16 from mvpa.misc.state import ClassWithCollections, StateVariable
17
18 if __debug__:
19 from mvpa.base import debug
23 """Non-parametric 1d distribution -- derives cdf based on stored values.
24
25 Introduced to complement parametric distributions present in scipy.stats.
26 """
27
28 - def __init__(self, dist_samples, correction='clip'):
29 """
30 :Parameters:
31 dist_samples : ndarray
32 Samples to be used to assess the distribution.
33 correction : {'clip'} or None, optional
34 Determines the behavior when .cdf is queried. If None -- no
35 correction is made. If 'clip' -- values are clipped to lie
36 in the range [1/(N+2), (N+1)/(N+2)] (simply because
37 non-parametric assessment lacks the power to resolve with
38 higher precision in the tails, so 'imagery' samples are
39 placed in each of the two tails).
40 """
41 self._dist_samples = N.ravel(dist_samples)
42 self._correction = correction
43
45 return '%s(%r%s)' % (
46 self.__class__.__name__,
47 self._dist_samples,
48 ('', ', correction=%r' % self._correction)
49 [int(self._correction != 'clip')])
50
51 @staticmethod
52 - def fit(dist_samples):
54
55
57 """Returns the cdf value at `x`.
58 """
59 dist_samples = self._dist_samples
60 res = N.vectorize(lambda v:(dist_samples <= v).mean())(x)
61 if self._correction == 'clip':
62 nsamples = len(dist_samples)
63 N.clip(res, 1.0/(nsamples+2), (nsamples+1.0)/(nsamples+2), res)
64 elif self._correction is None:
65 pass
66 else:
67 raise ValueError, \
68 '%r is incorrect value for correction parameter of %s' \
69 % (self._correction, self.__class__.__name__)
70 return res
71
72
73 -def _pvalue(x, cdf_func, tail, return_tails=False, name=None):
74 """Helper function to return p-value(x) given cdf and tail
75
76 :Parameters:
77 cdf_func : callable
78 Function to be used to derive cdf values for x
79 tail : str ('left', 'right', 'any', 'both')
80 Which tail of the distribution to report. For 'any' and 'both'
81 it chooses the tail it belongs to based on the comparison to
82 p=0.5. In the case of 'any' significance is taken like in a
83 one-tailed test.
84 return_tails : bool
85 If True, a tuple return (pvalues, tails), where tails contain
86 1s if value was from the right tail, and 0 if the value was
87 from the left tail.
88 """
89 is_scalar = N.isscalar(x)
90 if is_scalar:
91 x = [x]
92
93 cdf = cdf_func(x)
94
95 if __debug__ and 'CHECK_STABILITY' in debug.active:
96 cdf_min, cdf_max = N.min(cdf), N.max(cdf)
97 if cdf_min < 0 or cdf_max > 1.0:
98 s = ('', ' for %s' % name)[int(name is not None)]
99 warning('Stability check of cdf %s failed%s. Min=%s, max=%s' % \
100 (cdf_func, s, cdf_min, cdf_max))
101
102
103
104 cdf = N.clip(cdf, 0, 1.0)
105
106 if tail == 'left':
107 if return_tails:
108 right_tail = N.zeros(cdf.shape, dtype=bool)
109 elif tail == 'right':
110 cdf = 1 - cdf
111 if return_tails:
112 right_tail = N.ones(cdf.shape, dtype=bool)
113 elif tail in ('any', 'both'):
114 right_tail = (cdf >= 0.5)
115 cdf[right_tail] = 1.0 - cdf[right_tail]
116 if tail == 'both':
117
118 cdf *= 2
119
120
121 cdf[N.isnan(x)] = 1.0
122 if is_scalar: res = cdf[0]
123 else: res = cdf
124
125 if return_tails:
126 return (res, right_tail)
127 else:
128 return res
129
130
131 -class NullDist(ClassWithCollections):
132 """Base class for null-hypothesis testing.
133
134 """
135
136
137
138
139
140 _ATTRIBUTE_COLLECTIONS = ['states']
141
142 - def __init__(self, tail='both', **kwargs):
143 """Cheap initialization.
144
145 :Parameter:
146 tail: str ('left', 'right', 'any', 'both')
147 Which tail of the distribution to report. For 'any' and 'both'
148 it chooses the tail it belongs to based on the comparison to
149 p=0.5. In the case of 'any' significance is taken like in a
150 one-tailed test.
151 """
152 ClassWithCollections.__init__(self, **kwargs)
153
154 self._setTail(tail)
155
157 return super(NullDist, self).__repr__(
158 prefixes=["tail=%s" % `self.__tail`] + prefixes)
159
160
162
163 if tail not in ('left', 'right', 'any', 'both'):
164 raise ValueError, 'Unknown value "%s" to `tail` argument.' \
165 % tail
166 self.__tail = tail
167
168
169 - def fit(self, measure, wdata, vdata=None):
170 """Implement to fit the distribution to the data."""
171 raise NotImplementedError
172
173
175 """Implementations return the value of the cumulative distribution
176 function (left or right tail dpending on the setting).
177 """
178 raise NotImplementedError
179
180
181 - def p(self, x, **kwargs):
182 """Returns the p-value for values of `x`.
183 Returned values are determined left, right, or from any tail
184 depending on the constructor setting.
185
186 In case a `FeaturewiseDatasetMeasure` was used to estimate the
187 distribution the method returns an array. In that case `x` can be
188 a scalar value or an array of a matching shape.
189 """
190 return _pvalue(x, self.cdf, self.__tail, **kwargs)
191
192
193 tail = property(fget=lambda x:x.__tail, fset=_setTail)
194
197 """Null-hypothesis distribution is estimated from randomly permuted data labels.
198
199 The distribution is estimated by calling fit() with an appropriate
200 `DatasetMeasure` or `TransferError` instance and a training and a
201 validation dataset (in case of a `TransferError`). For a customizable
202 amount of cycles the training data labels are permuted and the
203 corresponding measure computed. In case of a `TransferError` this is the
204 error when predicting the *correct* labels of the validation dataset.
205
206 The distribution can be queried using the `cdf()` method, which can be
207 configured to report probabilities/frequencies from `left` or `right` tail,
208 i.e. fraction of the distribution that is lower or larger than some
209 critical value.
210
211 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()`
212 returns an array of featurewise probabilities/frequencies.
213 """
214
215 _DEV_DOC = """
216 TODO automagically decide on the number of samples/permutations needed
217 Caution should be paid though since resultant distributions might be
218 quite far from some conventional ones (e.g. Normal) -- it is expected to
219 them to be bimodal (or actually multimodal) in many scenarios.
220 """
221
222 dist_samples = StateVariable(enabled=False,
223 doc='Samples obtained for each permutation')
224
226 """Initialize Monte-Carlo Permutation Null-hypothesis testing
227
228 :Parameters:
229 dist_class: class
230 This can be any class which provides parameters estimate
231 using `fit()` method to initialize the instance, and
232 provides `cdf(x)` method for estimating value of x in CDF.
233 All distributions from SciPy's 'stats' module can be used.
234 permutations: int
235 This many permutations of label will be performed to
236 determine the distribution under the null hypothesis.
237 """
238 NullDist.__init__(self, **kwargs)
239
240 self._dist_class = dist_class
241 self._dist = []
242
243 self.__permutations = permutations
244 """Number of permutations to compute the estimate the null
245 distribution."""
246
248 prefixes_ = ["permutations=%s" % self.__permutations]
249 if self._dist_class != Nonparametric:
250 prefixes_.insert(0, 'dist_class=%s' % `self._dist_class`)
251 return super(MCNullDist, self).__repr__(
252 prefixes=prefixes_ + prefixes)
253
254
255 - def fit(self, measure, wdata, vdata=None):
256 """Fit the distribution by performing multiple cycles which repeatedly
257 permuted labels in the training dataset.
258
259 :Parameters:
260 measure: (`Featurewise`)`DatasetMeasure` | `TransferError`
261 TransferError instance used to compute all errors.
262 wdata: `Dataset` which gets permuted and used to compute the
263 measure/transfer error multiple times.
264 vdata: `Dataset` used for validation.
265 If provided measure is assumed to be a `TransferError` and
266 working and validation dataset are passed onto it.
267 """
268 dist_samples = []
269 """Holds the values for randomized labels."""
270
271
272 if not vdata is None:
273 measure_args = [vdata, wdata]
274 else:
275 measure_args = [wdata]
276
277
278 for p in xrange(self.__permutations):
279
280
281
282 if __debug__:
283 debug('STATMC', "Doing %i permutations: %i" \
284 % (self.__permutations, p+1), cr=True)
285
286
287
288
289
290
291 wdata.permuteLabels(True, perchunk=False)
292
293
294
295 dist_samples.append(measure(*measure_args))
296
297 if __debug__:
298 debug('STATMC', '')
299
300
301 wdata.permuteLabels(False, perchunk=False)
302
303
304 self.dist_samples = dist_samples = N.asarray(dist_samples)
305
306
307
308
309 shape = dist_samples.shape
310 nshape = len(shape)
311
312
313 if nshape == 1:
314 dist_samples = dist_samples[:, N.newaxis]
315
316
317
318 dist_samples_rs = dist_samples.reshape((shape[0], -1))
319 dist = []
320 for samples in dist_samples_rs.T:
321 params = self._dist_class.fit(samples)
322 if __debug__ and 'STAT' in debug.active:
323 debug('STAT', 'Estimated parameters for the %s are %s'
324 % (self._dist_class, str(params)))
325 dist.append(self._dist_class(*params))
326 self._dist = dist
327
328
330 """Return value of the cumulative distribution function at `x`.
331 """
332 if self._dist is None:
333
334
335
336 raise RuntimeError, "Distribution has to be fit first"
337
338 is_scalar = N.isscalar(x)
339 if is_scalar:
340 x = [x]
341
342 x = N.asanyarray(x)
343 xshape = x.shape
344
345 x = x.reshape((-1,))
346
347 if len(self._dist) != len(x):
348 raise ValueError, 'Distribution was fit for structure with %d' \
349 ' elements, whenever now queried with %d elements' \
350 % (len(self._dist), len(x))
351
352
353 cdfs = [ dist.cdf(v) for v, dist in zip(x, self._dist) ]
354 return N.array(cdfs).reshape(xshape)
355
356
358 """Clean stored distributions
359
360 Storing all of the distributions might be too expensive
361 (e.g. in case of Nonparametric), and the scope of the object
362 might be too broad to wait for it to be destroyed. Clean would
363 bind dist_samples to empty list to let gc revoke the memory.
364 """
365 self._dist = []
366
370 """Proxy/Adaptor class for SciPy distributions.
371
372 All distributions from SciPy's 'stats' module can be used with this class.
373
374 >>> import numpy as N
375 >>> from scipy import stats
376 >>> from mvpa.clfs.stats import FixedNullDist
377 >>>
378 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4))
379 >>> dist.p(2)
380 0.5
381 >>>
382 >>> dist.cdf(N.arange(5))
383 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246])
384 >>>
385 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4), tail='right')
386 >>> dist.p(N.arange(5))
387 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754])
388 """
390 """
391 :Parameter:
392 dist: distribution object
393 This can be any object the has a `cdf()` method to report the
394 cumulative distribition function values.
395 """
396 NullDist.__init__(self, **kwargs)
397
398 self._dist = dist
399
400
401 - def fit(self, measure, wdata, vdata=None):
402 """Does nothing since the distribution is already fixed."""
403 pass
404
405
407 """Return value of the cumulative distribution function at `x`.
408 """
409 return self._dist.cdf(x)
410
411
413 prefixes_ = ["dist=%s" % `self._dist`]
414 return super(FixedNullDist, self).__repr__(
415 prefixes=prefixes_ + prefixes)
416
419 """Adaptive distribution which adjusts parameters according to the data
420
421 WiP: internal implementation might change
422 """
423 - def fit(self, measure, wdata, vdata=None):
424 """Cares about dimensionality of the feature space in measure
425 """
426
427 try:
428 nfeatures = len(measure.feature_ids)
429 except ValueError:
430 nfeatures = N.prod(wdata.shape[1:])
431
432 dist_gen = self._dist
433 if not hasattr(dist_gen, 'fit'):
434 dist_gen = dist_gen.dist
435
436 args, kwargs = self._adapt(nfeatures, measure, wdata, vdata)
437 if __debug__:
438 debug('STAT', 'Adapted parameters for %s to be %s, %s'
439 % (dist_gen, args, kwargs))
440 self._dist = dist_gen(*args, **kwargs)
441
442
443 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
444 raise NotImplementedError
445
448 """Adaptive rdist: params are (nfeatures-1, 0, 1)
449 """
450
451 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
453
454
455
456
457
459 cdf_ = self._dist.cdf(x)
460 bad_values = N.where(N.abs(cdf_)>1)
461
462
463
464 if len(bad_values[0]):
465
466
467 cdf_bad = cdf_[bad_values]
468 x_bad = x[bad_values]
469 cdf_bad[x_bad<0] = 0.0
470 cdf_bad[x_bad>=0] = 1.0
471 cdf_[bad_values] = cdf_bad
472 return cdf_
473
476 """Adaptive Normal Distribution: params are (0, sqrt(1/nfeatures))
477 """
478
479 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
481
482
483 if externals.exists('scipy'):
484 from mvpa.support.stats import scipy
485 from scipy.stats import kstest
486 """
487 Thoughts:
488
489 So we can use `scipy.stats.kstest` (Kolmogorov-Smirnov test) to
490 check/reject H0 that samples come from a given distribution. But
491 since it is based on a full range of data, we might better of with
492 some ad-hoc checking by the detection power of the values in the
493 tail of a tentative distribution.
494
495 """
496
497
498
499
500
501
502 import scipy
505 """Helper proxy-class to fit distribution when some parameters are known
506
507 It is an ugly hack with snippets of code taken from scipy, which is
508 Copyright (c) 2001, 2002 Enthought, Inc.
509 and is distributed under BSD license
510 http://www.scipy.org/License_Compatibility
511 """
512
513 - def __init__(self, dist, loc=None, scale=None, args=None):
514
515 self._dist = dist
516
517 theta = (loc, scale)
518
519 Narg_ = dist.numargs
520 if args is not None:
521 Narg = len(args)
522 if Narg > Narg_:
523 raise ValueError, \
524 'Distribution %s has only %d arguments. Got %d' \
525 % (dist, Narg_, Narg)
526 args += (None,) * (Narg_ - Narg)
527 else:
528 args = (None,) * Narg_
529
530 args_i = [i for i,v in enumerate(args) if v is None]
531 self._fargs = (list(args+theta), args_i)
532 """Arguments which should get some fixed value"""
533
534
536 """Upon call mimic call to get actual rv_frozen distribution
537 """
538 return self._dist(*args, **kwargs)
539
540
541 - def nnlf(self, theta, x):
542
543
544
545 fargs, fargs_i = self._fargs
546 try:
547 i=-1
548 if fargs[-1] is not None:
549 scale = fargs[-1]
550 else:
551 scale = theta[i]
552 i -= 1
553
554 if fargs[-2] is not None:
555 loc = fargs[-2]
556 else:
557 loc = theta[i]
558 i -= 1
559
560 args = theta[:i+1]
561
562 for i,a in zip(fargs_i, args):
563 fargs[i] = a
564 args = fargs[:-2]
565
566 except IndexError:
567 raise ValueError, "Not enough input arguments."
568 if not self._argcheck(*args) or scale <= 0:
569 return N.inf
570 x = N.asarray((x-loc) / scale)
571 cond0 = (x <= self.a) | (x >= self.b)
572 if (N.any(cond0)):
573 return N.inf
574 else:
575 return self._nnlf(x, *args) + len(x)*N.log(scale)
576
577 - def fit(self, data, *args, **kwds):
578 loc0, scale0 = map(kwds.get, ['loc', 'scale'], [0.0, 1.0])
579 fargs, fargs_i = self._fargs
580 Narg = len(args)
581 Narg_ = self.numargs
582 if Narg != Narg_:
583 if Narg > Narg_:
584 raise ValueError, "Too many input arguments."
585 else:
586 args += (1.0,)*(self.numargs-Narg)
587
588
589
590 if len(fargs_i) != Narg_:
591 x0 = tuple([args[i] for i in fargs_i])
592 else:
593 x0 = args
594 if fargs[-2] is None: x0 = x0 + (loc0,)
595 if fargs[-1] is None: x0 = x0 + (scale0,)
596
597 opt_x = scipy.optimize.fmin(self.nnlf, x0, args=(N.ravel(data),), disp=0)
598
599
600 i = 0
601 loc, scale = fargs[-2:]
602 if fargs[-1] is None:
603 i -= 1
604 scale = opt_x[i]
605 if fargs[-2] is None:
606 i -= 1
607 loc = opt_x[i]
608
609
610 for i in fargs_i:
611 fargs[i] = opt_x[i]
612
613
614 opt_x = N.hstack((fargs[:-2], (loc, scale)))
615 return opt_x
616
617
619 if not a in ['_dist', '_fargs', 'fit', 'nnlf']:
620 self._dist.__setattr__(a, v)
621 else:
622 object.__setattr__(self, a, v)
623
624
626 """We need to redirect all queries correspondingly
627 """
628 if not a in ['_dist', '_fargs', 'fit', 'nnlf']:
629 return getattr(self._dist, a)
630 else:
631 return object.__getattribute__(self, a)
632
633
634
635 - def matchDistribution(data, nsamples=None, loc=None, scale=None,
636 args=None, test='kstest', distributions=None,
637 **kwargs):
638 """Determine best matching distribution.
639
640 Can be used for 'smelling' the data, as well to choose a
641 parametric distribution for data obtained from non-parametric
642 testing (e.g. `MCNullDist`).
643
644 WiP: use with caution, API might change
645
646 :Parameters:
647 data : N.ndarray
648 Array of the data for which to deduce the distribution. It has
649 to be sufficiently large to make a reliable conclusion
650 nsamples : int or None
651 If None -- use all samples in data to estimate parametric
652 distribution. Otherwise use only specified number randomly selected
653 from data.
654 loc : float or None
655 Loc for the distribution (if known)
656 scale : float or None
657 Scale for the distribution (if known)
658 test : basestring
659 What kind of testing to do. Choices:
660 'p-roc' : detection power for a given ROC. Needs two
661 parameters: `p=0.05` and `tail='both'`
662 'kstest' : 'full-body' distribution comparison. The best
663 choice is made by minimal reported distance after estimating
664 parameters of the distribution. Parameter `p=0.05` sets
665 threshold to reject null-hypothesis that distribution is the
666 same.
667 WARNING: older versions (e.g. 0.5.2 in etch) of scipy have
668 incorrect kstest implementation and do not function
669 properly
670 distributions : None or list of basestring or tuple(basestring, dict)
671 Distributions to check. If None, all known in scipy.stats
672 are tested. If distribution is specified as a tuple, then
673 it must contain name and additional parameters (name, loc,
674 scale, args) in the dictionary. Entry 'scipy' adds all known
675 in scipy.stats.
676 **kwargs
677 Additional arguments which are needed for each particular test
678 (see above)
679
680 :Example:
681 data = N.random.normal(size=(1000,1));
682 matches = matchDistribution(
683 data,
684 distributions=['rdist',
685 ('rdist', {'name':'rdist_fixed',
686 'loc': 0.0,
687 'args': (10,)})],
688 nsamples=30, test='p-roc', p=0.05)
689 """
690
691
692 _KNOWN_TESTS = ['p-roc', 'kstest']
693 if not test in _KNOWN_TESTS:
694 raise ValueError, 'Unknown kind of test %s. Known are %s' \
695 % (test, _KNOWN_TESTS)
696
697 data = N.ravel(data)
698
699 if nsamples is not None:
700 if __debug__:
701 debug('STAT', 'Sampling %d samples from data for the ' \
702 'estimation of the distributions parameters' % nsamples)
703 indexes_selected = (N.random.sample(nsamples)*len(data)).astype(int)
704 data_selected = data[indexes_selected]
705 else:
706 indexes_selected = N.arange(len(data))
707 data_selected = data
708
709 p_thr = kwargs.get('p', 0.05)
710 if test == 'p-roc':
711 tail = kwargs.get('tail', 'both')
712 data_p = _pvalue(data, Nonparametric(data).cdf, tail)
713 data_p_thr = N.abs(data_p) <= p_thr
714 true_positives = N.sum(data_p_thr)
715 if true_positives == 0:
716 raise ValueError, "Provided data has no elements in non-" \
717 "parametric distribution under p<=%.3f. Please " \
718 "increase the size of data or value of p" % p
719 if __debug__:
720 debug('STAT_', 'Number of positives in non-parametric '
721 'distribution is %d' % true_positives)
722
723 if distributions is None:
724 distributions = ['scipy']
725
726
727 try:
728 scipy_ind = distributions.index('scipy')
729 distributions.pop(scipy_ind)
730 distributions += scipy.stats.distributions.__all__
731 except ValueError:
732 pass
733
734 results = []
735 for d in distributions:
736 dist_gen, loc_, scale_, args_ = None, loc, scale, args
737 if isinstance(d, basestring):
738 dist_gen = d
739 dist_name = d
740 elif isinstance(d, tuple):
741 if not (len(d)==2 and isinstance(d[1], dict)):
742 raise ValueError,\
743 "Tuple specification of distribution must be " \
744 "(d, {params}). Got %s" % (d,)
745 dist_gen = d[0]
746 loc_ = d[1].get('loc', loc)
747 scale_ = d[1].get('scale', scale)
748 args_ = d[1].get('args', args)
749 dist_name = d[1].get('name', str(dist_gen))
750 else:
751 dist_gen = d
752 dist_name = str(d)
753
754
755 try:
756 dist_gen_ = getattr(scipy.stats, dist_gen)
757
758
759 if args_ is not None or loc_ is not None or scale_ is not None:
760 dist_opt = rv_semifrozen(dist_gen_, loc=loc_, scale=scale_, args=args_)
761 else:
762 dist_opt = dist_gen_
763 dist_params = dist_opt.fit(data_selected)
764 if __debug__:
765 debug('STAT__',
766 'Got distribution parameters %s for %s'
767 % (dist_params, dist_name))
768 if test == 'p-roc':
769 cdf_func = lambda x: dist_gen_.cdf(x, *dist_params)
770
771 cdf_p = N.abs(_pvalue(data, cdf_func, tail, name=dist_gen))
772 cdf_p_thr = cdf_p <= p_thr
773 D, p = N.sum(N.abs(data_p_thr - cdf_p_thr))*1.0/true_positives, 1
774 if __debug__: res_sum = 'D=%.2f' % D
775 elif test == 'kstest':
776 D, p = kstest(data, d, args=dist_params)
777 if __debug__: res_sum = 'D=%.3f p=%.3f' % (D, p)
778 except (TypeError, ValueError, AttributeError,
779 NotImplementedError), e:
780 if __debug__:
781 debug('STAT__',
782 'Testing for %s distribution failed due to %s'
783 % (d, str(e)))
784 continue
785
786 if p > p_thr and not N.isnan(D):
787 results += [ (D, dist_gen, dist_name, dist_params) ]
788 if __debug__:
789 debug('STAT_', 'Testing for %s dist.: %s' % (dist_name, res_sum))
790 else:
791 if __debug__:
792 debug('STAT__', 'Cannot consider %s dist. with %s'
793 % (d, res_sum))
794 continue
795
796
797 results.sort()
798
799 if __debug__ and 'STAT' in debug.active:
800
801 nresults = len(results)
802 sresult = lambda r:'%s(%s)=%.2f' % (r[1], ', '.join(map(str, r[3])), r[0])
803 if nresults>0:
804 nnextbest = min(2, nresults-1)
805 snextbest = ', '.join(map(sresult, results[1:1+nnextbest]))
806 debug('STAT', 'Best distribution %s. Next best: %s'
807 % (sresult(results[0]), snextbest))
808 else:
809 debug('STAT', 'Could not find suitable distribution')
810
811
812 return results
813
814
815 if externals.exists('pylab'):
816 import pylab as P
817
818 - def plotDistributionMatches(data, matches, nbins=31, nbest=5,
819 expand_tails=8, legend=2, plot_cdf=True,
820 p=None, tail='both'):
821 """Plot best matching distributions
822
823 :Parameters:
824 data : N.ndarray
825 Data which was used to obtain the matches
826 matches : list of tuples
827 Sorted matches as provided by matchDistribution
828 nbins : int
829 Number of bins in the histogram
830 nbest : int
831 Number of top matches to plot
832 expand_tails : int
833 How many bins away to add to parametrized distributions
834 plots
835 legend : int
836 Either to provide legend and statistics in the legend.
837 1 -- just lists distributions.
838 2 -- adds distance measure
839 3 -- tp/fp/fn in the case if p is provided
840 plot_cdf : bool
841 Either to plot cdf for data using non-parametric distribution
842 p : float or None
843 If not None, visualize null-hypothesis testing (given p).
844 Bars in the histogram which fall under given p are colored
845 in red. False positives and false negatives are marked as
846 triangle up and down symbols correspondingly
847 tail : ('left', 'right', 'any', 'both')
848 If p is not None, the choise of tail for null-hypothesis
849 testing
850
851 :Returns: tuple(histogram, list of lines)
852 """
853
854 hist = P.hist(data, nbins, normed=1, align='center')
855 data_range = [N.min(data), N.max(data)]
856
857
858 x = hist[1]
859 dx = x[expand_tails] - x[0]
860 x = N.hstack((x[:expand_tails] - dx, x, x[-expand_tails:] + dx))
861
862 nonparam = Nonparametric(data)
863
864 if plot_cdf:
865 P.plot(x, nonparam.cdf(x), 'k--', linewidth=1)
866
867 p_thr = p
868
869 data_p = _pvalue(data, nonparam.cdf, tail)
870 data_p_thr = (data_p <= p_thr).ravel()
871
872 x_p = _pvalue(x, Nonparametric(data).cdf, tail)
873 x_p_thr = N.abs(x_p) <= p_thr
874
875 for thr, bar in zip(x_p_thr[expand_tails:], hist[2]):
876 bar.set_facecolor(('w','r')[int(thr)])
877
878 if not len(matches):
879
880 warning("No matching distributions were provided -- nothing to plot")
881 return (hist, )
882
883 lines = []
884 labels = []
885 for i in xrange(min(nbest, len(matches))):
886 D, dist_gen, dist_name, params = matches[i]
887 dist = getattr(scipy.stats, dist_gen)(*params)
888
889 label = '%s' % (dist_name)
890 if legend > 1: label += '(D=%.2f)' % (D)
891
892 xcdf_p = N.abs(_pvalue(x, dist.cdf, tail))
893 xcdf_p_thr = (xcdf_p <= p_thr).ravel()
894
895 if p is not None and legend > 2:
896
897 data_cdf_p = N.abs(_pvalue(data, dist.cdf, tail))
898 data_cdf_p_thr = (data_cdf_p <= p_thr).ravel()
899
900
901 tp = N.logical_and(data_cdf_p_thr, data_p_thr)
902
903 fp = N.logical_and(data_cdf_p_thr, ~data_p_thr)
904
905 fn = N.logical_and(~data_cdf_p_thr, data_p_thr)
906
907 label += ' tp/fp/fn=%d/%d/%d)' % \
908 tuple(map(N.sum, [tp,fp,fn]))
909
910 pdf = dist.pdf(x)
911 line = P.plot(x, pdf, '-', linewidth=2, label=label)
912 color = line[0].get_color()
913
914 if plot_cdf:
915 cdf = dist.cdf(x)
916 P.plot(x, cdf, ':', linewidth=1, color=color, label=label)
917
918
919
920
921
922 if p is not None:
923
924 xtp = N.logical_and(xcdf_p_thr, x_p_thr)
925
926 xfp = N.logical_and(xcdf_p_thr, ~x_p_thr)
927
928 xfn = N.logical_and(~xcdf_p_thr, x_p_thr)
929
930
931
932 P.plot(x[xfp], pdf[xfp], '^', color=color)
933 P.plot(x[xfn], pdf[xfn], 'v', color=color)
934
935 lines.append(line)
936 labels.append(label)
937
938 if legend:
939 P.legend(lines, labels)
940
941 return (hist, lines)
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956 -def autoNullDist(dist):
957 """Cheater for human beings -- wraps `dist` if needed with some
958 NullDist
959
960 tail and other arguments are assumed to be default as in
961 NullDist/MCNullDist
962 """
963 if dist is None or isinstance(dist, NullDist):
964 return dist
965 elif hasattr(dist, 'fit'):
966 if __debug__:
967 debug('STAT', 'Wrapping %s into MCNullDist' % dist)
968 return MCNullDist(dist)
969 else:
970 if __debug__:
971 debug('STAT', 'Wrapping %s into FixedNullDist' % dist)
972 return FixedNullDist(dist)
973
977 if axis is None:
978 a = N.ravel(a)
979 outaxis = 0
980 else:
981 a = N.asarray(a)
982 outaxis = axis
983 return a, outaxis
984
986 """Compute the mean over the given axis ignoring nans.
987
988 :Parameters:
989 x : ndarray
990 input array
991 axis : int
992 axis along which the mean is computed.
993
994 :Results:
995 m : float
996 the mean."""
997 x, axis = _chk_asarray(x,axis)
998 x = x.copy()
999 Norig = x.shape[axis]
1000 factor = 1.0-N.sum(N.isnan(x),axis)*1.0/Norig
1001
1002 x[N.isnan(x)] = 0
1003 return N.mean(x,axis)/factor
1004