1
2
3
4
5
6
7
8
9 """Estimator for classifier error distributions."""
10
11 __docformat__ = 'restructuredtext'
12
13 import numpy as N
14
15 from mvpa.base import externals, warning
16 from mvpa.misc.state import Stateful, StateVariable
17
18 if __debug__:
19 from mvpa.base import debug
23 """Non-parametric 1d distribution -- derives cdf based on stored values.
24
25 Introduced to complement parametric distributions present in scipy.stats.
26 """
27
30
31
32 @staticmethod
33 - def fit(dist_samples):
35
36
38 """Returns the cdf value at `x`.
39 """
40 return N.vectorize(lambda v:(self._dist_samples <= v).mean())(x)
41
42
43 -def _pvalue(x, cdf_func, tail, return_tails=False):
44 """Helper function to return p-value(x) given cdf and tail
45
46 :Parameters:
47 cdf_func : callable
48 Function to be used to derive cdf values for x
49 tail : str ('left', 'right', 'any', 'both')
50 Which tail of the distribution to report. For 'any' and 'both'
51 it chooses the tail it belongs to based on the comparison to
52 p=0.5. In the case of 'any' significance is taken like in a
53 one-tailed test.
54 return_tails : bool
55 If True, a tuple return (pvalues, tails), where tails contain
56 1s if value was from the right tail, and 0 if the value was
57 from the left tail.
58 """
59 is_scalar = N.isscalar(x)
60 if is_scalar:
61 x = [x]
62
63 cdf = cdf_func(x)
64
65 if __debug__ and 'CHECK_STABILITY' in debug.active:
66 cdf_min, cdf_max = N.min(cdf), N.max(cdf)
67 if cdf_min < 0 or cdf_max > 1.0:
68 warning('Stability check of cdf %s failed. Min=%s, max=%s' % \
69 (cdf_func, cdf_min, cdf_max))
70
71
72
73 cdf = N.clip(cdf, 0, 1.0)
74
75 if tail == 'left':
76 if return_tails:
77 right_tail = N.zeros(cdf.shape, dtype=bool)
78 elif tail == 'right':
79 cdf = 1 - cdf
80 if return_tails:
81 right_tail = N.ones(cdf.shape, dtype=bool)
82 elif tail in ('any', 'both'):
83 right_tail = (cdf >= 0.5)
84 cdf[right_tail] = 1.0 - cdf[right_tail]
85 if tail == 'both':
86
87 cdf *= 2
88
89
90 cdf[N.isnan(x)] = 1.0
91 if is_scalar: res = cdf[0]
92 else: res = cdf
93
94 if return_tails:
95 return (res, right_tail)
96 else:
97 return res
98
101 """Base class for null-hypothesis testing.
102
103 """
104
105
106
107
108
109 _ATTRIBUTE_COLLECTIONS = ['states']
110
111 - def __init__(self, tail='both', **kwargs):
112 """Cheap initialization.
113
114 :Parameter:
115 tail: str ('left', 'right', 'any', 'both')
116 Which tail of the distribution to report. For 'any' and 'both'
117 it chooses the tail it belongs to based on the comparison to
118 p=0.5. In the case of 'any' significance is taken like in a
119 one-tailed test.
120 """
121 Stateful.__init__(self, **kwargs)
122
123 self._setTail(tail)
124
125
127
128 if tail not in ('left', 'right', 'any', 'both'):
129 raise ValueError, 'Unknown value "%s" to `tail` argument.' \
130 % tail
131 self.__tail = tail
132
133
134 - def fit(self, measure, wdata, vdata=None):
135 """Implement to fit the distribution to the data."""
136 raise NotImplementedError
137
138
140 """Implementations return the value of the cumulative distribution
141 function (left or right tail dpending on the setting).
142 """
143 raise NotImplementedError
144
145
146 - def p(self, x, **kwargs):
147 """Returns the p-value for values of `x`.
148 Returned values are determined left, right, or from any tail
149 depending on the constructor setting.
150
151 In case a `FeaturewiseDatasetMeasure` was used to estimate the
152 distribution the method returns an array. In that case `x` can be
153 a scalar value or an array of a matching shape.
154 """
155 return _pvalue(x, self.cdf, self.__tail, **kwargs)
156
157
158 tail = property(fget=lambda x:x.__tail, fset=_setTail)
159
162 """Null-hypothesis distribution is estimated from randomly permuted data labels.
163
164 The distribution is estimated by calling fit() with an appropriate
165 `DatasetMeasure` or `TransferError` instance and a training and a
166 validation dataset (in case of a `TransferError`). For a customizable
167 amount of cycles the training data labels are permuted and the
168 corresponding measure computed. In case of a `TransferError` this is the
169 error when predicting the *correct* labels of the validation dataset.
170
171 The distribution can be queried using the `cdf()` method, which can be
172 configured to report probabilities/frequencies from `left` or `right` tail,
173 i.e. fraction of the distribution that is lower or larger than some
174 critical value.
175
176 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()`
177 returns an array of featurewise probabilities/frequencies.
178 """
179
180 _DEV_DOC = """
181 TODO automagically decide on the number of samples/permutations needed
182 Caution should be paid though since resultant distributions might be
183 quite far from some conventional ones (e.g. Normal) -- it is expected to
184 them to be bimodal (or actually multimodal) in many scenarios.
185 """
186
187 dist_samples = StateVariable(enabled=False,
188 doc='Samples obtained for each permutation')
189
191 """Initialize Monte-Carlo Permutation Null-hypothesis testing
192
193 :Parameter:
194 dist_class: class
195 This can be any class which provides parameters estimate
196 using `fit()` method to initialize the instance, and
197 provides `cdf(x)` method for estimating value of x in CDF.
198 All distributions from SciPy's 'stats' module can be used.
199 permutations: int
200 This many permutations of label will be performed to
201 determine the distribution under the null hypothesis.
202 """
203 NullDist.__init__(self, **kwargs)
204
205 self._dist_class = dist_class
206 self._dist = []
207
208 self.__permutations = permutations
209 """Number of permutations to compute the estimate the null
210 distribution."""
211
212
213
214 - def fit(self, measure, wdata, vdata=None):
215 """Fit the distribution by performing multiple cycles which repeatedly
216 permuted labels in the training dataset.
217
218 :Parameter:
219 measure: (`Featurewise`)`DatasetMeasure` | `TransferError`
220 TransferError instance used to compute all errors.
221 wdata: `Dataset` which gets permuted and used to compute the
222 measure/transfer error multiple times.
223 vdata: `Dataset` used for validation.
224 If provided measure is assumed to be a `TransferError` and
225 working and validation dataset are passed onto it.
226 """
227 dist_samples = []
228 """Holds the values for randomized labels."""
229
230
231 if not vdata is None:
232 measure_args = [vdata, wdata]
233 else:
234 measure_args = [wdata]
235
236
237 for p in xrange(self.__permutations):
238
239
240
241
242
243
244
245
246 wdata.permuteLabels(True, perchunk=False)
247
248
249
250 dist_samples.append(measure(*measure_args))
251
252
253 wdata.permuteLabels(False, perchunk=False)
254
255
256 self.dist_samples = dist_samples = N.asarray(dist_samples)
257
258
259
260
261 shape = dist_samples.shape
262 nshape = len(shape)
263
264
265 if nshape == 1:
266 dist_samples = dist_samples[:, N.newaxis]
267
268
269
270 dist_samples_rs = dist_samples.reshape((shape[0], -1))
271 dist = []
272 for samples in dist_samples_rs.T:
273 params = self._dist_class.fit(samples)
274 if __debug__ and 'STAT' in debug.active:
275 debug('STAT', 'Estimated parameters for the %s are %s'
276 % (self._dist_class, str(params)))
277 dist.append(self._dist_class(*params))
278 self._dist = dist
279
280
282 """Return value of the cumulative distribution function at `x`.
283 """
284 if self._dist is None:
285
286
287
288 raise RuntimeError, "Distribution has to be fit first"
289
290 is_scalar = N.isscalar(x)
291 if is_scalar:
292 x = [x]
293
294 x = N.asanyarray(x)
295 xshape = x.shape
296
297 x = x.reshape((-1,))
298
299 if len(self._dist) != len(x):
300 raise ValueError, 'Distribution was fit for structure with %d' \
301 ' elements, whenever now queried with %d elements' \
302 % (len(self._dist), len(x))
303
304
305 cdfs = [ dist.cdf(v) for v, dist in zip(x, self._dist) ]
306 return N.array(cdfs).reshape(xshape)
307
308
310 """Clean stored distributions
311
312 Storing all of the distributions might be too expensive
313 (e.g. in case of Nonparametric), and the scope of the object
314 might be too broad to wait for it to be destroyed. Clean would
315 bind dist_samples to empty list to let gc revoke the memory.
316 """
317 self._dist = []
318
322 """Proxy/Adaptor class for SciPy distributions.
323
324 All distributions from SciPy's 'stats' module can be used with this class.
325
326 >>> import numpy as N
327 >>> from scipy import stats
328 >>> from mvpa.clfs.stats import FixedNullDist
329 >>>
330 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4))
331 >>> dist.p(2)
332 0.5
333 >>>
334 >>> dist.cdf(N.arange(5))
335 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246])
336 >>>
337 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4), tail='right')
338 >>> dist.p(N.arange(5))
339 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754])
340 """
342 """
343 :Parameter:
344 dist: distribution object
345 This can be any object the has a `cdf()` method to report the
346 cumulative distribition function values.
347 """
348 NullDist.__init__(self, **kwargs)
349
350 self._dist = dist
351
352
353 - def fit(self, measure, wdata, vdata=None):
354 """Does nothing since the distribution is already fixed."""
355 pass
356
357
359 """Return value of the cumulative distribution function at `x`.
360 """
361 return self._dist.cdf(x)
362
365 """Adaptive distribution which adjusts parameters according to the data
366
367 WiP: internal implementation might change
368 """
369 - def fit(self, measure, wdata, vdata=None):
370 """Cares about dimensionality of the feature space in measure
371 """
372
373 try:
374 nfeatures = len(measure.feature_ids)
375 except ValueError:
376 nfeatures = N.prod(wdata.shape[1:])
377
378 dist_gen = self._dist
379 if not hasattr(dist_gen, 'fit'):
380 dist_gen = dist_gen.dist
381
382 args, kwargs = self._adapt(nfeatures, measure, wdata, vdata)
383 if __debug__:
384 debug('STAT', 'Adapted parameters for %s to be %s, %s'
385 % (dist_gen, args, kwargs))
386 self._dist = dist_gen(*args, **kwargs)
387
388
389 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
390 raise NotImplementedError
391
394 """Adaptive rdist: params are (nfeatures-1, 0, 1)
395 """
396
397 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
399
400
401
402
403
405 cdf_ = self._dist.cdf(x)
406 bad_values = N.where(N.abs(cdf_)>1)
407
408
409
410 if len(bad_values[0]):
411
412
413 cdf_bad = cdf_[bad_values]
414 x_bad = x[bad_values]
415 cdf_bad[x_bad<0] = 0.0
416 cdf_bad[x_bad>=0] = 1.0
417 cdf_[bad_values] = cdf_bad
418 return cdf_
419
422 """Adaptive rdist: params are (0, sqrt(1/nfeatures))
423 """
424
425 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
427
428
429 if externals.exists('scipy'):
430 import scipy.stats
431 from scipy.stats import kstest
432 """
433 Thoughts:
434
435 So we can use `scipy.stats.kstest` (Kolmogorov-Smirnov test) to
436 check/reject H0 that samples come from a given distribution. But
437 since it is based on a full range of data, we might better of with
438 some ad-hoc checking by the detection power of the values in the
439 tail of a tentative distribution.
440
441 """
442
443
444
445
446
447
448 import scipy
451 """Helper proxy-class to fit distribution when some parameters are known
452
453 It is an ugly hack with snippets of code taken from scipy, which is
454 Copyright (c) 2001, 2002 Enthought, Inc.
455 and is distributed under BSD license
456 http://www.scipy.org/License_Compatibility
457 """
458
459 - def __init__(self, dist, loc=None, scale=None, args=None):
460
461 self._dist = dist
462
463 theta = (loc, scale)
464
465 Narg_ = dist.numargs
466 if args is not None:
467 Narg = len(args)
468 if Narg > Narg_:
469 raise ValueError, \
470 'Distribution %s has only %d arguments. Got %d' \
471 % (dist, Narg_, Narg)
472 args += (None,) * (Narg_ - Narg)
473 else:
474 args = (None,) * Narg_
475
476 args_i = [i for i,v in enumerate(args) if v is None]
477 self._fargs = (list(args+theta), args_i)
478 """Arguments which should get some fixed value"""
479
480
481 - def nnlf(self, theta, x):
482
483
484
485 fargs, fargs_i = self._fargs
486 try:
487 i=-1
488 if fargs[-1] is not None:
489 scale = fargs[-1]
490 else:
491 scale = theta[i]
492 i -= 1
493
494 if fargs[-2] is not None:
495 loc = fargs[-2]
496 else:
497 loc = theta[i]
498 i -= 1
499
500 args = theta[:i+1]
501
502 for i,a in zip(fargs_i, args):
503 fargs[i] = a
504 args = fargs[:-2]
505
506 except IndexError:
507 raise ValueError, "Not enough input arguments."
508 if not self._argcheck(*args) or scale <= 0:
509 return N.inf
510 x = N.asarray((x-loc) / scale)
511 cond0 = (x <= self.a) | (x >= self.b)
512 if (N.any(cond0)):
513 return N.inf
514 else:
515 return self._nnlf(x, *args) + len(x)*N.log(scale)
516
517 - def fit(self, data, *args, **kwds):
518 loc0, scale0 = map(kwds.get, ['loc', 'scale'], [0.0, 1.0])
519 fargs, fargs_i = self._fargs
520 Narg = len(args)
521 Narg_ = self.numargs
522 if Narg != Narg_:
523 if Narg > Narg_:
524 raise ValueError, "Too many input arguments."
525 else:
526 args += (1.0,)*(self.numargs-Narg)
527
528
529
530 if len(fargs_i) != Narg_:
531 x0 = tuple([args[i] for i in fargs_i])
532 else:
533 x0 = args
534 if fargs[-2] is None: x0 = x0 + (loc0,)
535 if fargs[-1] is None: x0 = x0 + (scale0,)
536
537 opt_x = scipy.optimize.fmin(self.nnlf, x0, args=(N.ravel(data),), disp=0)
538
539
540 i = 0
541 loc, scale = fargs[-2:]
542 if fargs[-1] is None:
543 i -= 1
544 scale = opt_x[i]
545 if fargs[-2] is None:
546 i -= 1
547 loc = opt_x[i]
548
549
550 for i in fargs_i:
551 fargs[i] = opt_x[i]
552
553
554 opt_x = N.hstack((fargs[:-2], (loc, scale)))
555 return opt_x
556
557
559 if not a in ['_dist', '_fargs', 'fit', 'nnlf']:
560 self._dist.__setattr__(a, v)
561 else:
562 object.__setattr__(self, a, v)
563
564
566 """We need to redirect all queries correspondingly
567 """
568 if not a in ['_dist', '_fargs', 'fit', 'nnlf']:
569 return getattr(self._dist, a)
570 else:
571 return object.__getattribute__(self, a)
572
573
574
575 - def matchDistribution(data, nsamples=None, loc=None, scale=None,
576 args=None, test='kstest', distributions=None,
577 **kwargs):
578 """Determine best matching distribution.
579
580 Can be used for 'smelling' the data, as well to choose a
581 parametric distribution for data obtained from non-parametric
582 testing (e.g. `MCNullDist`).
583
584 WiP: use with caution, API might change
585
586 :Parameters:
587 data : N.ndarray
588 Array of the data for which to deduce the distribution. It has
589 to be sufficiently large to make a reliable conclusion
590 nsamples : int or None
591 If None -- use all samples in data to estimate parametric
592 distribution. Otherwise use only specified number randomly selected
593 from data.
594 loc : float or None
595 Loc for the distribution (if known)
596 scale : float or None
597 Scale for the distribution (if known)
598 test : basestring
599 What kind of testing to do. Choices:
600 'p-roc' : detection power for a given ROC. Needs two
601 parameters: `p=0.05` and `tail='both'`
602 'kstest' : 'full-body' distribution comparison. The best
603 choice is made by minimal reported distance after estimating
604 parameters of the distribution. Parameter `p=0.05` sets
605 threshold to reject null-hypothesis that distribution is the
606 same.
607 WARNING: older versions (e.g. 0.5.2 in etch) of scipy have
608 incorrect kstest implementation and do not function
609 properly
610 distributions : None or list of basestring or tuple(basestring, dict)
611 Distributions to check. If None, all known in scipy.stats
612 are tested. If distribution is specified as a tuple, then
613 it must contain name and additional parameters (name, loc,
614 scale, args) in the dictionary. Entry 'scipy' adds all known
615 in scipy.stats.
616 **kwargs
617 Additional arguments which are needed for each particular test
618 (see above)
619
620 :Example:
621 data = N.random.normal(size=(1000,1));
622 matches = matchDistribution(
623 data,
624 distributions=['rdist',
625 ('rdist', {'name':'rdist_fixed',
626 'loc': 0.0,
627 'args': (10,)})],
628 nsamples=30, test='p-roc', p=0.05)
629 """
630
631
632 _KNOWN_TESTS = ['p-roc', 'kstest']
633 if not test in _KNOWN_TESTS:
634 raise ValueError, 'Unknown kind of test %s. Known are %s' \
635 % (test, _KNOWN_TESTS)
636
637 data = N.ravel(data)
638
639 if nsamples is not None:
640 if __debug__:
641 debug('STAT', 'Sampling %d samples from data for the ' \
642 'estimation of the distributions parameters' % nsamples)
643 indexes_selected = (N.random.sample(nsamples)*len(data)).astype(int)
644 data_selected = data[indexes_selected]
645 else:
646 indexes_selected = N.arange(len(data))
647 data_selected = data
648
649 p_thr = kwargs.get('p', 0.05)
650 if test == 'p-roc':
651 tail = kwargs.get('tail', 'both')
652 data_p = _pvalue(data, Nonparametric(data).cdf, tail)
653 data_p_thr = N.abs(data_p) <= p_thr
654 true_positives = N.sum(data_p_thr)
655 if true_positives == 0:
656 raise ValueError, "Provided data has no elements in non-" \
657 "parametric distribution under p<=%.3f. Please " \
658 "increase the size of data or value of p" % p
659 if __debug__:
660 debug('STAT_', 'Number of positives in non-parametric '
661 'distribution is %d' % true_positives)
662
663 if distributions is None:
664 distributions = ['scipy']
665
666
667 try:
668 scipy_ind = distributions.index('scipy')
669 distributions.pop(scipy_ind)
670 distributions += scipy.stats.distributions.__all__
671 except ValueError:
672 pass
673
674 results = []
675 for d in distributions:
676 dist_gen, loc_, scale_, args_ = None, loc, scale, args
677 if isinstance(d, basestring):
678 dist_gen = d
679 dist_name = d
680 elif isinstance(d, tuple):
681 if not (len(d)==2 and isinstance(d[1], dict)):
682 raise ValueError,\
683 "Tuple specification of distribution must be " \
684 "(d, {params}). Got %s" % (d,)
685 dist_gen = d[0]
686 loc_ = d[1].get('loc', loc)
687 scale_ = d[1].get('scale', scale)
688 args_ = d[1].get('args', args)
689 dist_name = d[1].get('name', str(dist_gen))
690 else:
691 dist_gen = d
692 dist_name = str(d)
693
694
695 try:
696 dist_gen_ = getattr(scipy.stats, dist_gen)
697
698
699 if args_ is not None or loc_ is not None or scale_ is not None:
700 dist_opt = rv_semifrozen(dist_gen_, loc=loc_, scale=scale_, args=args_)
701 else:
702 dist_opt = dist_gen_
703 dist_params = dist_opt.fit(data_selected)
704 if __debug__:
705 debug('STAT__',
706 'Got distribution parameters %s for %s'
707 % (dist_params, dist_name))
708 if test == 'p-roc':
709 cdf_func = lambda x: dist_gen_.cdf(x, *dist_params)
710
711 cdf_p = N.abs(_pvalue(data, cdf_func, tail))
712 cdf_p_thr = cdf_p <= p_thr
713 D, p = N.sum(N.abs(data_p_thr - cdf_p_thr))*1.0/true_positives, 1
714 if __debug__: res_sum = 'D=%.2f' % D
715 elif test == 'kstest':
716 D, p = kstest(data, d, args=dist_params)
717 if __debug__: res_sum = 'D=%.3f p=%.3f' % (D, p)
718 except (TypeError, ValueError, AttributeError), e:
719 if __debug__:
720 debug('STAT__',
721 'Testing for %s distribution failed due to %s'
722 % (d, str(e)))
723 continue
724
725 if p > p_thr and not N.isnan(D):
726 results += [ (D, dist_gen, dist_name, dist_params) ]
727 if __debug__:
728 debug('STAT_', 'Testing for %s dist.: %s' % (dist_name, res_sum))
729 else:
730 if __debug__:
731 debug('STAT__', 'Cannot consider %s dist. with %s'
732 % (d, res_sum))
733 continue
734
735
736 results.sort()
737
738 if __debug__ and 'STAT' in debug.active:
739
740 nresults = len(results)
741 sresult = lambda r:'%s(%s)=%.2f' % (r[1], ', '.join(map(str, r[3])), r[0])
742 if nresults>0:
743 nnextbest = min(2, nresults-1)
744 snextbest = ', '.join(map(sresult, results[1:1+nnextbest]))
745 debug('STAT', 'Best distribution %s. Next best: %s'
746 % (sresult(results[0]), snextbest))
747 else:
748 debug('STAT', 'Could not find suitable distribution')
749
750
751 return results
752
753
754 if externals.exists('pylab'):
755 import pylab as P
756
757 - def plotDistributionMatches(data, matches, nbins=31, nbest=5,
758 expand_tails=8, legend=2, plot_cdf=True,
759 p=None, tail='both'):
760 """Plot best matching distributions
761
762 :Parameters:
763 data : N.ndarray
764 Data which was used to obtain the matches
765 matches : list of tuples
766 Sorted matches as provided by matchDistribution
767 nbins : int
768 Number of bins in the histogram
769 nbest : int
770 Number of top matches to plot
771 expand_tails : int
772 How many bins away to add to parametrized distributions
773 plots
774 legend : int
775 Either to provide legend and statistics in the legend.
776 1 -- just lists distributions.
777 2 -- adds distance measure
778 3 -- tp/fp/fn in the case if p is provided
779 plot_cdf : bool
780 Either to plot cdf for data using non-parametric distribution
781 p : float or None
782 If not None, visualize null-hypothesis testing (given p).
783 Bars in the histogram which fall under given p are colored
784 in red. False positives and false negatives are marked as
785 triangle up and down symbols correspondingly
786 tail : ('left', 'right', 'any', 'both')
787 If p is not None, the choise of tail for null-hypothesis
788 testing
789
790 :Returns: tuple(histogram, list of lines)
791 """
792
793 hist = P.hist(data, nbins, normed=1, align='center')
794 data_range = [N.min(data), N.max(data)]
795
796
797 x = hist[1]
798 dx = x[expand_tails] - x[0]
799 x = N.hstack((x[:expand_tails] - dx, x, x[-expand_tails:] + dx))
800
801 nonparam = Nonparametric(data)
802
803 if plot_cdf:
804 P.plot(x, nonparam.cdf(x), 'k--', linewidth=1)
805
806 p_thr = p
807
808 data_p = _pvalue(data, nonparam.cdf, tail)
809 data_p_thr = (data_p <= p_thr).ravel()
810
811 x_p = _pvalue(x, Nonparametric(data).cdf, tail)
812 x_p_thr = N.abs(x_p) <= p_thr
813
814 for thr, bar in zip(x_p_thr[expand_tails:], hist[2]):
815 bar.set_facecolor(('w','r')[int(thr)])
816
817 if not len(matches):
818
819 warning("No matching distributions were provided -- nothing to plot")
820 return (hist, )
821
822 lines = []
823 labels = []
824 for i in xrange(min(nbest, len(matches))):
825 D, dist_gen, dist_name, params = matches[i]
826 dist = getattr(scipy.stats, dist_gen)(*params)
827
828 label = '%s' % (dist_name)
829 if legend > 1: label += '(D=%.2f)' % (D)
830
831 xcdf_p = N.abs(_pvalue(x, dist.cdf, tail))
832 xcdf_p_thr = (xcdf_p <= p_thr).ravel()
833
834 if p is not None and legend > 2:
835
836 data_cdf_p = N.abs(_pvalue(data, dist.cdf, tail))
837 data_cdf_p_thr = (data_cdf_p <= p_thr).ravel()
838
839
840 tp = N.logical_and(data_cdf_p_thr, data_p_thr)
841
842 fp = N.logical_and(data_cdf_p_thr, ~data_p_thr)
843
844 fn = N.logical_and(~data_cdf_p_thr, data_p_thr)
845
846 label += ' tp/fp/fn=%d/%d/%d)' % \
847 tuple(map(N.sum, [tp,fp,fn]))
848
849 pdf = dist.pdf(x)
850 line = P.plot(x, pdf, '-', linewidth=2, label=label)
851 color = line[0].get_color()
852
853 if plot_cdf:
854 cdf = dist.cdf(x)
855 P.plot(x, cdf, ':', linewidth=1, color=color, label=label)
856
857
858
859
860
861 if p is not None:
862
863 xtp = N.logical_and(xcdf_p_thr, x_p_thr)
864
865 xfp = N.logical_and(xcdf_p_thr, ~x_p_thr)
866
867 xfn = N.logical_and(~xcdf_p_thr, x_p_thr)
868
869
870
871 P.plot(x[xfp], pdf[xfp], '^', color=color)
872 P.plot(x[xfn], pdf[xfn], 'v', color=color)
873
874 lines.append(line)
875 labels.append(label)
876
877 if legend:
878 P.legend(lines, labels)
879
880 return (hist, lines)
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895 -def autoNullDist(dist):
896 """Cheater for human beings -- wraps `dist` if needed with some
897 NullDist
898
899 tail and other arguments are assumed to be default as in
900 NullDist/MCNullDist
901 """
902 if dist is None or isinstance(dist, NullDist):
903 return dist
904 elif hasattr(dist, 'fit'):
905 if __debug__:
906 debug('STAT', 'Wrapping %s into MCNullDist' % dist)
907 return MCNullDist(dist)
908 else:
909 if __debug__:
910 debug('STAT', 'Wrapping %s into FixedNullDist' % dist)
911 return FixedNullDist(dist)
912
916 if axis is None:
917 a = N.ravel(a)
918 outaxis = 0
919 else:
920 a = N.asarray(a)
921 outaxis = axis
922 return a, outaxis
923
925 """Compute the mean over the given axis ignoring nans.
926
927 :Parameters:
928 x : ndarray
929 input array
930 axis : int
931 axis along which the mean is computed.
932
933 :Results:
934 m : float
935 the mean."""
936 x, axis = _chk_asarray(x,axis)
937 x = x.copy()
938 Norig = x.shape[axis]
939 factor = 1.0-N.sum(N.isnan(x),axis)*1.0/Norig
940
941 x[N.isnan(x)] = 0
942 return N.mean(x,axis)/factor
943