1
2
3
4
5
6
7
8
9 """Estimator for classifier error distributions."""
10
11 __docformat__ = 'restructuredtext'
12
13 import numpy as N
14
15 from mvpa.base import externals, warning
16 from mvpa.misc.state import ClassWithCollections, StateVariable
17
18 if __debug__:
19 from mvpa.base import debug
20
21
23 """Non-parametric 1d distribution -- derives cdf based on stored values.
24
25 Introduced to complement parametric distributions present in scipy.stats.
26 """
27
30
31
32 @staticmethod
33 - def fit(dist_samples):
35
36
38 """Returns the cdf value at `x`.
39 """
40 return N.vectorize(lambda v:(self._dist_samples <= v).mean())(x)
41
42
43 -def _pvalue(x, cdf_func, tail, return_tails=False, name=None):
44 """Helper function to return p-value(x) given cdf and tail
45
46 :Parameters:
47 cdf_func : callable
48 Function to be used to derive cdf values for x
49 tail : str ('left', 'right', 'any', 'both')
50 Which tail of the distribution to report. For 'any' and 'both'
51 it chooses the tail it belongs to based on the comparison to
52 p=0.5. In the case of 'any' significance is taken like in a
53 one-tailed test.
54 return_tails : bool
55 If True, a tuple return (pvalues, tails), where tails contain
56 1s if value was from the right tail, and 0 if the value was
57 from the left tail.
58 """
59 is_scalar = N.isscalar(x)
60 if is_scalar:
61 x = [x]
62
63 cdf = cdf_func(x)
64
65 if __debug__ and 'CHECK_STABILITY' in debug.active:
66 cdf_min, cdf_max = N.min(cdf), N.max(cdf)
67 if cdf_min < 0 or cdf_max > 1.0:
68 s = ('', ' for %s' % name)[int(name is not None)]
69 warning('Stability check of cdf %s failed%s. Min=%s, max=%s' % \
70 (cdf_func, s, cdf_min, cdf_max))
71
72
73
74 cdf = N.clip(cdf, 0, 1.0)
75
76 if tail == 'left':
77 if return_tails:
78 right_tail = N.zeros(cdf.shape, dtype=bool)
79 elif tail == 'right':
80 cdf = 1 - cdf
81 if return_tails:
82 right_tail = N.ones(cdf.shape, dtype=bool)
83 elif tail in ('any', 'both'):
84 right_tail = (cdf >= 0.5)
85 cdf[right_tail] = 1.0 - cdf[right_tail]
86 if tail == 'both':
87
88 cdf *= 2
89
90
91 cdf[N.isnan(x)] = 1.0
92 if is_scalar: res = cdf[0]
93 else: res = cdf
94
95 if return_tails:
96 return (res, right_tail)
97 else:
98 return res
99
100
102 """Base class for null-hypothesis testing.
103
104 """
105
106
107
108
109
110 _ATTRIBUTE_COLLECTIONS = ['states']
111
112 - def __init__(self, tail='both', **kwargs):
113 """Cheap initialization.
114
115 :Parameter:
116 tail: str ('left', 'right', 'any', 'both')
117 Which tail of the distribution to report. For 'any' and 'both'
118 it chooses the tail it belongs to based on the comparison to
119 p=0.5. In the case of 'any' significance is taken like in a
120 one-tailed test.
121 """
122 ClassWithCollections.__init__(self, **kwargs)
123
124 self._setTail(tail)
125
127 return super(NullDist, self).__repr__(
128 prefixes=["tail=%s" % `self.__tail`] + prefixes)
129
130
132
133 if tail not in ('left', 'right', 'any', 'both'):
134 raise ValueError, 'Unknown value "%s" to `tail` argument.' \
135 % tail
136 self.__tail = tail
137
138
139 - def fit(self, measure, wdata, vdata=None):
140 """Implement to fit the distribution to the data."""
141 raise NotImplementedError
142
143
145 """Implementations return the value of the cumulative distribution
146 function (left or right tail dpending on the setting).
147 """
148 raise NotImplementedError
149
150
151 - def p(self, x, **kwargs):
152 """Returns the p-value for values of `x`.
153 Returned values are determined left, right, or from any tail
154 depending on the constructor setting.
155
156 In case a `FeaturewiseDatasetMeasure` was used to estimate the
157 distribution the method returns an array. In that case `x` can be
158 a scalar value or an array of a matching shape.
159 """
160 return _pvalue(x, self.cdf, self.__tail, **kwargs)
161
162
163 tail = property(fget=lambda x:x.__tail, fset=_setTail)
164
165
167 """Null-hypothesis distribution is estimated from randomly permuted data labels.
168
169 The distribution is estimated by calling fit() with an appropriate
170 `DatasetMeasure` or `TransferError` instance and a training and a
171 validation dataset (in case of a `TransferError`). For a customizable
172 amount of cycles the training data labels are permuted and the
173 corresponding measure computed. In case of a `TransferError` this is the
174 error when predicting the *correct* labels of the validation dataset.
175
176 The distribution can be queried using the `cdf()` method, which can be
177 configured to report probabilities/frequencies from `left` or `right` tail,
178 i.e. fraction of the distribution that is lower or larger than some
179 critical value.
180
181 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()`
182 returns an array of featurewise probabilities/frequencies.
183 """
184
185 _DEV_DOC = """
186 TODO automagically decide on the number of samples/permutations needed
187 Caution should be paid though since resultant distributions might be
188 quite far from some conventional ones (e.g. Normal) -- it is expected to
189 them to be bimodal (or actually multimodal) in many scenarios.
190 """
191
192 dist_samples = StateVariable(enabled=False,
193 doc='Samples obtained for each permutation')
194
196 """Initialize Monte-Carlo Permutation Null-hypothesis testing
197
198 :Parameters:
199 dist_class: class
200 This can be any class which provides parameters estimate
201 using `fit()` method to initialize the instance, and
202 provides `cdf(x)` method for estimating value of x in CDF.
203 All distributions from SciPy's 'stats' module can be used.
204 permutations: int
205 This many permutations of label will be performed to
206 determine the distribution under the null hypothesis.
207 """
208 NullDist.__init__(self, **kwargs)
209
210 self._dist_class = dist_class
211 self._dist = []
212
213 self.__permutations = permutations
214 """Number of permutations to compute the estimate the null
215 distribution."""
216
218 prefixes_ = ["permutations=%s" % self.__permutations]
219 if self._dist_class != Nonparametric:
220 prefixes_.insert(0, 'dist_class=%s' % `self._dist_class`)
221 return super(MCNullDist, self).__repr__(
222 prefixes=prefixes_ + prefixes)
223
224
225 - def fit(self, measure, wdata, vdata=None):
226 """Fit the distribution by performing multiple cycles which repeatedly
227 permuted labels in the training dataset.
228
229 :Parameters:
230 measure: (`Featurewise`)`DatasetMeasure` | `TransferError`
231 TransferError instance used to compute all errors.
232 wdata: `Dataset` which gets permuted and used to compute the
233 measure/transfer error multiple times.
234 vdata: `Dataset` used for validation.
235 If provided measure is assumed to be a `TransferError` and
236 working and validation dataset are passed onto it.
237 """
238 dist_samples = []
239 """Holds the values for randomized labels."""
240
241
242 if not vdata is None:
243 measure_args = [vdata, wdata]
244 else:
245 measure_args = [wdata]
246
247
248 for p in xrange(self.__permutations):
249
250
251
252
253
254
255
256
257 wdata.permuteLabels(True, perchunk=False)
258
259
260
261 dist_samples.append(measure(*measure_args))
262
263
264 wdata.permuteLabels(False, perchunk=False)
265
266
267 self.dist_samples = dist_samples = N.asarray(dist_samples)
268
269
270
271
272 shape = dist_samples.shape
273 nshape = len(shape)
274
275
276 if nshape == 1:
277 dist_samples = dist_samples[:, N.newaxis]
278
279
280
281 dist_samples_rs = dist_samples.reshape((shape[0], -1))
282 dist = []
283 for samples in dist_samples_rs.T:
284 params = self._dist_class.fit(samples)
285 if __debug__ and 'STAT' in debug.active:
286 debug('STAT', 'Estimated parameters for the %s are %s'
287 % (self._dist_class, str(params)))
288 dist.append(self._dist_class(*params))
289 self._dist = dist
290
291
293 """Return value of the cumulative distribution function at `x`.
294 """
295 if self._dist is None:
296
297
298
299 raise RuntimeError, "Distribution has to be fit first"
300
301 is_scalar = N.isscalar(x)
302 if is_scalar:
303 x = [x]
304
305 x = N.asanyarray(x)
306 xshape = x.shape
307
308 x = x.reshape((-1,))
309
310 if len(self._dist) != len(x):
311 raise ValueError, 'Distribution was fit for structure with %d' \
312 ' elements, whenever now queried with %d elements' \
313 % (len(self._dist), len(x))
314
315
316 cdfs = [ dist.cdf(v) for v, dist in zip(x, self._dist) ]
317 return N.array(cdfs).reshape(xshape)
318
319
321 """Clean stored distributions
322
323 Storing all of the distributions might be too expensive
324 (e.g. in case of Nonparametric), and the scope of the object
325 might be too broad to wait for it to be destroyed. Clean would
326 bind dist_samples to empty list to let gc revoke the memory.
327 """
328 self._dist = []
329
330
331
333 """Proxy/Adaptor class for SciPy distributions.
334
335 All distributions from SciPy's 'stats' module can be used with this class.
336
337 >>> import numpy as N
338 >>> from scipy import stats
339 >>> from mvpa.clfs.stats import FixedNullDist
340 >>>
341 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4))
342 >>> dist.p(2)
343 0.5
344 >>>
345 >>> dist.cdf(N.arange(5))
346 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246])
347 >>>
348 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4), tail='right')
349 >>> dist.p(N.arange(5))
350 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754])
351 """
353 """
354 :Parameter:
355 dist: distribution object
356 This can be any object the has a `cdf()` method to report the
357 cumulative distribition function values.
358 """
359 NullDist.__init__(self, **kwargs)
360
361 self._dist = dist
362
363
364 - def fit(self, measure, wdata, vdata=None):
365 """Does nothing since the distribution is already fixed."""
366 pass
367
368
370 """Return value of the cumulative distribution function at `x`.
371 """
372 return self._dist.cdf(x)
373
374
376 prefixes_ = ["dist=%s" % `self._dist`]
377 return super(FixedNullDist, self).__repr__(
378 prefixes=prefixes_ + prefixes)
379
380
382 """Adaptive distribution which adjusts parameters according to the data
383
384 WiP: internal implementation might change
385 """
386 - def fit(self, measure, wdata, vdata=None):
387 """Cares about dimensionality of the feature space in measure
388 """
389
390 try:
391 nfeatures = len(measure.feature_ids)
392 except ValueError:
393 nfeatures = N.prod(wdata.shape[1:])
394
395 dist_gen = self._dist
396 if not hasattr(dist_gen, 'fit'):
397 dist_gen = dist_gen.dist
398
399 args, kwargs = self._adapt(nfeatures, measure, wdata, vdata)
400 if __debug__:
401 debug('STAT', 'Adapted parameters for %s to be %s, %s'
402 % (dist_gen, args, kwargs))
403 self._dist = dist_gen(*args, **kwargs)
404
405
406 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
407 raise NotImplementedError
408
409
411 """Adaptive rdist: params are (nfeatures-1, 0, 1)
412 """
413
414 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
416
417
418
419
420
422 cdf_ = self._dist.cdf(x)
423 bad_values = N.where(N.abs(cdf_)>1)
424
425
426
427 if len(bad_values[0]):
428
429
430 cdf_bad = cdf_[bad_values]
431 x_bad = x[bad_values]
432 cdf_bad[x_bad<0] = 0.0
433 cdf_bad[x_bad>=0] = 1.0
434 cdf_[bad_values] = cdf_bad
435 return cdf_
436
437
439 """Adaptive Normal Distribution: params are (0, sqrt(1/nfeatures))
440 """
441
442 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
444
445
446 if externals.exists('scipy'):
447 from mvpa.support.stats import scipy
448 from scipy.stats import kstest
449 """
450 Thoughts:
451
452 So we can use `scipy.stats.kstest` (Kolmogorov-Smirnov test) to
453 check/reject H0 that samples come from a given distribution. But
454 since it is based on a full range of data, we might better of with
455 some ad-hoc checking by the detection power of the values in the
456 tail of a tentative distribution.
457
458 """
459
460
461
462
463
464
465 import scipy
466
468 """Helper proxy-class to fit distribution when some parameters are known
469
470 It is an ugly hack with snippets of code taken from scipy, which is
471 Copyright (c) 2001, 2002 Enthought, Inc.
472 and is distributed under BSD license
473 http://www.scipy.org/License_Compatibility
474 """
475
476 - def __init__(self, dist, loc=None, scale=None, args=None):
477
478 self._dist = dist
479
480 theta = (loc, scale)
481
482 Narg_ = dist.numargs
483 if args is not None:
484 Narg = len(args)
485 if Narg > Narg_:
486 raise ValueError, \
487 'Distribution %s has only %d arguments. Got %d' \
488 % (dist, Narg_, Narg)
489 args += (None,) * (Narg_ - Narg)
490 else:
491 args = (None,) * Narg_
492
493 args_i = [i for i,v in enumerate(args) if v is None]
494 self._fargs = (list(args+theta), args_i)
495 """Arguments which should get some fixed value"""
496
497
498 - def nnlf(self, theta, x):
499
500
501
502 fargs, fargs_i = self._fargs
503 try:
504 i=-1
505 if fargs[-1] is not None:
506 scale = fargs[-1]
507 else:
508 scale = theta[i]
509 i -= 1
510
511 if fargs[-2] is not None:
512 loc = fargs[-2]
513 else:
514 loc = theta[i]
515 i -= 1
516
517 args = theta[:i+1]
518
519 for i,a in zip(fargs_i, args):
520 fargs[i] = a
521 args = fargs[:-2]
522
523 except IndexError:
524 raise ValueError, "Not enough input arguments."
525 if not self._argcheck(*args) or scale <= 0:
526 return N.inf
527 x = N.asarray((x-loc) / scale)
528 cond0 = (x <= self.a) | (x >= self.b)
529 if (N.any(cond0)):
530 return N.inf
531 else:
532 return self._nnlf(x, *args) + len(x)*N.log(scale)
533
534 - def fit(self, data, *args, **kwds):
535 loc0, scale0 = map(kwds.get, ['loc', 'scale'], [0.0, 1.0])
536 fargs, fargs_i = self._fargs
537 Narg = len(args)
538 Narg_ = self.numargs
539 if Narg != Narg_:
540 if Narg > Narg_:
541 raise ValueError, "Too many input arguments."
542 else:
543 args += (1.0,)*(self.numargs-Narg)
544
545
546
547 if len(fargs_i) != Narg_:
548 x0 = tuple([args[i] for i in fargs_i])
549 else:
550 x0 = args
551 if fargs[-2] is None: x0 = x0 + (loc0,)
552 if fargs[-1] is None: x0 = x0 + (scale0,)
553
554 opt_x = scipy.optimize.fmin(self.nnlf, x0, args=(N.ravel(data),), disp=0)
555
556
557 i = 0
558 loc, scale = fargs[-2:]
559 if fargs[-1] is None:
560 i -= 1
561 scale = opt_x[i]
562 if fargs[-2] is None:
563 i -= 1
564 loc = opt_x[i]
565
566
567 for i in fargs_i:
568 fargs[i] = opt_x[i]
569
570
571 opt_x = N.hstack((fargs[:-2], (loc, scale)))
572 return opt_x
573
574
576 if not a in ['_dist', '_fargs', 'fit', 'nnlf']:
577 self._dist.__setattr__(a, v)
578 else:
579 object.__setattr__(self, a, v)
580
581
583 """We need to redirect all queries correspondingly
584 """
585 if not a in ['_dist', '_fargs', 'fit', 'nnlf']:
586 return getattr(self._dist, a)
587 else:
588 return object.__getattribute__(self, a)
589
590
591
592 - def matchDistribution(data, nsamples=None, loc=None, scale=None,
593 args=None, test='kstest', distributions=None,
594 **kwargs):
595 """Determine best matching distribution.
596
597 Can be used for 'smelling' the data, as well to choose a
598 parametric distribution for data obtained from non-parametric
599 testing (e.g. `MCNullDist`).
600
601 WiP: use with caution, API might change
602
603 :Parameters:
604 data : N.ndarray
605 Array of the data for which to deduce the distribution. It has
606 to be sufficiently large to make a reliable conclusion
607 nsamples : int or None
608 If None -- use all samples in data to estimate parametric
609 distribution. Otherwise use only specified number randomly selected
610 from data.
611 loc : float or None
612 Loc for the distribution (if known)
613 scale : float or None
614 Scale for the distribution (if known)
615 test : basestring
616 What kind of testing to do. Choices:
617 'p-roc' : detection power for a given ROC. Needs two
618 parameters: `p=0.05` and `tail='both'`
619 'kstest' : 'full-body' distribution comparison. The best
620 choice is made by minimal reported distance after estimating
621 parameters of the distribution. Parameter `p=0.05` sets
622 threshold to reject null-hypothesis that distribution is the
623 same.
624 WARNING: older versions (e.g. 0.5.2 in etch) of scipy have
625 incorrect kstest implementation and do not function
626 properly
627 distributions : None or list of basestring or tuple(basestring, dict)
628 Distributions to check. If None, all known in scipy.stats
629 are tested. If distribution is specified as a tuple, then
630 it must contain name and additional parameters (name, loc,
631 scale, args) in the dictionary. Entry 'scipy' adds all known
632 in scipy.stats.
633 **kwargs
634 Additional arguments which are needed for each particular test
635 (see above)
636
637 :Example:
638 data = N.random.normal(size=(1000,1));
639 matches = matchDistribution(
640 data,
641 distributions=['rdist',
642 ('rdist', {'name':'rdist_fixed',
643 'loc': 0.0,
644 'args': (10,)})],
645 nsamples=30, test='p-roc', p=0.05)
646 """
647
648
649 _KNOWN_TESTS = ['p-roc', 'kstest']
650 if not test in _KNOWN_TESTS:
651 raise ValueError, 'Unknown kind of test %s. Known are %s' \
652 % (test, _KNOWN_TESTS)
653
654 data = N.ravel(data)
655
656 if nsamples is not None:
657 if __debug__:
658 debug('STAT', 'Sampling %d samples from data for the ' \
659 'estimation of the distributions parameters' % nsamples)
660 indexes_selected = (N.random.sample(nsamples)*len(data)).astype(int)
661 data_selected = data[indexes_selected]
662 else:
663 indexes_selected = N.arange(len(data))
664 data_selected = data
665
666 p_thr = kwargs.get('p', 0.05)
667 if test == 'p-roc':
668 tail = kwargs.get('tail', 'both')
669 data_p = _pvalue(data, Nonparametric(data).cdf, tail)
670 data_p_thr = N.abs(data_p) <= p_thr
671 true_positives = N.sum(data_p_thr)
672 if true_positives == 0:
673 raise ValueError, "Provided data has no elements in non-" \
674 "parametric distribution under p<=%.3f. Please " \
675 "increase the size of data or value of p" % p
676 if __debug__:
677 debug('STAT_', 'Number of positives in non-parametric '
678 'distribution is %d' % true_positives)
679
680 if distributions is None:
681 distributions = ['scipy']
682
683
684 try:
685 scipy_ind = distributions.index('scipy')
686 distributions.pop(scipy_ind)
687 distributions += scipy.stats.distributions.__all__
688 except ValueError:
689 pass
690
691 results = []
692 for d in distributions:
693 dist_gen, loc_, scale_, args_ = None, loc, scale, args
694 if isinstance(d, basestring):
695 dist_gen = d
696 dist_name = d
697 elif isinstance(d, tuple):
698 if not (len(d)==2 and isinstance(d[1], dict)):
699 raise ValueError,\
700 "Tuple specification of distribution must be " \
701 "(d, {params}). Got %s" % (d,)
702 dist_gen = d[0]
703 loc_ = d[1].get('loc', loc)
704 scale_ = d[1].get('scale', scale)
705 args_ = d[1].get('args', args)
706 dist_name = d[1].get('name', str(dist_gen))
707 else:
708 dist_gen = d
709 dist_name = str(d)
710
711
712 try:
713 dist_gen_ = getattr(scipy.stats, dist_gen)
714
715
716 if args_ is not None or loc_ is not None or scale_ is not None:
717 dist_opt = rv_semifrozen(dist_gen_, loc=loc_, scale=scale_, args=args_)
718 else:
719 dist_opt = dist_gen_
720 dist_params = dist_opt.fit(data_selected)
721 if __debug__:
722 debug('STAT__',
723 'Got distribution parameters %s for %s'
724 % (dist_params, dist_name))
725 if test == 'p-roc':
726 cdf_func = lambda x: dist_gen_.cdf(x, *dist_params)
727
728 cdf_p = N.abs(_pvalue(data, cdf_func, tail, name=dist_gen))
729 cdf_p_thr = cdf_p <= p_thr
730 D, p = N.sum(N.abs(data_p_thr - cdf_p_thr))*1.0/true_positives, 1
731 if __debug__: res_sum = 'D=%.2f' % D
732 elif test == 'kstest':
733 D, p = kstest(data, d, args=dist_params)
734 if __debug__: res_sum = 'D=%.3f p=%.3f' % (D, p)
735 except (TypeError, ValueError, AttributeError,
736 NotImplementedError), e:
737 if __debug__:
738 debug('STAT__',
739 'Testing for %s distribution failed due to %s'
740 % (d, str(e)))
741 continue
742
743 if p > p_thr and not N.isnan(D):
744 results += [ (D, dist_gen, dist_name, dist_params) ]
745 if __debug__:
746 debug('STAT_', 'Testing for %s dist.: %s' % (dist_name, res_sum))
747 else:
748 if __debug__:
749 debug('STAT__', 'Cannot consider %s dist. with %s'
750 % (d, res_sum))
751 continue
752
753
754 results.sort()
755
756 if __debug__ and 'STAT' in debug.active:
757
758 nresults = len(results)
759 sresult = lambda r:'%s(%s)=%.2f' % (r[1], ', '.join(map(str, r[3])), r[0])
760 if nresults>0:
761 nnextbest = min(2, nresults-1)
762 snextbest = ', '.join(map(sresult, results[1:1+nnextbest]))
763 debug('STAT', 'Best distribution %s. Next best: %s'
764 % (sresult(results[0]), snextbest))
765 else:
766 debug('STAT', 'Could not find suitable distribution')
767
768
769 return results
770
771
772 if externals.exists('pylab'):
773 import pylab as P
774
775 - def plotDistributionMatches(data, matches, nbins=31, nbest=5,
776 expand_tails=8, legend=2, plot_cdf=True,
777 p=None, tail='both'):
778 """Plot best matching distributions
779
780 :Parameters:
781 data : N.ndarray
782 Data which was used to obtain the matches
783 matches : list of tuples
784 Sorted matches as provided by matchDistribution
785 nbins : int
786 Number of bins in the histogram
787 nbest : int
788 Number of top matches to plot
789 expand_tails : int
790 How many bins away to add to parametrized distributions
791 plots
792 legend : int
793 Either to provide legend and statistics in the legend.
794 1 -- just lists distributions.
795 2 -- adds distance measure
796 3 -- tp/fp/fn in the case if p is provided
797 plot_cdf : bool
798 Either to plot cdf for data using non-parametric distribution
799 p : float or None
800 If not None, visualize null-hypothesis testing (given p).
801 Bars in the histogram which fall under given p are colored
802 in red. False positives and false negatives are marked as
803 triangle up and down symbols correspondingly
804 tail : ('left', 'right', 'any', 'both')
805 If p is not None, the choise of tail for null-hypothesis
806 testing
807
808 :Returns: tuple(histogram, list of lines)
809 """
810
811 hist = P.hist(data, nbins, normed=1, align='center')
812 data_range = [N.min(data), N.max(data)]
813
814
815 x = hist[1]
816 dx = x[expand_tails] - x[0]
817 x = N.hstack((x[:expand_tails] - dx, x, x[-expand_tails:] + dx))
818
819 nonparam = Nonparametric(data)
820
821 if plot_cdf:
822 P.plot(x, nonparam.cdf(x), 'k--', linewidth=1)
823
824 p_thr = p
825
826 data_p = _pvalue(data, nonparam.cdf, tail)
827 data_p_thr = (data_p <= p_thr).ravel()
828
829 x_p = _pvalue(x, Nonparametric(data).cdf, tail)
830 x_p_thr = N.abs(x_p) <= p_thr
831
832 for thr, bar in zip(x_p_thr[expand_tails:], hist[2]):
833 bar.set_facecolor(('w','r')[int(thr)])
834
835 if not len(matches):
836
837 warning("No matching distributions were provided -- nothing to plot")
838 return (hist, )
839
840 lines = []
841 labels = []
842 for i in xrange(min(nbest, len(matches))):
843 D, dist_gen, dist_name, params = matches[i]
844 dist = getattr(scipy.stats, dist_gen)(*params)
845
846 label = '%s' % (dist_name)
847 if legend > 1: label += '(D=%.2f)' % (D)
848
849 xcdf_p = N.abs(_pvalue(x, dist.cdf, tail))
850 xcdf_p_thr = (xcdf_p <= p_thr).ravel()
851
852 if p is not None and legend > 2:
853
854 data_cdf_p = N.abs(_pvalue(data, dist.cdf, tail))
855 data_cdf_p_thr = (data_cdf_p <= p_thr).ravel()
856
857
858 tp = N.logical_and(data_cdf_p_thr, data_p_thr)
859
860 fp = N.logical_and(data_cdf_p_thr, ~data_p_thr)
861
862 fn = N.logical_and(~data_cdf_p_thr, data_p_thr)
863
864 label += ' tp/fp/fn=%d/%d/%d)' % \
865 tuple(map(N.sum, [tp,fp,fn]))
866
867 pdf = dist.pdf(x)
868 line = P.plot(x, pdf, '-', linewidth=2, label=label)
869 color = line[0].get_color()
870
871 if plot_cdf:
872 cdf = dist.cdf(x)
873 P.plot(x, cdf, ':', linewidth=1, color=color, label=label)
874
875
876
877
878
879 if p is not None:
880
881 xtp = N.logical_and(xcdf_p_thr, x_p_thr)
882
883 xfp = N.logical_and(xcdf_p_thr, ~x_p_thr)
884
885 xfn = N.logical_and(~xcdf_p_thr, x_p_thr)
886
887
888
889 P.plot(x[xfp], pdf[xfp], '^', color=color)
890 P.plot(x[xfn], pdf[xfn], 'v', color=color)
891
892 lines.append(line)
893 labels.append(label)
894
895 if legend:
896 P.legend(lines, labels)
897
898 return (hist, lines)
899
900
901
902
903
904
905
906
907
908
909
910
911
912
914 """Cheater for human beings -- wraps `dist` if needed with some
915 NullDist
916
917 tail and other arguments are assumed to be default as in
918 NullDist/MCNullDist
919 """
920 if dist is None or isinstance(dist, NullDist):
921 return dist
922 elif hasattr(dist, 'fit'):
923 if __debug__:
924 debug('STAT', 'Wrapping %s into MCNullDist' % dist)
925 return MCNullDist(dist)
926 else:
927 if __debug__:
928 debug('STAT', 'Wrapping %s into FixedNullDist' % dist)
929 return FixedNullDist(dist)
930
931
932
934 if axis is None:
935 a = N.ravel(a)
936 outaxis = 0
937 else:
938 a = N.asarray(a)
939 outaxis = axis
940 return a, outaxis
941
943 """Compute the mean over the given axis ignoring nans.
944
945 :Parameters:
946 x : ndarray
947 input array
948 axis : int
949 axis along which the mean is computed.
950
951 :Results:
952 m : float
953 the mean."""
954 x, axis = _chk_asarray(x,axis)
955 x = x.copy()
956 Norig = x.shape[axis]
957 factor = 1.0-N.sum(N.isnan(x),axis)*1.0/Norig
958
959 x[N.isnan(x)] = 0
960 return N.mean(x,axis)/factor
961