1
2
3
4
5
6
7
8
9 """Unit tests for PyMVPA basic Classifiers"""
10
11 from mvpa.support.copy import deepcopy
12 from mvpa.base import externals
13
14 from mvpa.datasets import Dataset
15 from mvpa.mappers.mask import MaskMapper
16 from mvpa.datasets.splitters import NFoldSplitter, OddEvenSplitter
17
18 from mvpa.misc.exceptions import UnknownStateError
19
20 from mvpa.clfs.base import DegenerateInputError, FailedToTrainError
21 from mvpa.clfs.meta import CombinedClassifier, \
22 BinaryClassifier, MulticlassClassifier, \
23 SplitClassifier, MappedClassifier, FeatureSelectionClassifier, \
24 TreeClassifier
25 from mvpa.clfs.transerror import TransferError
26 from mvpa.algorithms.cvtranserror import CrossValidatedTransferError
27
28 from tests_warehouse import *
29 from tests_warehouse_clfs import *
30
31
32
33
34 _degenerate_allowed_exceptions = [DegenerateInputError, FailedToTrainError]
35 if externals.exists('rpy'):
36 import rpy
37 _degenerate_allowed_exceptions += [rpy.RPyRException]
41
43 self.clf_sign = SameSignClassifier()
44 self.clf_less1 = Less1Classifier()
45
46
47 self.data_bin_1 = Dataset(
48 samples=[[0,0],[-10,-1],[1,0.1],[1,-1],[-1,1]],
49 labels=[1, 1, 1, -1, -1],
50 chunks=[0, 1, 2, 2, 3])
51
79
80
82
83
84 bclf = CombinedClassifier(clfs=[self.clf_sign.clone(),
85 self.clf_sign.clone()])
86
87 self.failUnlessEqual(list(bclf.predict(self.data_bin_1.samples)),
88 list(self.data_bin_1.labels),
89 msg="Boosted classifier should work")
90 self.failUnlessEqual(bclf.predict(self.data_bin_1.samples),
91 self.clf_sign.predict(self.data_bin_1.samples),
92 msg="Boosted classifier should have the same as regular")
93
94
114
115
116
118 ds = Dataset(samples=[ [0,0], [0,1], [1,100], [-1,0], [-1,-3], [ 0,-10] ],
119 labels=[ 'sp', 'sp', 'sp', 'dn', 'sn', 'dp'])
120 testdata = [ [0,0], [10,10], [-10, -1], [0.1, -0.1], [-0.2, 0.2] ]
121
122
123 clf = SameSignClassifier()
124
125
126 bclf1 = BinaryClassifier(clf=clf,
127 poslabels=['sp', 'sn'],
128 neglabels=['dp', 'dn'])
129
130 orig_labels = ds.labels[:]
131 bclf1.train(ds)
132
133 self.failUnless(bclf1.predict(testdata) ==
134 [['sp', 'sn'], ['sp', 'sn'], ['sp', 'sn'],
135 ['dn', 'dp'], ['dn', 'dp']])
136
137 self.failUnless((ds.labels == orig_labels).all(),
138 msg="BinaryClassifier should not alter labels")
139
140
141 @sweepargs(clf=clfswh['binary'])
150
151
152 @sweepargs(clf=clfswh[:] + regrswh[:])
154 """Basic testing of the clf summary
155 """
156 summary1 = clf.summary()
157 self.failUnless('not yet trained' in summary1)
158 clf.train(datasets['uni2small'])
159 summary = clf.summary()
160
161 self.failUnless(len(summary) > len(summary1))
162 self.failUnless(not 'not yet trained' in summary)
163
164
165 @sweepargs(clf=clfswh[:] + regrswh[:])
204
205
206
208 ds = self.data_bin_1
209 clf = SplitClassifier(clf=SameSignClassifier(),
210 splitter=NFoldSplitter(1),
211 enable_states=['confusion', 'training_confusion',
212 'feature_ids'])
213 clf.train(ds)
214 error = clf.confusion.error
215 tr_error = clf.training_confusion.error
216
217 clf2 = clf.clone()
218 cv = CrossValidatedTransferError(
219 TransferError(clf2),
220 NFoldSplitter(),
221 enable_states=['confusion', 'training_confusion'])
222 cverror = cv(ds)
223 tr_cverror = cv.training_confusion.error
224
225 self.failUnlessEqual(error, cverror,
226 msg="We should get the same error using split classifier as"
227 " using CrossValidatedTransferError. Got %s and %s"
228 % (error, cverror))
229
230 self.failUnlessEqual(tr_error, tr_cverror,
231 msg="We should get the same training error using split classifier as"
232 " using CrossValidatedTransferError. Got %s and %s"
233 % (tr_error, tr_cverror))
234
235 self.failUnlessEqual(clf.confusion.percentCorrect,
236 100,
237 msg="Dummy clf should train perfectly")
238 self.failUnlessEqual(len(clf.confusion.sets),
239 len(ds.uniquechunks),
240 msg="Should have 1 confusion per each split")
241 self.failUnlessEqual(len(clf.clfs), len(ds.uniquechunks),
242 msg="Should have number of classifiers equal # of epochs")
243 self.failUnlessEqual(clf.predict(ds.samples), list(ds.labels),
244 msg="Should classify correctly")
245
246
247
248
249
250
251
252
253
254
255
256
257
258 summary = clf.summary()
259
260
261 @sweepargs(clf_=clfswh['binary', '!meta'])
263 clf2 = clf_.clone()
264 ds = datasets['uni2medium']
265 clf = SplitClassifier(clf=clf_,
266 splitter=NFoldSplitter(1),
267 enable_states=['confusion', 'feature_ids'])
268 clf.train(ds)
269 error = clf.confusion.error
270
271 cv = CrossValidatedTransferError(
272 TransferError(clf2),
273 NFoldSplitter(),
274 enable_states=['confusion', 'training_confusion'])
275 cverror = cv(ds)
276
277 self.failUnless(abs(error-cverror)<0.01,
278 msg="We should get the same error using split classifier as"
279 " using CrossValidatedTransferError. Got %s and %s"
280 % (error, cverror))
281
282 if cfg.getboolean('tests', 'labile', default='yes'):
283 self.failUnless(error < 0.25,
284 msg="clf should generalize more or less fine. "
285 "Got error %s" % error)
286 self.failUnlessEqual(len(clf.confusion.sets), len(ds.uniquechunks),
287 msg="Should have 1 confusion per each split")
288 self.failUnlessEqual(len(clf.clfs), len(ds.uniquechunks),
289 msg="Should have number of classifiers equal # of epochs")
290
291
292
293
294
312
313
315 samples = N.array([ [0,0,-1], [1,0,1], [-1,-1, 1], [-1,0,1], [1, -1, 1] ])
316 testdata3 = Dataset(samples=samples, labels=1)
317 res110 = [1, 1, 1, -1, -1]
318 res101 = [-1, 1, -1, -1, 1]
319 res011 = [-1, 1, -1, 1, -1]
320
321 clf110 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([1,1,0])))
322 clf101 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([1,0,1])))
323 clf011 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([0,1,1])))
324
325 self.failUnlessEqual(clf110.predict(samples), res110)
326 self.failUnlessEqual(clf101.predict(samples), res101)
327 self.failUnlessEqual(clf011.predict(samples), res011)
328
329
331 from test_rfe import SillySensitivityAnalyzer
332 from mvpa.featsel.base import \
333 SensitivityBasedFeatureSelection
334 from mvpa.featsel.helpers import \
335 FixedNElementTailSelector
336
337
338 sens_ana = SillySensitivityAnalyzer()
339
340 sens_ana_rev = SillySensitivityAnalyzer(mult=-1)
341
342
343 feat_sel = SensitivityBasedFeatureSelection(sens_ana,
344 FixedNElementTailSelector(1, mode='discard'))
345
346 feat_sel_rev = SensitivityBasedFeatureSelection(sens_ana_rev,
347 FixedNElementTailSelector(1))
348
349 samples = N.array([ [0,0,-1], [1,0,1], [-1,-1, 1], [-1,0,1], [1, -1, 1] ])
350
351 testdata3 = Dataset(samples=samples, labels=1)
352
353 traindata = Dataset(samples=N.array([ [0, 0,-1], [1,0,1] ]), labels=[1,2])
354
355
356 res110 = [1, 1, 1, -1, -1]
357 res011 = [-1, 1, -1, 1, -1]
358
359
360 clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel,
361 enable_states=['feature_ids'])
362
363 self.clf_sign.states._changeTemporarily(enable_states=['values'])
364 clf011.train(traindata)
365
366 self.failUnlessEqual(clf011.predict(testdata3.samples), res011)
367
368 self.failUnless(len(clf011.values) == len(res110),
369 msg="We need to pass values into ProxyClassifier")
370 self.clf_sign.states._resetEnabledTemporarily()
371
372 self.failUnlessEqual(len(clf011.feature_ids), 2)
373 "Feature selection classifier had to be trained on 2 features"
374
375
376 clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev)
377 clf011.train(traindata)
378 self.failUnlessEqual(clf011.predict(testdata3.samples), res110)
379
408
409
411 """Basic tests for TreeClassifier
412 """
413 ds = datasets['uni4small']
414
415 clfs = clfswh['binary', '!plr']
416
417
418 clfs = [clfs[i] for i in N.random.permutation(len(clfs))]
419
420 tclf = TreeClassifier(clfs[0], {
421 'L0+2' : (('L0', 'L2'), clfs[1]),
422 'L2+3' : ((2, 3), clfs[2])})
423 self.failUnlessRaises(ValueError, tclf.train, ds)
424 """Should raise exception since label 2 is in both"""
425
426
427 tclf = TreeClassifier(clfs[0], {
428 'L0+5' : (('L0', 'L5'), clfs[1]),
429 'L2+3' : ((2, 3), clfs[2])})
430 self.failUnlessRaises(ValueError, tclf.train, ds)
431 """Should raise exception since no group for L1"""
432
433
434 tclf = TreeClassifier(clfs[0], {
435 'L0+1' : (('L0', 1), clfs[1]),
436 'L2+3' : ((2, 3), clfs[2])})
437
438
439 cv = CrossValidatedTransferError(
440 TransferError(tclf),
441 OddEvenSplitter(),
442 enable_states=['confusion', 'training_confusion'])
443 cverror = cv(ds)
444 try:
445 rtclf = repr(tclf)
446 except:
447 self.fail(msg="Could not obtain repr for TreeClassifier")
448
449
450 self.failUnless(tclf.clfs['L0+1'] is clfs[1])
451 self.failUnless(tclf.clfs['L2+3'] is clfs[2])
452
453 cvtrc = cv.training_confusion
454 cvtc = cv.confusion
455 if cfg.getboolean('tests', 'labile', default='yes'):
456
457 self.failUnless(cvtrc != cvtc)
458 self.failUnless(cverror < 0.3,
459 msg="Got too high error = %s using %s"
460 % (cverror, tclf))
461
462
463 tclf = TreeClassifier(clfs[0], {
464 'L0' : (('L0',), clfs[1]),
465 'L1+2+3' : ((1, 2, 3), clfs[2])})
466
467
468
469 @sweepargs(clf=clfswh[:])
486
487 @sweepargs(clf=clfswh['linear', 'svm', 'libsvm', '!meta'])
489 oldC = None
490
491
492
493 if clf.params.isKnown('C') and clf.C<0:
494 oldC = clf.C
495 clf.C = 1.0
496
497 svm, svm2 = clf, clf.clone()
498 svm2.states.enable(['training_confusion'])
499
500 mclf = MulticlassClassifier(clf=svm,
501 enable_states=['training_confusion'])
502
503 svm2.train(datasets['uni2small_train'])
504 mclf.train(datasets['uni2small_train'])
505 s1 = str(mclf.training_confusion)
506 s2 = str(svm2.training_confusion)
507 self.failUnlessEqual(s1, s2,
508 msg="Multiclass clf should provide same results as built-in "
509 "libsvm's %s. Got %s and %s" % (svm2, s1, s2))
510
511 svm2.untrain()
512
513 self.failUnless(svm2.trained == False,
514 msg="Un-Trained SVM should be untrained")
515
516 self.failUnless(N.array([x.trained for x in mclf.clfs]).all(),
517 msg="Trained Boosted classifier should have all primary classifiers trained")
518 self.failUnless(mclf.trained,
519 msg="Trained Boosted classifier should be marked as trained")
520
521 mclf.untrain()
522
523 self.failUnless(not mclf.trained,
524 msg="UnTrained Boosted classifier should not be trained")
525 self.failUnless(not N.array([x.trained for x in mclf.clfs]).any(),
526 msg="UnTrained Boosted classifier should have no primary classifiers trained")
527
528 if oldC is not None:
529 clf.C = oldC
530
531
532 @sweepargs(clf=clfswh['svm', '!meta'])
550
551
552 @sweepargs(clf=clfswh['retrainable'])
554
555 clf = clf.clone()
556 clf.states._changeTemporarily(enable_states = ['values'],
557
558
559 disable_states=['training_confusion'])
560 clf_re = clf.clone()
561
562
563 clf_re._setRetrainable(True)
564
565
566
567 dsargs = {'perlabel':50, 'nlabels':2, 'nfeatures':5, 'nchunks':1,
568 'nonbogus_features':[2,4], 'snr': 5.0}
569
570
571
572
573
574 dstrain = deepcopy(datasets['uni2large_train'])
575 dstest = deepcopy(datasets['uni2large_test'])
576
577 clf.untrain()
578 clf_re.untrain()
579 trerr, trerr_re = TransferError(clf), \
580 TransferError(clf_re, disable_states=['training_confusion'])
581
582
583 err_1 = trerr(dstest, dstrain)
584 self.failUnless(err_1<0.3,
585 msg="We should test here on easy dataset. Got error of %s" % err_1)
586 values_1 = clf.values[:]
587
588 eps = 0.05
589 corrcoef_eps = 0.85
590
591
592 def batch_test(retrain=True, retest=True, closer=True):
593 err = trerr(dstest, dstrain)
594 err_re = trerr_re(dstest, dstrain)
595 corr = N.corrcoef(clf.values, clf_re.values)[0,1]
596 corr_old = N.corrcoef(values_1, clf_re.values)[0,1]
597 if __debug__:
598 debug('TEST', "Retraining stats: errors %g %g corr %g "
599 "with old error %g corr %g" %
600 (err, err_re, corr, err_1, corr_old))
601 self.failUnless(clf_re.states.retrained == retrain,
602 ("Must fully train",
603 "Must retrain instead of full training")[retrain])
604 self.failUnless(clf_re.states.repredicted == retest,
605 ("Must fully test",
606 "Must retest instead of full testing")[retest])
607 self.failUnless(corr > corrcoef_eps,
608 msg="Result must be close to the one without retraining."
609 " Got corrcoef=%s" % (corr))
610 if closer:
611 self.failUnless(corr >= corr_old,
612 msg="Result must be closer to current without retraining"
613 " than to old one. Got corrcoef=%s" % (corr_old))
614
615
616 for i in xrange(3):
617 flag = bool(i!=0)
618
619
620
621 batch_test(retrain=flag, retest=flag, closer=False)
622
623
624 if 'C' in clf.params.names:
625 clf.params.C *= 0.1
626 clf_re.params.C *= 0.1
627 batch_test()
628 elif 'sigma_noise' in clf.params.names:
629 clf.params.sigma_noise *= 100
630 clf_re.params.sigma_noise *= 100
631 batch_test()
632 else:
633 raise RuntimeError, \
634 'Please implement testing while changing some of the ' \
635 'params for clf %s' % clf
636
637
638 if hasattr(clf, 'kernel_params') and len(clf.kernel_params.names):
639 clf.kernel_params.gamma = 0.1
640 clf_re.kernel_params.gamma = 0.1
641
642
643 batch_test(retest=not('gamma' in clf.kernel_params.names))
644
645
646 oldlabels = dstrain.labels[:]
647 dstrain.permuteLabels(status=True, assure_permute=True)
648 self.failUnless((oldlabels != dstrain.labels).any(),
649 msg="We should succeed at permutting -- now got the same labels")
650 batch_test()
651
652
653 oldlabels = dstest.labels[:]
654 dstest.permuteLabels(status=True, assure_permute=True)
655 self.failUnless((oldlabels != dstest.labels).any(),
656 msg="We should succeed at permutting -- now got the same labels")
657 batch_test()
658
659
660
661 if not clf.__class__.__name__ in ['GPR']:
662 oldsamples = dstrain.samples.copy()
663 dstrain.samples[:] += dstrain.samples*0.05
664 self.failUnless((oldsamples != dstrain.samples).any())
665 batch_test(retest=False)
666 clf.states._resetEnabledTemporarily()
667
668
669
670 clf_re.retrain(dstrain); self.failUnless(clf_re.states.retrained)
671 clf_re.retrain(dstrain, labels=True); self.failUnless(clf_re.states.retrained)
672 clf_re.retrain(dstrain, traindataset=True); self.failUnless(clf_re.states.retrained)
673
674
675 clf_re.repredict(dstest.samples);
676 self.failUnless(clf_re.states.repredicted)
677 self.failUnlessRaises(RuntimeError, clf_re.repredict,
678 dstest.samples, labels=True,
679 msg="for now retesting with anything changed makes no sense")
680 clf_re._setRetrainable(False)
681
682
684 """Test all classifiers for conformant behavior
685 """
686 for clf_, traindata in \
687 [(clfswh['binary'], datasets['dumb2']),
688 (clfswh['multiclass'], datasets['dumb'])]:
689 traindata_copy = deepcopy(traindata)
690 for clf in clf_:
691 clf.train(traindata)
692 self.failUnless(
693 (traindata.samples == traindata_copy.samples).all(),
694 "Training of a classifier shouldn't change original dataset")
695
696
697
698
699
700
701 self.failUnless(str(clf) != "")
702 self.failUnless(repr(clf) != "")
703
704
705
706
707
708
709 @sweepargs(clf=clfswh['!smlr', '!knn', '!gnb', '!lars', '!meta', '!ridge'])
711 """To check if known/present Classifiers are working properly
712 with samples being first dimension. Started to worry about
713 possible problems while looking at sg where samples are 2nd
714 dimension
715 """
716
717
718
719 traindatas = [
720 Dataset(samples=N.array([ [0, 0, 1.0],
721 [1, 0, 0] ]), labels=[0, 1]),
722 Dataset(samples=N.array([ [0, 0.0],
723 [1, 1] ]), labels=[0, 1])]
724
725 clf.states._changeTemporarily(enable_states = ['training_confusion'])
726 for traindata in traindatas:
727 clf.train(traindata)
728 self.failUnlessEqual(clf.training_confusion.percentCorrect, 100.0,
729 "Classifier %s must have 100%% correct learning on %s. Has %f" %
730 (`clf`, traindata.samples, clf.training_confusion.percentCorrect))
731
732
733 for i in xrange(traindata.nsamples):
734 sample = traindata.samples[i,:]
735 predicted = clf.predict([sample])
736 self.failUnlessEqual([predicted], traindata.labels[i],
737 "We must be able to predict sample %s using " % sample +
738 "classifier %s" % `clf`)
739 clf.states._resetEnabledTemporarily()
740
743
744
745 if __name__ == '__main__':
746 import runner
747