1
2
3
4
5
6
7
8
9 """Unit tests for PyMVPA basic Classifiers"""
10
11 from mvpa.support.copy import deepcopy
12 from mvpa.base import externals
13
14 from mvpa.datasets import Dataset
15 from mvpa.mappers.mask import MaskMapper
16 from mvpa.datasets.splitters import NFoldSplitter, OddEvenSplitter
17
18 from mvpa.misc.exceptions import UnknownStateError
19
20 from mvpa.clfs.base import DegenerateInputError, FailedToTrainError
21 from mvpa.clfs.meta import CombinedClassifier, \
22 BinaryClassifier, MulticlassClassifier, \
23 SplitClassifier, MappedClassifier, FeatureSelectionClassifier, \
24 TreeClassifier
25 from mvpa.clfs.transerror import TransferError
26 from mvpa.algorithms.cvtranserror import CrossValidatedTransferError
27
28 from tests_warehouse import *
29 from tests_warehouse_clfs import *
30
31
32
33
34 _degenerate_allowed_exceptions = [DegenerateInputError, FailedToTrainError]
35 if externals.exists('rpy'):
36 import rpy
37 _degenerate_allowed_exceptions += [rpy.RPyRException]
41
43 self.clf_sign = SameSignClassifier()
44 self.clf_less1 = Less1Classifier()
45
46
47 self.data_bin_1 = Dataset(
48 samples=[[0,0],[-10,-1],[1,0.1],[1,-1],[-1,1]],
49 labels=[1, 1, 1, -1, -1],
50 chunks=[0, 1, 2, 2, 3])
51
79
80
82
83
84 bclf = CombinedClassifier(clfs=[self.clf_sign.clone(),
85 self.clf_sign.clone()])
86
87 self.failUnlessEqual(list(bclf.predict(self.data_bin_1.samples)),
88 list(self.data_bin_1.labels),
89 msg="Boosted classifier should work")
90 self.failUnlessEqual(bclf.predict(self.data_bin_1.samples),
91 self.clf_sign.predict(self.data_bin_1.samples),
92 msg="Boosted classifier should have the same as regular")
93
94
114
115
116
118 ds = Dataset(samples=[ [0,0], [0,1], [1,100], [-1,0], [-1,-3], [ 0,-10] ],
119 labels=[ 'sp', 'sp', 'sp', 'dn', 'sn', 'dp'])
120 testdata = [ [0,0], [10,10], [-10, -1], [0.1, -0.1], [-0.2, 0.2] ]
121
122
123 clf = SameSignClassifier()
124
125
126 bclf1 = BinaryClassifier(clf=clf,
127 poslabels=['sp', 'sn'],
128 neglabels=['dp', 'dn'])
129
130 orig_labels = ds.labels[:]
131 bclf1.train(ds)
132
133 self.failUnless(bclf1.predict(testdata) ==
134 [['sp', 'sn'], ['sp', 'sn'], ['sp', 'sn'],
135 ['dn', 'dp'], ['dn', 'dp']])
136
137 self.failUnless((ds.labels == orig_labels).all(),
138 msg="BinaryClassifier should not alter labels")
139
140
141 @sweepargs(clf=clfswh['binary'])
150
151
152 @sweepargs(clf=clfswh[:] + regrswh[:])
154 """Basic testing of the clf summary
155 """
156 summary1 = clf.summary()
157 self.failUnless('not yet trained' in summary1)
158 clf.train(datasets['uni2small'])
159 summary = clf.summary()
160
161 self.failUnless(len(summary) > len(summary1))
162 self.failUnless(not 'not yet trained' in summary)
163
164
165 @sweepargs(clf=clfswh[:] + regrswh[:])
204
205
206
208 ds = self.data_bin_1
209 clf = SplitClassifier(clf=SameSignClassifier(),
210 splitter=NFoldSplitter(1),
211 enable_states=['confusion', 'training_confusion',
212 'feature_ids'])
213 clf.train(ds)
214 error = clf.confusion.error
215 tr_error = clf.training_confusion.error
216
217 clf2 = clf.clone()
218 cv = CrossValidatedTransferError(
219 TransferError(clf2),
220 NFoldSplitter(),
221 enable_states=['confusion', 'training_confusion'])
222 cverror = cv(ds)
223 tr_cverror = cv.training_confusion.error
224
225 self.failUnlessEqual(error, cverror,
226 msg="We should get the same error using split classifier as"
227 " using CrossValidatedTransferError. Got %s and %s"
228 % (error, cverror))
229
230 self.failUnlessEqual(tr_error, tr_cverror,
231 msg="We should get the same training error using split classifier as"
232 " using CrossValidatedTransferError. Got %s and %s"
233 % (tr_error, tr_cverror))
234
235 self.failUnlessEqual(clf.confusion.percentCorrect,
236 100,
237 msg="Dummy clf should train perfectly")
238 self.failUnlessEqual(len(clf.confusion.sets),
239 len(ds.uniquechunks),
240 msg="Should have 1 confusion per each split")
241 self.failUnlessEqual(len(clf.clfs), len(ds.uniquechunks),
242 msg="Should have number of classifiers equal # of epochs")
243 self.failUnlessEqual(clf.predict(ds.samples), list(ds.labels),
244 msg="Should classify correctly")
245
246
247
248
249
250
251
252
253
254
255
256
257
258 summary = clf.summary()
259
260
261 @sweepargs(clf_=clfswh['binary', '!meta'])
263 clf2 = clf_.clone()
264 ds = datasets['uni2medium']
265 clf = SplitClassifier(clf=clf_,
266 splitter=NFoldSplitter(1),
267 enable_states=['confusion', 'feature_ids'])
268 clf.train(ds)
269 error = clf.confusion.error
270
271 cv = CrossValidatedTransferError(
272 TransferError(clf2),
273 NFoldSplitter(),
274 enable_states=['confusion', 'training_confusion'])
275 cverror = cv(ds)
276
277 self.failUnless(abs(error-cverror)<0.01,
278 msg="We should get the same error using split classifier as"
279 " using CrossValidatedTransferError. Got %s and %s"
280 % (error, cverror))
281
282 if cfg.getboolean('tests', 'labile', default='yes'):
283 self.failUnless(error < 0.25,
284 msg="clf should generalize more or less fine. "
285 "Got error %s" % error)
286 self.failUnlessEqual(len(clf.confusion.sets), len(ds.uniquechunks),
287 msg="Should have 1 confusion per each split")
288 self.failUnlessEqual(len(clf.clfs), len(ds.uniquechunks),
289 msg="Should have number of classifiers equal # of epochs")
290
291
292
293
294
312
313
315 samples = N.array([ [0,0,-1], [1,0,1], [-1,-1, 1], [-1,0,1], [1, -1, 1] ])
316 testdata3 = Dataset(samples=samples, labels=1)
317 res110 = [1, 1, 1, -1, -1]
318 res101 = [-1, 1, -1, -1, 1]
319 res011 = [-1, 1, -1, 1, -1]
320
321 clf110 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([1,1,0])))
322 clf101 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([1,0,1])))
323 clf011 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([0,1,1])))
324
325 self.failUnlessEqual(clf110.predict(samples), res110)
326 self.failUnlessEqual(clf101.predict(samples), res101)
327 self.failUnlessEqual(clf011.predict(samples), res011)
328
329
331 from test_rfe import SillySensitivityAnalyzer
332 from mvpa.featsel.base import \
333 SensitivityBasedFeatureSelection
334 from mvpa.featsel.helpers import \
335 FixedNElementTailSelector
336
337
338 sens_ana = SillySensitivityAnalyzer()
339
340 sens_ana_rev = SillySensitivityAnalyzer(mult=-1)
341
342
343 feat_sel = SensitivityBasedFeatureSelection(sens_ana,
344 FixedNElementTailSelector(1, mode='discard'))
345
346 feat_sel_rev = SensitivityBasedFeatureSelection(sens_ana_rev,
347 FixedNElementTailSelector(1))
348
349 samples = N.array([ [0,0,-1], [1,0,1], [-1,-1, 1], [-1,0,1], [1, -1, 1] ])
350
351 testdata3 = Dataset(samples=samples, labels=1)
352
353 traindata = Dataset(samples=N.array([ [0, 0,-1], [1,0,1] ]), labels=[1,2])
354
355
356 res110 = [1, 1, 1, -1, -1]
357 res011 = [-1, 1, -1, 1, -1]
358
359
360 clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel,
361 enable_states=['feature_ids'])
362
363 self.clf_sign.states._changeTemporarily(enable_states=['values'])
364 clf011.train(traindata)
365
366 self.failUnlessEqual(clf011.predict(testdata3.samples), res011)
367
368 self.failUnless(len(clf011.values) == len(res110),
369 msg="We need to pass values into ProxyClassifier")
370 self.clf_sign.states._resetEnabledTemporarily()
371
372 self.failUnlessEqual(len(clf011.feature_ids), 2)
373 "Feature selection classifier had to be trained on 2 features"
374
375
376 clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev)
377 clf011.train(traindata)
378 self.failUnlessEqual(clf011.predict(testdata3.samples), res110)
379
408
409
411 """Basic tests for TreeClassifier
412 """
413 ds = datasets['uni4small']
414 clfs = clfswh['binary']
415
416
417 clfs = [clfs[i] for i in N.random.permutation(len(clfs))]
418
419 tclf = TreeClassifier(clfs[0], {
420 'L0+2' : (('L0', 'L2'), clfs[1]),
421 'L2+3' : ((2, 3), clfs[2])})
422 self.failUnlessRaises(ValueError, tclf.train, ds)
423 """Should raise exception since label 2 is in both"""
424
425
426 tclf = TreeClassifier(clfs[0], {
427 'L0+5' : (('L0', 'L5'), clfs[1]),
428 'L2+3' : ((2, 3), clfs[2])})
429 self.failUnlessRaises(ValueError, tclf.train, ds)
430 """Should raise exception since no group for L1"""
431
432
433 tclf = TreeClassifier(clfs[0], {
434 'L0+1' : (('L0', 1), clfs[1]),
435 'L2+3' : ((2, 3), clfs[2])})
436
437
438 cv = CrossValidatedTransferError(
439 TransferError(tclf),
440 OddEvenSplitter(),
441 enable_states=['confusion', 'training_confusion'])
442 cverror = cv(ds)
443 try:
444 rtclf = repr(tclf)
445 except:
446 self.fail(msg="Could not obtain repr for TreeClassifier")
447
448
449 self.failUnless(tclf.clfs['L0+1'] is clfs[1])
450 self.failUnless(tclf.clfs['L2+3'] is clfs[2])
451
452 cvtrc = cv.training_confusion
453 cvtc = cv.confusion
454 if cfg.getboolean('tests', 'labile', default='yes'):
455
456 self.failUnless(cvtrc != cvtc)
457 self.failUnless(cverror < 0.3)
458
459
460 tclf = TreeClassifier(clfs[0], {
461 'L0' : (('L0',), clfs[1]),
462 'L1+2+3' : ((1, 2, 3), clfs[2])})
463
464
465
466 @sweepargs(clf=clfswh[:])
483
484 @sweepargs(clf=clfswh['linear', 'svm', 'libsvm', '!meta'])
486 oldC = None
487
488
489
490 if clf.params.isKnown('C') and clf.C<0:
491 oldC = clf.C
492 clf.C = 1.0
493
494 svm, svm2 = clf, clf.clone()
495 svm2.states.enable(['training_confusion'])
496
497 mclf = MulticlassClassifier(clf=svm,
498 enable_states=['training_confusion'])
499
500 svm2.train(datasets['uni2small_train'])
501 mclf.train(datasets['uni2small_train'])
502 s1 = str(mclf.training_confusion)
503 s2 = str(svm2.training_confusion)
504 self.failUnlessEqual(s1, s2,
505 msg="Multiclass clf should provide same results as built-in "
506 "libsvm's %s. Got %s and %s" % (svm2, s1, s2))
507
508 svm2.untrain()
509
510 self.failUnless(svm2.trained == False,
511 msg="Un-Trained SVM should be untrained")
512
513 self.failUnless(N.array([x.trained for x in mclf.clfs]).all(),
514 msg="Trained Boosted classifier should have all primary classifiers trained")
515 self.failUnless(mclf.trained,
516 msg="Trained Boosted classifier should be marked as trained")
517
518 mclf.untrain()
519
520 self.failUnless(not mclf.trained,
521 msg="UnTrained Boosted classifier should not be trained")
522 self.failUnless(not N.array([x.trained for x in mclf.clfs]).any(),
523 msg="UnTrained Boosted classifier should have no primary classifiers trained")
524
525 if oldC is not None:
526 clf.C = oldC
527
528
529 @sweepargs(clf=clfswh['svm', '!meta'])
547
548
549 @sweepargs(clf=clfswh['retrainable'])
551
552 clf = clf.clone()
553 clf.states._changeTemporarily(enable_states = ['values'],
554
555
556 disable_states=['training_confusion'])
557 clf_re = clf.clone()
558
559
560 clf_re._setRetrainable(True)
561
562
563
564 dsargs = {'perlabel':50, 'nlabels':2, 'nfeatures':5, 'nchunks':1,
565 'nonbogus_features':[2,4], 'snr': 5.0}
566
567
568
569
570
571 dstrain = deepcopy(datasets['uni2large_train'])
572 dstest = deepcopy(datasets['uni2large_test'])
573
574 clf.untrain()
575 clf_re.untrain()
576 trerr, trerr_re = TransferError(clf), \
577 TransferError(clf_re, disable_states=['training_confusion'])
578
579
580 err_1 = trerr(dstest, dstrain)
581 self.failUnless(err_1<0.3,
582 msg="We should test here on easy dataset. Got error of %s" % err_1)
583 values_1 = clf.values[:]
584
585 eps = 0.05
586 corrcoef_eps = 0.85
587
588
589 def batch_test(retrain=True, retest=True, closer=True):
590 err = trerr(dstest, dstrain)
591 err_re = trerr_re(dstest, dstrain)
592 corr = N.corrcoef(clf.values, clf_re.values)[0,1]
593 corr_old = N.corrcoef(values_1, clf_re.values)[0,1]
594 if __debug__:
595 debug('TEST', "Retraining stats: errors %g %g corr %g "
596 "with old error %g corr %g" %
597 (err, err_re, corr, err_1, corr_old))
598 self.failUnless(clf_re.states.retrained == retrain,
599 ("Must fully train",
600 "Must retrain instead of full training")[retrain])
601 self.failUnless(clf_re.states.repredicted == retest,
602 ("Must fully test",
603 "Must retest instead of full testing")[retest])
604 self.failUnless(corr > corrcoef_eps,
605 msg="Result must be close to the one without retraining."
606 " Got corrcoef=%s" % (corr))
607 if closer:
608 self.failUnless(corr >= corr_old,
609 msg="Result must be closer to current without retraining"
610 " than to old one. Got corrcoef=%s" % (corr_old))
611
612
613 for i in xrange(3):
614 flag = bool(i!=0)
615
616
617
618 batch_test(retrain=flag, retest=flag, closer=False)
619
620
621 if 'C' in clf.params.names:
622 clf.params.C *= 0.1
623 clf_re.params.C *= 0.1
624 batch_test()
625 elif 'sigma_noise' in clf.params.names:
626 clf.params.sigma_noise *= 100
627 clf_re.params.sigma_noise *= 100
628 batch_test()
629 else:
630 raise RuntimeError, \
631 'Please implement testing while changing some of the ' \
632 'params for clf %s' % clf
633
634
635 if hasattr(clf, 'kernel_params') and len(clf.kernel_params.names):
636 clf.kernel_params.gamma = 0.1
637 clf_re.kernel_params.gamma = 0.1
638
639
640 batch_test(retest=not('gamma' in clf.kernel_params.names))
641
642
643 oldlabels = dstrain.labels[:]
644 dstrain.permuteLabels(status=True, assure_permute=True)
645 self.failUnless((oldlabels != dstrain.labels).any(),
646 msg="We should succeed at permutting -- now got the same labels")
647 batch_test()
648
649
650 oldlabels = dstest.labels[:]
651 dstest.permuteLabels(status=True, assure_permute=True)
652 self.failUnless((oldlabels != dstest.labels).any(),
653 msg="We should succeed at permutting -- now got the same labels")
654 batch_test()
655
656
657
658 if not clf.__class__.__name__ in ['GPR']:
659 oldsamples = dstrain.samples.copy()
660 dstrain.samples[:] += dstrain.samples*0.05
661 self.failUnless((oldsamples != dstrain.samples).any())
662 batch_test(retest=False)
663 clf.states._resetEnabledTemporarily()
664
665
666
667 clf_re.retrain(dstrain); self.failUnless(clf_re.states.retrained)
668 clf_re.retrain(dstrain, labels=True); self.failUnless(clf_re.states.retrained)
669 clf_re.retrain(dstrain, traindataset=True); self.failUnless(clf_re.states.retrained)
670
671
672 clf_re.repredict(dstest.samples);
673 self.failUnless(clf_re.states.repredicted)
674 self.failUnlessRaises(RuntimeError, clf_re.repredict,
675 dstest.samples, labels=True,
676 msg="for now retesting with anything changed makes no sense")
677 clf_re._setRetrainable(False)
678
679
681 """Test all classifiers for conformant behavior
682 """
683 for clf_, traindata in \
684 [(clfswh['binary'], datasets['dumb2']),
685 (clfswh['multiclass'], datasets['dumb'])]:
686 traindata_copy = deepcopy(traindata)
687 for clf in clf_:
688 clf.train(traindata)
689 self.failUnless(
690 (traindata.samples == traindata_copy.samples).all(),
691 "Training of a classifier shouldn't change original dataset")
692
693
694
695
696
697
698 self.failUnless(str(clf) != "")
699 self.failUnless(repr(clf) != "")
700
701
702
703
704
705
706 @sweepargs(clf=clfswh['!smlr', '!knn', '!gnb', '!lars', '!meta', '!ridge'])
708 """To check if known/present Classifiers are working properly
709 with samples being first dimension. Started to worry about
710 possible problems while looking at sg where samples are 2nd
711 dimension
712 """
713
714
715
716 traindatas = [
717 Dataset(samples=N.array([ [0, 0, 1.0],
718 [1, 0, 0] ]), labels=[-1, 1]),
719 Dataset(samples=N.array([ [0, 0.0],
720 [1, 1] ]), labels=[-1, 1])]
721
722 clf.states._changeTemporarily(enable_states = ['training_confusion'])
723 for traindata in traindatas:
724 clf.train(traindata)
725 self.failUnlessEqual(clf.training_confusion.percentCorrect, 100.0,
726 "Classifier %s must have 100%% correct learning on %s. Has %f" %
727 (`clf`, traindata.samples, clf.training_confusion.percentCorrect))
728
729
730 for i in xrange(traindata.nsamples):
731 sample = traindata.samples[i,:]
732 predicted = clf.predict([sample])
733 self.failUnlessEqual([predicted], traindata.labels[i],
734 "We must be able to predict sample %s using " % sample +
735 "classifier %s" % `clf`)
736 clf.states._resetEnabledTemporarily()
737
740
741
742 if __name__ == '__main__':
743 import runner
744