1
2
3
4
5
6
7
8
9 """Unit tests for PyMVPA basic Classifiers"""
10
11 from mvpa.support.copy import deepcopy
12
13 from mvpa.datasets import Dataset
14 from mvpa.mappers.mask import MaskMapper
15 from mvpa.datasets.splitters import NFoldSplitter, OddEvenSplitter
16
17 from mvpa.misc.exceptions import UnknownStateError
18
19 from mvpa.clfs.base import Classifier
20 from mvpa.clfs.meta import CombinedClassifier, \
21 BinaryClassifier, MulticlassClassifier, \
22 SplitClassifier, MappedClassifier, FeatureSelectionClassifier, \
23 TreeClassifier
24 from mvpa.clfs.transerror import TransferError
25 from mvpa.algorithms.cvtranserror import CrossValidatedTransferError
26
27 from tests_warehouse import *
28 from tests_warehouse_clfs import *
29
31
33 self.clf_sign = SameSignClassifier()
34 self.clf_less1 = Less1Classifier()
35
36
37 self.data_bin_1 = Dataset(
38 samples=[[0,0],[-10,-1],[1,0.1],[1,-1],[-1,1]],
39 labels=[1, 1, 1, -1, -1],
40 chunks=[0, 1, 2, 2, 3])
41
68
69
71
72
73 bclf = CombinedClassifier(clfs=[self.clf_sign.clone(),
74 self.clf_sign.clone()])
75
76 self.failUnlessEqual(list(bclf.predict(self.data_bin_1.samples)),
77 list(self.data_bin_1.labels),
78 msg="Boosted classifier should work")
79 self.failUnlessEqual(bclf.predict(self.data_bin_1.samples),
80 self.clf_sign.predict(self.data_bin_1.samples),
81 msg="Boosted classifier should have the same as regular")
82
83
85 bclf = CombinedClassifier(clfs=[self.clf_sign.clone(),
86 self.clf_sign.clone()],
87 enable_states=['feature_ids'])
88
89
90 self.failUnlessEqual(self.clf_sign.states.isEnabled('feature_ids'), False)
91 self.failUnlessEqual(bclf.clfs[0].states.isEnabled('feature_ids'), True)
92
93 bclf2 = CombinedClassifier(clfs=[self.clf_sign.clone(),
94 self.clf_sign.clone()],
95 propagate_states=False,
96 enable_states=['feature_ids'])
97
98 self.failUnlessEqual(self.clf_sign.states.isEnabled('feature_ids'), False)
99 self.failUnlessEqual(bclf2.clfs[0].states.isEnabled('feature_ids'), False)
100
101
102
104 ds = Dataset(samples=[ [0,0], [0,1], [1,100], [-1,0], [-1,-3], [ 0,-10] ],
105 labels=[ 'sp', 'sp', 'sp', 'dn', 'sn', 'dp'])
106 testdata = [ [0,0], [10,10], [-10, -1], [0.1, -0.1], [-0.2, 0.2] ]
107
108
109 clf = SameSignClassifier()
110
111
112 bclf1 = BinaryClassifier(clf=clf,
113 poslabels=['sp', 'sn'],
114 neglabels=['dp', 'dn'])
115
116 orig_labels = ds.labels[:]
117 bclf1.train(ds)
118
119 self.failUnless(bclf1.predict(testdata) ==
120 [['sp', 'sn'], ['sp', 'sn'], ['sp', 'sn'],
121 ['dn', 'dp'], ['dn', 'dp']])
122
123 self.failUnless((ds.labels == orig_labels).all(),
124 msg="BinaryClassifier should not alter labels")
125
126
127 @sweepargs(clf=clfswh['binary'])
136
137
138 @sweepargs(clf=clfswh[:] + regrswh[:])
140 """Basic testing of the clf summary
141 """
142 summary1 = clf.summary()
143 self.failUnless('not yet trained' in summary1)
144 clf.train(datasets['uni2small'])
145 summary = clf.summary()
146
147 self.failUnless(len(summary) > len(summary1))
148 self.failUnless(not 'not yet trained' in summary)
149
150
151
153 ds = self.data_bin_1
154 clf = SplitClassifier(clf=SameSignClassifier(),
155 splitter=NFoldSplitter(1),
156 enable_states=['confusion', 'training_confusion',
157 'feature_ids'])
158 clf.train(ds)
159 error = clf.confusion.error
160 tr_error = clf.training_confusion.error
161
162 clf2 = clf.clone()
163 cv = CrossValidatedTransferError(
164 TransferError(clf2),
165 NFoldSplitter(),
166 enable_states=['confusion', 'training_confusion'])
167 cverror = cv(ds)
168 tr_cverror = cv.training_confusion.error
169
170 self.failUnlessEqual(error, cverror,
171 msg="We should get the same error using split classifier as"
172 " using CrossValidatedTransferError. Got %s and %s"
173 % (error, cverror))
174
175 self.failUnlessEqual(tr_error, tr_cverror,
176 msg="We should get the same training error using split classifier as"
177 " using CrossValidatedTransferError. Got %s and %s"
178 % (tr_error, tr_cverror))
179
180 self.failUnlessEqual(clf.confusion.percentCorrect,
181 100,
182 msg="Dummy clf should train perfectly")
183 self.failUnlessEqual(len(clf.confusion.sets),
184 len(ds.uniquechunks),
185 msg="Should have 1 confusion per each split")
186 self.failUnlessEqual(len(clf.clfs), len(ds.uniquechunks),
187 msg="Should have number of classifiers equal # of epochs")
188 self.failUnlessEqual(clf.predict(ds.samples), list(ds.labels),
189 msg="Should classify correctly")
190
191
192
193
194
195
196
197
198
199
200
201
202
203 summary = clf.summary()
204
205
206 @sweepargs(clf_=clfswh['binary', '!meta'])
208 clf2 = clf_.clone()
209 ds = datasets['uni2medium']
210 clf = SplitClassifier(clf=clf_,
211 splitter=NFoldSplitter(1),
212 enable_states=['confusion', 'feature_ids'])
213 clf.train(ds)
214 error = clf.confusion.error
215
216 cv = CrossValidatedTransferError(
217 TransferError(clf2),
218 NFoldSplitter(),
219 enable_states=['confusion', 'training_confusion'])
220 cverror = cv(ds)
221
222 self.failUnless(abs(error-cverror)<0.01,
223 msg="We should get the same error using split classifier as"
224 " using CrossValidatedTransferError. Got %s and %s"
225 % (error, cverror))
226
227 if cfg.getboolean('tests', 'labile', default='yes'):
228 self.failUnless(error < 0.25,
229 msg="clf should generalize more or less fine. "
230 "Got error %s" % error)
231 self.failUnlessEqual(len(clf.confusion.sets), len(ds.uniquechunks),
232 msg="Should have 1 confusion per each split")
233 self.failUnlessEqual(len(clf.clfs), len(ds.uniquechunks),
234 msg="Should have number of classifiers equal # of epochs")
235
236
237
238
239
257
258
260 samples = N.array([ [0,0,-1], [1,0,1], [-1,-1, 1], [-1,0,1], [1, -1, 1] ])
261 testdata3 = Dataset(samples=samples, labels=1)
262 res110 = [1, 1, 1, -1, -1]
263 res101 = [-1, 1, -1, -1, 1]
264 res011 = [-1, 1, -1, 1, -1]
265
266 clf110 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([1,1,0])))
267 clf101 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([1,0,1])))
268 clf011 = MappedClassifier(clf=self.clf_sign, mapper=MaskMapper(N.array([0,1,1])))
269
270 self.failUnlessEqual(clf110.predict(samples), res110)
271 self.failUnlessEqual(clf101.predict(samples), res101)
272 self.failUnlessEqual(clf011.predict(samples), res011)
273
274
276 from test_rfe import SillySensitivityAnalyzer
277 from mvpa.featsel.base import \
278 SensitivityBasedFeatureSelection
279 from mvpa.featsel.helpers import \
280 FixedNElementTailSelector
281
282
283 sens_ana = SillySensitivityAnalyzer()
284
285 sens_ana_rev = SillySensitivityAnalyzer(mult=-1)
286
287
288 feat_sel = SensitivityBasedFeatureSelection(sens_ana,
289 FixedNElementTailSelector(1, mode='discard'))
290
291 feat_sel_rev = SensitivityBasedFeatureSelection(sens_ana_rev,
292 FixedNElementTailSelector(1))
293
294 samples = N.array([ [0,0,-1], [1,0,1], [-1,-1, 1], [-1,0,1], [1, -1, 1] ])
295
296 testdata3 = Dataset(samples=samples, labels=1)
297
298 traindata = Dataset(samples=N.array([ [0, 0,-1], [1,0,1] ]), labels=[1,2])
299
300
301 res110 = [1, 1, 1, -1, -1]
302 res011 = [-1, 1, -1, 1, -1]
303
304
305 clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel,
306 enable_states=['feature_ids'])
307
308 self.clf_sign.states._changeTemporarily(enable_states=['values'])
309 clf011.train(traindata)
310
311 self.failUnlessEqual(clf011.predict(testdata3.samples), res011)
312
313 self.failUnless(len(clf011.values) == len(res110),
314 msg="We need to pass values into ProxyClassifier")
315 self.clf_sign.states._resetEnabledTemporarily()
316
317 self.failUnlessEqual(len(clf011.feature_ids), 2)
318 "Feature selection classifier had to be trained on 2 features"
319
320
321 clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev)
322 clf011.train(traindata)
323 self.failUnlessEqual(clf011.predict(testdata3.samples), res110)
324
352
353
355 """Basic tests for TreeClassifier
356 """
357 ds = datasets['uni4small']
358 clfs = clfswh['binary']
359
360
361 clfs = [clfs[i] for i in N.random.permutation(len(clfs))]
362
363 tclf = TreeClassifier(clfs[0], {
364 'L0+2' : (('L0', 'L2'), clfs[1]),
365 'L2+3' : ((2, 3), clfs[2])})
366 self.failUnlessRaises(ValueError, tclf.train, ds)
367 """Should raise exception since label 2 is in both"""
368
369
370 tclf = TreeClassifier(clfs[0], {
371 'L0+5' : (('L0', 'L5'), clfs[1]),
372 'L2+3' : ((2, 3), clfs[2])})
373 self.failUnlessRaises(ValueError, tclf.train, ds)
374 """Should raise exception since no group for L1"""
375
376
377 tclf = TreeClassifier(clfs[0], {
378 'L0+1' : (('L0', 1), clfs[1]),
379 'L2+3' : ((2, 3), clfs[2])})
380
381
382 cv = CrossValidatedTransferError(
383 TransferError(tclf),
384 OddEvenSplitter(),
385 enable_states=['confusion', 'training_confusion'])
386 cverror = cv(ds)
387 try:
388 rtclf = repr(tclf)
389 except:
390 self.fail(msg="Could not obtain repr for TreeClassifier")
391
392
393 self.failUnless(tclf.clfs['L0+1'] is clfs[1])
394 self.failUnless(tclf.clfs['L2+3'] is clfs[2])
395
396 cvtrc = cv.training_confusion
397 cvtc = cv.confusion
398 if cfg.getboolean('tests', 'labile', default='yes'):
399
400 self.failUnless(cvtrc != cvtc)
401 self.failUnless(cverror < 0.3)
402
403
404 tclf = TreeClassifier(clfs[0], {
405 'L0' : (('L0',), clfs[1]),
406 'L1+2+3' : ((1, 2, 3), clfs[2])})
407
408
409
410 @sweepargs(clf=clfswh[:])
427
428 @sweepargs(clf=clfswh['linear', 'svm', 'libsvm', '!meta'])
430 oldC = None
431
432
433
434 if clf.params.isKnown('C') and clf.C<0:
435 oldC = clf.C
436 clf.C = 1.0
437
438 svm, svm2 = clf, clf.clone()
439 svm2.states.enable(['training_confusion'])
440
441 mclf = MulticlassClassifier(clf=svm,
442 enable_states=['training_confusion'])
443
444 svm2.train(datasets['uni2small_train'])
445 mclf.train(datasets['uni2small_train'])
446 s1 = str(mclf.training_confusion)
447 s2 = str(svm2.training_confusion)
448 self.failUnlessEqual(s1, s2,
449 msg="Multiclass clf should provide same results as built-in "
450 "libsvm's %s. Got %s and %s" % (svm2, s1, s2))
451
452 svm2.untrain()
453
454 self.failUnless(svm2.trained == False,
455 msg="Un-Trained SVM should be untrained")
456
457 self.failUnless(N.array([x.trained for x in mclf.clfs]).all(),
458 msg="Trained Boosted classifier should have all primary classifiers trained")
459 self.failUnless(mclf.trained,
460 msg="Trained Boosted classifier should be marked as trained")
461
462 mclf.untrain()
463
464 self.failUnless(not mclf.trained,
465 msg="UnTrained Boosted classifier should not be trained")
466 self.failUnless(not N.array([x.trained for x in mclf.clfs]).any(),
467 msg="UnTrained Boosted classifier should have no primary classifiers trained")
468
469 if oldC is not None:
470 clf.C = oldC
471
472
473 @sweepargs(clf=clfswh['svm', '!meta'])
491
492
493 @sweepargs(clf=clfswh['retrainable'])
495
496 clf = clf.clone()
497 clf.states._changeTemporarily(enable_states = ['values'],
498
499
500 disable_states=['training_confusion'])
501 clf_re = clf.clone()
502
503
504 clf_re._setRetrainable(True)
505
506
507
508 dsargs = {'perlabel':50, 'nlabels':2, 'nfeatures':5, 'nchunks':1,
509 'nonbogus_features':[2,4], 'snr': 5.0}
510
511
512
513
514
515 dstrain = deepcopy(datasets['uni2large_train'])
516 dstest = deepcopy(datasets['uni2large_test'])
517
518 clf.untrain()
519 clf_re.untrain()
520 trerr, trerr_re = TransferError(clf), TransferError(clf_re)
521
522
523 err_1 = trerr(dstest, dstrain)
524 self.failUnless(err_1<0.3,
525 msg="We should test here on easy dataset. Got error of %s" % err_1)
526 values_1 = clf.values[:]
527
528 eps = 0.05
529 corrcoef_eps = 0.85
530
531
532 def batch_test(retrain=True, retest=True, closer=True):
533 err = trerr(dstest, dstrain)
534 err_re = trerr_re(dstest, dstrain)
535 corr = N.corrcoef(clf.values, clf_re.values)[0,1]
536 corr_old = N.corrcoef(values_1, clf_re.values)[0,1]
537 if __debug__:
538 debug('TEST', "Retraining stats: errors %g %g corr %g "
539 "with old error %g corr %g" %
540 (err, err_re, corr, err_1, corr_old))
541 self.failUnless(clf_re.states.retrained == retrain,
542 ("Must fully train",
543 "Must retrain instead of full training")[retrain])
544 self.failUnless(clf_re.states.repredicted == retest,
545 ("Must fully test",
546 "Must retest instead of full testing")[retest])
547 self.failUnless(corr > corrcoef_eps,
548 msg="Result must be close to the one without retraining."
549 " Got corrcoef=%s" % (corr))
550 if closer:
551 self.failUnless(corr >= corr_old,
552 msg="Result must be closer to current without retraining"
553 " than to old one. Got corrcoef=%s" % (corr_old))
554
555
556 for i in xrange(3):
557 flag = bool(i!=0)
558
559
560
561 batch_test(retrain=flag, retest=flag, closer=False)
562
563
564 if 'C' in clf.params.names:
565 clf.params.C *= 0.1
566 clf_re.params.C *= 0.1
567 batch_test()
568 elif 'sigma_noise' in clf.params.names:
569 clf.params.sigma_noise *= 100
570 clf_re.params.sigma_noise *= 100
571 batch_test()
572 else:
573 raise RuntimeError, \
574 'Please implement testing while changing some of the ' \
575 'params for clf %s' % clf
576
577
578 if hasattr(clf, 'kernel_params') and len(clf.kernel_params.names):
579 clf.kernel_params.gamma = 0.1
580 clf_re.kernel_params.gamma = 0.1
581
582
583 batch_test(retest=not('gamma' in clf.kernel_params.names))
584
585
586 oldlabels = dstrain.labels[:]
587 dstrain.permuteLabels(status=True, assure_permute=True)
588 self.failUnless((oldlabels != dstrain.labels).any(),
589 msg="We should succeed at permutting -- now got the same labels")
590 batch_test()
591
592
593 oldlabels = dstest.labels[:]
594 dstest.permuteLabels(status=True, assure_permute=True)
595 self.failUnless((oldlabels != dstest.labels).any(),
596 msg="We should succeed at permutting -- now got the same labels")
597 batch_test()
598
599
600
601 if not clf.__class__.__name__ in ['GPR']:
602 oldsamples = dstrain.samples.copy()
603 dstrain.samples[:] += dstrain.samples*0.05
604 self.failUnless((oldsamples != dstrain.samples).any())
605 batch_test(retest=False)
606 clf.states._resetEnabledTemporarily()
607
608
609
610 clf_re.retrain(dstrain); self.failUnless(clf_re.states.retrained)
611 clf_re.retrain(dstrain, labels=True); self.failUnless(clf_re.states.retrained)
612 clf_re.retrain(dstrain, traindataset=True); self.failUnless(clf_re.states.retrained)
613
614
615 clf_re.repredict(dstest.samples);
616 self.failUnless(clf_re.states.repredicted)
617 self.failUnlessRaises(RuntimeError, clf_re.repredict,
618 dstest.samples, labels=True,
619 msg="for now retesting with anything changed makes no sense")
620 clf_re._setRetrainable(False)
621
622
624 """Test all classifiers for conformant behavior
625 """
626 for clf_, traindata in \
627 [(clfswh['binary'], datasets['dumb2']),
628 (clfswh['multiclass'], datasets['dumb'])]:
629 traindata_copy = deepcopy(traindata)
630 for clf in clf_:
631 clf.train(traindata)
632 self.failUnless(
633 (traindata.samples == traindata_copy.samples).all(),
634 "Training of a classifier shouldn't change original dataset")
635
636
637
638
639
640
641 self.failUnless(str(clf) != "")
642 self.failUnless(repr(clf) != "")
643
644
645
646
647
648 @sweepargs(clf=clfswh['!smlr', '!knn', '!lars', '!meta', '!ridge'])
650 """To check if known/present Classifiers are working properly
651 with samples being first dimension. Started to worry about
652 possible problems while looking at sg where samples are 2nd
653 dimension
654 """
655
656
657
658 traindatas = [
659 Dataset(samples=N.array([ [0, 0, 1.0],
660 [1, 0, 0] ]), labels=[-1, 1]),
661 Dataset(samples=N.array([ [0, 0.0],
662 [1, 1] ]), labels=[-1, 1])]
663
664 clf.states._changeTemporarily(enable_states = ['training_confusion'])
665 for traindata in traindatas:
666 clf.train(traindata)
667 self.failUnlessEqual(clf.training_confusion.percentCorrect, 100.0,
668 "Classifier %s must have 100%% correct learning on %s. Has %f" %
669 (`clf`, traindata.samples, clf.training_confusion.percentCorrect))
670
671
672 for i in xrange(traindata.nsamples):
673 sample = traindata.samples[i,:]
674 predicted = clf.predict([sample])
675 self.failUnlessEqual([predicted], traindata.labels[i],
676 "We must be able to predict sample %s using " % sample +
677 "classifier %s" % `clf`)
678 clf.states._resetEnabledTemporarily()
679
682
683
684 if __name__ == '__main__':
685 import runner
686