1
2
3
4
5
6
7
8
9 """Classes for meta classifiers -- classifiers which use other classifiers
10
11 Meta Classifiers can be grouped according to their function as
12
13 :group BoostedClassifiers: CombinedClassifier MulticlassClassifier
14 SplitClassifier
15 :group ProxyClassifiers: ProxyClassifier BinaryClassifier MappedClassifier
16 FeatureSelectionClassifier
17 :group PredictionsCombiners for CombinedClassifier: PredictionsCombiner
18 MaximalVote MeanPrediction
19
20 """
21
22 __docformat__ = 'restructuredtext'
23
24 import operator
25 import numpy as N
26
27 from sets import Set
28
29 from mvpa.misc.args import group_kwargs
30 from mvpa.mappers.mask import MaskMapper
31 from mvpa.datasets.splitters import NFoldSplitter
32 from mvpa.misc.state import StateVariable, ClassWithCollections, Harvestable
33
34 from mvpa.clfs.base import Classifier
35 from mvpa.misc.transformers import FirstAxisMean
36
37 from mvpa.measures.base import \
38 BoostedClassifierSensitivityAnalyzer, ProxyClassifierSensitivityAnalyzer, \
39 MappedClassifierSensitivityAnalyzer, \
40 FeatureSelectionClassifierSensitivityAnalyzer
41
42 from mvpa.base import warning
43
44 if __debug__:
45 from mvpa.base import debug
46
47
49 """Classifier containing the farm of other classifiers.
50
51 Should rarely be used directly. Use one of its childs instead
52 """
53
54
55
56 raw_predictions = StateVariable(enabled=False,
57 doc="Predictions obtained from each classifier")
58
59 raw_values = StateVariable(enabled=False,
60 doc="Values obtained from each classifier")
61
62
63 - def __init__(self, clfs=None, propagate_states=True,
64 harvest_attribs=None, copy_attribs='copy',
65 **kwargs):
66 """Initialize the instance.
67
68 :Parameters:
69 clfs : list
70 list of classifier instances to use (slave classifiers)
71 propagate_states : bool
72 either to propagate enabled states into slave classifiers.
73 It is in effect only when slaves get assigned - so if state
74 is enabled not during construction, it would not necessarily
75 propagate into slaves
76 kwargs : dict
77 dict of keyworded arguments which might get used
78 by State or Classifier
79 """
80 if clfs == None:
81 clfs = []
82
83 Classifier.__init__(self, **kwargs)
84 Harvestable.__init__(self, harvest_attribs, copy_attribs)
85
86 self.__clfs = None
87 """Pylint friendly definition of __clfs"""
88
89 self.__propagate_states = propagate_states
90 """Enable current enabled states in slave classifiers"""
91
92 self._setClassifiers(clfs)
93 """Store the list of classifiers"""
94
95
97 if self.__clfs is None or len(self.__clfs)==0:
98
99 prefix_ = []
100 else:
101 prefix_ = ["clfs=[%s,...]" % repr(self.__clfs[0])]
102 return super(BoostedClassifier, self).__repr__(prefix_ + prefixes)
103
104
106 """Train `BoostedClassifier`
107 """
108 for clf in self.__clfs:
109 clf.train(dataset)
110
111
112 - def _posttrain(self, dataset):
113 """Custom posttrain of `BoostedClassifier`
114
115 Harvest over the trained classifiers if it was asked to so
116 """
117 Classifier._posttrain(self, dataset)
118 if self.states.isEnabled('harvested'):
119 for clf in self.__clfs:
120 self._harvest(locals())
121 if self.params.retrainable:
122 self.__changedData_isset = False
123
124
133
134
152
153
155 """Set the classifiers used by the boosted classifier
156
157 We have to allow to set list of classifiers after the object
158 was actually created. It will be used by
159 MulticlassClassifier
160 """
161 self.__clfs = clfs
162 """Classifiers to use"""
163
164 if len(clfs):
165 for flag in ['regression']:
166 values = N.array([clf.params[flag].value for clf in clfs])
167 value = values.any()
168 if __debug__:
169 debug("CLFBST", "Setting %(flag)s=%(value)s for classifiers "
170 "%(clfs)s with %(values)s",
171 msgargs={'flag' : flag, 'value' : value,
172 'clfs' : clfs,
173 'values' : values})
174
175 self.params[flag].value = value
176
177
178 if self.__propagate_states:
179 for clf in self.__clfs:
180 clf.states.enable(self.states.enabled, missingok=True)
181
182
183
184
185 self._clf_internals = [ 'binary', 'multiclass', 'meta' ]
186 if len(clfs)>0:
187 self._clf_internals += self.__clfs[0]._clf_internals
188
199
205
206
207 clfs = property(fget=lambda x:x.__clfs,
208 fset=_setClassifiers,
209 doc="Used classifiers")
210
211
212
214 """Classifier which decorates another classifier
215
216 Possible uses:
217
218 - modify data somehow prior training/testing:
219 * normalization
220 * feature selection
221 * modification
222
223 - optimized classifier?
224
225 """
226
245
246
250
252 s = super(ProxyClassifier, self).summary()
253 if self.trained:
254 s += "\n Slave classifier summary:" + \
255 '\n + %s' % \
256 (self.__clf.summary().replace('\n', '\n |'))
257 return s
258
259
260
262 """Train `ProxyClassifier`
263 """
264
265
266 self.__clf.train(dataset)
267
268
269
270
271
272
273
274
275
276
278 """Predict using `ProxyClassifier`
279 """
280 clf = self.__clf
281 if self.states.isEnabled('values'):
282 clf.states.enable(['values'])
283
284 result = clf.predict(data)
285
286 self.states._copy_states_(self.__clf, ['values'], deep=False)
287 return result
288
289
296
297
298 @group_kwargs(prefixes=['slave_'], passthrough=True)
305
306
307 clf = property(lambda x:x.__clf, doc="Used `Classifier`")
308
309
310
311
312
313
314
316 """Base class for combining decisions of multiple classifiers"""
317
318 - def train(self, clfs, dataset):
319 """PredictionsCombiner might need to be trained
320
321 :Parameters:
322 clfs : list of Classifier
323 List of classifiers to combine. Has to be classifiers (not
324 pure predictions), since combiner might use some other
325 state variables (value's) instead of pure prediction's
326 dataset : Dataset
327 training data in this case
328 """
329 pass
330
331
333 """Call function
334
335 :Parameters:
336 clfs : list of Classifier
337 List of classifiers to combine. Has to be classifiers (not
338 pure predictions), since combiner might use some other
339 state variables (value's) instead of pure prediction's
340 """
341 raise NotImplementedError
342
343
344
346 """Provides a decision using maximal vote rule"""
347
348 predictions = StateVariable(enabled=True,
349 doc="Voted predictions")
350 all_label_counts = StateVariable(enabled=False,
351 doc="Counts across classifiers for each label/sample")
352
354 """XXX Might get a parameter to use raw decision values if
355 voting is not unambigous (ie two classes have equal number of
356 votes
357 """
358 PredictionsCombiner.__init__(self)
359
360
362 """Actuall callable - perform voting
363
364 Extended functionality which might not be needed actually:
365 Since `BinaryClassifier` might return a list of possible
366 predictions (not just a single one), we should consider all of those
367
368 MaximalVote doesn't care about dataset itself
369 """
370 if len(clfs)==0:
371 return []
372
373 all_label_counts = None
374 for clf in clfs:
375
376 if not clf.states.isEnabled("predictions"):
377 raise ValueError, "MaximalVote needs classifiers (such as " + \
378 "%s) with state 'predictions' enabled" % clf
379 predictions = clf.predictions
380 if all_label_counts is None:
381 all_label_counts = [ {} for i in xrange(len(predictions)) ]
382
383
384 for i in xrange(len(predictions)):
385 prediction = predictions[i]
386 if not operator.isSequenceType(prediction):
387 prediction = (prediction,)
388 for label in prediction:
389
390
391 if not all_label_counts[i].has_key(label):
392 all_label_counts[i][label] = 0
393 all_label_counts[i][label] += 1
394
395 predictions = []
396
397 for i in xrange(len(all_label_counts)):
398 label_counts = all_label_counts[i]
399
400
401 maxk = []
402 maxv = -1
403 for k, v in label_counts.iteritems():
404 if v > maxv:
405 maxk = [k]
406 maxv = v
407 elif v == maxv:
408 maxk.append(k)
409
410 assert len(maxk) >= 1, \
411 "We should have obtained at least a single key of max label"
412
413 if len(maxk) > 1:
414 warning("We got multiple labels %s which have the " % maxk +
415 "same maximal vote %d. XXX disambiguate" % maxv)
416 predictions.append(maxk[0])
417
418 self.all_label_counts = all_label_counts
419 self.predictions = predictions
420 return predictions
421
422
423
425 """Provides a decision by taking mean of the results
426 """
427
428 predictions = StateVariable(enabled=True,
429 doc="Mean predictions")
430
432 """Actuall callable - perform meaning
433
434 """
435 if len(clfs)==0:
436 return []
437
438 all_predictions = []
439 for clf in clfs:
440
441 if not clf.states.isEnabled("predictions"):
442 raise ValueError, "MeanPrediction needs classifiers (such " \
443 " as %s) with state 'predictions' enabled" % clf
444 all_predictions.append(clf.predictions)
445
446
447 predictions = N.mean(N.asarray(all_predictions), axis=0)
448 self.predictions = predictions
449 return predictions
450
451
453 """Provides a decision using training a classifier on predictions/values
454
455 TODO: implement
456 """
457
458 predictions = StateVariable(enabled=True,
459 doc="Trained predictions")
460
461
462 - def __init__(self, clf, variables=None):
463 """Initialize `ClassifierCombiner`
464
465 :Parameters:
466 clf : Classifier
467 Classifier to train on the predictions
468 variables : list of basestring
469 List of state variables stored in 'combined' classifiers, which
470 to use as features for training this classifier
471 """
472 PredictionsCombiner.__init__(self)
473
474 self.__clf = clf
475 """Classifier to train on `variables` states of provided classifiers"""
476
477 if variables == None:
478 variables = ['predictions']
479 self.__variables = variables
480 """What state variables of the classifiers to use"""
481
482
484 """It might be needed to untrain used classifier"""
485 if self.__clf:
486 self.__clf.untrain()
487
489 """
490 """
491 if len(clfs)==0:
492 return []
493
494 raise NotImplementedError
495
496
497
499 """`BoostedClassifier` which combines predictions using some
500 `PredictionsCombiner` functor.
501 """
502
503 - def __init__(self, clfs=None, combiner=None, **kwargs):
504 """Initialize the instance.
505
506 :Parameters:
507 clfs : list of Classifier
508 list of classifier instances to use
509 combiner : PredictionsCombiner
510 callable which takes care about combining multiple
511 results into a single one (e.g. maximal vote for
512 classification, MeanPrediction for regression))
513 kwargs : dict
514 dict of keyworded arguments which might get used
515 by State or Classifier
516
517 NB: `combiner` might need to operate not on 'predictions' descrete
518 labels but rather on raw 'class' values classifiers
519 estimate (which is pretty much what is stored under
520 `values`
521 """
522 if clfs == None:
523 clfs = []
524
525 BoostedClassifier.__init__(self, clfs, **kwargs)
526
527
528 if combiner is None:
529 combiner = (MaximalVote, MeanPrediction)[int(self.regression)]()
530 self.__combiner = combiner
531 """Functor destined to combine results of multiple classifiers"""
532
533
535 """Literal representation of `CombinedClassifier`.
536 """
537 return super(CombinedClassifier, self).__repr__(
538 ["combiner=%s" % repr(self.__combiner)] + prefixes)
539
540
542 """Provide summary for the `CombinedClassifier`.
543 """
544 s = super(CombinedClassifier, self).summary()
545 if self.trained:
546 s += "\n Slave classifiers summaries:"
547 for i, clf in enumerate(self.clfs):
548 s += '\n + %d clf: %s' % \
549 (i, clf.summary().replace('\n', '\n |'))
550 return s
551
552
561
568
569
590
591
592 combiner = property(fget=lambda x:x.__combiner,
593 doc="Used combiner to derive a single result")
594
595
596
598 """`TreeClassifier` which allows to create hierarchy of classifiers
599
600 Functions by grouping some labels into a single "meta-label" and training
601 classifier first to separate between meta-labels. Then
602 each group further proceeds with classification within each group.
603
604 Possible scenarios::
605
606 TreeClassifier(SVM(),
607 {'animate': ((1,2,3,4),
608 TreeClassifier(SVM(),
609 {'human': (('male', 'female'), SVM()),
610 'animals': (('monkey', 'dog'), SMLR())})),
611 'inanimate': ((5,6,7,8), SMLR())})
612
613 would create classifier which would first do binary classification
614 to separate animate from inanimate, then for animate result it
615 would separate to classify human vs animal and so on::
616
617 SVM
618 / \
619 animate inanimate
620 / \
621 SVM SMLR
622 / \ / | \ \
623 human animal 5 6 7 8
624 | |
625 SVM SVM
626 / \ / \
627 male female monkey dog
628 1 2 3 4
629
630 """
631
632 _DEV__doc = """
633 Questions:
634 * how to collect confusion matrices at a particular layer if such
635 classifier is given to SplitClassifier or CVTE
636
637 * What additional states to add, something like
638 clf_labels -- store remapped labels for the dataset
639 clf_values ...
640
641 * What do we store into values ? just values from the clfs[]
642 for corresponding samples, or top level clf values as well?
643
644 * what should be SensitivityAnalyzer? by default it would just
645 use top slave classifier (i.e. animate/inanimate)
646
647 Problems?
648 * .clf is not actually "proxied" per se, so not sure what things
649 should be taken care of yet...
650
651 TODO:
652 * Allow a group to be just a single category, so no further
653 classifier is needed, it just should stay separate from the
654 other groups
655
656 Possible TODO:
657 * Add ability to provide results of clf.values as features into
658 input of clfs[]. This way we could provide additional 'similarity'
659 information to the "other" branch
660
661 """
662
663 - def __init__(self, clf, groups, **kwargs):
664 """Initialize TreeClassifier
665
666 :Parameters:
667 clf : Classifier
668 Classifier to separate between the groups
669 groups : dict of meta-label: tuple of (tuple of labels, classifier)
670 Defines the groups of labels and their classifiers.
671 See :class:`~mvpa.clfs.meta.TreeClassifier` for example
672 """
673
674
675 ProxyClassifier.__init__(self, clf, **kwargs)
676 self._regressionIsBogus()
677
678
679
680
681
682 self._groups = groups
683 self._index2group = groups.keys()
684
685
686
687
688
689
690
691 self.clfs = dict([(gk, c) for gk, (ls, c) in groups.iteritems()])
692 """Dictionary of classifiers used by the groups"""
693
694
696 """String representation of TreeClassifier
697 """
698 prefix = "groups=%s" % repr(self._groups)
699 return super(TreeClassifier, self).__repr__([prefix] + prefixes)
700
701
703 """Provide summary for the `TreeClassifier`.
704 """
705 s = super(TreeClassifier, self).summary()
706 if self.trained:
707 s += "\n Node classifiers summaries:"
708 for i, (clfname, clf) in enumerate(self.clfs.iteritems()):
709 s += '\n + %d %s clf: %s' % \
710 (i, clfname, clf.summary().replace('\n', '\n |'))
711 return s
712
713
715 """Train TreeClassifier
716
717 First train .clf on groupped samples, then train each of .clfs
718 on a corresponding subset of samples.
719 """
720
721 clf, clfs, index2group = self.clf, self.clfs, self._index2group
722
723
724 groups = self._groups
725 labels_map = dataset.labels_map
726
727 if labels_map is None: labels_map = {}
728 groups_labels = {}
729 label2index = {}
730 known = set()
731 for gi, gk in enumerate(index2group):
732 ls = groups[gk][0]
733
734 ls_ = [labels_map.get(l, l) for l in ls]
735 known_already = known.intersection(ls_)
736 if len(known_already):
737 raise ValueError, "Grouping of labels is not appropriate. " \
738 "Got labels %s already among known in %s. " \
739 "Used labelsmap %s" % (known_already, known, labels_map)
740 groups_labels[gk] = ls_
741 for l in ls_:
742 label2index[l] = gi
743 known = known.union(ls_)
744
745
746
747
748
749 dsul = set(dataset.uniquelabels)
750 if known.intersection(dsul) != dsul:
751 raise ValueError, \
752 "Dataset %s had some labels not defined in groups: %s. " \
753 "Known are %s" % \
754 (dataset, dsul.difference(known), known)
755
756
757
758
759
760
761
762
763 ds_group = dataset.copy(deep=False)
764
765 ds_group.labels = [label2index[l] for l in dataset.labels]
766
767
768 if __debug__:
769 debug('CLFTREE', "Training primary %(clf)s on %(ds)s",
770 msgargs=dict(clf=clf, ds=ds_group))
771 clf.train(ds_group)
772
773
774
775
776
777
778
779
780
781
782
783 for gk in groups.iterkeys():
784
785 ids = dataset.idsbylabels(groups_labels[gk])
786 ds_group = dataset.selectSamples(ids)
787 if __debug__:
788 debug('CLFTREE', "Training %(clf)s for group %(gk)s on %(ds)s",
789 msgargs=dict(clf=clfs[gk], gk=gk, ds=ds_group))
790
791 clfs[gk].train(ds_group)
792
793
800
801
803 """
804 """
805
806 clfs, index2group = self.clfs, self._index2group
807 clf_predictions = N.asanyarray(ProxyClassifier._predict(self, data))
808
809 clf_predictions = clf_predictions.astype(int)
810
811
812
813 predictions = N.array([N.nan]*len(data))
814 for pred_group in set(clf_predictions):
815 gk = index2group[pred_group]
816 clf_ = clfs[gk]
817 group_indexes = (clf_predictions == pred_group)
818 if __debug__:
819 debug('CLFTREE', 'Predicting for group %s using %s on %d samples' %
820 (gk, clf_, N.sum(group_indexes)))
821 predictions[group_indexes] = clf_.predict(data[group_indexes])
822 return predictions
823
824
826 """`ProxyClassifier` which maps set of two labels into +1 and -1
827 """
828
829 - def __init__(self, clf, poslabels, neglabels, **kwargs):
830 """
831 :Parameters:
832 clf : Classifier
833 classifier to use
834 poslabels : list
835 list of labels which are treated as +1 category
836 neglabels : list
837 list of labels which are treated as -1 category
838 """
839
840 ProxyClassifier.__init__(self, clf, **kwargs)
841
842 self._regressionIsBogus()
843
844
845 sposlabels = Set(poslabels)
846 sneglabels = Set(neglabels)
847
848
849 overlap = sposlabels.intersection(sneglabels)
850 if len(overlap)>0:
851 raise ValueError("Sets of positive and negative labels for " +
852 "BinaryClassifier must not overlap. Got overlap " %
853 overlap)
854
855 self.__poslabels = list(sposlabels)
856 self.__neglabels = list(sneglabels)
857
858
859
860
861
862
863
864
865 if len(self.__poslabels) > 1:
866 self.__predictpos = self.__poslabels
867 else:
868 self.__predictpos = self.__poslabels[0]
869
870 if len(self.__neglabels) > 1:
871 self.__predictneg = self.__neglabels
872 else:
873 self.__predictneg = self.__neglabels[0]
874
875
877 prefix = "poslabels=%s, neglabels=%s" % (
878 repr(self.__poslabels), repr(self.__neglabels))
879 return super(BinaryClassifier, self).__repr__([prefix] + prefixes)
880
881
883 """Train `BinaryClassifier`
884 """
885 idlabels = [(x, +1) for x in dataset.idsbylabels(self.__poslabels)] + \
886 [(x, -1) for x in dataset.idsbylabels(self.__neglabels)]
887
888
889 idlabels.sort()
890
891 orig_labels = None
892
893
894
895
896 if len(idlabels) == dataset.nsamples \
897 and [x[0] for x in idlabels] == range(dataset.nsamples):
898
899
900 datasetselected = dataset
901 orig_labels = dataset.labels
902 if __debug__:
903 debug('CLFBIN',
904 "Assigned all %d samples for binary " %
905 (dataset.nsamples) +
906 " classification among labels %s/+1 and %s/-1" %
907 (self.__poslabels, self.__neglabels))
908 else:
909 datasetselected = dataset.selectSamples([ x[0] for x in idlabels ])
910 if __debug__:
911 debug('CLFBIN',
912 "Selected %d samples out of %d samples for binary " %
913 (len(idlabels), dataset.nsamples) +
914 " classification among labels %s/+1 and %s/-1" %
915 (self.__poslabels, self.__neglabels) +
916 ". Selected %s" % datasetselected)
917
918
919 datasetselected.labels = [ x[1] for x in idlabels ]
920
921
922 if __debug__:
923 assert((datasetselected.uniquelabels == [-1, 1]).all())
924
925 self.clf.train(datasetselected)
926
927 if not orig_labels is None:
928 dataset.labels = orig_labels
929
931 """Predict the labels for a given `data`
932
933 Predicts using binary classifier and spits out list (for each sample)
934 where with either poslabels or neglabels as the "label" for the sample.
935 If there was just a single label within pos or neg labels then it would
936 return not a list but just that single label.
937 """
938 binary_predictions = ProxyClassifier._predict(self, data)
939 self.values = binary_predictions
940 predictions = [ {-1: self.__predictneg,
941 +1: self.__predictpos}[x] for x in binary_predictions]
942 self.predictions = predictions
943 return predictions
944
945
946
948 """`CombinedClassifier` to perform multiclass using a list of
949 `BinaryClassifier`.
950
951 such as 1-vs-1 (ie in pairs like libsvm doesn) or 1-vs-all (which
952 is yet to think about)
953 """
954
955 - def __init__(self, clf, bclf_type="1-vs-1", **kwargs):
956 """Initialize the instance
957
958 :Parameters:
959 clf : Classifier
960 classifier based on which multiple classifiers are created
961 for multiclass
962 bclf_type
963 "1-vs-1" or "1-vs-all", determines the way to generate binary
964 classifiers
965 """
966 CombinedClassifier.__init__(self, **kwargs)
967 self._regressionIsBogus()
968 if not clf is None:
969 clf._regressionIsBogus()
970
971 self.__clf = clf
972 """Store sample instance of basic classifier"""
973
974
975 if bclf_type == "1-vs-1":
976 pass
977 elif bclf_type == "1-vs-all":
978 raise NotImplementedError
979 else:
980 raise ValueError, \
981 "Unknown type of classifier %s for " % bclf_type + \
982 "BoostedMulticlassClassifier"
983 self.__bclf_type = bclf_type
984
985
986
988 prefix = "bclf_type=%s, clf=%s" % (repr(self.__bclf_type),
989 repr(self.__clf))
990 return super(MulticlassClassifier, self).__repr__([prefix] + prefixes)
991
992
994 """Train classifier
995 """
996
997 ulabels = dataset.uniquelabels
998 if self.__bclf_type == "1-vs-1":
999
1000 biclfs = []
1001 for i in xrange(len(ulabels)):
1002 for j in xrange(i+1, len(ulabels)):
1003 clf = self.__clf.clone()
1004 biclfs.append(
1005 BinaryClassifier(
1006 clf,
1007 poslabels=[ulabels[i]], neglabels=[ulabels[j]]))
1008 if __debug__:
1009 debug("CLFMC", "Created %d binary classifiers for %d labels" %
1010 (len(biclfs), len(ulabels)))
1011
1012 self.clfs = biclfs
1013
1014 elif self.__bclf_type == "1-vs-all":
1015 raise NotImplementedError
1016
1017
1018 CombinedClassifier._train(self, dataset)
1019
1020
1021
1023 """`BoostedClassifier` to work on splits of the data
1024
1025 """
1026
1027 """
1028 TODO: SplitClassifier and MulticlassClassifier have too much in
1029 common -- need to refactor: just need a splitter which would
1030 split dataset in pairs of class labels. MulticlassClassifier
1031 does just a tiny bit more which might be not necessary at
1032 all: map sets of labels into 2 categories...
1033 """
1034
1035
1036
1037 confusion = StateVariable(enabled=False,
1038 doc="Resultant confusion whenever classifier trained " +
1039 "on 1 part and tested on 2nd part of each split")
1040
1041 splits = StateVariable(enabled=False, doc=
1042 """Store the actual splits of the data. Can be memory expensive""")
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1057 """Initialize the instance
1058
1059 :Parameters:
1060 clf : Classifier
1061 classifier based on which multiple classifiers are created
1062 for multiclass
1063 splitter : Splitter
1064 `Splitter` to use to split the dataset prior training
1065 """
1066
1067 CombinedClassifier.__init__(self, regression=clf.regression, **kwargs)
1068 self.__clf = clf
1069 """Store sample instance of basic classifier"""
1070
1071 if isinstance(splitter, type):
1072 raise ValueError, \
1073 "Please provide an instance of a splitter, not a type." \
1074 " Got %s" % splitter
1075
1076 self.__splitter = splitter
1077
1078
1080 """Train `SplitClassifier`
1081 """
1082
1083 bclfs = []
1084
1085
1086 states = self.states
1087
1088 clf_template = self.__clf
1089 if states.isEnabled('confusion'):
1090 states.confusion = clf_template._summaryClass()
1091 if states.isEnabled('training_confusion'):
1092 clf_template.states.enable(['training_confusion'])
1093 states.training_confusion = clf_template._summaryClass()
1094
1095 clf_hastestdataset = hasattr(clf_template, 'testdataset')
1096
1097
1098
1099 for split in self.__splitter.splitcfg(dataset):
1100 if __debug__:
1101 debug("CLFSPL_",
1102 "Deepcopying %(clf)s for %(sclf)s",
1103 msgargs={'clf':clf_template,
1104 'sclf':self})
1105 clf = clf_template.clone()
1106 bclfs.append(clf)
1107 self.clfs = bclfs
1108
1109 self.splits = []
1110
1111 for i, split in enumerate(self.__splitter(dataset)):
1112 if __debug__:
1113 debug("CLFSPL", "Training classifier for split %d" % (i))
1114
1115 if states.isEnabled("splits"):
1116 self.splits.append(split)
1117
1118 clf = self.clfs[i]
1119
1120
1121 if clf_hastestdataset:
1122 clf.testdataset = split[1]
1123
1124 clf.train(split[0])
1125
1126
1127 if clf_hastestdataset:
1128 clf.testdataset = None
1129
1130 if states.isEnabled("confusion"):
1131 predictions = clf.predict(split[1].samples)
1132 self.confusion.add(split[1].labels, predictions,
1133 clf.states.get('values', None))
1134 if __debug__:
1135 dact = debug.active
1136 if 'CLFSPL_' in dact:
1137 debug('CLFSPL_', 'Split %d:\n%s' % (i, self.confusion))
1138 elif 'CLFSPL' in dact:
1139 debug('CLFSPL', 'Split %d error %.2f%%'
1140 % (i, self.confusion.summaries[-1].error))
1141
1142 if states.isEnabled("training_confusion"):
1143 states.training_confusion += \
1144 clf.states.training_confusion
1145
1146 try:
1147 if states.isEnabled("confusion"):
1148 states.confusion.labels_map = dataset.labels_map
1149 if states.isEnabled("training_confusion"):
1150 states.training_confusion.labels_map = dataset.labels_map
1151 except:
1152 pass
1153
1154
1155 @group_kwargs(prefixes=['slave_'], passthrough=True)
1168
1169 splitter = property(fget=lambda x:x.__splitter,
1170 doc="Splitter user by SplitClassifier")
1171
1172
1174 """`ProxyClassifier` which uses some mapper prior training/testing.
1175
1176 `MaskMapper` can be used just a subset of features to
1177 train/classify.
1178 Having such classifier we can easily create a set of classifiers
1179 for BoostedClassifier, where each classifier operates on some set
1180 of features, e.g. set of best spheres from SearchLight, set of
1181 ROIs selected elsewhere. It would be different from simply
1182 applying whole mask over the dataset, since here initial decision
1183 is made by each classifier and then later on they vote for the
1184 final decision across the set of classifiers.
1185 """
1186
1187 - def __init__(self, clf, mapper, **kwargs):
1188 """Initialize the instance
1189
1190 :Parameters:
1191 clf : Classifier
1192 classifier based on which mask classifiers is created
1193 mapper
1194 whatever `Mapper` comes handy
1195 """
1196 ProxyClassifier.__init__(self, clf, **kwargs)
1197
1198 self.__mapper = mapper
1199 """mapper to help us our with prepping data to
1200 training/classification"""
1201
1202
1204 """Train `MappedClassifier`
1205 """
1206
1207
1208
1209 self.__mapper.train(dataset)
1210
1211
1212 wdataset = dataset.applyMapper(featuresmapper = self.__mapper)
1213 ProxyClassifier._train(self, wdataset)
1214
1215
1220
1221
1222 @group_kwargs(prefixes=['slave_'], passthrough=True)
1229
1230
1231 mapper = property(lambda x:x.__mapper, doc="Used mapper")
1232
1233
1234
1236 """`ProxyClassifier` which uses some `FeatureSelection` prior training.
1237
1238 `FeatureSelection` is used first to select features for the classifier to
1239 use for prediction. Internally it would rely on MappedClassifier which
1240 would use created MaskMapper.
1241
1242 TODO: think about removing overhead of retraining the same classifier if
1243 feature selection was carried out with the same classifier already. It
1244 has been addressed by adding .trained property to classifier, but now
1245 we should expclitely use isTrained here if we want... need to think more
1246 """
1247
1248 _clf_internals = [ 'does_feature_selection', 'meta' ]
1249
1250 - def __init__(self, clf, feature_selection, testdataset=None, **kwargs):
1251 """Initialize the instance
1252
1253 :Parameters:
1254 clf : Classifier
1255 classifier based on which mask classifiers is created
1256 feature_selection : FeatureSelection
1257 whatever `FeatureSelection` comes handy
1258 testdataset : Dataset
1259 optional dataset which would be given on call to feature_selection
1260 """
1261 ProxyClassifier.__init__(self, clf, **kwargs)
1262
1263 self.__maskclf = None
1264 """Should become `MappedClassifier`(mapper=`MaskMapper`) later on."""
1265
1266 self.__feature_selection = feature_selection
1267 """`FeatureSelection` to select the features prior training"""
1268
1269 self.__testdataset = testdataset
1270 """`FeatureSelection` might like to use testdataset"""
1271
1272
1274 """Untrain `FeatureSelectionClassifier`
1275
1276 Has to untrain any known classifier
1277 """
1278 if self.__feature_selection is not None:
1279 self.__feature_selection.untrain()
1280 if not self.trained:
1281 return
1282 if not self.__maskclf is None:
1283 self.__maskclf.untrain()
1284 super(FeatureSelectionClassifier, self).untrain()
1285
1286
1288 """Train `FeatureSelectionClassifier`
1289 """
1290
1291 self.__feature_selection.states._changeTemporarily(
1292 enable_states=["selected_ids"])
1293
1294 if __debug__:
1295 debug("CLFFS", "Performing feature selection using %s" %
1296 self.__feature_selection + " on %s" % dataset)
1297
1298 (wdataset, tdataset) = self.__feature_selection(dataset,
1299 self.__testdataset)
1300 if __debug__:
1301 add_ = ""
1302 if "CLFFS_" in debug.active:
1303 add_ = " Selected features: %s" % \
1304 self.__feature_selection.selected_ids
1305 debug("CLFFS", "%(fs)s selected %(nfeat)d out of " +
1306 "%(dsnfeat)d features.%(app)s",
1307 msgargs={'fs':self.__feature_selection,
1308 'nfeat':wdataset.nfeatures,
1309 'dsnfeat':dataset.nfeatures,
1310 'app':add_})
1311
1312
1313
1314 mappermask = N.zeros(dataset.nfeatures)
1315 mappermask[self.__feature_selection.selected_ids] = 1
1316 mapper = MaskMapper(mappermask)
1317
1318 self.__feature_selection.states._resetEnabledTemporarily()
1319
1320
1321 self.__maskclf = MappedClassifier(self.clf, mapper)
1322
1323
1324 self.__maskclf.clf.train(wdataset)
1325
1326
1327
1328
1329
1331 """Return used feature ids for `FeatureSelectionClassifier`
1332
1333 """
1334 return self.__feature_selection.selected_ids
1335
1337 """Predict using `FeatureSelectionClassifier`
1338 """
1339 clf = self.__maskclf
1340 if self.states.isEnabled('values'):
1341 clf.states.enable(['values'])
1342
1343 result = clf._predict(data)
1344
1345 self.states._copy_states_(clf, ['values'], deep=False)
1346 return result
1347
1349 """Set testing dataset to be used for feature selection
1350 """
1351 self.__testdataset = testdataset
1352
1353 maskclf = property(lambda x:x.__maskclf, doc="Used `MappedClassifier`")
1354 feature_selection = property(lambda x:x.__feature_selection,
1355 doc="Used `FeatureSelection`")
1356
1357 @group_kwargs(prefixes=['slave_'], passthrough=True)
1367
1368
1369
1370 testdataset = property(fget=lambda x:x.__testdataset,
1371 fset=setTestDataset)
1372