Package mvpa :: Package clfs :: Module meta
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.meta

   1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
   2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
   3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   4  # 
   5  #   See COPYING file distributed along with the PyMVPA package for the 
   6  #   copyright and license terms. 
   7  # 
   8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   9  """Classes for meta classifiers -- classifiers which use other classifiers 
  10   
  11  Meta Classifiers can be grouped according to their function as 
  12   
  13  :group BoostedClassifiers: CombinedClassifier MulticlassClassifier 
  14    SplitClassifier 
  15  :group ProxyClassifiers: ProxyClassifier BinaryClassifier MappedClassifier 
  16    FeatureSelectionClassifier 
  17  :group PredictionsCombiners for CombinedClassifier: PredictionsCombiner 
  18    MaximalVote MeanPrediction 
  19   
  20  """ 
  21   
  22  __docformat__ = 'restructuredtext' 
  23   
  24  import operator 
  25  import numpy as N 
  26   
  27  from sets import Set 
  28   
  29  from mvpa.misc.args import group_kwargs 
  30  from mvpa.mappers.mask import MaskMapper 
  31  from mvpa.datasets.splitters import NFoldSplitter 
  32  from mvpa.misc.state import StateVariable, ClassWithCollections, Harvestable 
  33   
  34  from mvpa.clfs.base import Classifier 
  35  from mvpa.misc.transformers import FirstAxisMean 
  36   
  37  from mvpa.measures.base import \ 
  38      BoostedClassifierSensitivityAnalyzer, ProxyClassifierSensitivityAnalyzer, \ 
  39      MappedClassifierSensitivityAnalyzer, \ 
  40      FeatureSelectionClassifierSensitivityAnalyzer 
  41   
  42  from mvpa.base import warning 
  43   
  44  if __debug__: 
  45      from mvpa.base import debug 
  46   
  47   
48 -class BoostedClassifier(Classifier, Harvestable):
49 """Classifier containing the farm of other classifiers. 50 51 Should rarely be used directly. Use one of its childs instead 52 """ 53 54 # should not be needed if we have prediction_values upstairs 55 # raw_predictions should be handled as Harvestable??? 56 raw_predictions = StateVariable(enabled=False, 57 doc="Predictions obtained from each classifier") 58 59 raw_values = StateVariable(enabled=False, 60 doc="Values obtained from each classifier") 61 62
63 - def __init__(self, clfs=None, propagate_states=True, 64 harvest_attribs=None, copy_attribs='copy', 65 **kwargs):
66 """Initialize the instance. 67 68 :Parameters: 69 clfs : list 70 list of classifier instances to use (slave classifiers) 71 propagate_states : bool 72 either to propagate enabled states into slave classifiers. 73 It is in effect only when slaves get assigned - so if state 74 is enabled not during construction, it would not necessarily 75 propagate into slaves 76 kwargs : dict 77 dict of keyworded arguments which might get used 78 by State or Classifier 79 """ 80 if clfs == None: 81 clfs = [] 82 83 Classifier.__init__(self, **kwargs) 84 Harvestable.__init__(self, harvest_attribs, copy_attribs) 85 86 self.__clfs = None 87 """Pylint friendly definition of __clfs""" 88 89 self.__propagate_states = propagate_states 90 """Enable current enabled states in slave classifiers""" 91 92 self._setClassifiers(clfs) 93 """Store the list of classifiers"""
94 95
96 - def __repr__(self, prefixes=[]):
97 if self.__clfs is None or len(self.__clfs)==0: 98 #prefix_ = "clfs=%s" % repr(self.__clfs) 99 prefix_ = [] 100 else: 101 prefix_ = ["clfs=[%s,...]" % repr(self.__clfs[0])] 102 return super(BoostedClassifier, self).__repr__(prefix_ + prefixes)
103 104
105 - def _train(self, dataset):
106 """Train `BoostedClassifier` 107 """ 108 for clf in self.__clfs: 109 clf.train(dataset)
110 111
112 - def _posttrain(self, dataset):
113 """Custom posttrain of `BoostedClassifier` 114 115 Harvest over the trained classifiers if it was asked to so 116 """ 117 Classifier._posttrain(self, dataset) 118 if self.states.isEnabled('harvested'): 119 for clf in self.__clfs: 120 self._harvest(locals()) 121 if self.params.retrainable: 122 self.__changedData_isset = False
123 124
125 - def _getFeatureIds(self):
126 """Custom _getFeatureIds for `BoostedClassifier` 127 """ 128 # return union of all used features by slave classifiers 129 feature_ids = Set([]) 130 for clf in self.__clfs: 131 feature_ids = feature_ids.union(Set(clf.feature_ids)) 132 return list(feature_ids)
133 134
135 - def _predict(self, data):
136 """Predict using `BoostedClassifier` 137 """ 138 raw_predictions = [ clf.predict(data) for clf in self.__clfs ] 139 self.raw_predictions = raw_predictions 140 assert(len(self.__clfs)>0) 141 if self.states.isEnabled("values"): 142 if N.array([x.states.isEnabled("values") 143 for x in self.__clfs]).all(): 144 values = [ clf.values for clf in self.__clfs ] 145 self.raw_values = values 146 else: 147 warning("One or more classifiers in %s has no 'values' state" % 148 self + "enabled, thus BoostedClassifier can't have" + 149 " 'raw_values' state variable defined") 150 151 return raw_predictions
152 153
154 - def _setClassifiers(self, clfs):
155 """Set the classifiers used by the boosted classifier 156 157 We have to allow to set list of classifiers after the object 158 was actually created. It will be used by 159 MulticlassClassifier 160 """ 161 self.__clfs = clfs 162 """Classifiers to use""" 163 164 if len(clfs): 165 for flag in ['regression']: 166 values = N.array([clf.params[flag].value for clf in clfs]) 167 value = values.any() 168 if __debug__: 169 debug("CLFBST", "Setting %(flag)s=%(value)s for classifiers " 170 "%(clfs)s with %(values)s", 171 msgargs={'flag' : flag, 'value' : value, 172 'clfs' : clfs, 173 'values' : values}) 174 # set flag if it needs to be trained before predicting 175 self.params[flag].value = value 176 177 # enable corresponding states in the slave-classifiers 178 if self.__propagate_states: 179 for clf in self.__clfs: 180 clf.states.enable(self.states.enabled, missingok=True) 181 182 # adhere to their capabilities + 'multiclass' 183 # XXX do intersection across all classifiers! 184 # TODO: this seems to be wrong since it can be regression etc 185 self._clf_internals = [ 'binary', 'multiclass', 'meta' ] 186 if len(clfs)>0: 187 self._clf_internals += self.__clfs[0]._clf_internals
188
189 - def untrain(self):
190 """Untrain `BoostedClassifier` 191 192 Has to untrain any known classifier 193 """ 194 if not self.trained: 195 return 196 for clf in self.clfs: 197 clf.untrain() 198 super(BoostedClassifier, self).untrain()
199
200 - def getSensitivityAnalyzer(self, **kwargs):
201 """Return an appropriate SensitivityAnalyzer""" 202 return BoostedClassifierSensitivityAnalyzer( 203 self, 204 **kwargs)
205 206 207 clfs = property(fget=lambda x:x.__clfs, 208 fset=_setClassifiers, 209 doc="Used classifiers")
210 211 212
213 -class ProxyClassifier(Classifier):
214 """Classifier which decorates another classifier 215 216 Possible uses: 217 218 - modify data somehow prior training/testing: 219 * normalization 220 * feature selection 221 * modification 222 223 - optimized classifier? 224 225 """ 226
227 - def __init__(self, clf, **kwargs):
228 """Initialize the instance 229 230 :Parameters: 231 clf : Classifier 232 classifier based on which mask classifiers is created 233 """ 234 235 Classifier.__init__(self, regression=clf.regression, **kwargs) 236 237 self.__clf = clf 238 """Store the classifier to use.""" 239 240 # adhere to slave classifier capabilities 241 # TODO: unittest 242 self._clf_internals = self._clf_internals[:] + ['meta'] 243 if clf is not None: 244 self._clf_internals += clf._clf_internals
245 246
247 - def __repr__(self, prefixes=[]):
248 return super(ProxyClassifier, self).__repr__( 249 ["clf=%s" % repr(self.__clf)] + prefixes)
250
251 - def summary(self):
252 s = super(ProxyClassifier, self).summary() 253 if self.trained: 254 s += "\n Slave classifier summary:" + \ 255 '\n + %s' % \ 256 (self.__clf.summary().replace('\n', '\n |')) 257 return s
258 259 260
261 - def _train(self, dataset):
262 """Train `ProxyClassifier` 263 """ 264 # base class does nothing much -- just proxies requests to underlying 265 # classifier 266 self.__clf.train(dataset)
267 268 # for the ease of access 269 # TODO: if to copy we should exclude some states which are defined in 270 # base Classifier (such as training_time, predicting_time) 271 # YOH: for now _copy_states_ would copy only set states variables. If 272 # anything needs to be overriden in the parent's class, it is 273 # welcome to do so 274 #self.states._copy_states_(self.__clf, deep=False) 275 276
277 - def _predict(self, data):
278 """Predict using `ProxyClassifier` 279 """ 280 clf = self.__clf 281 if self.states.isEnabled('values'): 282 clf.states.enable(['values']) 283 284 result = clf.predict(data) 285 # for the ease of access 286 self.states._copy_states_(self.__clf, ['values'], deep=False) 287 return result
288 289
290 - def untrain(self):
291 """Untrain ProxyClassifier 292 """ 293 if not self.__clf is None: 294 self.__clf.untrain() 295 super(ProxyClassifier, self).untrain()
296 297 298 @group_kwargs(prefixes=['slave_'], passthrough=True)
299 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
300 """Return an appropriate SensitivityAnalyzer""" 301 return ProxyClassifierSensitivityAnalyzer( 302 self, 303 analyzer=self.__clf.getSensitivityAnalyzer(**slave_kwargs), 304 **kwargs)
305 306 307 clf = property(lambda x:x.__clf, doc="Used `Classifier`")
308 309 310 311 # 312 # Various combiners for CombinedClassifier 313 # 314
315 -class PredictionsCombiner(ClassWithCollections):
316 """Base class for combining decisions of multiple classifiers""" 317
318 - def train(self, clfs, dataset):
319 """PredictionsCombiner might need to be trained 320 321 :Parameters: 322 clfs : list of Classifier 323 List of classifiers to combine. Has to be classifiers (not 324 pure predictions), since combiner might use some other 325 state variables (value's) instead of pure prediction's 326 dataset : Dataset 327 training data in this case 328 """ 329 pass
330 331
332 - def __call__(self, clfs, dataset):
333 """Call function 334 335 :Parameters: 336 clfs : list of Classifier 337 List of classifiers to combine. Has to be classifiers (not 338 pure predictions), since combiner might use some other 339 state variables (value's) instead of pure prediction's 340 """ 341 raise NotImplementedError
342 343 344
345 -class MaximalVote(PredictionsCombiner):
346 """Provides a decision using maximal vote rule""" 347 348 predictions = StateVariable(enabled=True, 349 doc="Voted predictions") 350 all_label_counts = StateVariable(enabled=False, 351 doc="Counts across classifiers for each label/sample") 352
353 - def __init__(self):
354 """XXX Might get a parameter to use raw decision values if 355 voting is not unambigous (ie two classes have equal number of 356 votes 357 """ 358 PredictionsCombiner.__init__(self)
359 360
361 - def __call__(self, clfs, dataset):
362 """Actuall callable - perform voting 363 364 Extended functionality which might not be needed actually: 365 Since `BinaryClassifier` might return a list of possible 366 predictions (not just a single one), we should consider all of those 367 368 MaximalVote doesn't care about dataset itself 369 """ 370 if len(clfs)==0: 371 return [] # to don't even bother 372 373 all_label_counts = None 374 for clf in clfs: 375 # Lets check first if necessary state variable is enabled 376 if not clf.states.isEnabled("predictions"): 377 raise ValueError, "MaximalVote needs classifiers (such as " + \ 378 "%s) with state 'predictions' enabled" % clf 379 predictions = clf.predictions 380 if all_label_counts is None: 381 all_label_counts = [ {} for i in xrange(len(predictions)) ] 382 383 # for every sample 384 for i in xrange(len(predictions)): 385 prediction = predictions[i] 386 if not operator.isSequenceType(prediction): 387 prediction = (prediction,) 388 for label in prediction: # for every label 389 # XXX we might have multiple labels assigned 390 # but might not -- don't remember now 391 if not all_label_counts[i].has_key(label): 392 all_label_counts[i][label] = 0 393 all_label_counts[i][label] += 1 394 395 predictions = [] 396 # select maximal vote now for each sample 397 for i in xrange(len(all_label_counts)): 398 label_counts = all_label_counts[i] 399 # lets do explicit search for max so we know 400 # if it is unique 401 maxk = [] # labels of elements with max vote 402 maxv = -1 403 for k, v in label_counts.iteritems(): 404 if v > maxv: 405 maxk = [k] 406 maxv = v 407 elif v == maxv: 408 maxk.append(k) 409 410 assert len(maxk) >= 1, \ 411 "We should have obtained at least a single key of max label" 412 413 if len(maxk) > 1: 414 warning("We got multiple labels %s which have the " % maxk + 415 "same maximal vote %d. XXX disambiguate" % maxv) 416 predictions.append(maxk[0]) 417 418 self.all_label_counts = all_label_counts 419 self.predictions = predictions 420 return predictions
421 422 423
424 -class MeanPrediction(PredictionsCombiner):
425 """Provides a decision by taking mean of the results 426 """ 427 428 predictions = StateVariable(enabled=True, 429 doc="Mean predictions") 430
431 - def __call__(self, clfs, dataset):
432 """Actuall callable - perform meaning 433 434 """ 435 if len(clfs)==0: 436 return [] # to don't even bother 437 438 all_predictions = [] 439 for clf in clfs: 440 # Lets check first if necessary state variable is enabled 441 if not clf.states.isEnabled("predictions"): 442 raise ValueError, "MeanPrediction needs classifiers (such " \ 443 " as %s) with state 'predictions' enabled" % clf 444 all_predictions.append(clf.predictions) 445 446 # compute mean 447 predictions = N.mean(N.asarray(all_predictions), axis=0) 448 self.predictions = predictions 449 return predictions
450 451
452 -class ClassifierCombiner(PredictionsCombiner):
453 """Provides a decision using training a classifier on predictions/values 454 455 TODO: implement 456 """ 457 458 predictions = StateVariable(enabled=True, 459 doc="Trained predictions") 460 461
462 - def __init__(self, clf, variables=None):
463 """Initialize `ClassifierCombiner` 464 465 :Parameters: 466 clf : Classifier 467 Classifier to train on the predictions 468 variables : list of basestring 469 List of state variables stored in 'combined' classifiers, which 470 to use as features for training this classifier 471 """ 472 PredictionsCombiner.__init__(self) 473 474 self.__clf = clf 475 """Classifier to train on `variables` states of provided classifiers""" 476 477 if variables == None: 478 variables = ['predictions'] 479 self.__variables = variables 480 """What state variables of the classifiers to use"""
481 482
483 - def untrain(self):
484 """It might be needed to untrain used classifier""" 485 if self.__clf: 486 self.__clf.untrain()
487
488 - def __call__(self, clfs, dataset):
489 """ 490 """ 491 if len(clfs)==0: 492 return [] # to don't even bother 493 494 raise NotImplementedError
495 496 497
498 -class CombinedClassifier(BoostedClassifier):
499 """`BoostedClassifier` which combines predictions using some 500 `PredictionsCombiner` functor. 501 """ 502
503 - def __init__(self, clfs=None, combiner=None, **kwargs):
504 """Initialize the instance. 505 506 :Parameters: 507 clfs : list of Classifier 508 list of classifier instances to use 509 combiner : PredictionsCombiner 510 callable which takes care about combining multiple 511 results into a single one (e.g. maximal vote for 512 classification, MeanPrediction for regression)) 513 kwargs : dict 514 dict of keyworded arguments which might get used 515 by State or Classifier 516 517 NB: `combiner` might need to operate not on 'predictions' descrete 518 labels but rather on raw 'class' values classifiers 519 estimate (which is pretty much what is stored under 520 `values` 521 """ 522 if clfs == None: 523 clfs = [] 524 525 BoostedClassifier.__init__(self, clfs, **kwargs) 526 527 # assign default combiner 528 if combiner is None: 529 combiner = (MaximalVote, MeanPrediction)[int(self.regression)]() 530 self.__combiner = combiner 531 """Functor destined to combine results of multiple classifiers"""
532 533
534 - def __repr__(self, prefixes=[]):
535 """Literal representation of `CombinedClassifier`. 536 """ 537 return super(CombinedClassifier, self).__repr__( 538 ["combiner=%s" % repr(self.__combiner)] + prefixes)
539 540
541 - def summary(self):
542 """Provide summary for the `CombinedClassifier`. 543 """ 544 s = super(CombinedClassifier, self).summary() 545 if self.trained: 546 s += "\n Slave classifiers summaries:" 547 for i, clf in enumerate(self.clfs): 548 s += '\n + %d clf: %s' % \ 549 (i, clf.summary().replace('\n', '\n |')) 550 return s
551 552
553 - def untrain(self):
554 """Untrain `CombinedClassifier` 555 """ 556 try: 557 self.__combiner.untrain() 558 except: 559 pass 560 super(CombinedClassifier, self).untrain()
561
562 - def _train(self, dataset):
563 """Train `CombinedClassifier` 564 """ 565 BoostedClassifier._train(self, dataset) 566 # combiner might need to train as well 567 self.__combiner.train(self.clfs, dataset)
568 569
570 - def _predict(self, data):
571 """Predict using `CombinedClassifier` 572 """ 573 BoostedClassifier._predict(self, data) 574 # combiner will make use of state variables instead of only predictions 575 # returned from _predict 576 predictions = self.__combiner(self.clfs, data) 577 self.predictions = predictions 578 579 if self.states.isEnabled("values"): 580 if self.__combiner.states.isActive("values"): 581 # XXX or may be we could leave simply up to accessing .combiner? 582 self.values = self.__combiner.values 583 else: 584 if __debug__: 585 warning("Boosted classifier %s has 'values' state enabled," 586 " but combiner doesn't have 'values' active, thus " 587 " .values cannot be provided directly, access .clfs" 588 % self) 589 return predictions
590 591 592 combiner = property(fget=lambda x:x.__combiner, 593 doc="Used combiner to derive a single result")
594 595 596
597 -class TreeClassifier(ProxyClassifier):
598 """`TreeClassifier` which allows to create hierarchy of classifiers 599 600 Functions by grouping some labels into a single "meta-label" and training 601 classifier first to separate between meta-labels. Then 602 each group further proceeds with classification within each group. 603 604 Possible scenarios:: 605 606 TreeClassifier(SVM(), 607 {'animate': ((1,2,3,4), 608 TreeClassifier(SVM(), 609 {'human': (('male', 'female'), SVM()), 610 'animals': (('monkey', 'dog'), SMLR())})), 611 'inanimate': ((5,6,7,8), SMLR())}) 612 613 would create classifier which would first do binary classification 614 to separate animate from inanimate, then for animate result it 615 would separate to classify human vs animal and so on:: 616 617 SVM 618 / \ 619 animate inanimate 620 / \ 621 SVM SMLR 622 / \ / | \ \ 623 human animal 5 6 7 8 624 | | 625 SVM SVM 626 / \ / \ 627 male female monkey dog 628 1 2 3 4 629 630 """ 631 632 _DEV__doc = """ 633 Questions: 634 * how to collect confusion matrices at a particular layer if such 635 classifier is given to SplitClassifier or CVTE 636 637 * What additional states to add, something like 638 clf_labels -- store remapped labels for the dataset 639 clf_values ... 640 641 * What do we store into values ? just values from the clfs[] 642 for corresponding samples, or top level clf values as well? 643 644 * what should be SensitivityAnalyzer? by default it would just 645 use top slave classifier (i.e. animate/inanimate) 646 647 Problems? 648 * .clf is not actually "proxied" per se, so not sure what things 649 should be taken care of yet... 650 651 TODO: 652 * Allow a group to be just a single category, so no further 653 classifier is needed, it just should stay separate from the 654 other groups 655 656 Possible TODO: 657 * Add ability to provide results of clf.values as features into 658 input of clfs[]. This way we could provide additional 'similarity' 659 information to the "other" branch 660 661 """ 662
663 - def __init__(self, clf, groups, **kwargs):
664 """Initialize TreeClassifier 665 666 :Parameters: 667 clf : Classifier 668 Classifier to separate between the groups 669 groups : dict of meta-label: tuple of (tuple of labels, classifier) 670 Defines the groups of labels and their classifiers. 671 See :class:`~mvpa.clfs.meta.TreeClassifier` for example 672 """ 673 674 # Basic initialization 675 ProxyClassifier.__init__(self, clf, **kwargs) 676 self._regressionIsBogus() 677 678 # XXX RF: probably create internal structure with dictionary, 679 # not just a tuple, and store all information in there 680 # accordingly 681 682 self._groups = groups 683 self._index2group = groups.keys() 684 685 # All processing of groups needs to be handled within _train 686 # since labels_map is not available here and definition 687 # is allowed to carry both symbolic and numeric values for 688 # labels 689 690 # We can only assign respective classifiers 691 self.clfs = dict([(gk, c) for gk, (ls, c) in groups.iteritems()]) 692 """Dictionary of classifiers used by the groups"""
693 694
695 - def __repr__(self, prefixes=[]):
696 """String representation of TreeClassifier 697 """ 698 prefix = "groups=%s" % repr(self._groups) 699 return super(TreeClassifier, self).__repr__([prefix] + prefixes)
700 701
702 - def summary(self):
703 """Provide summary for the `TreeClassifier`. 704 """ 705 s = super(TreeClassifier, self).summary() 706 if self.trained: 707 s += "\n Node classifiers summaries:" 708 for i, (clfname, clf) in enumerate(self.clfs.iteritems()): 709 s += '\n + %d %s clf: %s' % \ 710 (i, clfname, clf.summary().replace('\n', '\n |')) 711 return s
712 713
714 - def _train(self, dataset):
715 """Train TreeClassifier 716 717 First train .clf on groupped samples, then train each of .clfs 718 on a corresponding subset of samples. 719 """ 720 # Local bindings 721 clf, clfs, index2group = self.clf, self.clfs, self._index2group 722 723 # Handle groups of labels 724 groups = self._groups 725 labels_map = dataset.labels_map 726 # just for convenience 727 if labels_map is None: labels_map = {} 728 groups_labels = {} # just groups with numeric indexes 729 label2index = {} # how to map old labels to new 730 known = set() 731 for gi, gk in enumerate(index2group): 732 ls = groups[gk][0] 733 # if mapping exists -- map 734 ls_ = [labels_map.get(l, l) for l in ls] 735 known_already = known.intersection(ls_) 736 if len(known_already): 737 raise ValueError, "Grouping of labels is not appropriate. " \ 738 "Got labels %s already among known in %s. " \ 739 "Used labelsmap %s" % (known_already, known, labels_map) 740 groups_labels[gk] = ls_ # needed? XXX 741 for l in ls_: 742 label2index[l] = gi 743 known = known.union(ls_) 744 # TODO: check if different literal labels weren't mapped into 745 # same numerical but here asked to belong to different groups 746 # yoh: actually above should catch it 747 748 # Check if none of the labels is missing from known groups 749 dsul = set(dataset.uniquelabels) 750 if known.intersection(dsul) != dsul: 751 raise ValueError, \ 752 "Dataset %s had some labels not defined in groups: %s. " \ 753 "Known are %s" % \ 754 (dataset, dsul.difference(known), known) 755 756 # We can operate on the same dataset here 757 # Nope: doesn't work nicely with the classifier like kNN 758 # which links to the dataset used in the training, 759 # so whenever if we simply restore labels back, we 760 # would get kNN confused in _predict() 761 # Therefore we need to create a shallow copy of 762 # dataset and provide it with new labels 763 ds_group = dataset.copy(deep=False) 764 # assign new labels group samples into groups of labels 765 ds_group.labels = [label2index[l] for l in dataset.labels] 766 767 # train primary classifier 768 if __debug__: 769 debug('CLFTREE', "Training primary %(clf)s on %(ds)s", 770 msgargs=dict(clf=clf, ds=ds_group)) 771 clf.train(ds_group) 772 773 # ??? should we obtain values for anything? 774 # may be we could training values of .clfs to be added 775 # as features to the next level -- i.e. .clfs 776 777 # Proceed with next 'layer' and train all .clfs on corresponding 778 # selection of samples 779 # ??? should we may be allow additional 'the other' category, to 780 # signal contain all the other categories data? probably not 781 # since then it would lead to undetermined prediction (which 782 # might be not a bad thing altogether...) 783 for gk in groups.iterkeys(): 784 # select samples per each group 785 ids = dataset.idsbylabels(groups_labels[gk]) 786 ds_group = dataset.selectSamples(ids) 787 if __debug__: 788 debug('CLFTREE', "Training %(clf)s for group %(gk)s on %(ds)s", 789 msgargs=dict(clf=clfs[gk], gk=gk, ds=ds_group)) 790 # and train corresponding slave clf 791 clfs[gk].train(ds_group)
792 793
794 - def untrain(self):
795 """Untrain TreeClassifier 796 """ 797 super(TreeClassifier, self).untrain() 798 for clf in self.clfs.values(): 799 clf.untrain()
800 801
802 - def _predict(self, data):
803 """ 804 """ 805 # Local bindings 806 clfs, index2group = self.clfs, self._index2group 807 clf_predictions = N.asanyarray(ProxyClassifier._predict(self, data)) 808 # assure that predictions are indexes, ie int 809 clf_predictions = clf_predictions.astype(int) 810 811 # now for predictions pointing to specific groups go into 812 # corresponding one 813 predictions = N.array([N.nan]*len(data)) 814 for pred_group in set(clf_predictions): 815 gk = index2group[pred_group] 816 clf_ = clfs[gk] 817 group_indexes = (clf_predictions == pred_group) 818 if __debug__: 819 debug('CLFTREE', 'Predicting for group %s using %s on %d samples' % 820 (gk, clf_, N.sum(group_indexes))) 821 predictions[group_indexes] = clf_.predict(data[group_indexes]) 822 return predictions
823 824
825 -class BinaryClassifier(ProxyClassifier):
826 """`ProxyClassifier` which maps set of two labels into +1 and -1 827 """ 828
829 - def __init__(self, clf, poslabels, neglabels, **kwargs):
830 """ 831 :Parameters: 832 clf : Classifier 833 classifier to use 834 poslabels : list 835 list of labels which are treated as +1 category 836 neglabels : list 837 list of labels which are treated as -1 category 838 """ 839 840 ProxyClassifier.__init__(self, clf, **kwargs) 841 842 self._regressionIsBogus() 843 844 # Handle labels 845 sposlabels = Set(poslabels) # so to remove duplicates 846 sneglabels = Set(neglabels) # so to remove duplicates 847 848 # check if there is no overlap 849 overlap = sposlabels.intersection(sneglabels) 850 if len(overlap)>0: 851 raise ValueError("Sets of positive and negative labels for " + 852 "BinaryClassifier must not overlap. Got overlap " % 853 overlap) 854 855 self.__poslabels = list(sposlabels) 856 self.__neglabels = list(sneglabels) 857 858 # define what values will be returned by predict: if there is 859 # a single label - return just it alone, otherwise - whole 860 # list 861 # Such approach might come useful if we use some classifiers 862 # over different subsets of data with some voting later on 863 # (1-vs-therest?) 864 865 if len(self.__poslabels) > 1: 866 self.__predictpos = self.__poslabels 867 else: 868 self.__predictpos = self.__poslabels[0] 869 870 if len(self.__neglabels) > 1: 871 self.__predictneg = self.__neglabels 872 else: 873 self.__predictneg = self.__neglabels[0]
874 875
876 - def __repr__(self, prefixes=[]):
877 prefix = "poslabels=%s, neglabels=%s" % ( 878 repr(self.__poslabels), repr(self.__neglabels)) 879 return super(BinaryClassifier, self).__repr__([prefix] + prefixes)
880 881
882 - def _train(self, dataset):
883 """Train `BinaryClassifier` 884 """ 885 idlabels = [(x, +1) for x in dataset.idsbylabels(self.__poslabels)] + \ 886 [(x, -1) for x in dataset.idsbylabels(self.__neglabels)] 887 # XXX we have to sort ids since at the moment Dataset.selectSamples 888 # doesn't take care about order 889 idlabels.sort() 890 # select the samples 891 orig_labels = None 892 893 # If we need all samples, why simply not perform on original 894 # data, an just store/restore labels. But it really should be done 895 # within Dataset.selectSamples 896 if len(idlabels) == dataset.nsamples \ 897 and [x[0] for x in idlabels] == range(dataset.nsamples): 898 # the last condition is not even necessary... just overly 899 # cautious 900 datasetselected = dataset # no selection is needed 901 orig_labels = dataset.labels # but we would need to restore labels 902 if __debug__: 903 debug('CLFBIN', 904 "Assigned all %d samples for binary " % 905 (dataset.nsamples) + 906 " classification among labels %s/+1 and %s/-1" % 907 (self.__poslabels, self.__neglabels)) 908 else: 909 datasetselected = dataset.selectSamples([ x[0] for x in idlabels ]) 910 if __debug__: 911 debug('CLFBIN', 912 "Selected %d samples out of %d samples for binary " % 913 (len(idlabels), dataset.nsamples) + 914 " classification among labels %s/+1 and %s/-1" % 915 (self.__poslabels, self.__neglabels) + 916 ". Selected %s" % datasetselected) 917 918 # adjust the labels 919 datasetselected.labels = [ x[1] for x in idlabels ] 920 921 # now we got a dataset with only 2 labels 922 if __debug__: 923 assert((datasetselected.uniquelabels == [-1, 1]).all()) 924 925 self.clf.train(datasetselected) 926 927 if not orig_labels is None: 928 dataset.labels = orig_labels
929
930 - def _predict(self, data):
931 """Predict the labels for a given `data` 932 933 Predicts using binary classifier and spits out list (for each sample) 934 where with either poslabels or neglabels as the "label" for the sample. 935 If there was just a single label within pos or neg labels then it would 936 return not a list but just that single label. 937 """ 938 binary_predictions = ProxyClassifier._predict(self, data) 939 self.values = binary_predictions 940 predictions = [ {-1: self.__predictneg, 941 +1: self.__predictpos}[x] for x in binary_predictions] 942 self.predictions = predictions 943 return predictions
944 945 946
947 -class MulticlassClassifier(CombinedClassifier):
948 """`CombinedClassifier` to perform multiclass using a list of 949 `BinaryClassifier`. 950 951 such as 1-vs-1 (ie in pairs like libsvm doesn) or 1-vs-all (which 952 is yet to think about) 953 """ 954
955 - def __init__(self, clf, bclf_type="1-vs-1", **kwargs):
956 """Initialize the instance 957 958 :Parameters: 959 clf : Classifier 960 classifier based on which multiple classifiers are created 961 for multiclass 962 bclf_type 963 "1-vs-1" or "1-vs-all", determines the way to generate binary 964 classifiers 965 """ 966 CombinedClassifier.__init__(self, **kwargs) 967 self._regressionIsBogus() 968 if not clf is None: 969 clf._regressionIsBogus() 970 971 self.__clf = clf 972 """Store sample instance of basic classifier""" 973 974 # Some checks on known ways to do multiclass 975 if bclf_type == "1-vs-1": 976 pass 977 elif bclf_type == "1-vs-all": # TODO 978 raise NotImplementedError 979 else: 980 raise ValueError, \ 981 "Unknown type of classifier %s for " % bclf_type + \ 982 "BoostedMulticlassClassifier" 983 self.__bclf_type = bclf_type
984 985 # XXX fix it up a bit... it seems that MulticlassClassifier should 986 # be actually ProxyClassifier and use BoostedClassifier internally
987 - def __repr__(self, prefixes=[]):
988 prefix = "bclf_type=%s, clf=%s" % (repr(self.__bclf_type), 989 repr(self.__clf)) 990 return super(MulticlassClassifier, self).__repr__([prefix] + prefixes)
991 992
993 - def _train(self, dataset):
994 """Train classifier 995 """ 996 # construct binary classifiers 997 ulabels = dataset.uniquelabels 998 if self.__bclf_type == "1-vs-1": 999 # generate pairs and corresponding classifiers 1000 biclfs = [] 1001 for i in xrange(len(ulabels)): 1002 for j in xrange(i+1, len(ulabels)): 1003 clf = self.__clf.clone() 1004 biclfs.append( 1005 BinaryClassifier( 1006 clf, 1007 poslabels=[ulabels[i]], neglabels=[ulabels[j]])) 1008 if __debug__: 1009 debug("CLFMC", "Created %d binary classifiers for %d labels" % 1010 (len(biclfs), len(ulabels))) 1011 1012 self.clfs = biclfs 1013 1014 elif self.__bclf_type == "1-vs-all": 1015 raise NotImplementedError 1016 1017 # perform actual training 1018 CombinedClassifier._train(self, dataset)
1019 1020 1021
1022 -class SplitClassifier(CombinedClassifier):
1023 """`BoostedClassifier` to work on splits of the data 1024 1025 """ 1026 1027 """ 1028 TODO: SplitClassifier and MulticlassClassifier have too much in 1029 common -- need to refactor: just need a splitter which would 1030 split dataset in pairs of class labels. MulticlassClassifier 1031 does just a tiny bit more which might be not necessary at 1032 all: map sets of labels into 2 categories... 1033 """ 1034 1035 # TODO: unify with CrossValidatedTransferError which now uses 1036 # harvest_attribs to expose gathered attributes 1037 confusion = StateVariable(enabled=False, 1038 doc="Resultant confusion whenever classifier trained " + 1039 "on 1 part and tested on 2nd part of each split") 1040 1041 splits = StateVariable(enabled=False, doc= 1042 """Store the actual splits of the data. Can be memory expensive""") 1043 1044 # ??? couldn't be training_confusion since it has other meaning 1045 # here, BUT it is named so within CrossValidatedTransferError 1046 # -- unify 1047 # decided to go with overriding semantics tiny bit. For split 1048 # classifier training_confusion would correspond to summary 1049 # over training errors across all splits. Later on if need comes 1050 # we might want to implement global_training_confusion which would 1051 # correspond to overall confusion on full training dataset as it is 1052 # done in base Classifier 1053 #global_training_confusion = StateVariable(enabled=False, 1054 # doc="Summary over training confusions acquired at each split") 1055
1056 - def __init__(self, clf, splitter=NFoldSplitter(cvtype=1), **kwargs):
1057 """Initialize the instance 1058 1059 :Parameters: 1060 clf : Classifier 1061 classifier based on which multiple classifiers are created 1062 for multiclass 1063 splitter : Splitter 1064 `Splitter` to use to split the dataset prior training 1065 """ 1066 1067 CombinedClassifier.__init__(self, regression=clf.regression, **kwargs) 1068 self.__clf = clf 1069 """Store sample instance of basic classifier""" 1070 1071 if isinstance(splitter, type): 1072 raise ValueError, \ 1073 "Please provide an instance of a splitter, not a type." \ 1074 " Got %s" % splitter 1075 1076 self.__splitter = splitter
1077 1078
1079 - def _train(self, dataset):
1080 """Train `SplitClassifier` 1081 """ 1082 # generate pairs and corresponding classifiers 1083 bclfs = [] 1084 1085 # local binding 1086 states = self.states 1087 1088 clf_template = self.__clf 1089 if states.isEnabled('confusion'): 1090 states.confusion = clf_template._summaryClass() 1091 if states.isEnabled('training_confusion'): 1092 clf_template.states.enable(['training_confusion']) 1093 states.training_confusion = clf_template._summaryClass() 1094 1095 clf_hastestdataset = hasattr(clf_template, 'testdataset') 1096 1097 # for proper and easier debugging - first define classifiers and then 1098 # train them 1099 for split in self.__splitter.splitcfg(dataset): 1100 if __debug__: 1101 debug("CLFSPL_", 1102 "Deepcopying %(clf)s for %(sclf)s", 1103 msgargs={'clf':clf_template, 1104 'sclf':self}) 1105 clf = clf_template.clone() 1106 bclfs.append(clf) 1107 self.clfs = bclfs 1108 1109 self.splits = [] 1110 1111 for i, split in enumerate(self.__splitter(dataset)): 1112 if __debug__: 1113 debug("CLFSPL", "Training classifier for split %d" % (i)) 1114 1115 if states.isEnabled("splits"): 1116 self.splits.append(split) 1117 1118 clf = self.clfs[i] 1119 1120 # assign testing dataset if given classifier can digest it 1121 if clf_hastestdataset: 1122 clf.testdataset = split[1] 1123 1124 clf.train(split[0]) 1125 1126 # unbind the testdataset from the classifier 1127 if clf_hastestdataset: 1128 clf.testdataset = None 1129 1130 if states.isEnabled("confusion"): 1131 predictions = clf.predict(split[1].samples) 1132 self.confusion.add(split[1].labels, predictions, 1133 clf.states.get('values', None)) 1134 if __debug__: 1135 dact = debug.active 1136 if 'CLFSPL_' in dact: 1137 debug('CLFSPL_', 'Split %d:\n%s' % (i, self.confusion)) 1138 elif 'CLFSPL' in dact: 1139 debug('CLFSPL', 'Split %d error %.2f%%' 1140 % (i, self.confusion.summaries[-1].error)) 1141 1142 if states.isEnabled("training_confusion"): 1143 states.training_confusion += \ 1144 clf.states.training_confusion 1145 # hackish way -- so it should work only for ConfusionMatrix??? 1146 try: 1147 if states.isEnabled("confusion"): 1148 states.confusion.labels_map = dataset.labels_map 1149 if states.isEnabled("training_confusion"): 1150 states.training_confusion.labels_map = dataset.labels_map 1151 except: 1152 pass
1153 1154 1155 @group_kwargs(prefixes=['slave_'], passthrough=True)
1156 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
1157 """Return an appropriate SensitivityAnalyzer for `SplitClassifier` 1158 1159 :Parameters: 1160 combiner 1161 If not provided, FirstAxisMean is assumed 1162 """ 1163 kwargs.setdefault('combiner', FirstAxisMean) 1164 return BoostedClassifierSensitivityAnalyzer( 1165 self, 1166 analyzer=self.__clf.getSensitivityAnalyzer(**slave_kwargs), 1167 **kwargs)
1168 1169 splitter = property(fget=lambda x:x.__splitter, 1170 doc="Splitter user by SplitClassifier")
1171 1172
1173 -class MappedClassifier(ProxyClassifier):
1174 """`ProxyClassifier` which uses some mapper prior training/testing. 1175 1176 `MaskMapper` can be used just a subset of features to 1177 train/classify. 1178 Having such classifier we can easily create a set of classifiers 1179 for BoostedClassifier, where each classifier operates on some set 1180 of features, e.g. set of best spheres from SearchLight, set of 1181 ROIs selected elsewhere. It would be different from simply 1182 applying whole mask over the dataset, since here initial decision 1183 is made by each classifier and then later on they vote for the 1184 final decision across the set of classifiers. 1185 """ 1186
1187 - def __init__(self, clf, mapper, **kwargs):
1188 """Initialize the instance 1189 1190 :Parameters: 1191 clf : Classifier 1192 classifier based on which mask classifiers is created 1193 mapper 1194 whatever `Mapper` comes handy 1195 """ 1196 ProxyClassifier.__init__(self, clf, **kwargs) 1197 1198 self.__mapper = mapper 1199 """mapper to help us our with prepping data to 1200 training/classification"""
1201 1202
1203 - def _train(self, dataset):
1204 """Train `MappedClassifier` 1205 """ 1206 # first train the mapper 1207 # XXX: should training be done using whole dataset or just samples 1208 # YYY: in some cases labels might be needed, thus better full dataset 1209 self.__mapper.train(dataset) 1210 1211 # for train() we have to provide dataset -- not just samples to train! 1212 wdataset = dataset.applyMapper(featuresmapper = self.__mapper) 1213 ProxyClassifier._train(self, wdataset)
1214 1215
1216 - def _predict(self, data):
1217 """Predict using `MappedClassifier` 1218 """ 1219 return ProxyClassifier._predict(self, self.__mapper.forward(data))
1220 1221 1222 @group_kwargs(prefixes=['slave_'], passthrough=True)
1223 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
1224 """Return an appropriate SensitivityAnalyzer""" 1225 return MappedClassifierSensitivityAnalyzer( 1226 self, 1227 analyzer=self.clf.getSensitivityAnalyzer(**slave_kwargs), 1228 **kwargs)
1229 1230 1231 mapper = property(lambda x:x.__mapper, doc="Used mapper")
1232 1233 1234
1235 -class FeatureSelectionClassifier(ProxyClassifier):
1236 """`ProxyClassifier` which uses some `FeatureSelection` prior training. 1237 1238 `FeatureSelection` is used first to select features for the classifier to 1239 use for prediction. Internally it would rely on MappedClassifier which 1240 would use created MaskMapper. 1241 1242 TODO: think about removing overhead of retraining the same classifier if 1243 feature selection was carried out with the same classifier already. It 1244 has been addressed by adding .trained property to classifier, but now 1245 we should expclitely use isTrained here if we want... need to think more 1246 """ 1247 1248 _clf_internals = [ 'does_feature_selection', 'meta' ] 1249
1250 - def __init__(self, clf, feature_selection, testdataset=None, **kwargs):
1251 """Initialize the instance 1252 1253 :Parameters: 1254 clf : Classifier 1255 classifier based on which mask classifiers is created 1256 feature_selection : FeatureSelection 1257 whatever `FeatureSelection` comes handy 1258 testdataset : Dataset 1259 optional dataset which would be given on call to feature_selection 1260 """ 1261 ProxyClassifier.__init__(self, clf, **kwargs) 1262 1263 self.__maskclf = None 1264 """Should become `MappedClassifier`(mapper=`MaskMapper`) later on.""" 1265 1266 self.__feature_selection = feature_selection 1267 """`FeatureSelection` to select the features prior training""" 1268 1269 self.__testdataset = testdataset 1270 """`FeatureSelection` might like to use testdataset"""
1271 1272
1273 - def untrain(self):
1274 """Untrain `FeatureSelectionClassifier` 1275 1276 Has to untrain any known classifier 1277 """ 1278 if self.__feature_selection is not None: 1279 self.__feature_selection.untrain() 1280 if not self.trained: 1281 return 1282 if not self.__maskclf is None: 1283 self.__maskclf.untrain() 1284 super(FeatureSelectionClassifier, self).untrain()
1285 1286
1287 - def _train(self, dataset):
1288 """Train `FeatureSelectionClassifier` 1289 """ 1290 # temporarily enable selected_ids 1291 self.__feature_selection.states._changeTemporarily( 1292 enable_states=["selected_ids"]) 1293 1294 if __debug__: 1295 debug("CLFFS", "Performing feature selection using %s" % 1296 self.__feature_selection + " on %s" % dataset) 1297 1298 (wdataset, tdataset) = self.__feature_selection(dataset, 1299 self.__testdataset) 1300 if __debug__: 1301 add_ = "" 1302 if "CLFFS_" in debug.active: 1303 add_ = " Selected features: %s" % \ 1304 self.__feature_selection.selected_ids 1305 debug("CLFFS", "%(fs)s selected %(nfeat)d out of " + 1306 "%(dsnfeat)d features.%(app)s", 1307 msgargs={'fs':self.__feature_selection, 1308 'nfeat':wdataset.nfeatures, 1309 'dsnfeat':dataset.nfeatures, 1310 'app':add_}) 1311 1312 # create a mask to devise a mapper 1313 # TODO -- think about making selected_ids a MaskMapper 1314 mappermask = N.zeros(dataset.nfeatures) 1315 mappermask[self.__feature_selection.selected_ids] = 1 1316 mapper = MaskMapper(mappermask) 1317 1318 self.__feature_selection.states._resetEnabledTemporarily() 1319 1320 # create and assign `MappedClassifier` 1321 self.__maskclf = MappedClassifier(self.clf, mapper) 1322 # we could have called self.__clf.train(dataset), but it would 1323 # cause unnecessary masking 1324 self.__maskclf.clf.train(wdataset)
1325 1326 # for the ease of access 1327 # TODO see for ProxyClassifier 1328 #self.states._copy_states_(self.__maskclf, deep=False) 1329
1330 - def _getFeatureIds(self):
1331 """Return used feature ids for `FeatureSelectionClassifier` 1332 1333 """ 1334 return self.__feature_selection.selected_ids
1335
1336 - def _predict(self, data):
1337 """Predict using `FeatureSelectionClassifier` 1338 """ 1339 clf = self.__maskclf 1340 if self.states.isEnabled('values'): 1341 clf.states.enable(['values']) 1342 1343 result = clf._predict(data) 1344 # for the ease of access 1345 self.states._copy_states_(clf, ['values'], deep=False) 1346 return result
1347
1348 - def setTestDataset(self, testdataset):
1349 """Set testing dataset to be used for feature selection 1350 """ 1351 self.__testdataset = testdataset
1352 1353 maskclf = property(lambda x:x.__maskclf, doc="Used `MappedClassifier`") 1354 feature_selection = property(lambda x:x.__feature_selection, 1355 doc="Used `FeatureSelection`") 1356 1357 @group_kwargs(prefixes=['slave_'], passthrough=True)
1358 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
1359 """Return an appropriate SensitivityAnalyzer 1360 1361 had to clone from mapped classifier??? 1362 """ 1363 return FeatureSelectionClassifierSensitivityAnalyzer( 1364 self, 1365 analyzer=self.clf.getSensitivityAnalyzer(**slave_kwargs), 1366 **kwargs)
1367 1368 1369 1370 testdataset = property(fget=lambda x:x.__testdataset, 1371 fset=setTestDataset)
1372