Package mvpa :: Package clfs :: Module warehouse
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.warehouse

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Collection of classifiers to ease the exploration. 
 10  """ 
 11   
 12  __docformat__ = 'restructuredtext' 
 13   
 14  from sets import Set 
 15  import operator 
 16   
 17  # Define sets of classifiers 
 18  from mvpa.clfs.meta import FeatureSelectionClassifier, SplitClassifier, \ 
 19       MulticlassClassifier 
 20  from mvpa.clfs.smlr import SMLR 
 21  from mvpa.clfs.knn import kNN 
 22  from mvpa.clfs.kernel import KernelLinear, KernelSquaredExponential 
 23   
 24  # Helpers 
 25  from mvpa.base import externals, cfg 
 26  from mvpa.measures.anova import OneWayAnova 
 27  from mvpa.misc.transformers import Absolute 
 28  from mvpa.clfs.smlr import SMLRWeights 
 29  from mvpa.featsel.helpers import FractionTailSelector, \ 
 30      FixedNElementTailSelector, RangeElementSelector 
 31   
 32  from mvpa.featsel.base import SensitivityBasedFeatureSelection 
 33   
 34  _KNOWN_INTERNALS = [ 'knn', 'binary', 'svm', 'linear', 
 35          'smlr', 'does_feature_selection', 'has_sensitivity', 
 36          'multiclass', 'non-linear', 'kernel-based', 'lars', 
 37          'regression', 'libsvm', 'sg', 'meta', 'retrainable', 'gpr', 
 38          'notrain2predict', 'ridge', 'blr', 'gnpp', 'enet', 'glmnet'] 
 39   
40 -class Warehouse(object):
41 """Class to keep known instantiated classifiers 42 43 Should provide easy ways to select classifiers of needed kind: 44 clfswh['linear', 'svm'] should return all linear SVMs 45 clfswh['linear', 'multiclass'] should return all linear classifiers 46 capable of doing multiclass classification 47 """ 48
49 - def __init__(self, known_tags=None, matches=None):
50 """Initialize warehouse 51 52 :Parameters: 53 known_tags : list of basestring 54 List of known tags 55 matches : dict 56 Optional dictionary of additional matches. E.g. since any 57 regression can be used as a binary classifier, 58 matches={'binary':['regression']}, would allow to provide 59 regressions also if 'binary' was requested 60 """ 61 self._known_tags = Set(known_tags) 62 self.__items = [] 63 self.__keys = Set() 64 if matches is None: 65 matches = {} 66 self.__matches = matches
67
68 - def __getitem__(self, *args):
69 if isinstance(args[0], tuple): 70 args = args[0] 71 72 # so we explicitely handle [:] 73 if args == (slice(None),): 74 args = [] 75 76 # lets remove optional modifier '!' 77 dargs = Set([str(x).lstrip('!') for x in args]).difference( 78 self._known_tags) 79 80 if len(dargs)>0: 81 raise ValueError, "Unknown internals %s requested. Known are %s" % \ 82 (list(dargs), list(self._known_tags)) 83 84 # dummy implementation for now 85 result = [] 86 # check every known item 87 for item in self.__items: 88 good = True 89 # by default each one counts 90 for arg in args: 91 # check for rejection first 92 if arg.startswith('!'): 93 if (arg[1:] in item._clf_internals): 94 good = False 95 break 96 else: 97 continue 98 # check for inclusion 99 found = False 100 for arg in [arg] + self.__matches.get(arg, []): 101 if (arg in item._clf_internals): 102 found = True 103 break 104 good = found 105 if not good: 106 break 107 if good: 108 result.append(item) 109 return result
110
111 - def __iadd__(self, item):
112 if operator.isSequenceType(item): 113 for item_ in item: 114 self.__iadd__(item_) 115 else: 116 if not hasattr(item, '_clf_internals'): 117 raise ValueError, "Cannot register %s " % item + \ 118 "which has no _clf_internals defined" 119 if len(item._clf_internals) == 0: 120 raise ValueError, "Cannot register %s " % item + \ 121 "which has empty _clf_internals" 122 clf_internals = Set(item._clf_internals) 123 if clf_internals.issubset(self._known_tags): 124 self.__items.append(item) 125 self.__keys |= clf_internals 126 else: 127 raise ValueError, 'Unknown clf internal(s) %s' % \ 128 clf_internals.difference(self._known_tags) 129 return self
130 131 @property
132 - def internals(self):
133 """Known internal tags of the classifiers 134 """ 135 return self.__keys
136
137 - def listing(self):
138 """Listing (description + internals) of registered items 139 """ 140 return [(x.descr, x._clf_internals) for x in self.__items]
141 142 @property
143 - def items(self):
144 """Registered items 145 """ 146 return self.__items
147 148 clfswh = Warehouse(known_tags=_KNOWN_INTERNALS) # classifiers 149 regrswh = Warehouse(known_tags=_KNOWN_INTERNALS) # regressions 150 151 # NB: 152 # - Nu-classifiers are turned off since for haxby DS default nu 153 # is an 'infisible' one 154 # - Python's SMLR is turned off for the duration of development 155 # since it is slow and results should be the same as of C version 156 # 157 clfswh += [ SMLR(lm=0.1, implementation="C", descr="SMLR(lm=0.1)"), 158 SMLR(lm=1.0, implementation="C", descr="SMLR(lm=1.0)"), 159 #SMLR(lm=10.0, implementation="C", descr="SMLR(lm=10.0)"), 160 #SMLR(lm=100.0, implementation="C", descr="SMLR(lm=100.0)"), 161 #SMLR(implementation="Python", descr="SMLR(Python)") 162 ] 163 164 clfswh += \ 165 [ MulticlassClassifier(clfswh['smlr'][0], 166 descr='Pairs+maxvote multiclass on ' + \ 167 clfswh['smlr'][0].descr) ] 168 169 if externals.exists('libsvm'): 170 from mvpa.clfs import libsvmc as libsvm 171 clfswh._known_tags.union_update(libsvm.SVM._KNOWN_IMPLEMENTATIONS.keys()) 172 clfswh += [libsvm.SVM(descr="libsvm.LinSVM(C=def)", probability=1), 173 libsvm.SVM( 174 C=-10.0, descr="libsvm.LinSVM(C=10*def)", probability=1), 175 libsvm.SVM( 176 C=1.0, descr="libsvm.LinSVM(C=1)", probability=1), 177 libsvm.SVM(svm_impl='NU_SVC', 178 descr="libsvm.LinNuSVM(nu=def)", probability=1) 179 ] 180 clfswh += [libsvm.SVM(kernel_type='RBF', descr="libsvm.RbfSVM()"), 181 libsvm.SVM(kernel_type='RBF', svm_impl='NU_SVC', 182 descr="libsvm.RbfNuSVM(nu=def)"), 183 libsvm.SVM(kernel_type='poly', 184 descr='libsvm.PolySVM()', probability=1), 185 #libsvm.svm.SVM(kernel_type='sigmoid', 186 # svm_impl='C_SVC', 187 # descr='libsvm.SigmoidSVM()'), 188 ] 189 190 # regressions 191 regrswh._known_tags.union_update(['EPSILON_SVR', 'NU_SVR']) 192 regrswh += [libsvm.SVM(svm_impl='EPSILON_SVR', descr='libsvm epsilon-SVR', 193 regression=True), 194 libsvm.SVM(svm_impl='NU_SVR', descr='libsvm nu-SVR', 195 regression=True)] 196 197 if externals.exists('shogun'): 198 from mvpa.clfs import sg 199 clfswh._known_tags.union_update(sg.SVM._KNOWN_IMPLEMENTATIONS) 200 201 # some classifiers are not yet ready to be used out-of-the-box in 202 # PyMVPA, thus we don't populate warehouse with their instances 203 bad_classifiers = [ 204 'mpd', # was segfault, now non-training on testcases, and XOR. 205 # and was described as "for educational purposes", thus 206 # shouldn't be used for real data ;-) 207 # Should be a drop-in replacement for lightsvm 208 'gpbt', # fails to train for testAnalyzerWithSplitClassifier 209 # also 'retraining' doesn't work -- fails to generalize 210 'gmnp', # would fail with 'assertion Cache_Size > 2' 211 # if shogun < 0.6.3, also refuses to train 212 'svrlight', # fails to 'generalize' as a binary classifier 213 # after 'binning' 214 'krr', # fails to generalize 215 ] 216 if not externals.exists('sg_fixedcachesize'): 217 # would fail with 'assertion Cache_Size > 2' if shogun < 0.6.3 218 bad_classifiers.append('gnpp') 219 220 for impl in sg.SVM._KNOWN_IMPLEMENTATIONS: 221 # Uncomment the ones to disable 222 if impl in bad_classifiers: 223 continue 224 clfswh += [ 225 sg.SVM( 226 descr="sg.LinSVM(C=def)/%s" % impl, svm_impl=impl), 227 sg.SVM( 228 C=-10.0, descr="sg.LinSVM(C=10*def)/%s" % impl, svm_impl=impl), 229 sg.SVM( 230 C=1.0, descr="sg.LinSVM(C=1)/%s" % impl, svm_impl=impl), 231 ] 232 clfswh += [ 233 sg.SVM(kernel_type='RBF', 234 descr="sg.RbfSVM()/%s" % impl, svm_impl=impl), 235 # sg.SVM(kernel_type='RBF', 236 # descr="sg.RbfSVM(gamma=0.1)/%s" 237 # % impl, svm_impl=impl, gamma=0.1), 238 # sg.SVM(descr="sg.SigmoidSVM()/%s" 239 # % impl, svm_impl=impl, kernel_type="sigmoid"), 240 ] 241 242 for impl in ['libsvr', 'krr']:# \ 243 # XXX svrlight sucks in SG -- dont' have time to figure it out 244 #+ ([], ['svrlight'])['svrlight' in sg.SVM._KNOWN_IMPLEMENTATIONS]: 245 regrswh._known_tags.union_update([impl]) 246 regrswh += [ sg.SVM(svm_impl=impl, descr='sg.LinSVMR()/%s' % impl, 247 regression=True), 248 #sg.SVM(svm_impl=impl, kernel_type='RBF', 249 # descr='sg.RBFSVMR()/%s' % impl, 250 # regression=True), 251 ] 252 253 if len(clfswh['svm', 'linear']) > 0: 254 # if any SVM implementation is known, import default ones 255 from mvpa.clfs.svm import * 256 257 # lars from R via RPy 258 if externals.exists('lars'): 259 import mvpa.clfs.lars as lars 260 from mvpa.clfs.lars import LARS 261 for model in lars.known_models: 262 # XXX create proper repository of classifiers! 263 lars_clf = LARS(descr="LARS(%s)" % model, model_type=model) 264 clfswh += lars_clf 265 266 # is a regression, too 267 lars_regr = LARS(descr="_LARS(%s, regression=True)" % model, 268 regression=True, model_type=model) 269 regrswh += lars_regr 270 # clfswh += MulticlassClassifier(lars, 271 # descr='Multiclass %s' % lars.descr) 272 273 ## PBS: enet has some weird issue that causes it to fail. GLMNET is 274 ## better anyway, so just use that instead 275 ## # enet from R via RPy 276 ## if externals.exists('elasticnet'): 277 ## from mvpa.clfs.enet import ENET 278 ## clfswh += ENET(descr="ENET()") 279 ## regrswh += ENET(descr="ENET(regression=True)", regression=True) 280 281 # glmnet from R via RPy 282 if externals.exists('glmnet'): 283 from mvpa.clfs.glmnet import GLMNET_C, GLMNET_R 284 clfswh += GLMNET_C(descr="GLMNET_C()") 285 regrswh += GLMNET_R(descr="GLMNET_R()") 286 287 # kNN 288 clfswh += kNN(k=5, descr="kNN(k=5)") 289 clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')") 290 291 clfswh += \ 292 FeatureSelectionClassifier( 293 kNN(), 294 SensitivityBasedFeatureSelection( 295 SMLRWeights(SMLR(lm=1.0, implementation="C")), 296 RangeElementSelector(mode='select')), 297 descr="kNN on SMLR(lm=1) non-0") 298 299 clfswh += \ 300 FeatureSelectionClassifier( 301 kNN(), 302 SensitivityBasedFeatureSelection( 303 OneWayAnova(), 304 FractionTailSelector(0.05, mode='select', tail='upper')), 305 descr="kNN on 5%(ANOVA)") 306 307 clfswh += \ 308 FeatureSelectionClassifier( 309 kNN(), 310 SensitivityBasedFeatureSelection( 311 OneWayAnova(), 312 FixedNElementTailSelector(50, mode='select', tail='upper')), 313 descr="kNN on 50(ANOVA)") 314 315 316 # GPR 317 if externals.exists('scipy'): 318 from mvpa.clfs.gpr import GPR 319 320 clfswh += GPR(kernel=KernelLinear(), descr="GPR(kernel='linear')") 321 clfswh += GPR(kernel=KernelSquaredExponential(), 322 descr="GPR(kernel='sqexp')") 323 324 # BLR 325 from mvpa.clfs.blr import BLR 326 clfswh += BLR(descr="BLR()") 327 328 329 # SVM stuff 330 331 if len(clfswh['linear', 'svm']) > 0: 332 333 linearSVMC = clfswh['linear', 'svm', 334 cfg.get('svm', 'backend', default='libsvm').lower() 335 ][0] 336 337 # "Interesting" classifiers 338 clfswh += \ 339 FeatureSelectionClassifier( 340 linearSVMC.clone(), 341 SensitivityBasedFeatureSelection( 342 SMLRWeights(SMLR(lm=0.1, implementation="C")), 343 RangeElementSelector(mode='select')), 344 descr="LinSVM on SMLR(lm=0.1) non-0") 345 346 347 clfswh += \ 348 FeatureSelectionClassifier( 349 linearSVMC.clone(), 350 SensitivityBasedFeatureSelection( 351 SMLRWeights(SMLR(lm=1.0, implementation="C")), 352 RangeElementSelector(mode='select')), 353 descr="LinSVM on SMLR(lm=1) non-0") 354 355 356 # "Interesting" classifiers 357 clfswh += \ 358 FeatureSelectionClassifier( 359 RbfCSVMC(), 360 SensitivityBasedFeatureSelection( 361 SMLRWeights(SMLR(lm=1.0, implementation="C")), 362 RangeElementSelector(mode='select')), 363 descr="RbfSVM on SMLR(lm=1) non-0") 364 365 clfswh += \ 366 FeatureSelectionClassifier( 367 linearSVMC.clone(), 368 SensitivityBasedFeatureSelection( 369 OneWayAnova(), 370 FractionTailSelector(0.05, mode='select', tail='upper')), 371 descr="LinSVM on 5%(ANOVA)") 372 373 clfswh += \ 374 FeatureSelectionClassifier( 375 linearSVMC.clone(), 376 SensitivityBasedFeatureSelection( 377 OneWayAnova(), 378 FixedNElementTailSelector(50, mode='select', tail='upper')), 379 descr="LinSVM on 50(ANOVA)") 380 381 clfswh += \ 382 FeatureSelectionClassifier( 383 linearSVMC.clone(), 384 SensitivityBasedFeatureSelection( 385 linearSVMC.getSensitivityAnalyzer(transformer=Absolute), 386 FractionTailSelector(0.05, mode='select', tail='upper')), 387 descr="LinSVM on 5%(SVM)") 388 389 clfswh += \ 390 FeatureSelectionClassifier( 391 linearSVMC.clone(), 392 SensitivityBasedFeatureSelection( 393 linearSVMC.getSensitivityAnalyzer(transformer=Absolute), 394 FixedNElementTailSelector(50, mode='select', tail='upper')), 395 descr="LinSVM on 50(SVM)") 396 397 398 ### Imports which are specific to RFEs 399 # from mvpa.datasets.splitters import OddEvenSplitter 400 # from mvpa.clfs.transerror import TransferError 401 # from mvpa.featsel.rfe import RFE 402 # from mvpa.featsel.helpers import FixedErrorThresholdStopCrit 403 # from mvpa.clfs.transerror import ConfusionBasedError 404 405 # SVM with unbiased RFE -- transfer-error to another splits, or in 406 # other terms leave-1-out error on the same dataset 407 # Has to be bound outside of the RFE definition since both analyzer and 408 # error should use the same instance. 409 rfesvm_split = SplitClassifier(linearSVMC)#clfswh['LinearSVMC'][0]) 410 411 # "Almost" classical RFE. If this works it would differ only that 412 # our transfer_error is based on internal splitting and classifier used 413 # within RFE is a split classifier and its sensitivities per split will get 414 # averaged 415 # 416 417 #clfswh += \ 418 # FeatureSelectionClassifier( 419 # clf = LinearCSVMC(), #clfswh['LinearSVMC'][0], # we train LinearSVM 420 # feature_selection = RFE( # on features selected via RFE 421 # # based on sensitivity of a clf which does splitting internally 422 # sensitivity_analyzer=rfesvm_split.getSensitivityAnalyzer(), 423 # transfer_error=ConfusionBasedError( 424 # rfesvm_split, 425 # confusion_state="confusion"), 426 # # and whose internal error we use 427 # feature_selector=FractionTailSelector( 428 # 0.2, mode='discard', tail='lower'), 429 # # remove 20% of features at each step 430 # update_sensitivity=True), 431 # # update sensitivity at each step 432 # descr='LinSVM+RFE(splits_avg)' ) 433 # 434 #clfswh += \ 435 # FeatureSelectionClassifier( 436 # clf = LinearCSVMC(), # we train LinearSVM 437 # feature_selection = RFE( # on features selected via RFE 438 # # based on sensitivity of a clf which does splitting internally 439 # sensitivity_analyzer=rfesvm_split.getSensitivityAnalyzer(), 440 # transfer_error=ConfusionBasedError( 441 # rfesvm_split, 442 # confusion_state="confusion"), 443 # # and whose internal error we use 444 # feature_selector=FractionTailSelector( 445 # 0.2, mode='discard', tail='lower'), 446 # # remove 20% of features at each step 447 # update_sensitivity=False), 448 # # update sensitivity at each step 449 # descr='LinSVM+RFE(splits_avg,static)' ) 450 451 rfesvm = LinearCSVMC() 452 453 # This classifier will do RFE while taking transfer error to testing 454 # set of that split. Resultant classifier is voted classifier on top 455 # of all splits, let see what that would do ;-) 456 #clfswh += \ 457 # SplitClassifier( # which does splitting internally 458 # FeatureSelectionClassifier( 459 # clf = LinearCSVMC(), 460 # feature_selection = RFE( # on features selected via RFE 461 # sensitivity_analyzer=\ 462 # rfesvm.getSensitivityAnalyzer(transformer=Absolute), 463 # transfer_error=TransferError(rfesvm), 464 # stopping_criterion=FixedErrorThresholdStopCrit(0.05), 465 # feature_selector=FractionTailSelector( 466 # 0.2, mode='discard', tail='lower'), 467 # # remove 20% of features at each step 468 # update_sensitivity=True)), 469 # # update sensitivity at each step 470 # descr='LinSVM+RFE(N-Fold)') 471 # 472 # 473 #clfswh += \ 474 # SplitClassifier( # which does splitting internally 475 # FeatureSelectionClassifier( 476 # clf = LinearCSVMC(), 477 # feature_selection = RFE( # on features selected via RFE 478 # sensitivity_analyzer=\ 479 # rfesvm.getSensitivityAnalyzer(transformer=Absolute), 480 # transfer_error=TransferError(rfesvm), 481 # stopping_criterion=FixedErrorThresholdStopCrit(0.05), 482 # feature_selector=FractionTailSelector( 483 # 0.2, mode='discard', tail='lower'), 484 # # remove 20% of features at each step 485 # update_sensitivity=True)), 486 # # update sensitivity at each step 487 # splitter = OddEvenSplitter(), 488 # descr='LinSVM+RFE(OddEven)') 489