Package mvpa :: Package clfs :: Package sg :: Module svm
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.sg.svm

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Wrap the libsvm package into a very simple class interface.""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13   
 14  _DEV__doc__ = """ 
 15   
 16  TODOs: 
 17   * dual-license under GPL for use of SG? 
 18   * for recent versions add ability to specify/parametrize normalization 
 19     scheme for the kernel, and reuse 'scale' now for the normalizer 
 20   * Add support for simplified linear classifiers (which do not require 
 21     storing all training SVs/samples to make classification in predict()) 
 22  """ 
 23   
 24  import numpy as N 
 25   
 26   
 27  # Rely on SG 
 28  from mvpa.base import externals 
 29  if externals.exists('shogun', raiseException=True): 
 30      import shogun.Features 
 31      import shogun.Classifier 
 32      import shogun.Regression 
 33      import shogun.Kernel 
 34      import shogun.Library 
 35   
 36  import operator 
 37   
 38  from mvpa.misc.param import Parameter 
 39  from mvpa.base import warning 
 40   
 41  from mvpa.clfs.meta import MulticlassClassifier 
 42  from mvpa.clfs._svmbase import _SVM 
 43  from mvpa.misc.state import StateVariable 
 44  from mvpa.measures.base import Sensitivity 
 45  from mvpa.base import externals 
 46   
 47  from sens import * 
 48   
 49  if __debug__: 
 50      from mvpa.base import debug 
 51   
 52   
 53   
 54   
55 -def _setdebug(obj, partname):
56 """Helper to set level of debugging output for SG 57 :Parameters: 58 obj 59 In SG debug output seems to be set per every object 60 partname : basestring 61 For what kind of object we are talking about... could be automated 62 later on (TODO) 63 """ 64 debugname = "SG_%s" % partname.upper() 65 66 switch = {True: (shogun.Kernel.M_DEBUG, 'M_DEBUG', "enable"), 67 False: (shogun.Kernel.M_ERROR, 'M_ERROR', "disable")} 68 69 key = __debug__ and debugname in debug.active 70 71 sglevel, slevel, progressfunc = switch[key] 72 73 if __debug__: 74 debug("SG_", "Setting verbosity for shogun.%s instance: %s to %s" % 75 (partname, `obj`, slevel)) 76 obj.io.set_loglevel(sglevel) 77 try: 78 exec "obj.io.%s_progress()" % progressfunc 79 except: 80 warning("Shogun version installed has no way to enable progress" + 81 " reports")
82 83
84 -def _tosg(data):
85 """Draft helper function to convert data we have into SG suitable format 86 87 TODO: Support different datatypes 88 """ 89 90 if __debug__: 91 debug("SG_", "Converting data for shogun into RealFeatures") 92 93 features = shogun.Features.RealFeatures(data.astype('double').T) 94 95 if __debug__: 96 debug("SG__", "Done converting data for shogun into RealFeatures") 97 _setdebug(features, 'Features') 98 return features
99 100
101 -class SVM(_SVM):
102 """Support Vector Machine Classifier(s) based on Shogun 103 104 This is a simple base interface 105 """ 106 107 num_threads = Parameter(1, 108 min=1, 109 doc='Number of threads to utilize') 110 111 # NOTE: gamma is width in SG notation for RBF(Gaussian) 112 _KERNELS = {} 113 if externals.exists('shogun', raiseException=True): 114 _KERNELS = { "linear": (shogun.Kernel.LinearKernel, 115 ('scale',), LinearSVMWeights), 116 "rbf" : (shogun.Kernel.GaussianKernel, 117 ('gamma',), None), 118 "rbfshift": (shogun.Kernel.GaussianShiftKernel, 119 ('gamma', 'max_shift', 'shift_step'), None), 120 "sigmoid": (shogun.Kernel.SigmoidKernel, 121 ('cache_size', 'gamma', 'coef0'), None), 122 } 123 124 _KNOWN_PARAMS = [ 'epsilon' ] 125 _KNOWN_KERNEL_PARAMS = [ ] 126 127 _clf_internals = _SVM._clf_internals + [ 'sg', 'retrainable' ] 128 129 if externals.exists('sg ge 0.6.4'): 130 _KERNELS['linear'] = (shogun.Kernel.LinearKernel, (), LinearSVMWeights) 131 132 # Some words of wisdom from shogun author: 133 # XXX remove after proper comments added to implementations 134 """ 135 If you'd like to train linear SVMs use SGD or OCAS. These are (I am 136 serious) the fastest linear SVM-solvers to date. (OCAS cannot do SVMs 137 with standard additive bias, but will L2 reqularize it - though it 138 should not matter much in practice (although it will give slightly 139 different solutions)). Note that SGD has no stopping criterion (you 140 simply have to specify the number of iterations) and that OCAS has a 141 different stopping condition than svmlight for example which may be more 142 tight and more loose depending on the problem - I sugeest 1e-2 or 1e-3 143 for epsilon. 144 145 If you would like to train kernel SVMs use libsvm/gpdt/svmlight - 146 depending on the problem one is faster than the other (hard to say when, 147 I *think* when your dataset is very unbalanced chunking methods like 148 svmlight/gpdt are better), for smaller problems definitely libsvm. 149 150 If you use string kernels then gpdt/svmlight have a special 'linadd' 151 speedup for this (requires sg 0.6.2 - there was some inefficiency in the 152 code for python-modular before that). This is effective for big datasets 153 and (I trained on 10 million strings based on this). 154 155 And yes currently we only implemented parallel training for svmlight, 156 however all SVMs can be evaluated in parallel. 157 """ 158 _KNOWN_IMPLEMENTATIONS = {} 159 if externals.exists('shogun', raiseException=True): 160 _KNOWN_IMPLEMENTATIONS = { 161 "libsvm" : (shogun.Classifier.LibSVM, ('C',), 162 ('multiclass', 'binary'), 163 "LIBSVM's C-SVM (L2 soft-margin SVM)"), 164 "gmnp" : (shogun.Classifier.GMNPSVM, ('C',), 165 ('multiclass', 'binary'), 166 "Generalized Nearest Point Problem SVM"), 167 # XXX should have been GPDT, shogun has it fixed since some version 168 "gpbt" : (shogun.Classifier.GPBTSVM, ('C',), ('binary',), 169 "Gradient Projection Decomposition Technique for " \ 170 "large-scale SVM problems"), 171 "gnpp" : (shogun.Classifier.GNPPSVM, ('C',), ('binary',), 172 "Generalized Nearest Point Problem SVM"), 173 174 ## TODO: Needs sparse features... 175 # "svmlin" : (shogun.Classifier.SVMLin, ''), 176 # "liblinear" : (shogun.Classifier.LibLinear, ''), 177 # "subgradient" : (shogun.Classifier.SubGradientSVM, ''), 178 ## good 2-class linear SVMs 179 # "ocas" : (shogun.Classifier.SVMOcas, ''), 180 # "sgd" : ( shogun.Classifier.SVMSGD, ''), 181 182 # regressions 183 "libsvr": (shogun.Regression.LibSVR, ('C', 'tube_epsilon',), 184 ('regression',), 185 "LIBSVM's epsilon-SVR"), 186 "krr": (shogun.Regression.KRR, ('tau',), ('regression',), 187 "Kernel Ridge Regression"), 188 } 189 190
191 - def __init__(self, 192 kernel_type='linear', 193 **kwargs):
194 """Interface class to Shogun's classifiers and regressions. 195 196 Default implementation is 'libsvm'. 197 """ 198 199 svm_impl = kwargs.get('svm_impl', 'libsvm').lower() 200 kwargs['svm_impl'] = svm_impl 201 202 # init base class 203 _SVM.__init__(self, kernel_type=kernel_type, **kwargs) 204 205 self.__svm = None 206 """Holds the trained svm.""" 207 208 # Need to store original data... 209 # TODO: keep 1 of them -- just __traindata or __traindataset 210 # For now it is needed for computing sensitivities 211 self.__traindataset = None 212 213 # internal SG swig proxies 214 self.__traindata = None 215 self.__kernel = None 216 self.__kernel_test = None 217 self.__testdata = None
218 219
220 - def __condition_kernel(self, kernel):
221 # XXX I thought that it is needed only for retrainable classifier, 222 # but then krr gets confused, and svrlight needs it to provide 223 # meaningful results even without 'retraining' 224 if self._svm_impl in ['svrlight', 'lightsvm']: 225 kernel.set_precompute_matrix(True, True)
226 227
228 - def _train(self, dataset):
229 """Train SVM 230 """ 231 # XXX watchout 232 # self.untrain() 233 newkernel, newsvm = False, False 234 # local bindings for faster lookup 235 retrainable = self.params.retrainable 236 237 if retrainable: 238 _changedData = self._changedData 239 240 # LABELS 241 ul = None 242 self.__traindataset = dataset 243 244 245 # OK -- we have to map labels since 246 # binary ones expect -1/+1 247 # Multiclass expect labels starting with 0, otherwise they puke 248 # when ran from ipython... yikes 249 if __debug__: 250 debug("SG_", "Creating labels instance") 251 252 if 'regression' in self._clf_internals: 253 labels_ = N.asarray(dataset.labels, dtype='double') 254 else: 255 ul = dataset.uniquelabels 256 ul.sort() 257 258 if len(ul) == 2: 259 # assure that we have -1/+1 260 _labels_dict = {ul[0]:-1.0, ul[1]:+1.0} 261 elif len(ul) < 2: 262 raise ValueError, "we do not have 1-class SVM brought into SG yet" 263 else: 264 # can't use plain enumerate since we need them swapped 265 _labels_dict = dict([ (ul[i], i) for i in range(len(ul))]) 266 267 # reverse labels dict for back mapping in _predict 268 _labels_dict_rev = dict([(x[1], x[0]) 269 for x in _labels_dict.items()]) 270 271 # bind to instance as well 272 self._labels_dict = _labels_dict 273 self._labels_dict_rev = _labels_dict_rev 274 275 # Map labels 276 # 277 # TODO: top level classifier should take care about labels 278 # mapping if that is needed 279 if __debug__: 280 debug("SG__", "Mapping labels using dict %s" % _labels_dict) 281 labels_ = N.asarray([ _labels_dict[x] for x in dataset.labels ], dtype='double') 282 283 labels = shogun.Features.Labels(labels_) 284 _setdebug(labels, 'Labels') 285 286 287 # KERNEL 288 if not retrainable or _changedData['traindata'] or _changedData['kernel_params']: 289 # If needed compute or just collect arguments for SVM and for 290 # the kernel 291 kargs = [] 292 for arg in self._KERNELS[self._kernel_type_literal][1]: 293 value = self.kernel_params[arg].value 294 # XXX Unify damn automagic gamma value 295 if arg == 'gamma' and value == 0.0: 296 value = self._getDefaultGamma(dataset) 297 kargs += [value] 298 299 if retrainable and __debug__: 300 if _changedData['traindata']: 301 debug("SG", 302 "Re-Creating kernel since training data has changed") 303 304 if _changedData['kernel_params']: 305 debug("SG", 306 "Re-Creating kernel since params %s has changed" % 307 _changedData['kernel_params']) 308 309 # create training data 310 if __debug__: debug("SG_", "Converting input data for shogun") 311 self.__traindata = _tosg(dataset.samples) 312 313 if __debug__: 314 debug("SG", "Creating kernel instance of %s giving arguments %s" % 315 (`self._kernel_type`, kargs)) 316 317 self.__kernel = kernel = \ 318 self._kernel_type(self.__traindata, self.__traindata, 319 *kargs) 320 321 if externals.exists('sg ge 0.6.4'): 322 kernel.set_normalizer(shogun.Kernel.IdentityKernelNormalizer()) 323 324 newkernel = True 325 self.kernel_params.reset() # mark them as not-changed 326 _setdebug(kernel, 'Kernels') 327 328 self.__condition_kernel(kernel) 329 if retrainable: 330 if __debug__: 331 debug("SG_", "Resetting test kernel for retrainable SVM") 332 self.__kernel_test = None 333 self.__kernel_args = kargs 334 335 # TODO -- handle _changedData['params'] correctly, ie without recreating 336 # whole SVM 337 Cs = None 338 if not retrainable or self.__svm is None or _changedData['params']: 339 # SVM 340 if self.params.isKnown('C'): 341 C = self.params.C 342 if not operator.isSequenceType(C): 343 # we were not given a tuple for balancing between classes 344 C = [C] 345 346 Cs = list(C[:]) # copy 347 for i in xrange(len(Cs)): 348 if Cs[i]<0: 349 Cs[i] = self._getDefaultC(dataset.samples)*abs(Cs[i]) 350 if __debug__: 351 debug("SG_", "Default C for %s was computed to be %s" % 352 (C[i], Cs[i])) 353 354 # XXX do not jump over the head and leave it up to the user 355 # ie do not rescale automagically by the number of samples 356 #if len(Cs) == 2 and not ('regression' in self._clf_internals) and len(ul) == 2: 357 # # we were given two Cs 358 # if N.max(C) < 0 and N.min(C) < 0: 359 # # and both are requested to be 'scaled' TODO : 360 # # provide proper 'features' to the parameters, 361 # # so we could specify explicitely if to scale 362 # # them by the number of samples here 363 # nl = [N.sum(labels_ == _labels_dict[l]) for l in ul] 364 # ratio = N.sqrt(float(nl[1]) / nl[0]) 365 # #ratio = (float(nl[1]) / nl[0]) 366 # Cs[0] *= ratio 367 # Cs[1] /= ratio 368 # if __debug__: 369 # debug("SG_", "Rescaled Cs to %s to accomodate the " 370 # "difference in number of training samples" % 371 # Cs) 372 373 # Choose appropriate implementation 374 svm_impl_class = self.__get_implementation(ul) 375 376 if __debug__: 377 debug("SG", "Creating SVM instance of %s" % `svm_impl_class`) 378 379 if self._svm_impl in ['libsvr', 'svrlight']: 380 # for regressions constructor a bit different 381 self.__svm = svm_impl_class(Cs[0], self.params.epsilon, self.__kernel, labels) 382 elif self._svm_impl in ['krr']: 383 self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels) 384 else: 385 self.__svm = svm_impl_class(Cs[0], self.__kernel, labels) 386 self.__svm.set_epsilon(self.params.epsilon) 387 if Cs is not None and len(Cs) == 2: 388 if __debug__: 389 debug("SG_", "Since multiple Cs are provided: %s, assign them" % Cs) 390 self.__svm.set_C(Cs[0], Cs[1]) 391 392 self.params.reset() # mark them as not-changed 393 newsvm = True 394 _setdebug(self.__svm, 'SVM') 395 # Set optimization parameters 396 if self.params.isKnown('tube_epsilon') and \ 397 hasattr(self.__svm, 'set_tube_epsilon'): 398 self.__svm.set_tube_epsilon(self.params.tube_epsilon) 399 self.__svm.parallel.set_num_threads(self.params.num_threads) 400 else: 401 if __debug__: 402 debug("SG_", "SVM instance is not re-created") 403 if _changedData['labels']: # labels were changed 404 if __debug__: debug("SG__", "Assigning new labels") 405 self.__svm.set_labels(labels) 406 if newkernel: # kernel was replaced 407 if __debug__: debug("SG__", "Assigning new kernel") 408 self.__svm.set_kernel(self.__kernel) 409 assert(_changedData['params'] is False) # we should never get here 410 411 if retrainable: 412 # we must assign it only if it is retrainable 413 self.states.retrained = not newsvm or not newkernel 414 415 # Train 416 if __debug__ and 'SG' in debug.active: 417 if not self.regression: 418 lstr = " with labels %s" % dataset.uniquelabels 419 else: 420 lstr = "" 421 debug("SG", "%sTraining %s on data%s" % 422 (("","Re-")[retrainable and self.states.retrained], 423 self, lstr)) 424 425 self.__svm.train() 426 427 if __debug__: 428 debug("SG_", "Done training SG_SVM %s" % self._kernel_type) 429 430 # Report on training 431 if (__debug__ and 'SG__' in debug.active) or \ 432 self.states.isEnabled('training_confusion'): 433 trained_labels = self.__svm.classify().get_labels() 434 else: 435 trained_labels = None 436 437 if __debug__ and "SG__" in debug.active: 438 debug("SG__", "Original labels: %s, Trained labels: %s" % 439 (dataset.labels, trained_labels)) 440 441 # Assign training confusion right away here since we are ready 442 # to do so. 443 # XXX TODO use some other state variable like 'trained_labels' and 444 # use it within base Classifier._posttrain to assign predictions 445 # instead of duplicating code here 446 # XXX For now it can be done only for regressions since labels need to 447 # be remapped and that becomes even worse if we use regression 448 # as a classifier so mapping happens upstairs 449 if self.regression and self.states.isEnabled('training_confusion'): 450 self.states.training_confusion = self._summaryClass( 451 targets=dataset.labels, 452 predictions=trained_labels)
453
454 - def _predict(self, data):
455 """Predict values for the data 456 """ 457 458 retrainable = self.params.retrainable 459 460 if retrainable: 461 changed_testdata = self._changedData['testdata'] or \ 462 self.__kernel_test is None 463 464 if not retrainable or changed_testdata: 465 testdata = _tosg(data) 466 467 if not retrainable: 468 if __debug__: 469 debug("SG__", 470 "Initializing SVMs kernel of %s with training/testing samples" 471 % self) 472 # We can just reuse kernel used for training 473 self.__kernel.init(self.__traindata, testdata) 474 self.__condition_kernel(self.__kernel) 475 else: 476 if changed_testdata: 477 if __debug__: 478 debug("SG__", 479 "Re-creating testing kernel of %s giving " 480 "arguments %s" % 481 (`self._kernel_type`, self.__kernel_args)) 482 kernel_test = self._kernel_type(self.__traindata, testdata, 483 *self.__kernel_args) 484 _setdebug(kernel_test, 'Kernels') 485 486 custk_args = ([self.__traindata, testdata], [])[ 487 int(externals.exists('sg ge 0.6.4'))] 488 if __debug__: 489 debug("SG__", 490 "Re-creating custom testing kernel giving " 491 "arguments %s" % (str(custk_args))) 492 kernel_test_custom = shogun.Kernel.CustomKernel(*custk_args) 493 494 _setdebug(kernel_test_custom, 'Kernels') 495 self.__kernel_test = kernel_test_custom 496 self.__kernel_test.set_full_kernel_matrix_from_full( 497 kernel_test.get_kernel_matrix()) 498 elif __debug__: 499 debug("SG__", "Re-using testing kernel") 500 501 assert(self.__kernel_test is not None) 502 self.__svm.set_kernel(self.__kernel_test) 503 504 if __debug__: 505 debug("SG_", "Classifying testing data") 506 507 # doesn't do any good imho although on unittests helps tiny bit... hm 508 #self.__svm.init_kernel_optimization() 509 values_ = self.__svm.classify() 510 if values_ is None: 511 raise RuntimeError, "We got empty list of values from %s" % self 512 513 values = values_.get_labels() 514 515 if retrainable: 516 # we must assign it only if it is retrainable 517 self.states.repredicted = repredicted = not changed_testdata 518 if __debug__: 519 debug("SG__", "Re-assigning learing kernel. Repredicted is %s" 520 % repredicted) 521 # return back original kernel 522 self.__svm.set_kernel(self.__kernel) 523 524 if __debug__: 525 debug("SG__", "Got values %s" % values) 526 527 if ('regression' in self._clf_internals): 528 predictions = values 529 else: 530 # local bindings 531 _labels_dict = self._labels_dict 532 _labels_dict_rev = self._labels_dict_rev 533 534 if len(_labels_dict) == 2: 535 predictions = 1.0 - 2*N.signbit(values) 536 else: 537 predictions = values 538 539 # assure that we have the same type 540 label_type = type(_labels_dict.values()[0]) 541 542 # remap labels back adjusting their type 543 predictions = [_labels_dict_rev[label_type(x)] 544 for x in predictions] 545 546 if __debug__: 547 debug("SG__", "Tuned predictions %s" % predictions) 548 549 # store state variable 550 # TODO: extract values properly for multiclass SVMs -- 551 # ie 1 value per label or pairs for all 1-vs-1 classifications 552 self.values = values 553 554 ## to avoid leaks with not yet properly fixed shogun 555 if not retrainable: 556 try: 557 testdata.free_features() 558 except: 559 pass 560 561 return predictions
562 563
564 - def untrain(self):
565 super(SVM, self).untrain() 566 if not self.params.retrainable: 567 if __debug__: 568 debug("SG__", "Untraining %(clf)s and destroying sg's SVM", 569 msgargs={'clf':self}) 570 571 # to avoid leaks with not yet properly fixed shogun 572 # XXX make it nice... now it is just stable ;-) 573 if True: # not self.__traindata is None: 574 if True: 575 # try: 576 if self.__kernel is not None: 577 del self.__kernel 578 self.__kernel = None 579 580 if self.__kernel_test is not None: 581 del self.__kernel_test 582 self.__kernel_test = None 583 584 if self.__svm is not None: 585 del self.__svm 586 self.__svm = None 587 588 if self.__traindata is not None: 589 # Let in for easy demonstration of the memory leak in shogun 590 #for i in xrange(10): 591 # debug("SG__", "cachesize pre free features %s" % 592 # (self.__svm.get_kernel().get_cache_size())) 593 self.__traindata.free_features() 594 del self.__traindata 595 self.__traindata = None 596 597 self.__traindataset = None 598 599 600 #except: 601 # pass 602 603 if __debug__: 604 debug("SG__", 605 "Done untraining %(self)s and destroying sg's SVM", 606 msgargs=locals()) 607 elif __debug__: 608 debug("SG__", "Not untraining %(self)s since it is retrainable", 609 msgargs=locals())
610 611
612 - def __get_implementation(self, ul):
613 if 'regression' in self._clf_internals or len(ul) == 2: 614 svm_impl_class = SVM._KNOWN_IMPLEMENTATIONS[self._svm_impl][0] 615 else: 616 if self._svm_impl == 'libsvm': 617 svm_impl_class = shogun.Classifier.LibSVMMultiClass 618 elif self._svm_impl == 'gmnp': 619 svm_impl_class = shogun.Classifier.GMNPSVM 620 else: 621 raise RuntimeError, \ 622 "Shogun: Implementation %s doesn't handle multiclass " \ 623 "data. Got labels %s. Use some other classifier" % \ 624 (self._svm_impl, self.__traindataset.uniquelabels) 625 if __debug__: 626 debug("SG_", "Using %s for multiclass data of %s" % 627 (svm_impl_class, self._svm_impl)) 628 629 return svm_impl_class
630 631 632 svm = property(fget=lambda self: self.__svm) 633 """Access to the SVM model.""" 634 635 traindataset = property(fget=lambda self: self.__traindataset) 636 """Dataset which was used for training 637 638 TODO -- might better become state variable I guess"""
639 640 641 642 # Conditionally make some of the implementations available if they are 643 # present in the present shogun 644 for name, item, params, descr in \ 645 [('mpd', "shogun.Classifier.MPDSVM", "('C',), ('binary',)", 646 "MPD classifier from shogun"), 647 ('lightsvm', "shogun.Classifier.SVMLight", "('C',), ('binary',)", 648 "SVMLight classification http://svmlight.joachims.org/"), 649 ('svrlight', "shogun.Regression.SVRLight", "('C','tube_epsilon',), ('regression',)", 650 "SVMLight regression http://svmlight.joachims.org/")]: 651 if externals.exists('shogun.%s' % name): 652 exec "SVM._KNOWN_IMPLEMENTATIONS[\"%s\"] = (%s, %s, \"%s\")" % (name, item, params, descr) 653 654 # Assign SVM class to limited set of LinearSVMWeights 655 LinearSVMWeights._LEGAL_CLFS = [SVM] 656