1
2
3
4
5
6
7
8
9 """Wrap the libsvm package into a very simple class interface."""
10
11 __docformat__ = 'restructuredtext'
12
13
14 _DEV__doc__ = """
15
16 TODOs:
17 * dual-license under GPL for use of SG?
18 * for recent versions add ability to specify/parametrize normalization
19 scheme for the kernel, and reuse 'scale' now for the normalizer
20 * Add support for simplified linear classifiers (which do not require
21 storing all training SVs/samples to make classification in predict())
22 """
23
24 import numpy as N
25
26
27
28 from mvpa.base import externals
29 if externals.exists('shogun', raiseException=True):
30 import shogun.Features
31 import shogun.Classifier
32 import shogun.Regression
33 import shogun.Kernel
34 import shogun.Library
35
36 import operator
37
38 from mvpa.misc.param import Parameter
39 from mvpa.base import warning
40
41 from mvpa.clfs.meta import MulticlassClassifier
42 from mvpa.clfs._svmbase import _SVM
43 from mvpa.misc.state import StateVariable
44 from mvpa.measures.base import Sensitivity
45 from mvpa.base import externals
46
47 from sens import *
48
49 if __debug__:
50 from mvpa.base import debug
51
52
53
54
56 """Helper to set level of debugging output for SG
57 :Parameters:
58 obj
59 In SG debug output seems to be set per every object
60 partname : basestring
61 For what kind of object we are talking about... could be automated
62 later on (TODO)
63 """
64 debugname = "SG_%s" % partname.upper()
65
66 switch = {True: (shogun.Kernel.M_DEBUG, 'M_DEBUG', "enable"),
67 False: (shogun.Kernel.M_ERROR, 'M_ERROR', "disable")}
68
69 key = __debug__ and debugname in debug.active
70
71 sglevel, slevel, progressfunc = switch[key]
72
73 if __debug__:
74 debug("SG_", "Setting verbosity for shogun.%s instance: %s to %s" %
75 (partname, `obj`, slevel))
76 obj.io.set_loglevel(sglevel)
77 try:
78 exec "obj.io.%s_progress()" % progressfunc
79 except:
80 warning("Shogun version installed has no way to enable progress" +
81 " reports")
82
83
85 """Draft helper function to convert data we have into SG suitable format
86
87 TODO: Support different datatypes
88 """
89
90 if __debug__:
91 debug("SG_", "Converting data for shogun into RealFeatures")
92
93 features = shogun.Features.RealFeatures(data.astype('double').T)
94
95 if __debug__:
96 debug("SG__", "Done converting data for shogun into RealFeatures")
97 _setdebug(features, 'Features')
98 return features
99
100
102 """Support Vector Machine Classifier(s) based on Shogun
103
104 This is a simple base interface
105 """
106
107 num_threads = Parameter(1,
108 min=1,
109 doc='Number of threads to utilize')
110
111
112 _KERNELS = {}
113 if externals.exists('shogun', raiseException=True):
114 _KERNELS = { "linear": (shogun.Kernel.LinearKernel,
115 ('scale',), LinearSVMWeights),
116 "rbf" : (shogun.Kernel.GaussianKernel,
117 ('gamma',), None),
118 "rbfshift": (shogun.Kernel.GaussianShiftKernel,
119 ('gamma', 'max_shift', 'shift_step'), None),
120 "sigmoid": (shogun.Kernel.SigmoidKernel,
121 ('cache_size', 'gamma', 'coef0'), None),
122 }
123
124 _KNOWN_PARAMS = [ 'epsilon' ]
125 _KNOWN_KERNEL_PARAMS = [ ]
126
127 _clf_internals = _SVM._clf_internals + [ 'sg', 'retrainable' ]
128
129 if externals.exists('sg ge 0.6.4'):
130 _KERNELS['linear'] = (shogun.Kernel.LinearKernel, (), LinearSVMWeights)
131
132
133
134 """
135 If you'd like to train linear SVMs use SGD or OCAS. These are (I am
136 serious) the fastest linear SVM-solvers to date. (OCAS cannot do SVMs
137 with standard additive bias, but will L2 reqularize it - though it
138 should not matter much in practice (although it will give slightly
139 different solutions)). Note that SGD has no stopping criterion (you
140 simply have to specify the number of iterations) and that OCAS has a
141 different stopping condition than svmlight for example which may be more
142 tight and more loose depending on the problem - I sugeest 1e-2 or 1e-3
143 for epsilon.
144
145 If you would like to train kernel SVMs use libsvm/gpdt/svmlight -
146 depending on the problem one is faster than the other (hard to say when,
147 I *think* when your dataset is very unbalanced chunking methods like
148 svmlight/gpdt are better), for smaller problems definitely libsvm.
149
150 If you use string kernels then gpdt/svmlight have a special 'linadd'
151 speedup for this (requires sg 0.6.2 - there was some inefficiency in the
152 code for python-modular before that). This is effective for big datasets
153 and (I trained on 10 million strings based on this).
154
155 And yes currently we only implemented parallel training for svmlight,
156 however all SVMs can be evaluated in parallel.
157 """
158 _KNOWN_IMPLEMENTATIONS = {}
159 if externals.exists('shogun', raiseException=True):
160 _KNOWN_IMPLEMENTATIONS = {
161 "libsvm" : (shogun.Classifier.LibSVM, ('C',),
162 ('multiclass', 'binary'),
163 "LIBSVM's C-SVM (L2 soft-margin SVM)"),
164 "gmnp" : (shogun.Classifier.GMNPSVM, ('C',),
165 ('multiclass', 'binary'),
166 "Generalized Nearest Point Problem SVM"),
167
168 "gpbt" : (shogun.Classifier.GPBTSVM, ('C',), ('binary',),
169 "Gradient Projection Decomposition Technique for " \
170 "large-scale SVM problems"),
171 "gnpp" : (shogun.Classifier.GNPPSVM, ('C',), ('binary',),
172 "Generalized Nearest Point Problem SVM"),
173
174
175
176
177
178
179
180
181
182
183 "libsvr": (shogun.Regression.LibSVR, ('C', 'tube_epsilon',),
184 ('regression',),
185 "LIBSVM's epsilon-SVR"),
186 "krr": (shogun.Regression.KRR, ('tau',), ('regression',),
187 "Kernel Ridge Regression"),
188 }
189
190
191 - def __init__(self,
192 kernel_type='linear',
193 **kwargs):
194 """Interface class to Shogun's classifiers and regressions.
195
196 Default implementation is 'libsvm'.
197 """
198
199 svm_impl = kwargs.get('svm_impl', 'libsvm').lower()
200 kwargs['svm_impl'] = svm_impl
201
202
203 _SVM.__init__(self, kernel_type=kernel_type, **kwargs)
204
205 self.__svm = None
206 """Holds the trained svm."""
207
208
209
210
211 self.__traindataset = None
212
213
214 self.__traindata = None
215 self.__kernel = None
216 self.__kernel_test = None
217 self.__testdata = None
218
219
221
222
223
224 if self._svm_impl in ['svrlight', 'lightsvm']:
225 kernel.set_precompute_matrix(True, True)
226
227
229 """Train SVM
230 """
231
232
233 newkernel, newsvm = False, False
234
235 retrainable = self.params.retrainable
236
237 if retrainable:
238 _changedData = self._changedData
239
240
241 ul = None
242 self.__traindataset = dataset
243
244
245
246
247
248
249 if __debug__:
250 debug("SG_", "Creating labels instance")
251
252 if 'regression' in self._clf_internals:
253 labels_ = N.asarray(dataset.labels, dtype='double')
254 else:
255 ul = dataset.uniquelabels
256 ul.sort()
257
258 if len(ul) == 2:
259
260 _labels_dict = {ul[0]:-1.0, ul[1]:+1.0}
261 elif len(ul) < 2:
262 raise ValueError, "we do not have 1-class SVM brought into SG yet"
263 else:
264
265 _labels_dict = dict([ (ul[i], i) for i in range(len(ul))])
266
267
268 _labels_dict_rev = dict([(x[1], x[0])
269 for x in _labels_dict.items()])
270
271
272 self._labels_dict = _labels_dict
273 self._labels_dict_rev = _labels_dict_rev
274
275
276
277
278
279 if __debug__:
280 debug("SG__", "Mapping labels using dict %s" % _labels_dict)
281 labels_ = N.asarray([ _labels_dict[x] for x in dataset.labels ], dtype='double')
282
283 labels = shogun.Features.Labels(labels_)
284 _setdebug(labels, 'Labels')
285
286
287
288 if not retrainable or _changedData['traindata'] or _changedData['kernel_params']:
289
290
291 kargs = []
292 for arg in self._KERNELS[self._kernel_type_literal][1]:
293 value = self.kernel_params[arg].value
294
295 if arg == 'gamma' and value == 0.0:
296 value = self._getDefaultGamma(dataset)
297 kargs += [value]
298
299 if retrainable and __debug__:
300 if _changedData['traindata']:
301 debug("SG",
302 "Re-Creating kernel since training data has changed")
303
304 if _changedData['kernel_params']:
305 debug("SG",
306 "Re-Creating kernel since params %s has changed" %
307 _changedData['kernel_params'])
308
309
310 if __debug__: debug("SG_", "Converting input data for shogun")
311 self.__traindata = _tosg(dataset.samples)
312
313 if __debug__:
314 debug("SG", "Creating kernel instance of %s giving arguments %s" %
315 (`self._kernel_type`, kargs))
316
317 self.__kernel = kernel = \
318 self._kernel_type(self.__traindata, self.__traindata,
319 *kargs)
320
321 if externals.exists('sg ge 0.6.4'):
322 kernel.set_normalizer(shogun.Kernel.IdentityKernelNormalizer())
323
324 newkernel = True
325 self.kernel_params.reset()
326 _setdebug(kernel, 'Kernels')
327
328 self.__condition_kernel(kernel)
329 if retrainable:
330 if __debug__:
331 debug("SG_", "Resetting test kernel for retrainable SVM")
332 self.__kernel_test = None
333 self.__kernel_args = kargs
334
335
336
337 Cs = None
338 if not retrainable or self.__svm is None or _changedData['params']:
339
340 if self.params.isKnown('C'):
341 C = self.params.C
342 if not operator.isSequenceType(C):
343
344 C = [C]
345
346 Cs = list(C[:])
347 for i in xrange(len(Cs)):
348 if Cs[i]<0:
349 Cs[i] = self._getDefaultC(dataset.samples)*abs(Cs[i])
350 if __debug__:
351 debug("SG_", "Default C for %s was computed to be %s" %
352 (C[i], Cs[i]))
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374 svm_impl_class = self.__get_implementation(ul)
375
376 if __debug__:
377 debug("SG", "Creating SVM instance of %s" % `svm_impl_class`)
378
379 if self._svm_impl in ['libsvr', 'svrlight']:
380
381 self.__svm = svm_impl_class(Cs[0], self.params.epsilon, self.__kernel, labels)
382 elif self._svm_impl in ['krr']:
383 self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels)
384 else:
385 self.__svm = svm_impl_class(Cs[0], self.__kernel, labels)
386 self.__svm.set_epsilon(self.params.epsilon)
387 if Cs is not None and len(Cs) == 2:
388 if __debug__:
389 debug("SG_", "Since multiple Cs are provided: %s, assign them" % Cs)
390 self.__svm.set_C(Cs[0], Cs[1])
391
392 self.params.reset()
393 newsvm = True
394 _setdebug(self.__svm, 'SVM')
395
396 if self.params.isKnown('tube_epsilon') and \
397 hasattr(self.__svm, 'set_tube_epsilon'):
398 self.__svm.set_tube_epsilon(self.params.tube_epsilon)
399 self.__svm.parallel.set_num_threads(self.params.num_threads)
400 else:
401 if __debug__:
402 debug("SG_", "SVM instance is not re-created")
403 if _changedData['labels']:
404 if __debug__: debug("SG__", "Assigning new labels")
405 self.__svm.set_labels(labels)
406 if newkernel:
407 if __debug__: debug("SG__", "Assigning new kernel")
408 self.__svm.set_kernel(self.__kernel)
409 assert(_changedData['params'] is False)
410
411 if retrainable:
412
413 self.states.retrained = not newsvm or not newkernel
414
415
416 if __debug__ and 'SG' in debug.active:
417 if not self.regression:
418 lstr = " with labels %s" % dataset.uniquelabels
419 else:
420 lstr = ""
421 debug("SG", "%sTraining %s on data%s" %
422 (("","Re-")[retrainable and self.states.retrained],
423 self, lstr))
424
425 self.__svm.train()
426
427 if __debug__:
428 debug("SG_", "Done training SG_SVM %s" % self._kernel_type)
429
430
431 if (__debug__ and 'SG__' in debug.active) or \
432 self.states.isEnabled('training_confusion'):
433 trained_labels = self.__svm.classify().get_labels()
434 else:
435 trained_labels = None
436
437 if __debug__ and "SG__" in debug.active:
438 debug("SG__", "Original labels: %s, Trained labels: %s" %
439 (dataset.labels, trained_labels))
440
441
442
443
444
445
446
447
448
449 if self.regression and self.states.isEnabled('training_confusion'):
450 self.states.training_confusion = self._summaryClass(
451 targets=dataset.labels,
452 predictions=trained_labels)
453
455 """Predict values for the data
456 """
457
458 retrainable = self.params.retrainable
459
460 if retrainable:
461 changed_testdata = self._changedData['testdata'] or \
462 self.__kernel_test is None
463
464 if not retrainable or changed_testdata:
465 testdata = _tosg(data)
466
467 if not retrainable:
468 if __debug__:
469 debug("SG__",
470 "Initializing SVMs kernel of %s with training/testing samples"
471 % self)
472
473 self.__kernel.init(self.__traindata, testdata)
474 self.__condition_kernel(self.__kernel)
475 else:
476 if changed_testdata:
477 if __debug__:
478 debug("SG__",
479 "Re-creating testing kernel of %s giving "
480 "arguments %s" %
481 (`self._kernel_type`, self.__kernel_args))
482 kernel_test = self._kernel_type(self.__traindata, testdata,
483 *self.__kernel_args)
484 _setdebug(kernel_test, 'Kernels')
485
486 custk_args = ([self.__traindata, testdata], [])[
487 int(externals.exists('sg ge 0.6.4'))]
488 if __debug__:
489 debug("SG__",
490 "Re-creating custom testing kernel giving "
491 "arguments %s" % (str(custk_args)))
492 kernel_test_custom = shogun.Kernel.CustomKernel(*custk_args)
493
494 _setdebug(kernel_test_custom, 'Kernels')
495 self.__kernel_test = kernel_test_custom
496 self.__kernel_test.set_full_kernel_matrix_from_full(
497 kernel_test.get_kernel_matrix())
498 elif __debug__:
499 debug("SG__", "Re-using testing kernel")
500
501 assert(self.__kernel_test is not None)
502 self.__svm.set_kernel(self.__kernel_test)
503
504 if __debug__:
505 debug("SG_", "Classifying testing data")
506
507
508
509 values_ = self.__svm.classify()
510 if values_ is None:
511 raise RuntimeError, "We got empty list of values from %s" % self
512
513 values = values_.get_labels()
514
515 if retrainable:
516
517 self.states.repredicted = repredicted = not changed_testdata
518 if __debug__:
519 debug("SG__", "Re-assigning learing kernel. Repredicted is %s"
520 % repredicted)
521
522 self.__svm.set_kernel(self.__kernel)
523
524 if __debug__:
525 debug("SG__", "Got values %s" % values)
526
527 if ('regression' in self._clf_internals):
528 predictions = values
529 else:
530
531 _labels_dict = self._labels_dict
532 _labels_dict_rev = self._labels_dict_rev
533
534 if len(_labels_dict) == 2:
535 predictions = 1.0 - 2*N.signbit(values)
536 else:
537 predictions = values
538
539
540 label_type = type(_labels_dict.values()[0])
541
542
543 predictions = [_labels_dict_rev[label_type(x)]
544 for x in predictions]
545
546 if __debug__:
547 debug("SG__", "Tuned predictions %s" % predictions)
548
549
550
551
552 self.values = values
553
554
555 if not retrainable:
556 try:
557 testdata.free_features()
558 except:
559 pass
560
561 return predictions
562
563
565 super(SVM, self).untrain()
566 if not self.params.retrainable:
567 if __debug__:
568 debug("SG__", "Untraining %(clf)s and destroying sg's SVM",
569 msgargs={'clf':self})
570
571
572
573 if True:
574 if True:
575
576 if self.__kernel is not None:
577 del self.__kernel
578 self.__kernel = None
579
580 if self.__kernel_test is not None:
581 del self.__kernel_test
582 self.__kernel_test = None
583
584 if self.__svm is not None:
585 del self.__svm
586 self.__svm = None
587
588 if self.__traindata is not None:
589
590
591
592
593 self.__traindata.free_features()
594 del self.__traindata
595 self.__traindata = None
596
597 self.__traindataset = None
598
599
600
601
602
603 if __debug__:
604 debug("SG__",
605 "Done untraining %(self)s and destroying sg's SVM",
606 msgargs=locals())
607 elif __debug__:
608 debug("SG__", "Not untraining %(self)s since it is retrainable",
609 msgargs=locals())
610
611
613 if 'regression' in self._clf_internals or len(ul) == 2:
614 svm_impl_class = SVM._KNOWN_IMPLEMENTATIONS[self._svm_impl][0]
615 else:
616 if self._svm_impl == 'libsvm':
617 svm_impl_class = shogun.Classifier.LibSVMMultiClass
618 elif self._svm_impl == 'gmnp':
619 svm_impl_class = shogun.Classifier.GMNPSVM
620 else:
621 raise RuntimeError, \
622 "Shogun: Implementation %s doesn't handle multiclass " \
623 "data. Got labels %s. Use some other classifier" % \
624 (self._svm_impl, self.__traindataset.uniquelabels)
625 if __debug__:
626 debug("SG_", "Using %s for multiclass data of %s" %
627 (svm_impl_class, self._svm_impl))
628
629 return svm_impl_class
630
631
632 svm = property(fget=lambda self: self.__svm)
633 """Access to the SVM model."""
634
635 traindataset = property(fget=lambda self: self.__traindataset)
636 """Dataset which was used for training
637
638 TODO -- might better become state variable I guess"""
639
640
641
642
643
644 for name, item, params, descr in \
645 [('mpd', "shogun.Classifier.MPDSVM", "('C',), ('binary',)",
646 "MPD classifier from shogun"),
647 ('lightsvm', "shogun.Classifier.SVMLight", "('C',), ('binary',)",
648 "SVMLight classification http://svmlight.joachims.org/"),
649 ('svrlight', "shogun.Regression.SVRLight", "('C','tube_epsilon',), ('regression',)",
650 "SVMLight regression http://svmlight.joachims.org/")]:
651 if externals.exists('shogun.%s' % name):
652 exec "SVM._KNOWN_IMPLEMENTATIONS[\"%s\"] = (%s, %s, \"%s\")" % (name, item, params, descr)
653
654
655 LinearSVMWeights._LEGAL_CLFS = [SVM]
656