1
2
3
4
5
6
7
8
9 """Wrap the libsvm package into a very simple class interface."""
10
11 __docformat__ = 'restructuredtext'
12
13
14 _DEV__doc__ = """
15
16 TODOs:
17 * dual-license under GPL for use of SG?
18 * for recent versions add ability to specify/parametrize normalization
19 scheme for the kernel, and reuse 'scale' now for the normalizer
20 * Add support for simplified linear classifiers (which do not require
21 storing all training SVs/samples to make classification in predict())
22 """
23
24 import numpy as N
25
26 from mvpa import _random_seed
27
28
29 from mvpa.base import externals, warning
30 if externals.exists('shogun', raiseException=True):
31 import shogun.Features
32 import shogun.Classifier
33 import shogun.Regression
34 import shogun.Kernel
35 import shogun.Library
36
37
38 if hasattr(shogun.Kernel, 'M_DEBUG'):
39 _M_DEBUG = shogun.Kernel.M_DEBUG
40 _M_ERROR = shogun.Kernel.M_ERROR
41 elif hasattr(shogun.Kernel, 'MSG_DEBUG'):
42 _M_DEBUG = shogun.Kernel.MSG_DEBUG
43 _M_ERROR = shogun.Kernel.MSG_ERROR
44 else:
45 _M_DEBUG, _M_ERROR = None, None
46 warning("Could not figure out debug IDs within shogun. "
47 "No control over shogun verbosity would be provided")
48
49 try:
50
51 shogun.Library.Math_init_random(_random_seed)
52
53
54 shogun.Library.Math_init_random(_random_seed)
55 except Exception, e:
56 warning('Shogun cannot be seeded due to %s' % (e,))
57
58 import operator
59
60 from mvpa.misc.param import Parameter
61 from mvpa.base import warning
62
63 from mvpa.clfs.base import FailedToTrainError
64 from mvpa.clfs.meta import MulticlassClassifier
65 from mvpa.clfs._svmbase import _SVM
66 from mvpa.misc.state import StateVariable
67 from mvpa.measures.base import Sensitivity
68
69 from sens import *
70
71 if __debug__:
72 from mvpa.base import debug
73
74
76 """Helper to set level of debugging output for SG
77 :Parameters:
78 obj
79 In SG debug output seems to be set per every object
80 partname : basestring
81 For what kind of object we are talking about... could be automated
82 later on (TODO)
83 """
84 if _M_DEBUG is None:
85 return
86 debugname = "SG_%s" % partname.upper()
87
88 switch = {True: (_M_DEBUG, 'M_DEBUG', "enable"),
89 False: (_M_ERROR, 'M_ERROR', "disable")}
90
91 key = __debug__ and debugname in debug.active
92
93 sglevel, slevel, progressfunc = switch[key]
94
95 if __debug__:
96 debug("SG_", "Setting verbosity for shogun.%s instance: %s to %s" %
97 (partname, `obj`, slevel))
98 obj.io.set_loglevel(sglevel)
99 try:
100 exec "obj.io.%s_progress()" % progressfunc
101 except:
102 warning("Shogun version installed has no way to enable progress" +
103 " reports")
104
105
107 """Draft helper function to convert data we have into SG suitable format
108
109 TODO: Support different datatypes
110 """
111
112 if __debug__:
113 debug("SG_", "Converting data for shogun into RealFeatures")
114
115 features = shogun.Features.RealFeatures(data.astype('double').T)
116
117 if __debug__:
118 debug("SG__", "Done converting data for shogun into RealFeatures")
119 _setdebug(features, 'Features')
120 return features
121
122
124 """Support Vector Machine Classifier(s) based on Shogun
125
126 This is a simple base interface
127 """
128
129 num_threads = Parameter(1,
130 min=1,
131 doc='Number of threads to utilize')
132
133
134 _KERNELS = {}
135 if externals.exists('shogun', raiseException=True):
136 _KERNELS = { "linear": (shogun.Kernel.LinearKernel,
137 ('scale',), LinearSVMWeights),
138 "rbf" : (shogun.Kernel.GaussianKernel,
139 ('gamma',), None),
140 "rbfshift": (shogun.Kernel.GaussianShiftKernel,
141 ('gamma', 'max_shift', 'shift_step'), None),
142 "sigmoid": (shogun.Kernel.SigmoidKernel,
143 ('cache_size', 'gamma', 'coef0'), None),
144 }
145
146 _KNOWN_PARAMS = [ 'epsilon' ]
147 _KNOWN_KERNEL_PARAMS = [ ]
148
149 _clf_internals = _SVM._clf_internals + [ 'sg', 'retrainable' ]
150
151 if externals.exists('sg ge 0.6.4'):
152 _KERNELS['linear'] = (shogun.Kernel.LinearKernel, (), LinearSVMWeights)
153
154
155
156 """
157 If you'd like to train linear SVMs use SGD or OCAS. These are (I am
158 serious) the fastest linear SVM-solvers to date. (OCAS cannot do SVMs
159 with standard additive bias, but will L2 reqularize it - though it
160 should not matter much in practice (although it will give slightly
161 different solutions)). Note that SGD has no stopping criterion (you
162 simply have to specify the number of iterations) and that OCAS has a
163 different stopping condition than svmlight for example which may be more
164 tight and more loose depending on the problem - I sugeest 1e-2 or 1e-3
165 for epsilon.
166
167 If you would like to train kernel SVMs use libsvm/gpdt/svmlight -
168 depending on the problem one is faster than the other (hard to say when,
169 I *think* when your dataset is very unbalanced chunking methods like
170 svmlight/gpdt are better), for smaller problems definitely libsvm.
171
172 If you use string kernels then gpdt/svmlight have a special 'linadd'
173 speedup for this (requires sg 0.6.2 - there was some inefficiency in the
174 code for python-modular before that). This is effective for big datasets
175 and (I trained on 10 million strings based on this).
176
177 And yes currently we only implemented parallel training for svmlight,
178 however all SVMs can be evaluated in parallel.
179 """
180 _KNOWN_IMPLEMENTATIONS = {}
181 if externals.exists('shogun', raiseException=True):
182 _KNOWN_IMPLEMENTATIONS = {
183 "libsvm" : (shogun.Classifier.LibSVM, ('C',),
184 ('multiclass', 'binary'),
185 "LIBSVM's C-SVM (L2 soft-margin SVM)"),
186 "gmnp" : (shogun.Classifier.GMNPSVM, ('C',),
187 ('multiclass', 'binary'),
188 "Generalized Nearest Point Problem SVM"),
189
190 "gpbt" : (shogun.Classifier.GPBTSVM, ('C',), ('binary',),
191 "Gradient Projection Decomposition Technique for " \
192 "large-scale SVM problems"),
193 "gnpp" : (shogun.Classifier.GNPPSVM, ('C',), ('binary',),
194 "Generalized Nearest Point Problem SVM"),
195
196
197
198
199
200
201
202
203
204
205 "libsvr": (shogun.Regression.LibSVR, ('C', 'tube_epsilon',),
206 ('regression',),
207 "LIBSVM's epsilon-SVR"),
208 }
209
210
211 - def __init__(self,
212 kernel_type='linear',
213 **kwargs):
214 """Interface class to Shogun's classifiers and regressions.
215
216 Default implementation is 'libsvm'.
217 """
218
219 svm_impl = kwargs.get('svm_impl', 'libsvm').lower()
220 kwargs['svm_impl'] = svm_impl
221
222
223 _SVM.__init__(self, kernel_type=kernel_type, **kwargs)
224
225 self.__svm = None
226 """Holds the trained svm."""
227
228
229
230
231 self.__traindataset = None
232
233
234 self.__traindata = None
235 self.__kernel = None
236 self.__kernel_test = None
237 self.__testdata = None
238
239
241
242
243
244 if self._svm_impl in ['svrlight', 'lightsvm']:
245 try:
246 kernel.set_precompute_matrix(True, True)
247 except Exception, e:
248
249 if __debug__:
250 debug('SG_', "Failed call to set_precompute_matrix for %s: %s"
251 % (self, e))
252
253
255 """Train SVM
256 """
257
258
259 newkernel, newsvm = False, False
260
261 retrainable = self.params.retrainable
262
263 if retrainable:
264 _changedData = self._changedData
265
266
267 ul = None
268 self.__traindataset = dataset
269
270
271
272
273
274
275 if __debug__:
276 debug("SG_", "Creating labels instance")
277
278 if 'regression' in self._clf_internals:
279 labels_ = N.asarray(dataset.labels, dtype='double')
280 else:
281 ul = dataset.uniquelabels
282 ul.sort()
283
284 if len(ul) == 2:
285
286 _labels_dict = {ul[0]:-1.0, ul[1]:+1.0}
287 elif len(ul) < 2:
288 raise FailedToTrainError, \
289 "We do not have 1-class SVM brought into SG yet"
290 else:
291
292 _labels_dict = dict([ (ul[i], i) for i in range(len(ul))])
293
294
295 _labels_dict_rev = dict([(x[1], x[0])
296 for x in _labels_dict.items()])
297
298
299 self._labels_dict = _labels_dict
300 self._labels_dict_rev = _labels_dict_rev
301
302
303
304
305
306 if __debug__:
307 debug("SG__", "Mapping labels using dict %s" % _labels_dict)
308 labels_ = N.asarray([ _labels_dict[x] for x in dataset.labels ], dtype='double')
309
310 labels = shogun.Features.Labels(labels_)
311 _setdebug(labels, 'Labels')
312
313
314
315 if not retrainable or _changedData['traindata'] or _changedData['kernel_params']:
316
317
318 kargs = []
319 for arg in self._KERNELS[self._kernel_type_literal][1]:
320 value = self.kernel_params[arg].value
321
322 if arg == 'gamma' and value == 0.0:
323 value = self._getDefaultGamma(dataset)
324 kargs += [value]
325
326 if retrainable and __debug__:
327 if _changedData['traindata']:
328 debug("SG",
329 "Re-Creating kernel since training data has changed")
330
331 if _changedData['kernel_params']:
332 debug("SG",
333 "Re-Creating kernel since params %s has changed" %
334 _changedData['kernel_params'])
335
336
337 if __debug__: debug("SG_", "Converting input data for shogun")
338 self.__traindata = _tosg(dataset.samples)
339
340 if __debug__:
341 debug("SG", "Creating kernel instance of %s giving arguments %s" %
342 (`self._kernel_type`, kargs))
343
344 self.__kernel = kernel = \
345 self._kernel_type(self.__traindata, self.__traindata,
346 *kargs)
347
348 if externals.exists('sg ge 0.6.4'):
349 kernel.set_normalizer(shogun.Kernel.IdentityKernelNormalizer())
350
351 newkernel = True
352 self.kernel_params.reset()
353 _setdebug(kernel, 'Kernels')
354
355 self.__condition_kernel(kernel)
356 if retrainable:
357 if __debug__:
358 debug("SG_", "Resetting test kernel for retrainable SVM")
359 self.__kernel_test = None
360 self.__kernel_args = kargs
361
362
363
364 Cs = None
365 if not retrainable or self.__svm is None or _changedData['params']:
366
367 if self.params.isKnown('C'):
368 C = self.params.C
369 if not operator.isSequenceType(C):
370
371 C = [C]
372
373 Cs = list(C[:])
374 for i in xrange(len(Cs)):
375 if Cs[i]<0:
376 Cs[i] = self._getDefaultC(dataset.samples)*abs(Cs[i])
377 if __debug__:
378 debug("SG_", "Default C for %s was computed to be %s" %
379 (C[i], Cs[i]))
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401 svm_impl_class = self.__get_implementation(ul)
402
403 if __debug__:
404 debug("SG", "Creating SVM instance of %s" % `svm_impl_class`)
405
406 if self._svm_impl in ['libsvr', 'svrlight']:
407
408 self.__svm = svm_impl_class(Cs[0], self.params.epsilon, self.__kernel, labels)
409 elif self._svm_impl in ['krr']:
410 self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels)
411 else:
412 self.__svm = svm_impl_class(Cs[0], self.__kernel, labels)
413 self.__svm.set_epsilon(self.params.epsilon)
414
415
416 if self.params.isKnown('shrinking'):
417 shrinking = self.params.shrinking
418 if __debug__:
419 debug("SG_", "Setting shrinking to %s" % shrinking)
420 self.__svm.set_shrinking_enabled(shrinking)
421
422 if Cs is not None and len(Cs) == 2:
423 if __debug__:
424 debug("SG_", "Since multiple Cs are provided: %s, assign them" % Cs)
425 self.__svm.set_C(Cs[0], Cs[1])
426
427 self.params.reset()
428 newsvm = True
429 _setdebug(self.__svm, 'SVM')
430
431 if self.params.isKnown('tube_epsilon') and \
432 hasattr(self.__svm, 'set_tube_epsilon'):
433 self.__svm.set_tube_epsilon(self.params.tube_epsilon)
434 self.__svm.parallel.set_num_threads(self.params.num_threads)
435 else:
436 if __debug__:
437 debug("SG_", "SVM instance is not re-created")
438 if _changedData['labels']:
439 if __debug__: debug("SG__", "Assigning new labels")
440 self.__svm.set_labels(labels)
441 if newkernel:
442 if __debug__: debug("SG__", "Assigning new kernel")
443 self.__svm.set_kernel(self.__kernel)
444 assert(_changedData['params'] is False)
445
446 if retrainable:
447
448 self.states.retrained = not newsvm or not newkernel
449
450
451 if __debug__ and 'SG' in debug.active:
452 if not self.regression:
453 lstr = " with labels %s" % dataset.uniquelabels
454 else:
455 lstr = ""
456 debug("SG", "%sTraining %s on data%s" %
457 (("","Re-")[retrainable and self.states.retrained],
458 self, lstr))
459
460 self.__svm.train()
461
462 if __debug__:
463 debug("SG_", "Done training SG_SVM %s" % self._kernel_type)
464
465
466 if (__debug__ and 'SG__' in debug.active) or \
467 self.states.isEnabled('training_confusion'):
468 trained_labels = self.__svm.classify().get_labels()
469 else:
470 trained_labels = None
471
472 if __debug__ and "SG__" in debug.active:
473 debug("SG__", "Original labels: %s, Trained labels: %s" %
474 (dataset.labels, trained_labels))
475
476
477
478
479
480
481
482
483
484 if self.regression and self.states.isEnabled('training_confusion'):
485 self.states.training_confusion = self._summaryClass(
486 targets=dataset.labels,
487 predictions=trained_labels)
488
490 """Predict values for the data
491 """
492
493 retrainable = self.params.retrainable
494
495 if retrainable:
496 changed_testdata = self._changedData['testdata'] or \
497 self.__kernel_test is None
498
499 if not retrainable or changed_testdata:
500 testdata = _tosg(data)
501
502 if not retrainable:
503 if __debug__:
504 debug("SG__",
505 "Initializing SVMs kernel of %s with training/testing samples"
506 % self)
507
508 self.__kernel.init(self.__traindata, testdata)
509 self.__condition_kernel(self.__kernel)
510 else:
511 if changed_testdata:
512 if __debug__:
513 debug("SG__",
514 "Re-creating testing kernel of %s giving "
515 "arguments %s" %
516 (`self._kernel_type`, self.__kernel_args))
517 kernel_test = self._kernel_type(self.__traindata, testdata,
518 *self.__kernel_args)
519 _setdebug(kernel_test, 'Kernels')
520
521 custk_args = ([self.__traindata, testdata], [])[
522 int(externals.exists('sg ge 0.6.4'))]
523 if __debug__:
524 debug("SG__",
525 "Re-creating custom testing kernel giving "
526 "arguments %s" % (str(custk_args)))
527 kernel_test_custom = shogun.Kernel.CustomKernel(*custk_args)
528
529 _setdebug(kernel_test_custom, 'Kernels')
530 self.__kernel_test = kernel_test_custom
531 self.__kernel_test.set_full_kernel_matrix_from_full(
532 kernel_test.get_kernel_matrix())
533 elif __debug__:
534 debug("SG__", "Re-using testing kernel")
535
536 assert(self.__kernel_test is not None)
537 self.__svm.set_kernel(self.__kernel_test)
538
539 if __debug__:
540 debug("SG_", "Classifying testing data")
541
542
543
544 values_ = self.__svm.classify()
545 if values_ is None:
546 raise RuntimeError, "We got empty list of values from %s" % self
547
548 values = values_.get_labels()
549
550 if retrainable:
551
552 self.states.repredicted = repredicted = not changed_testdata
553 if __debug__:
554 debug("SG__", "Re-assigning learing kernel. Repredicted is %s"
555 % repredicted)
556
557 self.__svm.set_kernel(self.__kernel)
558
559 if __debug__:
560 debug("SG__", "Got values %s" % values)
561
562 if ('regression' in self._clf_internals):
563 predictions = values
564 else:
565
566 _labels_dict = self._labels_dict
567 _labels_dict_rev = self._labels_dict_rev
568
569 if len(_labels_dict) == 2:
570 predictions = 1.0 - 2*N.signbit(values)
571 else:
572 predictions = values
573
574
575 label_type = type(_labels_dict.values()[0])
576
577
578 predictions = [_labels_dict_rev[label_type(x)]
579 for x in predictions]
580
581 if __debug__:
582 debug("SG__", "Tuned predictions %s" % predictions)
583
584
585
586
587 self.values = values
588
589
590 if not retrainable:
591 try:
592 testdata.free_features()
593 except:
594 pass
595
596 return predictions
597
598
600 super(SVM, self).untrain()
601 if not self.params.retrainable:
602 if __debug__:
603 debug("SG__", "Untraining %(clf)s and destroying sg's SVM",
604 msgargs={'clf':self})
605
606
607
608 if True:
609 if True:
610
611 if self.__kernel is not None:
612 del self.__kernel
613 self.__kernel = None
614
615 if self.__kernel_test is not None:
616 del self.__kernel_test
617 self.__kernel_test = None
618
619 if self.__svm is not None:
620 del self.__svm
621 self.__svm = None
622
623 if self.__traindata is not None:
624
625
626
627
628 self.__traindata.free_features()
629 del self.__traindata
630 self.__traindata = None
631
632 self.__traindataset = None
633
634
635
636
637
638 if __debug__:
639 debug("SG__",
640 "Done untraining %(self)s and destroying sg's SVM",
641 msgargs=locals())
642 elif __debug__:
643 debug("SG__", "Not untraining %(self)s since it is retrainable",
644 msgargs=locals())
645
646
648 if 'regression' in self._clf_internals or len(ul) == 2:
649 svm_impl_class = SVM._KNOWN_IMPLEMENTATIONS[self._svm_impl][0]
650 else:
651 if self._svm_impl == 'libsvm':
652 svm_impl_class = shogun.Classifier.LibSVMMultiClass
653 elif self._svm_impl == 'gmnp':
654 svm_impl_class = shogun.Classifier.GMNPSVM
655 else:
656 raise RuntimeError, \
657 "Shogun: Implementation %s doesn't handle multiclass " \
658 "data. Got labels %s. Use some other classifier" % \
659 (self._svm_impl, self.__traindataset.uniquelabels)
660 if __debug__:
661 debug("SG_", "Using %s for multiclass data of %s" %
662 (svm_impl_class, self._svm_impl))
663
664 return svm_impl_class
665
666
667 svm = property(fget=lambda self: self.__svm)
668 """Access to the SVM model."""
669
670 traindataset = property(fget=lambda self: self.__traindataset)
671 """Dataset which was used for training
672
673 TODO -- might better become state variable I guess"""
674
675
676
677
678
679 for name, item, params, descr in \
680 [('mpd', "shogun.Classifier.MPDSVM", "('C',), ('binary',)",
681 "MPD classifier from shogun"),
682 ('lightsvm', "shogun.Classifier.SVMLight", "('C',), ('binary',)",
683 "SVMLight classification http://svmlight.joachims.org/"),
684 ('svrlight', "shogun.Regression.SVRLight", "('C','tube_epsilon',), ('regression',)",
685 "SVMLight regression http://svmlight.joachims.org/"),
686 ('krr', "shogun.Regression.KRR", "('tau',), ('regression',)",
687 "Kernel Ridge Regression"),
688 ]:
689 if externals.exists('shogun.%s' % name):
690 exec "SVM._KNOWN_IMPLEMENTATIONS[\"%s\"] = (%s, %s, \"%s\")" % (name, item, params, descr)
691
692
693 LinearSVMWeights._LEGAL_CLFS = [SVM]
694