1
2
3
4
5
6
7
8
9 """Wrap the libsvm package into a very simple class interface."""
10
11 __docformat__ = 'restructuredtext'
12
13
14 _DEV__doc__ = """
15
16 TODOs:
17 * dual-license under GPL for use of SG?
18 * for recent versions add ability to specify/parametrize normalization
19 scheme for the kernel, and reuse 'scale' now for the normalizer
20 * Add support for simplified linear classifiers (which do not require
21 storing all training SVs/samples to make classification in predict())
22 """
23
24 import numpy as N
25
26
27
28 import shogun.Features
29 import shogun.Classifier
30 import shogun.Regression
31 import shogun.Kernel
32 import shogun.Library
33
34 import operator
35
36 from mvpa.misc.param import Parameter
37 from mvpa.base import warning
38
39 from mvpa.clfs.base import MulticlassClassifier
40 from mvpa.clfs._svmbase import _SVM
41 from mvpa.misc.state import StateVariable
42 from mvpa.clfs.base import Classifier, MulticlassClassifier
43 from mvpa.measures.base import Sensitivity
44 from mvpa.base import externals
45
46 from sens import *
47
48 if __debug__:
49 from mvpa.base import debug
50
51
52
53
55 """Helper to set level of debugging output for SG
56 :Parameters:
57 obj
58 In SG debug output seems to be set per every object
59 partname : basestring
60 For what kind of object we are talking about... could be automated
61 later on (TODO)
62 """
63 debugname = "SG_%s" % partname.upper()
64
65 switch = {True: (shogun.Kernel.M_DEBUG, 'M_DEBUG', "enable"),
66 False: (shogun.Kernel.M_ERROR, 'M_ERROR', "disable")}
67
68 key = __debug__ and debugname in debug.active
69
70 sglevel, slevel, progressfunc = switch[key]
71
72 if __debug__:
73 debug("SG_", "Setting verbosity for shogun.%s instance: %s to %s" %
74 (partname, `obj`, slevel))
75 obj.io.set_loglevel(sglevel)
76 try:
77 exec "obj.io.%s_progress()" % progressfunc
78 except:
79 warning("Shogun version installed has no way to enable progress" +
80 " reports")
81
82
84 """Draft helper function to convert data we have into SG suitable format
85
86 TODO: Support different datatypes
87 """
88
89 if __debug__:
90 debug("SG_", "Converting data for shogun into RealFeatures")
91
92 features = shogun.Features.RealFeatures(data.astype('double').T)
93
94 if __debug__:
95 debug("SG__", "Done converting data for shogun into RealFeatures")
96 _setdebug(features, 'Features')
97 return features
98
99
101 """Support Vector Machine Classifier(s) based on Shogun
102
103 This is a simple base interface
104 """
105
106 num_threads = Parameter(1,
107 min=1,
108 descr='Number of threads to utilize')
109
110
111 _KERNELS = { "linear": (shogun.Kernel.LinearKernel, ('scale',), LinearSVMWeights),
112 "rbf" : (shogun.Kernel.GaussianKernel, ('gamma',), None),
113 "rbfshift" : (shogun.Kernel.GaussianShiftKernel, ('gamma', 'max_shift', 'shift_step'), None),
114 "sigmoid" : (shogun.Kernel.SigmoidKernel, ('cache_size', 'gamma', 'coef0'), None),
115 }
116
117 _KNOWN_PARAMS = [ 'epsilon' ]
118 _KNOWN_KERNEL_PARAMS = [ ]
119
120 _clf_internals = _SVM._clf_internals + [ 'sg', 'retrainable' ]
121
122 if externals.exists('sg >= 0.6.4'):
123 _KERNELS['linear'] = (shogun.Kernel.LinearKernel, (), LinearSVMWeights)
124
125
126
127 """
128 If you'd like to train linear SVMs use SGD or OCAS. These are (I am
129 serious) the fastest linear SVM-solvers to date. (OCAS cannot do SVMs
130 with standard additive bias, but will L2 reqularize it - though it
131 should not matter much in practice (although it will give slightly
132 different solutions)). Note that SGD has no stopping criterion (you
133 simply have to specify the number of iterations) and that OCAS has a
134 different stopping condition than svmlight for example which may be more
135 tight and more loose depending on the problem - I sugeest 1e-2 or 1e-3
136 for epsilon.
137
138 If you would like to train kernel SVMs use libsvm/gpdt/svmlight -
139 depending on the problem one is faster than the other (hard to say when,
140 I *think* when your dataset is very unbalanced chunking methods like
141 svmlight/gpdt are better), for smaller problems definitely libsvm.
142
143 If you use string kernels then gpdt/svmlight have a special 'linadd'
144 speedup for this (requires sg 0.6.2 - there was some inefficiency in the
145 code for python-modular before that). This is effective for big datasets
146 and (I trained on 10 million strings based on this).
147
148 And yes currently we only implemented parallel training for svmlight,
149 however all SVMs can be evaluated in parallel.
150 """
151 _KNOWN_IMPLEMENTATIONS = {
152 "libsvm" : (shogun.Classifier.LibSVM, ('C',), ('multiclass', 'binary'), ''),
153 "gmnp" : (shogun.Classifier.GMNPSVM, ('C',), ('multiclass', 'binary'), ''),
154 "gpbt" : (shogun.Classifier.GPBTSVM, ('C',), ('binary',), ''),
155 "gnpp" : (shogun.Classifier.GNPPSVM, ('C',), ('binary',), ''),
156
157
158
159
160
161
162
163
164
165
166 "libsvr": (shogun.Regression.LibSVR, ('C', 'tube_epsilon',), ('regression',), ''),
167 "krr": (shogun.Regression.KRR, ('tau',), ('regression',), ''),
168 }
169
170
171 - def __init__(self,
172 kernel_type='linear',
173 **kwargs):
174 """This is the base class of all classifier that utilize so
175 far just SVM classifiers provided by shogun.
176
177 TODO Documentation if this all works ;-)
178 """
179
180 svm_impl = kwargs.get('svm_impl', 'libsvm').lower()
181 kwargs['svm_impl'] = svm_impl
182
183
184 _SVM.__init__(self, kernel_type=kernel_type, **kwargs)
185
186 self.__svm = None
187 """Holds the trained svm."""
188
189
190
191
192 self.__traindataset = None
193
194
195 self.__traindata = None
196 self.__kernel = None
197 self.__kernel_test = None
198 self.__testdata = None
199
200
202
203
204
205 if self._svm_impl in ['svrlight', 'lightsvm']:
206 kernel.set_precompute_matrix(True, True)
207
208
210 """Train SVM
211 """
212
213
214 newkernel, newsvm = False, False
215
216 retrainable = self.params.retrainable
217
218 if retrainable:
219 _changedData = self._changedData
220
221
222 ul = None
223 self.__traindataset = dataset
224
225
226
227
228
229
230 if __debug__:
231 debug("SG_", "Creating labels instance")
232
233 if 'regression' in self._clf_internals:
234 labels_ = N.asarray(dataset.labels, dtype='double')
235 else:
236 ul = dataset.uniquelabels
237 ul.sort()
238
239 if len(ul) == 2:
240
241 _labels_dict = {ul[0]:-1.0, ul[1]:+1.0}
242 elif len(ul) < 2:
243 raise ValueError, "we do not have 1-class SVM brought into SG yet"
244 else:
245
246 _labels_dict = dict([ (ul[i], i) for i in range(len(ul))])
247
248
249 _labels_dict_rev = dict([(x[1], x[0])
250 for x in _labels_dict.items()])
251
252
253 self._labels_dict = _labels_dict
254 self._labels_dict_rev = _labels_dict_rev
255
256
257
258
259
260 if __debug__:
261 debug("SG__", "Mapping labels using dict %s" % _labels_dict)
262 labels_ = N.asarray([ _labels_dict[x] for x in dataset.labels ], dtype='double')
263
264 labels = shogun.Features.Labels(labels_)
265 _setdebug(labels, 'Labels')
266
267
268
269 if not retrainable or _changedData['traindata'] or _changedData['kernel_params']:
270
271
272 kargs = []
273 for arg in self._KERNELS[self._kernel_type_literal][1]:
274 value = self.kernel_params[arg].value
275
276 if arg == 'gamma' and value == 0.0:
277 value = self._getDefaultGamma(dataset)
278 kargs += [value]
279
280 if retrainable and __debug__:
281 if _changedData['traindata']:
282 debug("SG",
283 "Re-Creating kernel since training data has changed")
284
285 if _changedData['kernel_params']:
286 debug("SG",
287 "Re-Creating kernel since params %s has changed" %
288 _changedData['kernel_params'])
289
290
291 if __debug__: debug("SG_", "Converting input data for shogun")
292 self.__traindata = _tosg(dataset.samples)
293
294 if __debug__:
295 debug("SG", "Creating kernel instance of %s giving arguments %s" %
296 (`self._kernel_type`, kargs))
297
298 self.__kernel = kernel = \
299 self._kernel_type(self.__traindata, self.__traindata,
300 *kargs)
301
302 if externals.exists('sg >= 0.6.4'):
303 kernel.set_normalizer(shogun.Kernel.IdentityKernelNormalizer())
304
305 newkernel = True
306 self.kernel_params.reset()
307 _setdebug(kernel, 'Kernels')
308
309 self.__condition_kernel(kernel)
310 if retrainable:
311 if __debug__:
312 debug("SG_", "Resetting test kernel for retrainable SVM")
313 self.__kernel_test = None
314 self.__kernel_args = kargs
315
316
317
318 Cs = None
319 if not retrainable or self.__svm is None or _changedData['params']:
320
321 if self.params.isKnown('C'):
322 C = self.params.C
323 if not operator.isSequenceType(C):
324
325 C = [C]
326
327 Cs = list(C[:])
328 for i in xrange(len(Cs)):
329 if Cs[i]<0:
330 Cs[i] = self._getDefaultC(dataset.samples)*abs(Cs[i])
331 if __debug__:
332 debug("SG_", "Default C for %s was computed to be %s" %
333 (C[i], Cs[i]))
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355 svm_impl_class = self.__get_implementation(ul)
356
357 if __debug__:
358 debug("SG", "Creating SVM instance of %s" % `svm_impl_class`)
359
360 if self._svm_impl in ['libsvr', 'svrlight']:
361
362 self.__svm = svm_impl_class(Cs[0], self.params.epsilon, self.__kernel, labels)
363 elif self._svm_impl in ['krr']:
364 self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels)
365 else:
366 self.__svm = svm_impl_class(Cs[0], self.__kernel, labels)
367 self.__svm.set_epsilon(self.params.epsilon)
368 if Cs is not None and len(Cs) == 2:
369 if __debug__:
370 debug("SG_", "Since multiple Cs are provided: %s, assign them" % Cs)
371 self.__svm.set_C(Cs[0], Cs[1])
372
373 self.params.reset()
374 newsvm = True
375 _setdebug(self.__svm, 'SVM')
376
377 if self.params.isKnown('tube_epsilon') and \
378 hasattr(self.__svm, 'set_tube_epsilon'):
379 self.__svm.set_tube_epsilon(self.params.tube_epsilon)
380 self.__svm.parallel.set_num_threads(self.params.num_threads)
381 else:
382 if __debug__:
383 debug("SG_", "SVM instance is not re-created")
384 if _changedData['labels']:
385 if __debug__: debug("SG__", "Assigning new labels")
386 self.__svm.set_labels(labels)
387 if newkernel:
388 if __debug__: debug("SG__", "Assigning new kernel")
389 self.__svm.set_kernel(self.__kernel)
390 assert(_changedData['params'] is False)
391
392 if retrainable:
393
394 self.states.retrained = not newsvm or not newkernel
395
396
397 if __debug__ and 'SG' in debug.active:
398 if not self.regression:
399 lstr = " with labels %s" % dataset.uniquelabels
400 else:
401 lstr = ""
402 debug("SG", "%sTraining %s on data%s" %
403 (("","Re-")[retrainable and self.states.retrained],
404 self, lstr))
405
406 self.__svm.train()
407
408 if __debug__:
409 debug("SG_", "Done training SG_SVM %s" % self._kernel_type)
410
411
412 if (__debug__ and 'SG__' in debug.active) or \
413 self.states.isEnabled('training_confusion'):
414 trained_labels = self.__svm.classify().get_labels()
415 else:
416 trained_labels = None
417
418 if __debug__ and "SG__" in debug.active:
419 debug("SG__", "Original labels: %s, Trained labels: %s" %
420 (dataset.labels, trained_labels))
421
422
423
424
425
426
427
428
429
430 if self.regression and self.states.isEnabled('training_confusion'):
431 self.states.training_confusion = self._summaryClass(
432 targets=dataset.labels,
433 predictions=trained_labels)
434
436 """Predict values for the data
437 """
438
439 retrainable = self.params.retrainable
440
441 if retrainable:
442 changed_testdata = self._changedData['testdata'] or \
443 self.__kernel_test is None
444
445 if not retrainable or changed_testdata:
446 testdata = _tosg(data)
447
448 if not retrainable:
449 if __debug__:
450 debug("SG__",
451 "Initializing SVMs kernel of %s with training/testing samples"
452 % self)
453
454 self.__kernel.init(self.__traindata, testdata)
455 self.__condition_kernel(self.__kernel)
456 else:
457 if changed_testdata:
458 if __debug__:
459 debug("SG__",
460 "Re-creating testing kernel of %s giving "
461 "arguments %s" %
462 (`self._kernel_type`, self.__kernel_args))
463 kernel_test = self._kernel_type(self.__traindata, testdata,
464 *self.__kernel_args)
465 _setdebug(kernel_test, 'Kernels')
466
467 custk_args = ([self.__traindata, testdata], [])[
468 int(externals.exists('sg >= 0.6.4'))]
469 if __debug__:
470 debug("SG__",
471 "Re-creating custom testing kernel giving "
472 "arguments %s" % (str(custk_args)))
473 kernel_test_custom = shogun.Kernel.CustomKernel(*custk_args)
474
475 _setdebug(kernel_test_custom, 'Kernels')
476 self.__kernel_test = kernel_test_custom
477 self.__kernel_test.set_full_kernel_matrix_from_full(
478 kernel_test.get_kernel_matrix())
479 elif __debug__:
480 debug("SG__", "Re-using testing kernel")
481
482 assert(self.__kernel_test is not None)
483 self.__svm.set_kernel(self.__kernel_test)
484
485 if __debug__:
486 debug("SG_", "Classifying testing data")
487
488
489
490 values_ = self.__svm.classify()
491 if values_ is None:
492 raise RuntimeError, "We got empty list of values from %s" % self
493
494 values = values_.get_labels()
495
496 if retrainable:
497
498 self.states.repredicted = not changed_testdata
499 if __debug__:
500 debug("SG__", "Re-assigning learing kernel. Repredicted is %s"
501 % self.states.repredicted)
502
503 self.__svm.set_kernel(self.__kernel)
504
505 if __debug__:
506 debug("SG__", "Got values %s" % values)
507
508 if ('regression' in self._clf_internals):
509 predictions = values
510 else:
511
512 _labels_dict = self._labels_dict
513 _labels_dict_rev = self._labels_dict_rev
514
515 if len(_labels_dict) == 2:
516 predictions = 1.0 - 2*N.signbit(values)
517 else:
518 predictions = values
519
520
521 label_type = type(_labels_dict.values()[0])
522
523
524 predictions = [_labels_dict_rev[label_type(x)]
525 for x in predictions]
526
527 if __debug__:
528 debug("SG__", "Tuned predictions %s" % predictions)
529
530
531
532
533 self.values = values
534
535
536 if not retrainable:
537 try:
538 testdata.free_features()
539 except:
540 pass
541
542 return predictions
543
544
546 super(SVM, self).untrain()
547 if not self.params.retrainable:
548 if __debug__:
549 debug("SG__", "Untraining %(clf)s and destroying sg's SVM",
550 msgargs={'clf':self})
551
552
553
554 if True:
555 if True:
556
557 if self.__kernel is not None:
558 del self.__kernel
559 self.__kernel = None
560
561 if self.__kernel_test is not None:
562 del self.__kernel_test
563 self.__kernel_test = None
564
565 if self.__svm is not None:
566 del self.__svm
567 self.__svm = None
568
569 if self.__traindata is not None:
570
571
572
573
574 self.__traindata.free_features()
575 del self.__traindata
576 self.__traindata = None
577
578 self.__traindataset = None
579
580
581
582
583
584 if __debug__:
585 debug("SG__",
586 "Done untraining %(self)s and destroying sg's SVM",
587 msgargs=locals())
588 elif __debug__:
589 debug("SG__", "Not untraining %(self)s since it is retrainable",
590 msgargs=locals())
591
592
594 if 'regression' in self._clf_internals or len(ul) == 2:
595 svm_impl_class = SVM._KNOWN_IMPLEMENTATIONS[self._svm_impl][0]
596 else:
597 if self._svm_impl == 'libsvm':
598 svm_impl_class = shogun.Classifier.LibSVMMultiClass
599 elif self._svm_impl == 'gmnp':
600 svm_impl_class = shogun.Classifier.GMNPSVM
601 else:
602 raise RuntimeError, \
603 "Shogun: Implementation %s doesn't handle multiclass " \
604 "data. Got labels %s. Use some other classifier" % \
605 (self._svm_impl, self.__traindataset.uniquelabels)
606 if __debug__:
607 debug("SG_", "Using %s for multiclass data of %s" %
608 (svm_impl_class, self._svm_impl))
609
610 return svm_impl_class
611
612
613 svm = property(fget=lambda self: self.__svm)
614 """Access to the SVM model."""
615
616 traindataset = property(fget=lambda self: self.__traindataset)
617 """Dataset which was used for training
618
619 TODO -- might better become state variable I guess"""
620
621
622
623
624
625 for name, item, params, descr in \
626 [('mpd', "shogun.Classifier.MPDSVM", "('C',), ('binary',)",
627 "MPD classifier from shogun"),
628 ('lightsvm', "shogun.Classifier.SVMLight", "('C',), ('binary',)",
629 "SVMLight classification http://svmlight.joachims.org/"),
630 ('svrlight', "shogun.Regression.SVRLight", "('C','tube_epsilon',), ('regression',)",
631 "SVMLight regression http://svmlight.joachims.org/")]:
632 if externals.exists('shogun.%s' % name):
633 exec "SVM._KNOWN_IMPLEMENTATIONS[\"%s\"] = (%s, %s, \"%s\")" % (name, item, params, descr)
634
635
636 LinearSVMWeights._LEGAL_CLFS = [SVM]
637