1
2
3
4
5
6
7
8
9 """Wrap the libsvm package into a very simple class interface."""
10
11 __docformat__ = 'restructuredtext'
12
13 import numpy as N
14
15
16
17
18 import shogun.Features
19 import shogun.Classifier
20 import shogun.Regression
21 import shogun.Kernel
22 import shogun.Library
23
24
25 from mvpa.misc.param import Parameter
26 from mvpa.misc import warning
27
28 from mvpa.clfs.base import MulticlassClassifier
29 from mvpa.clfs._svmbase import _SVM
30 from mvpa.misc.state import StateVariable
31 from mvpa.misc.support import idhash
32 from mvpa.clfs.base import Classifier, MulticlassClassifier
33 from mvpa.measures.base import Sensitivity
34 from mvpa.base import externals
35
36 from sens import *
37
38 if __debug__:
39 from mvpa.misc import debug
40
41
42
43 """
44 If you'd like to train linear SVMs use SGD or OCAS. These are (I am
45 serious) the fastest linear SVM-solvers to date. (OCAS cannot do SVMs
46 with standard additive bias, but will L2 reqularize it - though it
47 should not matter much in practice (although it will give slightly
48 different solutions)). Note that SGD has no stopping criterion (you
49 simply have to specify the number of iterations) and that OCAS has a
50 different stopping condition than svmlight for example which may be more
51 tight and more loose depending on the problem - I sugeest 1e-2 or 1e-3
52 for epsilon.
53
54 If you would like to train kernel SVMs use libsvm/gpdt/svmlight -
55 depending on the problem one is faster than the other (hard to say when,
56 I *think* when your dataset is very unbalanced chunking methods like
57 svmlight/gpdt are better), for smaller problems definitely libsvm.
58
59 If you use string kernels then gpdt/svmlight have a special 'linadd'
60 speedup for this (requires sg 0.6.2 - there was some inefficiency in the
61 code for python-modular before that). This is effective for big datasets
62 and (I trained on 10 million strings based on this).
63
64 And yes currently we only implemented parallel training for svmlight,
65 however all SVMs can be evaluated in parallel.
66 """
67 known_svm_impl = { "libsvm" : (shogun.Classifier.LibSVM, ''),
68 "gmnp" : (shogun.Classifier.GMNPSVM, ''),
69 "mpd" : (shogun.Classifier.MPDSVM, ''),
70 "gpbt" : (shogun.Classifier.GPBTSVM, ''),
71 "gnpp" : (shogun.Classifier.GNPPSVM, ''),
72
73
74
75
76
77
78
79
80
81
82 "libsvr": (shogun.Regression.LibSVR, ''),
83 "krr": (shogun.Regression.KRR, ''),
84 }
85
87 if nl > 2:
88 if svm_impl == 'libsvm':
89 svm_impl_class = shogun.Classifier.LibSVMMultiClass
90 elif svm_impl == 'gmnp':
91 svm_impl_class = shogun.Classifier.GMNPSVM
92 else:
93 raise RuntimeError, \
94 "Shogun: Implementation %s doesn't handle multiclass " \
95 "data. Got labels %s. Use some other classifier" % \
96 (svm_impl, ul)
97 if __debug__:
98 debug("SG_", "Using %s for multiclass data of %s" %
99 (svm_impl_class, svm_impl))
100 else:
101 svm_impl_class = known_svm_impl[svm_impl][0]
102 return svm_impl_class
103
104
105
106 for name, item, descr in \
107 [('lightsvm', "shogun.Classifier.SVMLight",
108 "SVMLight classification http://svmlight.joachims.org/"),
109 ('svrlight', "shogun.Regression.SVRLight",
110 "SVMLight regression http://svmlight.joachims.org/")]:
111 if externals.exists('shogun.%s' % name):
112 exec "known_svm_impl[\"%s\"] = (%s, \"%s\")" % (name, item, descr)
113
114
116 """Helper to set level of debugging output for SG
117 :Parameters:
118 obj
119 In SG debug output seems to be set per every object
120 partname : basestring
121 For what kind of object we are talking about... could be automated
122 later on (TODO)
123 """
124 debugname = "SG_%s" % partname.upper()
125
126 switch = {True: (shogun.Kernel.M_DEBUG, 'M_DEBUG', "enable"),
127 False: (shogun.Kernel.M_ERROR, 'M_ERROR', "disable")}
128
129 key = __debug__ and debugname in debug.active
130
131 sglevel, slevel, progressfunc = switch[key]
132
133 if __debug__:
134 debug("SG_", "Setting verbosity for shogun.%s instance: %s to %s" %
135 (partname, `obj`, slevel))
136 obj.io.set_loglevel(sglevel)
137 try:
138 exec "obj.io.%s_progress()" % progressfunc
139 except:
140 warning("Shogun version installed has no way to enable progress" +
141 " reports")
142
143
145 """Draft helper function to convert data we have into SG suitable format
146
147 TODO: Support different datatypes
148 """
149
150 if __debug__:
151 debug("SG_", "Converting data for shogun into RealFeatures")
152
153 features = shogun.Features.RealFeatures(data.astype('double').T)
154
155 if __debug__:
156 debug("SG__", "Done converting data for shogun into RealFeatures")
157 _setdebug(features, 'Features')
158 return features
159
160
162 """Support Vector Machine Classifier(s) based on Shogun
163
164 This is a simple base interface
165 """
166
167 num_threads = Parameter(1,
168 min=1,
169 descr='Number of threads to utilize')
170
171
172 _KERNELS = { "linear": (shogun.Kernel.LinearKernel, (), LinearSVMWeights),
173 "rbf" : (shogun.Kernel.GaussianKernel, ('gamma',), None),
174 "rbfshift" : (shogun.Kernel.GaussianShiftKernel, ('gamma', 'max_shift', 'shift_step'), None),
175 "sigmoid" : (shogun.Kernel.SigmoidKernel, ('cache_size', 'gamma', 'coef0'), None),
176 }
177
178 _KNOWN_PARAMS = [ 'C', 'epsilon' ]
179 _KNOWN_KERNEL_PARAMS = [ ]
180
181 _clf_internals = _SVM._clf_internals + [ 'sg', 'retrainable' ]
182
183 - def __init__(self,
184 kernel_type='linear',
185 svm_impl="libsvm",
186 **kwargs):
187 """This is the base class of all classifier that utilize so
188 far just SVM classifiers provided by shogun.
189
190 TODO Documentation if this all works ;-)
191 """
192
193 svm_impl = svm_impl.lower()
194 if svm_impl == 'krr':
195 self._KNOWN_PARAMS = self._KNOWN_PARAMS[:] + ['tau']
196 if svm_impl in ['svrlight', 'libsvr']:
197 self._KNOWN_PARAMS = self._KNOWN_PARAMS[:] + ['tube_epsilon']
198
199
200 _SVM.__init__(self, kernel_type=kernel_type, **kwargs)
201
202 self.__svm = None
203 """Holds the trained svm."""
204
205
206 if svm_impl in known_svm_impl:
207 self.__svm_impl = svm_impl
208 else:
209 raise ValueError, "Unknown SVM implementation %s" % svm_impl
210
211 self._clf_internals.append(
212 {True: 'multiclass', False:'binary'}[
213 svm_impl in ['gmnp', 'libsvm']])
214 if svm_impl in ['svrlight', 'libsvr', 'krr']:
215 self._clf_internals += [ 'regression' ]
216
217
218
219
220 self.__traindataset = None
221
222
223 self.__traindata = None
224 self.__kernel = None
225 self.__testdata = None
226
227
228
229 self.__idhash = [None, None, None]
230
231 if __debug__:
232 if 'RETRAIN' in debug.active:
233
234
235
236
237
238
239
240
241
242 self.__trained = [None, None, None]
243
244
246
247 repr_ = super(SVM, self).__repr__()
248 return repr_.replace("(kern", "(svm_impl='%s', kern" % self.__svm_impl)
249
250
252 """Check if given entry was changed from what known prior. If so -- store"""
253 idhash_ = idhash(entry)
254 changed = self.__idhash[i] != idhash_
255 if __debug__ and 'RETRAIN' in debug.active:
256 changed2 = entry != self.__trained[i]
257 if isinstance(changed2, N.ndarray):
258 changed2 = changed2.any()
259 if changed != changed2:
260 raise RuntimeError, \
261 'hashid found to be weak for %s. Though hashid %s!=%s %s, '\
262 'values %s!=%s %s' % \
263 (descr, idhash_, self.__idhash[i], changed,
264 entry, self.__trained[i], changed2)
265 self.__trained[i] = entry
266 if __debug__ and changed:
267 debug('SG__', "Changed %s from %s to %s"
268 % (descr, self.__idhash[i], idhash_))
269 self.__idhash[i] = idhash_
270 return changed
271
272
274 """Train SVM
275 """
276
277 if self.retrainable:
278 changed_params = self.params.whichSet()
279 changed_kernel_params = self.kernel_params.whichSet()
280
281
282
283 newkernel, newsvm = False, False
284 if self.retrainable:
285 if __debug__:
286 debug('SG__', "IDHashes are %s" % (self.__idhash))
287 changed_samples = self.__wasChanged('samples', 0, dataset.samples)
288 changed_labels = self.__wasChanged('labels', 1, dataset.labels)
289
290 ul = dataset.uniquelabels
291 ul.sort()
292
293 self.__traindataset = dataset
294
295
296
297
298
299
300
301 if __debug__:
302 debug("SG_", "Creating labels instance")
303
304 if 'regression' in self._clf_internals:
305 labels_ = N.asarray(dataset.labels, dtype='double')
306 else:
307 if len(ul) == 2:
308
309 self._labels_dict = {ul[0]:-1.0,
310 ul[1]:+1.0}
311 elif len(ul) < 2:
312 raise ValueError, "we do not have 1-class SVM brought into SG yet"
313 else:
314
315 self._labels_dict = dict([ (ul[i], i) for i in range(len(ul))])
316
317
318 self._labels_dict_rev = dict([(x[1], x[0])
319 for x in self._labels_dict.items()])
320
321
322
323
324
325 if __debug__:
326 debug("SG__", "Mapping labels using dict %s" % self._labels_dict)
327 labels_ = N.asarray([ self._labels_dict[x] for x in dataset.labels ], dtype='double')
328
329 labels = shogun.Features.Labels(labels_)
330 _setdebug(labels, 'Labels')
331
332
333
334 if not self.retrainable or changed_samples or changed_kernel_params:
335
336
337 kargs = []
338 for arg in self._KERNELS[self._kernel_type_literal][1]:
339 value = self.kernel_params[arg].value
340
341 if arg == 'gamma' and value == 0.0:
342 value = self._getDefaultGamma(dataset)
343 kargs += [value]
344
345 if self.retrainable and __debug__:
346 if changed_samples:
347 debug("SG",
348 "Re-Creating kernel since samples has changed")
349
350 if changed_kernel_params:
351 debug("SG",
352 "Re-Creating kernel since params %s has changed" %
353 changed_kernel_params)
354
355
356 if __debug__: debug("SG_", "Converting input data for shogun")
357 self.__traindata = _tosg(dataset.samples)
358
359 if __debug__:
360 debug("SG", "Creating kernel instance of %s giving arguments %s" %
361 (`self._kernel_type`, kargs))
362
363 self.__kernel = self._kernel_type(self.__traindata, self.__traindata,
364 *kargs)
365 newkernel = True
366 self.kernel_params.reset()
367 if self.retrainable:
368 self.__kernel.set_precompute_matrix(True, True)
369 self.__kernel_test = None
370 self.__kernel_args = kargs
371 _setdebug(self.__kernel, 'Kernels')
372
373
374
375 if not self.retrainable or self.__svm is None or changed_params:
376
377 C = self.params.C
378 if C<0:
379 C = self._getDefaultC(dataset.samples)*abs(C)
380 if __debug__:
381 debug("SG_", "Default C for %s was computed to be %s" %
382 (self.params.C, C))
383
384
385 svm_impl_class = _get_implementation(self.__svm_impl, len(ul))
386
387 if __debug__:
388 debug("SG", "Creating SVM instance of %s" % `svm_impl_class`)
389
390 if self.__svm_impl in ['libsvr', 'svrlight']:
391
392 self.__svm = svm_impl_class(C, self.params.epsilon, self.__kernel, labels)
393 elif self.__svm_impl in ['krr']:
394 self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels)
395 else:
396 self.__svm = svm_impl_class(C, self.__kernel, labels)
397 self.__svm.set_epsilon(self.params.epsilon)
398 self.params.reset()
399 newsvm = True
400 _setdebug(self.__svm, 'SVM')
401
402 if self.params.isKnown('tube_epsilon') and \
403 hasattr(self.__svm, 'set_tube_epsilon'):
404 self.__svm.set_tube_epsilon(self.params.tube_epsilon)
405 self.__svm.parallel.set_num_threads(self.params.num_threads)
406 else:
407 if __debug__:
408 debug("SG_", "SVM instance is not re-created")
409 if changed_labels:
410 self.__svm.set_labels(labels)
411 if newkernel:
412 self.__svm.set_kernel(self.__kernel)
413 if changed_params:
414 raise NotImplementedError, \
415 "Implement handling of changing params of SVM"
416
417 if self.retrainable:
418
419 self.states.retrained = not newsvm or not newkernel
420
421
422 if __debug__:
423 debug("SG", "%sTraining %s on data with labels %s" %
424 (("","Re-")[self.retrainable and self.states.retrained], self,
425 dataset.uniquelabels))
426
427 self.__svm.train()
428
429
430 if __debug__:
431 debug("SG_", "Done training SG_SVM %s on data with labels %s" %
432 (self._kernel_type, dataset.uniquelabels))
433 if "SG__" in debug.active:
434 trained_labels = self.__svm.classify().get_labels()
435 debug("SG__", "Original labels: %s, Trained labels: %s" %
436 (dataset.labels, trained_labels))
437
438
440 """Predict values for the data
441 """
442
443 if __debug__:
444 debug("SG_", "Initializing kernel with training/testing data")
445
446 if self.retrainable:
447 changed_testdata = self.__wasChanged('test_samples', 2, data) or \
448 self.__kernel_test is None
449
450 if not self.retrainable or changed_testdata:
451 testdata = _tosg(data)
452
453 if not self.retrainable:
454
455 self.__kernel.init(self.__traindata, testdata)
456 else:
457 if changed_testdata:
458 if __debug__:
459 debug("SG__",
460 "Re-creating testing kernel of %s giving "
461 "arguments %s" %
462 (`self._kernel_type`, self.__kernel_args))
463 kernel_test = self._kernel_type(self.__traindata, testdata,
464 *self.__kernel_args)
465 _setdebug(kernel_test, 'Kernels')
466 kernel_test_custom = shogun.Kernel.CustomKernel(self.__traindata, testdata)
467 _setdebug(kernel_test, 'Kernels')
468 self.__kernel_test = kernel_test_custom
469 self.__kernel_test.set_full_kernel_matrix_from_full(
470 kernel_test.get_kernel_matrix())
471 elif __debug__:
472 debug("SG__", "Re-using testing kernel")
473
474 assert(self.__kernel_test is not None)
475 self.__svm.set_kernel(self.__kernel_test)
476
477 if __debug__:
478 debug("SG_", "Classifying testing data")
479
480
481
482 values_ = self.__svm.classify()
483
484
485
486 values = values_.get_labels()
487
488 if self.retrainable:
489
490 self.states.retested = not changed_testdata
491 if __debug__:
492 debug("SG__", "Re-assigning learing kernel. Retested is %s"
493 % self.states.retested)
494
495 self.__svm.set_kernel(self.__kernel)
496
497 if __debug__:
498 debug("SG__", "Got values %s" % values)
499
500 if ('regression' in self._clf_internals):
501 predictions = values
502 else:
503 if len(self._labels_dict) == 2:
504 predictions = 1.0 - 2*N.signbit(values)
505 else:
506 predictions = values
507
508
509 label_type = type(self._labels_dict.values()[0])
510
511
512 predictions = [self._labels_dict_rev[label_type(x)]
513 for x in predictions]
514
515 if __debug__:
516 debug("SG__", "Tuned predictions %s" % predictions)
517
518
519 self.values = values
520
521
522 if not self.retrainable:
523 try:
524 testdata.free_features()
525 except:
526 pass
527
528 return predictions
529
530
532 super(SVM, self).untrain()
533
534 if not self.retrainable:
535 if __debug__:
536 debug("SG__", "Untraining %s and destroying sg's SVM" % self)
537
538 self.__idhash = [None, None, None]
539
540
541
542 if not self.__traindata is None:
543 try:
544 try:
545 self.__traindata.free_features()
546 except:
547 pass
548 if __debug__:
549 if 'RETRAIN' in debug.active:
550 self.__trained = [None, None, None]
551 self.__traindataset = None
552 del self.__kernel
553 self.__kernel = None
554 self.__kernel_test = None
555 del self.__traindata
556 self.__traindata = None
557 del self.__svm
558 self.__svm = None
559 except:
560 pass
561
562 if __debug__:
563 debug("SG__",
564 "Done untraining %(self)s and destroying sg's SVM",
565 msgargs=locals())
566 elif __debug__:
567 debug("SG__", "Not untraining %(self)s since it is retrainable",
568 msgargs=locals())
569
570
571 svm = property(fget=lambda self: self.__svm)
572 """Access to the SVM model."""
573
574 traindataset = property(fget=lambda self: self.__traindataset)
575 """Dataset which was used for training
576
577 TODO -- might better become state variable I guess"""
578