Package mvpa :: Package clfs :: Package sg :: Module svm
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.sg.svm

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Wrap the libsvm package into a very simple class interface.""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13   
 14  _DEV__doc__ = """ 
 15   
 16  TODOs: 
 17   * dual-license under GPL for use of SG? 
 18   * for recent versions add ability to specify/parametrize normalization 
 19     scheme for the kernel, and reuse 'scale' now for the normalizer 
 20   * Add support for simplified linear classifiers (which do not require 
 21     storing all training SVs/samples to make classification in predict()) 
 22  """ 
 23   
 24  import numpy as N 
 25   
 26   
 27  # Rely on SG 
 28  import shogun.Features 
 29  import shogun.Classifier 
 30  import shogun.Regression 
 31  import shogun.Kernel 
 32  import shogun.Library 
 33   
 34  import operator 
 35   
 36  from mvpa.misc.param import Parameter 
 37  from mvpa.base import warning 
 38   
 39  from mvpa.clfs.base import MulticlassClassifier 
 40  from mvpa.clfs._svmbase import _SVM 
 41  from mvpa.misc.state import StateVariable 
 42  from mvpa.clfs.base import Classifier, MulticlassClassifier 
 43  from mvpa.measures.base import Sensitivity 
 44  from mvpa.base import externals 
 45   
 46  from sens import * 
 47   
 48  if __debug__: 
 49      from mvpa.base import debug 
 50   
 51   
 52   
 53   
54 -def _setdebug(obj, partname):
55 """Helper to set level of debugging output for SG 56 :Parameters: 57 obj 58 In SG debug output seems to be set per every object 59 partname : basestring 60 For what kind of object we are talking about... could be automated 61 later on (TODO) 62 """ 63 debugname = "SG_%s" % partname.upper() 64 65 switch = {True: (shogun.Kernel.M_DEBUG, 'M_DEBUG', "enable"), 66 False: (shogun.Kernel.M_ERROR, 'M_ERROR', "disable")} 67 68 key = __debug__ and debugname in debug.active 69 70 sglevel, slevel, progressfunc = switch[key] 71 72 if __debug__: 73 debug("SG_", "Setting verbosity for shogun.%s instance: %s to %s" % 74 (partname, `obj`, slevel)) 75 obj.io.set_loglevel(sglevel) 76 try: 77 exec "obj.io.%s_progress()" % progressfunc 78 except: 79 warning("Shogun version installed has no way to enable progress" + 80 " reports")
81 82
83 -def _tosg(data):
84 """Draft helper function to convert data we have into SG suitable format 85 86 TODO: Support different datatypes 87 """ 88 89 if __debug__: 90 debug("SG_", "Converting data for shogun into RealFeatures") 91 92 features = shogun.Features.RealFeatures(data.astype('double').T) 93 94 if __debug__: 95 debug("SG__", "Done converting data for shogun into RealFeatures") 96 _setdebug(features, 'Features') 97 return features
98 99
100 -class SVM(_SVM):
101 """Support Vector Machine Classifier(s) based on Shogun 102 103 This is a simple base interface 104 """ 105 106 num_threads = Parameter(1, 107 min=1, 108 descr='Number of threads to utilize') 109 110 # NOTE: gamma is width in SG notation for RBF(Gaussian) 111 _KERNELS = { "linear": (shogun.Kernel.LinearKernel, ('scale',), LinearSVMWeights), 112 "rbf" : (shogun.Kernel.GaussianKernel, ('gamma',), None), 113 "rbfshift" : (shogun.Kernel.GaussianShiftKernel, ('gamma', 'max_shift', 'shift_step'), None), 114 "sigmoid" : (shogun.Kernel.SigmoidKernel, ('cache_size', 'gamma', 'coef0'), None), 115 } 116 117 _KNOWN_PARAMS = [ 'epsilon' ] 118 _KNOWN_KERNEL_PARAMS = [ ] 119 120 _clf_internals = _SVM._clf_internals + [ 'sg', 'retrainable' ] 121 122 if externals.exists('sg >= 0.6.4'): 123 _KERNELS['linear'] = (shogun.Kernel.LinearKernel, (), LinearSVMWeights) 124 125 # Some words of wisdom from shogun author: 126 # XXX remove after proper comments added to implementations 127 """ 128 If you'd like to train linear SVMs use SGD or OCAS. These are (I am 129 serious) the fastest linear SVM-solvers to date. (OCAS cannot do SVMs 130 with standard additive bias, but will L2 reqularize it - though it 131 should not matter much in practice (although it will give slightly 132 different solutions)). Note that SGD has no stopping criterion (you 133 simply have to specify the number of iterations) and that OCAS has a 134 different stopping condition than svmlight for example which may be more 135 tight and more loose depending on the problem - I sugeest 1e-2 or 1e-3 136 for epsilon. 137 138 If you would like to train kernel SVMs use libsvm/gpdt/svmlight - 139 depending on the problem one is faster than the other (hard to say when, 140 I *think* when your dataset is very unbalanced chunking methods like 141 svmlight/gpdt are better), for smaller problems definitely libsvm. 142 143 If you use string kernels then gpdt/svmlight have a special 'linadd' 144 speedup for this (requires sg 0.6.2 - there was some inefficiency in the 145 code for python-modular before that). This is effective for big datasets 146 and (I trained on 10 million strings based on this). 147 148 And yes currently we only implemented parallel training for svmlight, 149 however all SVMs can be evaluated in parallel. 150 """ 151 _KNOWN_IMPLEMENTATIONS = { 152 "libsvm" : (shogun.Classifier.LibSVM, ('C',), ('multiclass', 'binary'), ''), 153 "gmnp" : (shogun.Classifier.GMNPSVM, ('C',), ('multiclass', 'binary'), ''), 154 "gpbt" : (shogun.Classifier.GPBTSVM, ('C',), ('binary',), ''), 155 "gnpp" : (shogun.Classifier.GNPPSVM, ('C',), ('binary',), ''), 156 157 ## TODO: Needs sparse features... 158 # "svmlin" : (shogun.Classifier.SVMLin, ''), 159 # "liblinear" : (shogun.Classifier.LibLinear, ''), 160 # "subgradient" : (shogun.Classifier.SubGradientSVM, ''), 161 ## good 2-class linear SVMs 162 # "ocas" : (shogun.Classifier.SVMOcas, ''), 163 # "sgd" : ( shogun.Classifier.SVMSGD, ''), 164 165 # regressions 166 "libsvr": (shogun.Regression.LibSVR, ('C', 'tube_epsilon',), ('regression',), ''), 167 "krr": (shogun.Regression.KRR, ('tau',), ('regression',), ''), 168 } 169 170
171 - def __init__(self, 172 kernel_type='linear', 173 **kwargs):
174 """This is the base class of all classifier that utilize so 175 far just SVM classifiers provided by shogun. 176 177 TODO Documentation if this all works ;-) 178 """ 179 180 svm_impl = kwargs.get('svm_impl', 'libsvm').lower() 181 kwargs['svm_impl'] = svm_impl 182 183 # init base class 184 _SVM.__init__(self, kernel_type=kernel_type, **kwargs) 185 186 self.__svm = None 187 """Holds the trained svm.""" 188 189 # Need to store original data... 190 # TODO: keep 1 of them -- just __traindata or __traindataset 191 # For now it is needed for computing sensitivities 192 self.__traindataset = None 193 194 # internal SG swig proxies 195 self.__traindata = None 196 self.__kernel = None 197 self.__kernel_test = None 198 self.__testdata = None
199 200
201 - def __condition_kernel(self, kernel):
202 # XXX I thought that it is needed only for retrainable classifier, 203 # but then krr gets confused, and svrlight needs it to provide 204 # meaningful results even without 'retraining' 205 if self._svm_impl in ['svrlight', 'lightsvm']: 206 kernel.set_precompute_matrix(True, True)
207 208
209 - def _train(self, dataset):
210 """Train SVM 211 """ 212 # XXX watchout 213 # self.untrain() 214 newkernel, newsvm = False, False 215 # local bindings for faster lookup 216 retrainable = self.params.retrainable 217 218 if retrainable: 219 _changedData = self._changedData 220 221 # LABELS 222 ul = None 223 self.__traindataset = dataset 224 225 226 # OK -- we have to map labels since 227 # binary ones expect -1/+1 228 # Multiclass expect labels starting with 0, otherwise they puke 229 # when ran from ipython... yikes 230 if __debug__: 231 debug("SG_", "Creating labels instance") 232 233 if 'regression' in self._clf_internals: 234 labels_ = N.asarray(dataset.labels, dtype='double') 235 else: 236 ul = dataset.uniquelabels 237 ul.sort() 238 239 if len(ul) == 2: 240 # assure that we have -1/+1 241 _labels_dict = {ul[0]:-1.0, ul[1]:+1.0} 242 elif len(ul) < 2: 243 raise ValueError, "we do not have 1-class SVM brought into SG yet" 244 else: 245 # can't use plain enumerate since we need them swapped 246 _labels_dict = dict([ (ul[i], i) for i in range(len(ul))]) 247 248 # reverse labels dict for back mapping in _predict 249 _labels_dict_rev = dict([(x[1], x[0]) 250 for x in _labels_dict.items()]) 251 252 # bind to instance as well 253 self._labels_dict = _labels_dict 254 self._labels_dict_rev = _labels_dict_rev 255 256 # Map labels 257 # 258 # TODO: top level classifier should take care about labels 259 # mapping if that is needed 260 if __debug__: 261 debug("SG__", "Mapping labels using dict %s" % _labels_dict) 262 labels_ = N.asarray([ _labels_dict[x] for x in dataset.labels ], dtype='double') 263 264 labels = shogun.Features.Labels(labels_) 265 _setdebug(labels, 'Labels') 266 267 268 # KERNEL 269 if not retrainable or _changedData['traindata'] or _changedData['kernel_params']: 270 # If needed compute or just collect arguments for SVM and for 271 # the kernel 272 kargs = [] 273 for arg in self._KERNELS[self._kernel_type_literal][1]: 274 value = self.kernel_params[arg].value 275 # XXX Unify damn automagic gamma value 276 if arg == 'gamma' and value == 0.0: 277 value = self._getDefaultGamma(dataset) 278 kargs += [value] 279 280 if retrainable and __debug__: 281 if _changedData['traindata']: 282 debug("SG", 283 "Re-Creating kernel since training data has changed") 284 285 if _changedData['kernel_params']: 286 debug("SG", 287 "Re-Creating kernel since params %s has changed" % 288 _changedData['kernel_params']) 289 290 # create training data 291 if __debug__: debug("SG_", "Converting input data for shogun") 292 self.__traindata = _tosg(dataset.samples) 293 294 if __debug__: 295 debug("SG", "Creating kernel instance of %s giving arguments %s" % 296 (`self._kernel_type`, kargs)) 297 298 self.__kernel = kernel = \ 299 self._kernel_type(self.__traindata, self.__traindata, 300 *kargs) 301 302 if externals.exists('sg >= 0.6.4'): 303 kernel.set_normalizer(shogun.Kernel.IdentityKernelNormalizer()) 304 305 newkernel = True 306 self.kernel_params.reset() # mark them as not-changed 307 _setdebug(kernel, 'Kernels') 308 309 self.__condition_kernel(kernel) 310 if retrainable: 311 if __debug__: 312 debug("SG_", "Resetting test kernel for retrainable SVM") 313 self.__kernel_test = None 314 self.__kernel_args = kargs 315 316 # TODO -- handle _changedData['params'] correctly, ie without recreating 317 # whole SVM 318 Cs = None 319 if not retrainable or self.__svm is None or _changedData['params']: 320 # SVM 321 if self.params.isKnown('C'): 322 C = self.params.C 323 if not operator.isSequenceType(C): 324 # we were not given a tuple for balancing between classes 325 C = [C] 326 327 Cs = list(C[:]) # copy 328 for i in xrange(len(Cs)): 329 if Cs[i]<0: 330 Cs[i] = self._getDefaultC(dataset.samples)*abs(Cs[i]) 331 if __debug__: 332 debug("SG_", "Default C for %s was computed to be %s" % 333 (C[i], Cs[i])) 334 335 # XXX do not jump over the head and leave it up to the user 336 # ie do not rescale automagically by the number of samples 337 #if len(Cs) == 2 and not ('regression' in self._clf_internals) and len(ul) == 2: 338 # # we were given two Cs 339 # if N.max(C) < 0 and N.min(C) < 0: 340 # # and both are requested to be 'scaled' TODO : 341 # # provide proper 'features' to the parameters, 342 # # so we could specify explicitely if to scale 343 # # them by the number of samples here 344 # nl = [N.sum(labels_ == _labels_dict[l]) for l in ul] 345 # ratio = N.sqrt(float(nl[1]) / nl[0]) 346 # #ratio = (float(nl[1]) / nl[0]) 347 # Cs[0] *= ratio 348 # Cs[1] /= ratio 349 # if __debug__: 350 # debug("SG_", "Rescaled Cs to %s to accomodate the " 351 # "difference in number of training samples" % 352 # Cs) 353 354 # Choose appropriate implementation 355 svm_impl_class = self.__get_implementation(ul) 356 357 if __debug__: 358 debug("SG", "Creating SVM instance of %s" % `svm_impl_class`) 359 360 if self._svm_impl in ['libsvr', 'svrlight']: 361 # for regressions constructor a bit different 362 self.__svm = svm_impl_class(Cs[0], self.params.epsilon, self.__kernel, labels) 363 elif self._svm_impl in ['krr']: 364 self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels) 365 else: 366 self.__svm = svm_impl_class(Cs[0], self.__kernel, labels) 367 self.__svm.set_epsilon(self.params.epsilon) 368 if Cs is not None and len(Cs) == 2: 369 if __debug__: 370 debug("SG_", "Since multiple Cs are provided: %s, assign them" % Cs) 371 self.__svm.set_C(Cs[0], Cs[1]) 372 373 self.params.reset() # mark them as not-changed 374 newsvm = True 375 _setdebug(self.__svm, 'SVM') 376 # Set optimization parameters 377 if self.params.isKnown('tube_epsilon') and \ 378 hasattr(self.__svm, 'set_tube_epsilon'): 379 self.__svm.set_tube_epsilon(self.params.tube_epsilon) 380 self.__svm.parallel.set_num_threads(self.params.num_threads) 381 else: 382 if __debug__: 383 debug("SG_", "SVM instance is not re-created") 384 if _changedData['labels']: # labels were changed 385 if __debug__: debug("SG__", "Assigning new labels") 386 self.__svm.set_labels(labels) 387 if newkernel: # kernel was replaced 388 if __debug__: debug("SG__", "Assigning new kernel") 389 self.__svm.set_kernel(self.__kernel) 390 assert(_changedData['params'] is False) # we should never get here 391 392 if retrainable: 393 # we must assign it only if it is retrainable 394 self.states.retrained = not newsvm or not newkernel 395 396 # Train 397 if __debug__ and 'SG' in debug.active: 398 if not self.regression: 399 lstr = " with labels %s" % dataset.uniquelabels 400 else: 401 lstr = "" 402 debug("SG", "%sTraining %s on data%s" % 403 (("","Re-")[retrainable and self.states.retrained], 404 self, lstr)) 405 406 self.__svm.train() 407 408 if __debug__: 409 debug("SG_", "Done training SG_SVM %s" % self._kernel_type) 410 411 # Report on training 412 if (__debug__ and 'SG__' in debug.active) or \ 413 self.states.isEnabled('training_confusion'): 414 trained_labels = self.__svm.classify().get_labels() 415 else: 416 trained_labels = None 417 418 if __debug__ and "SG__" in debug.active: 419 debug("SG__", "Original labels: %s, Trained labels: %s" % 420 (dataset.labels, trained_labels)) 421 422 # Assign training confusion right away here since we are ready 423 # to do so. 424 # XXX TODO use some other state variable like 'trained_labels' and 425 # use it within base Classifier._posttrain to assign predictions 426 # instead of duplicating code here 427 # XXX For now it can be done only for regressions since labels need to 428 # be remapped and that becomes even worse if we use regression 429 # as a classifier so mapping happens upstairs 430 if self.regression and self.states.isEnabled('training_confusion'): 431 self.states.training_confusion = self._summaryClass( 432 targets=dataset.labels, 433 predictions=trained_labels)
434
435 - def _predict(self, data):
436 """Predict values for the data 437 """ 438 439 retrainable = self.params.retrainable 440 441 if retrainable: 442 changed_testdata = self._changedData['testdata'] or \ 443 self.__kernel_test is None 444 445 if not retrainable or changed_testdata: 446 testdata = _tosg(data) 447 448 if not retrainable: 449 if __debug__: 450 debug("SG__", 451 "Initializing SVMs kernel of %s with training/testing samples" 452 % self) 453 # We can just reuse kernel used for training 454 self.__kernel.init(self.__traindata, testdata) 455 self.__condition_kernel(self.__kernel) 456 else: 457 if changed_testdata: 458 if __debug__: 459 debug("SG__", 460 "Re-creating testing kernel of %s giving " 461 "arguments %s" % 462 (`self._kernel_type`, self.__kernel_args)) 463 kernel_test = self._kernel_type(self.__traindata, testdata, 464 *self.__kernel_args) 465 _setdebug(kernel_test, 'Kernels') 466 467 custk_args = ([self.__traindata, testdata], [])[ 468 int(externals.exists('sg >= 0.6.4'))] 469 if __debug__: 470 debug("SG__", 471 "Re-creating custom testing kernel giving " 472 "arguments %s" % (str(custk_args))) 473 kernel_test_custom = shogun.Kernel.CustomKernel(*custk_args) 474 475 _setdebug(kernel_test_custom, 'Kernels') 476 self.__kernel_test = kernel_test_custom 477 self.__kernel_test.set_full_kernel_matrix_from_full( 478 kernel_test.get_kernel_matrix()) 479 elif __debug__: 480 debug("SG__", "Re-using testing kernel") 481 482 assert(self.__kernel_test is not None) 483 self.__svm.set_kernel(self.__kernel_test) 484 485 if __debug__: 486 debug("SG_", "Classifying testing data") 487 488 # doesn't do any good imho although on unittests helps tiny bit... hm 489 #self.__svm.init_kernel_optimization() 490 values_ = self.__svm.classify() 491 if values_ is None: 492 raise RuntimeError, "We got empty list of values from %s" % self 493 494 values = values_.get_labels() 495 496 if retrainable: 497 # we must assign it only if it is retrainable 498 self.states.repredicted = not changed_testdata 499 if __debug__: 500 debug("SG__", "Re-assigning learing kernel. Repredicted is %s" 501 % self.states.repredicted) 502 # return back original kernel 503 self.__svm.set_kernel(self.__kernel) 504 505 if __debug__: 506 debug("SG__", "Got values %s" % values) 507 508 if ('regression' in self._clf_internals): 509 predictions = values 510 else: 511 # local bindings 512 _labels_dict = self._labels_dict 513 _labels_dict_rev = self._labels_dict_rev 514 515 if len(_labels_dict) == 2: 516 predictions = 1.0 - 2*N.signbit(values) 517 else: 518 predictions = values 519 520 # assure that we have the same type 521 label_type = type(_labels_dict.values()[0]) 522 523 # remap labels back adjusting their type 524 predictions = [_labels_dict_rev[label_type(x)] 525 for x in predictions] 526 527 if __debug__: 528 debug("SG__", "Tuned predictions %s" % predictions) 529 530 # store state variable 531 # TODO: extract values properly for multiclass SVMs -- 532 # ie 1 value per label or pairs for all 1-vs-1 classifications 533 self.values = values 534 535 ## to avoid leaks with not yet properly fixed shogun 536 if not retrainable: 537 try: 538 testdata.free_features() 539 except: 540 pass 541 542 return predictions
543 544
545 - def untrain(self):
546 super(SVM, self).untrain() 547 if not self.params.retrainable: 548 if __debug__: 549 debug("SG__", "Untraining %(clf)s and destroying sg's SVM", 550 msgargs={'clf':self}) 551 552 # to avoid leaks with not yet properly fixed shogun 553 # XXX make it nice... now it is just stable ;-) 554 if True: # not self.__traindata is None: 555 if True: 556 # try: 557 if self.__kernel is not None: 558 del self.__kernel 559 self.__kernel = None 560 561 if self.__kernel_test is not None: 562 del self.__kernel_test 563 self.__kernel_test = None 564 565 if self.__svm is not None: 566 del self.__svm 567 self.__svm = None 568 569 if self.__traindata is not None: 570 # Let in for easy demonstration of the memory leak in shogun 571 #for i in xrange(10): 572 # debug("SG__", "cachesize pre free features %s" % 573 # (self.__svm.get_kernel().get_cache_size())) 574 self.__traindata.free_features() 575 del self.__traindata 576 self.__traindata = None 577 578 self.__traindataset = None 579 580 581 #except: 582 # pass 583 584 if __debug__: 585 debug("SG__", 586 "Done untraining %(self)s and destroying sg's SVM", 587 msgargs=locals()) 588 elif __debug__: 589 debug("SG__", "Not untraining %(self)s since it is retrainable", 590 msgargs=locals())
591 592
593 - def __get_implementation(self, ul):
594 if 'regression' in self._clf_internals or len(ul) == 2: 595 svm_impl_class = SVM._KNOWN_IMPLEMENTATIONS[self._svm_impl][0] 596 else: 597 if self._svm_impl == 'libsvm': 598 svm_impl_class = shogun.Classifier.LibSVMMultiClass 599 elif self._svm_impl == 'gmnp': 600 svm_impl_class = shogun.Classifier.GMNPSVM 601 else: 602 raise RuntimeError, \ 603 "Shogun: Implementation %s doesn't handle multiclass " \ 604 "data. Got labels %s. Use some other classifier" % \ 605 (self._svm_impl, self.__traindataset.uniquelabels) 606 if __debug__: 607 debug("SG_", "Using %s for multiclass data of %s" % 608 (svm_impl_class, self._svm_impl)) 609 610 return svm_impl_class
611 612 613 svm = property(fget=lambda self: self.__svm) 614 """Access to the SVM model.""" 615 616 traindataset = property(fget=lambda self: self.__traindataset) 617 """Dataset which was used for training 618 619 TODO -- might better become state variable I guess"""
620 621 622 623 # Conditionally make some of the implementations available if they are 624 # present in the present shogun 625 for name, item, params, descr in \ 626 [('mpd', "shogun.Classifier.MPDSVM", "('C',), ('binary',)", 627 "MPD classifier from shogun"), 628 ('lightsvm', "shogun.Classifier.SVMLight", "('C',), ('binary',)", 629 "SVMLight classification http://svmlight.joachims.org/"), 630 ('svrlight', "shogun.Regression.SVRLight", "('C','tube_epsilon',), ('regression',)", 631 "SVMLight regression http://svmlight.joachims.org/")]: 632 if externals.exists('shogun.%s' % name): 633 exec "SVM._KNOWN_IMPLEMENTATIONS[\"%s\"] = (%s, %s, \"%s\")" % (name, item, params, descr) 634 635 # Assign SVM class to limited set of LinearSVMWeights 636 LinearSVMWeights._LEGAL_CLFS = [SVM] 637