Package mvpa :: Package clfs :: Package sg :: Module svm
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.sg.svm

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Wrap the libsvm package into a very simple class interface.""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  import numpy as N 
 14   
 15   
 16  # Rely on SG 
 17  # TODO: XXX dual-license under GPL for use of SG? 
 18  import shogun.Features 
 19  import shogun.Classifier 
 20  import shogun.Regression 
 21  import shogun.Kernel 
 22  import shogun.Library 
 23   
 24   
 25  from mvpa.misc.param import Parameter 
 26  from mvpa.misc import warning 
 27   
 28  from mvpa.clfs.base import MulticlassClassifier 
 29  from mvpa.clfs._svmbase import _SVM 
 30  from mvpa.misc.state import StateVariable 
 31  from mvpa.misc.support import idhash 
 32  from mvpa.clfs.base import Classifier, MulticlassClassifier 
 33  from mvpa.measures.base import Sensitivity 
 34  from mvpa.base import externals 
 35   
 36  from sens import * 
 37   
 38  if __debug__: 
 39      from mvpa.misc import debug 
 40   
 41  # Some words of wisdom from shogun author: 
 42  # XXX remove after proper comments added to implementations 
 43  """ 
 44  If you'd like to train linear SVMs use SGD or OCAS. These are (I am 
 45  serious) the fastest linear SVM-solvers to date. (OCAS cannot do SVMs 
 46  with standard additive bias, but will L2 reqularize it - though it 
 47  should not matter much in practice (although it will give slightly 
 48  different solutions)). Note that SGD has no stopping criterion (you 
 49  simply have to specify the number of iterations) and that OCAS has a 
 50  different stopping condition than svmlight for example which may be more 
 51  tight and more loose depending on the problem - I sugeest 1e-2 or 1e-3 
 52  for epsilon. 
 53   
 54  If you would like to train kernel SVMs use libsvm/gpdt/svmlight - 
 55  depending on the problem one is faster than the other (hard to say when, 
 56  I *think* when your dataset is very unbalanced chunking methods like 
 57  svmlight/gpdt are better), for smaller problems definitely libsvm. 
 58   
 59  If you use string kernels then gpdt/svmlight have a special 'linadd' 
 60  speedup for this (requires sg 0.6.2 - there was some inefficiency in the 
 61  code for python-modular before that). This is effective for big datasets 
 62  and (I trained on 10 million strings based on this). 
 63   
 64  And yes currently we only implemented parallel training for svmlight, 
 65  however all SVMs can be evaluated in parallel. 
 66  """ 
 67  known_svm_impl = { "libsvm" : (shogun.Classifier.LibSVM, ''), 
 68                     "gmnp" : (shogun.Classifier.GMNPSVM, ''), 
 69                     "mpd"  : (shogun.Classifier.MPDSVM, ''), 
 70                     "gpbt" : (shogun.Classifier.GPBTSVM, ''), 
 71                     "gnpp" : (shogun.Classifier.GNPPSVM, ''), 
 72   
 73                     ## TODO: Needs sparse features... 
 74                     # "svmlin" : (shogun.Classifier.SVMLin, ''), 
 75                     # "liblinear" : (shogun.Classifier.LibLinear, ''), 
 76                     # "subgradient" : (shogun.Classifier.SubGradientSVM, ''), 
 77                     ## good 2-class linear SVMs 
 78                     # "ocas" : (shogun.Classifier.SVMOcas, ''), 
 79                     # "sgd" : ( shogun.Classifier.SVMSGD, ''), 
 80   
 81                     # regressions 
 82                     "libsvr": (shogun.Regression.LibSVR, ''), 
 83                     "krr": (shogun.Regression.KRR, ''), 
 84                     } 
 85   
86 -def _get_implementation(svm_impl, nl):
87 if nl > 2: 88 if svm_impl == 'libsvm': 89 svm_impl_class = shogun.Classifier.LibSVMMultiClass 90 elif svm_impl == 'gmnp': 91 svm_impl_class = shogun.Classifier.GMNPSVM 92 else: 93 raise RuntimeError, \ 94 "Shogun: Implementation %s doesn't handle multiclass " \ 95 "data. Got labels %s. Use some other classifier" % \ 96 (svm_impl, ul) 97 if __debug__: 98 debug("SG_", "Using %s for multiclass data of %s" % 99 (svm_impl_class, svm_impl)) 100 else: 101 svm_impl_class = known_svm_impl[svm_impl][0] 102 return svm_impl_class
103 104 # Conditionally make some of the implementations available if they are 105 # present in the present shogun 106 for name, item, descr in \ 107 [('lightsvm', "shogun.Classifier.SVMLight", 108 "SVMLight classification http://svmlight.joachims.org/"), 109 ('svrlight', "shogun.Regression.SVRLight", 110 "SVMLight regression http://svmlight.joachims.org/")]: 111 if externals.exists('shogun.%s' % name): 112 exec "known_svm_impl[\"%s\"] = (%s, \"%s\")" % (name, item, descr) 113 114
115 -def _setdebug(obj, partname):
116 """Helper to set level of debugging output for SG 117 :Parameters: 118 obj 119 In SG debug output seems to be set per every object 120 partname : basestring 121 For what kind of object we are talking about... could be automated 122 later on (TODO) 123 """ 124 debugname = "SG_%s" % partname.upper() 125 126 switch = {True: (shogun.Kernel.M_DEBUG, 'M_DEBUG', "enable"), 127 False: (shogun.Kernel.M_ERROR, 'M_ERROR', "disable")} 128 129 key = __debug__ and debugname in debug.active 130 131 sglevel, slevel, progressfunc = switch[key] 132 133 if __debug__: 134 debug("SG_", "Setting verbosity for shogun.%s instance: %s to %s" % 135 (partname, `obj`, slevel)) 136 obj.io.set_loglevel(sglevel) 137 try: 138 exec "obj.io.%s_progress()" % progressfunc 139 except: 140 warning("Shogun version installed has no way to enable progress" + 141 " reports")
142 143
144 -def _tosg(data):
145 """Draft helper function to convert data we have into SG suitable format 146 147 TODO: Support different datatypes 148 """ 149 150 if __debug__: 151 debug("SG_", "Converting data for shogun into RealFeatures") 152 153 features = shogun.Features.RealFeatures(data.astype('double').T) 154 155 if __debug__: 156 debug("SG__", "Done converting data for shogun into RealFeatures") 157 _setdebug(features, 'Features') 158 return features
159 160
161 -class SVM(_SVM):
162 """Support Vector Machine Classifier(s) based on Shogun 163 164 This is a simple base interface 165 """ 166 167 num_threads = Parameter(1, 168 min=1, 169 descr='Number of threads to utilize') 170 171 # NOTE: gamma is width in SG notation for RBF(Gaussian) 172 _KERNELS = { "linear": (shogun.Kernel.LinearKernel, (), LinearSVMWeights), 173 "rbf" : (shogun.Kernel.GaussianKernel, ('gamma',), None), 174 "rbfshift" : (shogun.Kernel.GaussianShiftKernel, ('gamma', 'max_shift', 'shift_step'), None), 175 "sigmoid" : (shogun.Kernel.SigmoidKernel, ('cache_size', 'gamma', 'coef0'), None), 176 } 177 178 _KNOWN_PARAMS = [ 'C', 'epsilon' ] 179 _KNOWN_KERNEL_PARAMS = [ ] 180 181 _clf_internals = _SVM._clf_internals + [ 'sg', 'retrainable' ] 182
183 - def __init__(self, 184 kernel_type='linear', 185 svm_impl="libsvm", 186 **kwargs):
187 """This is the base class of all classifier that utilize so 188 far just SVM classifiers provided by shogun. 189 190 TODO Documentation if this all works ;-) 191 """ 192 193 svm_impl = svm_impl.lower() 194 if svm_impl == 'krr': 195 self._KNOWN_PARAMS = self._KNOWN_PARAMS[:] + ['tau'] 196 if svm_impl in ['svrlight', 'libsvr']: 197 self._KNOWN_PARAMS = self._KNOWN_PARAMS[:] + ['tube_epsilon'] 198 199 # init base class 200 _SVM.__init__(self, kernel_type=kernel_type, **kwargs) 201 202 self.__svm = None 203 """Holds the trained svm.""" 204 205 # assign default params 206 if svm_impl in known_svm_impl: 207 self.__svm_impl = svm_impl 208 else: 209 raise ValueError, "Unknown SVM implementation %s" % svm_impl 210 211 self._clf_internals.append( 212 {True: 'multiclass', False:'binary'}[ 213 svm_impl in ['gmnp', 'libsvm']]) 214 if svm_impl in ['svrlight', 'libsvr', 'krr']: 215 self._clf_internals += [ 'regression' ] 216 217 # Need to store original data... 218 # TODO: keep 1 of them -- just __traindata or __traindataset 219 # For now it is needed for computing sensitivities 220 self.__traindataset = None 221 222 # internal SG swig proxies 223 self.__traindata = None 224 self.__kernel = None 225 self.__testdata = None 226 227 # if we do retraining -- store hashes 228 # samples, labels, test_samples 229 self.__idhash = [None, None, None] 230 231 if __debug__: 232 if 'RETRAIN' in debug.active: 233 # XXX it is not clear though if idhash is faster than 234 # simple comparison of (dataset != __traineddataset).any(), 235 # but if we like to get rid of __traineddataset then we should 236 # use idhash anyways 237 238 # XXX now we keep 2 copies of the data -- __traineddataset 239 # has it in SG format... uff 240 # 241 # samples, labels, test_samples, trainsamples_intest 242 self.__trained = [None, None, None]
243 244
245 - def __repr__(self):
246 # adjust representation a bit to report SVM backend 247 repr_ = super(SVM, self).__repr__() 248 return repr_.replace("(kern", "(svm_impl='%s', kern" % self.__svm_impl)
249 250
251 - def __wasChanged(self, descr, i, entry):
252 """Check if given entry was changed from what known prior. If so -- store""" 253 idhash_ = idhash(entry) 254 changed = self.__idhash[i] != idhash_ 255 if __debug__ and 'RETRAIN' in debug.active: 256 changed2 = entry != self.__trained[i] 257 if isinstance(changed2, N.ndarray): 258 changed2 = changed2.any() 259 if changed != changed2:# and not changed: 260 raise RuntimeError, \ 261 'hashid found to be weak for %s. Though hashid %s!=%s %s, '\ 262 'values %s!=%s %s' % \ 263 (descr, idhash_, self.__idhash[i], changed, 264 entry, self.__trained[i], changed2) 265 self.__trained[i] = entry 266 if __debug__ and changed: 267 debug('SG__', "Changed %s from %s to %s" 268 % (descr, self.__idhash[i], idhash_)) 269 self.__idhash[i] = idhash_ 270 return changed
271 272
273 - def _train(self, dataset):
274 """Train SVM 275 """ 276 # XXX might get up in hierarchy 277 if self.retrainable: 278 changed_params = self.params.whichSet() 279 changed_kernel_params = self.kernel_params.whichSet() 280 281 # XXX watchout 282 # self.untrain() 283 newkernel, newsvm = False, False 284 if self.retrainable: 285 if __debug__: 286 debug('SG__', "IDHashes are %s" % (self.__idhash)) 287 changed_samples = self.__wasChanged('samples', 0, dataset.samples) 288 changed_labels = self.__wasChanged('labels', 1, dataset.labels) 289 290 ul = dataset.uniquelabels 291 ul.sort() 292 293 self.__traindataset = dataset 294 295 # LABELS 296 297 # OK -- we have to map labels since 298 # binary ones expect -1/+1 299 # Multiclass expect labels starting with 0, otherwise they puke 300 # when ran from ipython... yikes 301 if __debug__: 302 debug("SG_", "Creating labels instance") 303 304 if 'regression' in self._clf_internals: 305 labels_ = N.asarray(dataset.labels, dtype='double') 306 else: 307 if len(ul) == 2: 308 # assure that we have -1/+1 309 self._labels_dict = {ul[0]:-1.0, 310 ul[1]:+1.0} 311 elif len(ul) < 2: 312 raise ValueError, "we do not have 1-class SVM brought into SG yet" 313 else: 314 # can't use plain enumerate since we need them swapped 315 self._labels_dict = dict([ (ul[i], i) for i in range(len(ul))]) 316 317 # reverse labels dict for back mapping in _predict 318 self._labels_dict_rev = dict([(x[1], x[0]) 319 for x in self._labels_dict.items()]) 320 321 # Map labels 322 # 323 # TODO: top level classifier should take care about labels 324 # mapping if that is needed 325 if __debug__: 326 debug("SG__", "Mapping labels using dict %s" % self._labels_dict) 327 labels_ = N.asarray([ self._labels_dict[x] for x in dataset.labels ], dtype='double') 328 329 labels = shogun.Features.Labels(labels_) 330 _setdebug(labels, 'Labels') 331 332 333 # KERNEL 334 if not self.retrainable or changed_samples or changed_kernel_params: 335 # If needed compute or just collect arguments for SVM and for 336 # the kernel 337 kargs = [] 338 for arg in self._KERNELS[self._kernel_type_literal][1]: 339 value = self.kernel_params[arg].value 340 # XXX Unify damn automagic gamma value 341 if arg == 'gamma' and value == 0.0: 342 value = self._getDefaultGamma(dataset) 343 kargs += [value] 344 345 if self.retrainable and __debug__: 346 if changed_samples: 347 debug("SG", 348 "Re-Creating kernel since samples has changed") 349 350 if changed_kernel_params: 351 debug("SG", 352 "Re-Creating kernel since params %s has changed" % 353 changed_kernel_params) 354 355 # create training data 356 if __debug__: debug("SG_", "Converting input data for shogun") 357 self.__traindata = _tosg(dataset.samples) 358 359 if __debug__: 360 debug("SG", "Creating kernel instance of %s giving arguments %s" % 361 (`self._kernel_type`, kargs)) 362 363 self.__kernel = self._kernel_type(self.__traindata, self.__traindata, 364 *kargs) 365 newkernel = True 366 self.kernel_params.reset() # mark them as not-changed 367 if self.retrainable: 368 self.__kernel.set_precompute_matrix(True, True) 369 self.__kernel_test = None 370 self.__kernel_args = kargs 371 _setdebug(self.__kernel, 'Kernels') 372 373 # TODO -- handle changed_params correctly, ie without recreating 374 # whole SVM 375 if not self.retrainable or self.__svm is None or changed_params: 376 # SVM 377 C = self.params.C 378 if C<0: 379 C = self._getDefaultC(dataset.samples)*abs(C) 380 if __debug__: 381 debug("SG_", "Default C for %s was computed to be %s" % 382 (self.params.C, C)) 383 384 # Choose appropriate implementation 385 svm_impl_class = _get_implementation(self.__svm_impl, len(ul)) 386 387 if __debug__: 388 debug("SG", "Creating SVM instance of %s" % `svm_impl_class`) 389 390 if self.__svm_impl in ['libsvr', 'svrlight']: 391 # for regressions constructor a bit different 392 self.__svm = svm_impl_class(C, self.params.epsilon, self.__kernel, labels) 393 elif self.__svm_impl in ['krr']: 394 self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels) 395 else: 396 self.__svm = svm_impl_class(C, self.__kernel, labels) 397 self.__svm.set_epsilon(self.params.epsilon) 398 self.params.reset() # mark them as not-changed 399 newsvm = True 400 _setdebug(self.__svm, 'SVM') 401 # Set optimization parameters 402 if self.params.isKnown('tube_epsilon') and \ 403 hasattr(self.__svm, 'set_tube_epsilon'): 404 self.__svm.set_tube_epsilon(self.params.tube_epsilon) 405 self.__svm.parallel.set_num_threads(self.params.num_threads) 406 else: 407 if __debug__: 408 debug("SG_", "SVM instance is not re-created") 409 if changed_labels: # labels were changed 410 self.__svm.set_labels(labels) 411 if newkernel: # kernel was replaced 412 self.__svm.set_kernel(self.__kernel) 413 if changed_params: 414 raise NotImplementedError, \ 415 "Implement handling of changing params of SVM" 416 417 if self.retrainable: 418 # we must assign it only if it is retrainable 419 self.states.retrained = not newsvm or not newkernel 420 421 # Train 422 if __debug__: 423 debug("SG", "%sTraining %s on data with labels %s" % 424 (("","Re-")[self.retrainable and self.states.retrained], self, 425 dataset.uniquelabels)) 426 427 self.__svm.train() 428 429 # Report on training 430 if __debug__: 431 debug("SG_", "Done training SG_SVM %s on data with labels %s" % 432 (self._kernel_type, dataset.uniquelabels)) 433 if "SG__" in debug.active: 434 trained_labels = self.__svm.classify().get_labels() 435 debug("SG__", "Original labels: %s, Trained labels: %s" % 436 (dataset.labels, trained_labels))
437 438
439 - def _predict(self, data):
440 """Predict values for the data 441 """ 442 443 if __debug__: 444 debug("SG_", "Initializing kernel with training/testing data") 445 446 if self.retrainable: 447 changed_testdata = self.__wasChanged('test_samples', 2, data) or \ 448 self.__kernel_test is None 449 450 if not self.retrainable or changed_testdata: 451 testdata = _tosg(data) 452 453 if not self.retrainable: 454 # We can just reuse kernel used for training 455 self.__kernel.init(self.__traindata, testdata) 456 else: 457 if changed_testdata: 458 if __debug__: 459 debug("SG__", 460 "Re-creating testing kernel of %s giving " 461 "arguments %s" % 462 (`self._kernel_type`, self.__kernel_args)) 463 kernel_test = self._kernel_type(self.__traindata, testdata, 464 *self.__kernel_args) 465 _setdebug(kernel_test, 'Kernels') 466 kernel_test_custom = shogun.Kernel.CustomKernel(self.__traindata, testdata) 467 _setdebug(kernel_test, 'Kernels') 468 self.__kernel_test = kernel_test_custom 469 self.__kernel_test.set_full_kernel_matrix_from_full( 470 kernel_test.get_kernel_matrix()) 471 elif __debug__: 472 debug("SG__", "Re-using testing kernel") 473 474 assert(self.__kernel_test is not None) 475 self.__svm.set_kernel(self.__kernel_test) 476 477 if __debug__: 478 debug("SG_", "Classifying testing data") 479 480 # doesn't do any good imho although on unittests helps tiny bit... hm 481 #self.__svm.init_kernel_optimization() 482 values_ = self.__svm.classify() 483 #if self.retrainable and not changed_testdata: 484 # import pydb 485 # pydb.debugger() 486 values = values_.get_labels() 487 488 if self.retrainable: 489 # we must assign it only if it is retrainable 490 self.states.retested = not changed_testdata 491 if __debug__: 492 debug("SG__", "Re-assigning learing kernel. Retested is %s" 493 % self.states.retested) 494 # return back original kernel 495 self.__svm.set_kernel(self.__kernel) 496 497 if __debug__: 498 debug("SG__", "Got values %s" % values) 499 500 if ('regression' in self._clf_internals): 501 predictions = values 502 else: 503 if len(self._labels_dict) == 2: 504 predictions = 1.0 - 2*N.signbit(values) 505 else: 506 predictions = values 507 508 # assure that we have the same type 509 label_type = type(self._labels_dict.values()[0]) 510 511 # remap labels back adjusting their type 512 predictions = [self._labels_dict_rev[label_type(x)] 513 for x in predictions] 514 515 if __debug__: 516 debug("SG__", "Tuned predictions %s" % predictions) 517 518 # store state variable 519 self.values = values 520 521 ## to avoid leaks with not yet properly fixed shogun 522 if not self.retrainable: 523 try: 524 testdata.free_features() 525 except: 526 pass 527 528 return predictions
529 530
531 - def untrain(self):
532 super(SVM, self).untrain() 533 534 if not self.retrainable: 535 if __debug__: 536 debug("SG__", "Untraining %s and destroying sg's SVM" % self) 537 538 self.__idhash = [None, None, None] # samples, labels 539 540 # to avoid leaks with not yet properly fixed shogun 541 # XXX make it nice... now it is just stable ;-) 542 if not self.__traindata is None: 543 try: 544 try: 545 self.__traindata.free_features() 546 except: 547 pass 548 if __debug__: 549 if 'RETRAIN' in debug.active: 550 self.__trained = [None, None, None] 551 self.__traindataset = None 552 del self.__kernel 553 self.__kernel = None 554 self.__kernel_test = None 555 del self.__traindata 556 self.__traindata = None 557 del self.__svm 558 self.__svm = None 559 except: 560 pass 561 562 if __debug__: 563 debug("SG__", 564 "Done untraining %(self)s and destroying sg's SVM", 565 msgargs=locals()) 566 elif __debug__: 567 debug("SG__", "Not untraining %(self)s since it is retrainable", 568 msgargs=locals())
569 570 571 svm = property(fget=lambda self: self.__svm) 572 """Access to the SVM model.""" 573 574 traindataset = property(fget=lambda self: self.__traindataset) 575 """Dataset which was used for training 576 577 TODO -- might better become state variable I guess"""
578