Package mvpa :: Package clfs :: Module base
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.base

   1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
   2  #ex: set sts=4 ts=4 sw=4 et: 
   3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   4  # 
   5  #   See COPYING file distributed along with the PyMVPA package for the 
   6  #   copyright and license terms. 
   7  # 
   8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   9  """Base classes for all classifiers. 
  10   
  11  Base Classifiers can be grouped according to their function as 
  12   
  13  :group Basic Classifiers: Classifier BoostedClassifier ProxyClassifier 
  14  :group BoostedClassifiers: CombinedClassifier MulticlassClassifier 
  15    SplitClassifier 
  16  :group ProxyClassifiers: BinaryClassifier MappedClassifier 
  17    FeatureSelectionClassifier 
  18  :group PredictionsCombiners for CombinedClassifier: PredictionsCombiner 
  19    MaximalVote MeanPrediction 
  20   
  21  """ 
  22   
  23  __docformat__ = 'restructuredtext' 
  24   
  25  import operator, sys 
  26  import numpy as N 
  27   
  28  # We have to use deepcopy from python 2.5, since otherwise it fails to 
  29  # copy sensitivity analyzers with assigned combiners which are just 
  30  # functions not functors 
  31  if sys.version_info[0] > 2 or sys.version_info[1] > 4: 
  32      from mvpa.misc.copy import deepcopy 
  33  else: 
  34      from mvpa.misc.copy import deepcopy 
  35   
  36  import time 
  37  from sets import Set 
  38   
  39  from mvpa.misc.args import group_kwargs 
  40  from mvpa.misc.support import idhash 
  41  from mvpa.mappers.mask import MaskMapper 
  42  from mvpa.datasets.splitter import NFoldSplitter 
  43  from mvpa.misc.state import StateVariable, Stateful, Harvestable, Parametrized 
  44  from mvpa.misc.param import Parameter 
  45   
  46  from mvpa.clfs.transerror import ConfusionMatrix, RegressionStatistics 
  47  from mvpa.misc.transformers import FirstAxisMean, SecondAxisSumOfAbs 
  48   
  49  from mvpa.measures.base import \ 
  50      BoostedClassifierSensitivityAnalyzer, ProxyClassifierSensitivityAnalyzer, \ 
  51      MappedClassifierSensitivityAnalyzer 
  52  from mvpa.base import warning 
  53   
  54  if __debug__: 
  55      import traceback 
  56      from mvpa.base import debug 
57 58 59 -def _deepcopyclf(clf):
60 """Deepcopying of a classifier. 61 62 If deepcopy fails -- tries to untrain it first so that there is no 63 swig bindings attached 64 """ 65 try: 66 return deepcopy(clf) 67 except: 68 clf.untrain() 69 return deepcopy(clf)
70
71 72 -class Classifier(Parametrized):
73 """Abstract classifier class to be inherited by all classifiers 74 """ 75 76 # Kept separate from doc to don't pollute help(clf), especially if 77 # we including help for the parent class 78 _DEV__doc__ = """ 79 Required behavior: 80 81 For every classifier is has to be possible to be instanciated without 82 having to specify the training pattern. 83 84 Repeated calls to the train() method with different training data have to 85 result in a valid classifier, trained for the particular dataset. 86 87 It must be possible to specify all classifier parameters as keyword 88 arguments to the constructor. 89 90 Recommended behavior: 91 92 Derived classifiers should provide access to *values* -- i.e. that 93 information that is finally used to determine the predicted class label. 94 95 Michael: Maybe it works well if each classifier provides a 'values' 96 state member. This variable is a list as long as and in same order 97 as Dataset.uniquelabels (training data). Each item in the list 98 corresponds to the likelyhood of a sample to belong to the 99 respective class. However the sematics might differ between 100 classifiers, e.g. kNN would probably store distances to class- 101 neighbours, where PLR would store the raw function value of the 102 logistic function. So in the case of kNN low is predictive and for 103 PLR high is predictive. Don't know if there is the need to unify 104 that. 105 106 As the storage and/or computation of this information might be 107 demanding its collection should be switchable and off be default. 108 109 Nomenclature 110 * predictions : corresponds to the quantized labels if classifier spits 111 out labels by .predict() 112 * values : might be different from predictions if a classifier's predict() 113 makes a decision based on some internal value such as 114 probability or a distance. 115 """ 116 # Dict that contains the parameters of a classifier. 117 # This shall provide an interface to plug generic parameter optimizer 118 # on all classifiers (e.g. grid- or line-search optimizer) 119 # A dictionary is used because Michael thinks that access by name is nicer. 120 # Additonally Michael thinks ATM that additonal information might be 121 # necessary in some situations (e.g. reasonably predefined parameter range, 122 # minimal iteration stepsize, ...), therefore the value to each key should 123 # also be a dict or we should use mvpa.misc.param.Parameter'... 124 125 trained_labels = StateVariable(enabled=True, 126 doc="Set of unique labels it has been trained on") 127 128 trained_dataset = StateVariable(enabled=False, 129 doc="The dataset it has been trained on") 130 131 training_confusion = StateVariable(enabled=False, 132 doc="Confusion matrix of learning performance") 133 134 predictions = StateVariable(enabled=True, 135 doc="Most recent set of predictions") 136 137 values = StateVariable(enabled=True, 138 doc="Internal classifier values the most recent " + 139 "predictions are based on") 140 141 training_time = StateVariable(enabled=True, 142 doc="Time (in seconds) which took classifier to train") 143 144 predicting_time = StateVariable(enabled=True, 145 doc="Time (in seconds) which took classifier to predict") 146 147 feature_ids = StateVariable(enabled=False, 148 doc="Feature IDS which were used for the actual training.") 149 150 _clf_internals = [] 151 """Describes some specifics about the classifier -- is that it is 152 doing regression for instance....""" 153 154 regression = Parameter(False, allowedtype='bool', 155 doc="""Either to use 'regression' as regression. By default any 156 Classifier-derived class serves as a classifier, so regression 157 does binary classification. TODO:""") 158 159 retrainable = Parameter(False, allowedtype='bool', 160 doc="""Either to enable retraining for 'retrainable' classifier. 161 TODO: make it available only for actually retrainable classifiers""") 162 163
164 - def __init__(self, **kwargs):
165 """Cheap initialization. 166 """ 167 Parametrized.__init__(self, **kwargs) 168 169 170 self.__trainednfeatures = None 171 """Stores number of features for which classifier was trained. 172 If None -- it wasn't trained at all""" 173 174 self._setRetrainable(self.params.retrainable, force=True) 175 176 if self.params.regression: 177 for statevar in [ "trained_labels"]: #, "training_confusion" ]: 178 if self.states.isEnabled(statevar): 179 if __debug__: 180 debug("CLF", 181 "Disabling state %s since doing regression, " % 182 statevar + "not classification") 183 self.states.disable(statevar) 184 self._summaryClass = RegressionStatistics 185 else: 186 self._summaryClass = ConfusionMatrix 187 if 'regression' in self._clf_internals: 188 # regressions are used as binary classifiers if not asked to perform 189 # regression explicitely 190 self._clf_internals.append('binary')
191 192 # deprecate 193 #self.__trainedidhash = None 194 #"""Stores id of the dataset on which it was trained to signal 195 #in trained() if it was trained already on the same dataset""" 196 197
198 - def __str__(self):
199 if __debug__ and 'CLF_' in debug.active: 200 return "%s / %s" % (repr(self), super(Classifier, self).__str__()) 201 else: 202 return repr(self)
203
204 - def __repr__(self, prefixes=[]):
205 return super(Classifier, self).__repr__(prefixes=prefixes)
206 207
208 - def _pretrain(self, dataset):
209 """Functionality prior to training 210 """ 211 # So we reset all state variables and may be free up some memory 212 # explicitely 213 params = self.params 214 if not params.retrainable: 215 self.untrain() 216 else: 217 # just reset the states, do not untrain 218 self.states.reset() 219 if not self.__changedData_isset: 220 self.__resetChangedData() 221 _changedData = self._changedData 222 __idhashes = self.__idhashes 223 __invalidatedChangedData = self.__invalidatedChangedData 224 225 # if we don't know what was changed we need to figure 226 # them out 227 if __debug__: 228 debug('CLF_', "IDHashes are %s" % (__idhashes)) 229 230 # Look at the data if any was changed 231 for key, data_ in (('traindata', dataset.samples), 232 ('labels', dataset.labels)): 233 _changedData[key] = self.__wasDataChanged(key, data_) 234 # if those idhashes were invalidated by retraining 235 # we need to adjust _changedData accordingly 236 if __invalidatedChangedData.get(key, False): 237 if __debug__ and not _changedData[key]: 238 debug('CLF_', 'Found that idhash for %s was ' 239 'invalidated by retraining' % key) 240 _changedData[key] = True 241 242 # Look at the parameters 243 for col in self._paramscols: 244 changedParams = self._collections[col].whichSet() 245 if len(changedParams): 246 _changedData[col] = changedParams 247 248 self.__invalidatedChangedData = {} # reset it on training 249 250 if __debug__: 251 debug('CLF_', "Obtained _changedData is %s" % (self._changedData)) 252 253 if not params.regression and 'regression' in self._clf_internals \ 254 and not self.states.isEnabled('trained_labels'): 255 # if classifier internally does regression we need to have 256 # labels it was trained on 257 if __debug__: 258 debug("CLF", "Enabling trained_labels state since it is needed") 259 self.states.enable('trained_labels')
260 261
262 - def _posttrain(self, dataset):
263 """Functionality post training 264 265 For instance -- computing confusion matrix 266 :Parameters: 267 dataset : Dataset 268 Data which was used for training 269 """ 270 if self.states.isEnabled('trained_labels'): 271 self.trained_labels = dataset.uniquelabels 272 273 self.trained_dataset = dataset 274 275 # needs to be assigned first since below we use predict 276 self.__trainednfeatures = dataset.nfeatures 277 278 # XXX seems to be not even needed 279 # self.__trained_labels_map = dataset.labels_map 280 281 if __debug__ and 'CHECK_TRAINED' in debug.active: 282 self.__trainedidhash = dataset.idhash 283 284 if self.states.isEnabled('training_confusion') and \ 285 not self.states.isSet('training_confusion'): 286 # we should not store predictions for training data, 287 # it is confusing imho (yoh) 288 self.states._changeTemporarily( 289 disable_states=["predictions"]) 290 if self.params.retrainable: 291 # we would need to recheck if data is the same, 292 # XXX think if there is a way to make this all 293 # efficient. For now, probably, retrainable 294 # classifiers have no chance but not to use 295 # training_confusion... sad 296 self.__changedData_isset = False 297 predictions = self.predict(dataset.samples) 298 self.states._resetEnabledTemporarily() 299 self.training_confusion = self._summaryClass( 300 targets=dataset.labels, 301 predictions=predictions) 302 303 try: 304 self.training_confusion.labels_map = dataset.labels_map 305 except: 306 pass 307 308 if self.states.isEnabled('feature_ids'): 309 self.feature_ids = self._getFeatureIds()
310 311
312 - def _getFeatureIds(self):
313 """Virtual method to return feature_ids used while training 314 315 Is not intended to be called anywhere but from _posttrain, 316 thus classifier is assumed to be trained at this point 317 """ 318 # By default all features are used 319 return range(self.__trainednfeatures)
320 321
322 - def summary(self):
323 """Providing summary over the classifier""" 324 325 s = "Classifier %s" % self 326 states = self.states 327 states_enabled = states.enabled 328 329 if self.trained: 330 s += "\n trained" 331 if states.isSet('training_time'): 332 s += ' in %.3g sec' % states.training_time 333 s += ' on data with' 334 if states.isSet('trained_labels'): 335 s += ' labels:%s' % list(states.trained_labels) 336 if states.isSet('trained_dataset'): 337 td = states.trained_dataset 338 s += ' #samples:%d #chunks:%d' % (td.nsamples, len(td.uniquechunks)) 339 s += " #features:%d" % self.__trainednfeatures 340 if states.isSet('feature_ids'): 341 s += ", used #features:%d" % len(states.feature_ids) 342 if states.isSet('training_confusion'): 343 s += ", training error:%.3g" % states.training_confusion.error 344 else: 345 s += "\n not yet trained" 346 347 if len(states_enabled): 348 s += "\n enabled states:%s" % ', '.join([str(states[x]) for x in states_enabled]) 349 return s
350 351
352 - def _train(self, dataset):
353 """Function to be actually overriden in derived classes 354 """ 355 raise NotImplementedError
356 357
358 - def train(self, dataset):
359 """Train classifier on a dataset 360 361 Shouldn't be overriden in subclasses unless explicitely needed 362 to do so 363 """ 364 if __debug__: 365 debug("CLF", "Training classifier %(clf)s on dataset %(dataset)s", 366 msgargs={'clf':self, 'dataset':dataset}) 367 #if 'CLF_TB' in debug.active: 368 # tb = traceback.extract_stack(limit=5) 369 # debug("CLF_TB", "Traceback: %s" % tb) 370 371 self._pretrain(dataset) 372 373 # remember the time when started training 374 t0 = time.time() 375 376 if dataset.nfeatures > 0: 377 result = self._train(dataset) 378 else: 379 warning("Trying to train on dataset with no features present") 380 if __debug__: 381 debug("CLF", 382 "No features present for training, no actual training " \ 383 "is called") 384 result = None 385 386 self.training_time = time.time() - t0 387 self._posttrain(dataset) 388 return result
389 390
391 - def _prepredict(self, data):
392 """Functionality prior prediction 393 """ 394 if not ('notrain2predict' in self._clf_internals): 395 # check if classifier was trained if that is needed 396 if not self.trained: 397 raise ValueError, \ 398 "Classifier %s wasn't yet trained, therefore can't " \ 399 "predict" % self 400 nfeatures = data.shape[1] 401 # check if number of features is the same as in the data 402 # it was trained on 403 if nfeatures != self.__trainednfeatures: 404 raise ValueError, \ 405 "Classifier %s was trained on data with %d features, " % \ 406 (self, self.__trainednfeatures) + \ 407 "thus can't predict for %d features" % nfeatures 408 409 410 if self.params.retrainable: 411 if not self.__changedData_isset: 412 self.__resetChangedData() 413 _changedData = self._changedData 414 _changedData['testdata'] = \ 415 self.__wasDataChanged('testdata', data) 416 if __debug__: 417 debug('CLF_', "prepredict: Obtained _changedData is %s" % (_changedData))
418 419
420 - def _postpredict(self, data, result):
421 """Functionality after prediction is computed 422 """ 423 self.predictions = result 424 if self.params.retrainable: 425 self.__changedData_isset = False
426
427 - def _predict(self, data):
428 """Actual prediction 429 """ 430 raise NotImplementedError
431 432
433 - def predict(self, data):
434 """Predict classifier on data 435 436 Shouldn't be overriden in subclasses unless explicitely needed 437 to do so. Also subclasses trying to call super class's predict 438 should call _predict if within _predict instead of predict() 439 since otherwise it would loop 440 """ 441 data = N.asarray(data) 442 if __debug__: 443 debug("CLF", "Predicting classifier %(clf)s on data %(data)s", 444 msgargs={'clf':self, 'data':data.shape}) 445 #if 'CLF_TB' in debug.active: 446 # tb = traceback.extract_stack(limit=5) 447 # debug("CLF_TB", "Traceback: %s" % tb) 448 449 # remember the time when started computing predictions 450 t0 = time.time() 451 452 states = self.states 453 # to assure that those are reset (could be set due to testing 454 # post-training) 455 states.reset(['values', 'predictions']) 456 457 self._prepredict(data) 458 459 if self.__trainednfeatures > 0 \ 460 or 'notrain2predict' in self._clf_internals: 461 result = self._predict(data) 462 else: 463 warning("Trying to predict using classifier trained on no features") 464 if __debug__: 465 debug("CLF", 466 "No features were present for training, prediction is " \ 467 "bogus") 468 result = [None]*data.shape[0] 469 470 states.predicting_time = time.time() - t0 471 472 if 'regression' in self._clf_internals and not self.params.regression: 473 # We need to convert regression values into labels 474 # XXX unify may be labels -> internal_labels conversion. 475 #if len(self.trained_labels) != 2: 476 # raise RuntimeError, "XXX Ask developer to implement for " \ 477 # "multiclass mapping from regression into classification" 478 479 # must be N.array so we copy it to assign labels directly 480 # into labels. 481 # XXX or should we just recreate "result" 482 result_ = N.array(result) 483 if states.isEnabled('values'): 484 # values could be set by now so assigning 'result' would 485 # be misleading 486 if not states.isSet('values'): 487 states.values = result_.copy() 488 else: 489 # it might be the values are pointing to result at 490 # the moment, so lets assure this silly way that 491 # they do not overlap 492 states.values = states.values.copy() 493 494 trained_labels = self.trained_labels 495 for i, value in enumerate(result): 496 dists = N.abs(value - trained_labels) 497 result[i] = trained_labels[N.argmin(dists)] 498 499 if __debug__: 500 debug("CLF_", "Converted regression result %(result_)s " 501 "into labels %(result)s for %(self_)s", 502 msgargs={'result_':result_, 'result':result, 503 'self_': self}) 504 505 self._postpredict(data, result) 506 return result
507 508 # XXX deprecate ?
509 - def isTrained(self, dataset=None):
510 """Either classifier was already trained. 511 512 MUST BE USED WITH CARE IF EVER""" 513 if dataset is None: 514 # simply return if it was trained on anything 515 return not self.__trainednfeatures is None 516 else: 517 res = (self.__trainednfeatures == dataset.nfeatures) 518 if __debug__ and 'CHECK_TRAINED' in debug.active: 519 res2 = (self.__trainedidhash == dataset.idhash) 520 if res2 != res: 521 raise RuntimeError, \ 522 "isTrained is weak and shouldn't be relied upon. " \ 523 "Got result %b although comparing of idhash says %b" \ 524 % (res, res2) 525 return res
526 527
528 - def _regressionIsBogus(self):
529 """Some classifiers like BinaryClassifier can't be used for 530 regression""" 531 532 if self.params.regression: 533 raise ValueError, "Regression mode is meaningless for %s" % \ 534 self.__class__.__name__ + " thus don't enable it"
535 536 537 @property
538 - def trained(self):
539 """Either classifier was already trained""" 540 return self.isTrained()
541
542 - def untrain(self):
543 """Reset trained state""" 544 self.__trainednfeatures = None 545 # probably not needed... retrainable shouldn't be fully untrained 546 # XXX or should be?? 547 #if self.params.retrainable: 548 # # XXX don't duplicate the code ;-) 549 # self.__idhashes = {'traindata': None, 'labels': None, 550 # 'testdata': None, 'testtraindata': None} 551 super(Classifier, self).reset()
552 553
554 - def getSensitivityAnalyzer(self, **kwargs):
555 """Factory method to return an appropriate sensitivity analyzer for 556 the respective classifier.""" 557 raise NotImplementedError
558 559 560 # 561 # Methods which are needed for retrainable classifiers 562 #
563 - def _setRetrainable(self, value, force=False):
564 """Assign value of retrainable parameter 565 566 If retrainable flag is to be changed, classifier has to be 567 untrained. Also internal attributes such as _changedData, 568 __changedData_isset, and __idhashes should be initialized if 569 it becomes retrainable 570 """ 571 pretrainable = self.params['retrainable'] 572 if (force or value != pretrainable.value) and 'retrainable' in self._clf_internals: 573 if __debug__: 574 debug("CLF_", "Setting retrainable to %s" % value) 575 if 'meta' in self._clf_internals: 576 warning("Retrainability is not yet crafted/tested for " 577 "meta classifiers. Unpredictable behavior might occur") 578 # assure that we don't drag anything behind 579 if self.trained: 580 self.untrain() 581 states = self.states 582 if not value and states.isKnown('retrained'): 583 states.remove('retrained') 584 states.remove('repredicted') 585 if value: 586 if not 'retrainable' in self._clf_internals: 587 warning("Setting of flag retrainable for %s has no effect" 588 " since classifier has no such capability. It would" 589 " just lead to resources consumption and slowdown" 590 % self) 591 states.add(StateVariable(enabled=True, 592 name='retrained', 593 doc="Either retrainable classifier was retrained")) 594 states.add(StateVariable(enabled=True, 595 name='repredicted', 596 doc="Either retrainable classifier was repredicted")) 597 598 pretrainable.value = value 599 600 # if retrainable we need to keep track of things 601 if value: 602 self.__idhashes = {'traindata': None, 'labels': None, 603 'testdata': None} #, 'testtraindata': None} 604 if __debug__ and 'CHECK_RETRAIN' in debug.active: 605 # XXX it is not clear though if idhash is faster than 606 # simple comparison of (dataset != __traineddataset).any(), 607 # but if we like to get rid of __traineddataset then we should 608 # use idhash anyways 609 self.__trained = self.__idhashes.copy() # just the same Nones ;-) 610 self.__resetChangedData() 611 self.__invalidatedChangedData = {} 612 elif 'retrainable' in self._clf_internals: 613 #self.__resetChangedData() 614 self.__changedData_isset = False 615 self._changedData = None 616 self.__idhashes = None 617 if __debug__ and 'CHECK_RETRAIN' in debug.active: 618 self.__trained = None
619
620 - def __resetChangedData(self):
621 """For retrainable classifier we keep track of what was changed 622 This function resets that dictionary 623 """ 624 if __debug__: 625 debug('CLF_', 'Resetting flags on either data was changed (for retrainable)') 626 keys = self.__idhashes.keys() + self._paramscols 627 # XXX we might like just to reinit values to False 628 #_changedData = self._changedData 629 #if isinstance(_changedData, dict): 630 # for key in _changedData.keys(): 631 # _changedData[key] = False 632 self._changedData = dict(zip(keys, [False]*len(keys))) 633 self.__changedData_isset = False
634 635
636 - def __wasDataChanged(self, key, entry, update=True):
637 """Check if given entry was changed from what known prior. If so -- store 638 639 needed only for retrainable beastie 640 """ 641 idhash_ = idhash(entry) 642 __idhashes = self.__idhashes 643 644 changed = __idhashes[key] != idhash_ 645 if __debug__ and 'CHECK_RETRAIN' in debug.active: 646 __trained = self.__trained 647 changed2 = entry != __trained[key] 648 if isinstance(changed2, N.ndarray): 649 changed2 = changed2.any() 650 if changed != changed2 and not changed: 651 raise RuntimeError, \ 652 'idhash found to be weak for %s. Though hashid %s!=%s %s, '\ 653 'values %s!=%s %s' % \ 654 (key, idhash_, __idhashes[key], changed, 655 entry, __trained[key], changed2) 656 if update: 657 __trained[key] = entry 658 659 if __debug__ and changed: 660 debug('CLF_', "Changed %s from %s to %s.%s" 661 % (key, __idhashes[key], idhash_, 662 ('','updated')[int(update)])) 663 if update: 664 __idhashes[key] = idhash_ 665 666 return changed
667 668 669 # def __updateHashIds(self, key, data): 670 # """Is twofold operation: updates hashid if was said that it changed. 671 # 672 # or if it wasn't said that data changed, but CHECK_RETRAIN and it found 673 # to be changed -- raise Exception 674 # """ 675 # 676 # check_retrain = __debug__ and 'CHECK_RETRAIN' in debug.active 677 # chd = self._changedData 678 # 679 # # we need to updated idhashes 680 # if chd[key] or check_retrain: 681 # keychanged = self.__wasDataChanged(key, data) 682 # if check_retrain and keychanged and not chd[key]: 683 # raise RuntimeError, \ 684 # "Data %s found changed although wasn't " \ 685 # "labeled as such" % key 686 687 688 # 689 # Additional API which is specific only for retrainable classifiers. 690 # For now it would just puke if asked from not retrainable one XXX 691 # 692 # Might come useful and efficient for statistics testing, so if just 693 # labels of dataset changed, then 694 # self.retrain(dataset, labels=True) 695 # would cause efficient retraining (no kernels recomputed etc) 696 # and subsequent self.repredict(data) should be also quite fase ;-) 697
698 - def retrain(self, dataset, **kwargs):
699 """Helper to avoid check if data was changed actually changed 700 701 Useful if just some aspects of classifier were changed since 702 its previous training. For instance if dataset wasn't changed 703 but only classifier parameters, then kernel matrix does not 704 have to be computed. 705 706 Words of caution: classifier must be previousely trained, 707 results always should first be compared to the results on not 708 'retrainable' classifier (without calling retrain). Some 709 additional checks are enabled if debug id 'CHECK_RETRAIN' is 710 enabled, to guard against obvious mistakes. 711 712 :Parameters: 713 kwargs 714 that is what _changedData gets updated with. So, smth like 715 ``(params=['C'], labels=True)`` if parameter C and labels 716 got changed 717 """ 718 # Note that it also demolishes anything for repredicting, 719 # which should be ok in most of the cases 720 if __debug__: 721 if not self.params.retrainable: 722 raise RuntimeError, \ 723 "Do not use retrain/repredict on non-retrainable classifiers" 724 725 if kwargs.has_key('params') or kwargs.has_key('kernel_params'): 726 raise ValueError, "Retraining for changed params yet not working" 727 728 self.__resetChangedData() 729 730 # local bindings 731 chd = self._changedData 732 ichd = self.__invalidatedChangedData 733 734 chd.update(kwargs) 735 # mark for future 'train()' items which are explicitely 736 # mentioned as changed 737 for key, value in kwargs.iteritems(): 738 if value: ichd[key] = True 739 self.__changedData_isset = True 740 741 # To check if we are not fooled 742 if __debug__ and 'CHECK_RETRAIN' in debug.active: 743 for key, data_ in (('traindata', dataset.samples), 744 ('labels', dataset.labels)): 745 # so it wasn't told to be invalid 746 if not chd[key] and not ichd.get(key, False): 747 if self.__wasDataChanged(key, data_, update=False): 748 raise RuntimeError, \ 749 "Data %s found changed although wasn't " \ 750 "labeled as such" % key 751 752 # TODO: parameters of classifiers... for now there is explicit 753 # 'forbidance' above 754 755 # Below check should be superseeded by check above, thus never occur. 756 # XXX remove later on 757 if __debug__ and 'CHECK_RETRAIN' in debug.active and self.trained \ 758 and not self._changedData['traindata'] \ 759 and self.__trained['traindata'].shape != dataset.samples.shape: 760 raise ValueError, "In retrain got dataset with %s size, " \ 761 "whenever previousely was trained on %s size" \ 762 % (dataset.samples.shape, self.__trained['traindata'].shape) 763 self.train(dataset)
764 765
766 - def repredict(self, data, **kwargs):
767 """Helper to avoid check if data was changed actually changed 768 769 Useful if classifier was (re)trained but with the same data 770 (so just parameters were changed), so that it could be 771 repredicted easily (on the same data as before) without 772 recomputing for instance train/test kernel matrix. Should be 773 used with caution and always compared to the results on not 774 'retrainable' classifier. Some additional checks are enabled 775 if debug id 'CHECK_RETRAIN' is enabled, to guard against 776 obvious mistakes. 777 778 :Parameters: 779 data 780 data which is conventionally given to predict 781 kwargs 782 that is what _changedData gets updated with. So, smth like 783 ``(params=['C'], labels=True)`` if parameter C and labels 784 got changed 785 """ 786 if len(kwargs)>0: 787 raise RuntimeError, \ 788 "repredict for now should be used without params since " \ 789 "it makes little sense to repredict if anything got changed" 790 if __debug__ and not self.params.retrainable: 791 raise RuntimeError, \ 792 "Do not use retrain/repredict on non-retrainable classifiers" 793 794 self.__resetChangedData() 795 chd = self._changedData 796 chd.update(**kwargs) 797 self.__changedData_isset = True 798 799 800 # check if we are attempted to perform on the same data 801 if __debug__ and 'CHECK_RETRAIN' in debug.active: 802 for key, data_ in (('testdata', data),): 803 # so it wasn't told to be invalid 804 #if not chd[key]:# and not ichd.get(key, False): 805 if self.__wasDataChanged(key, data_, update=False): 806 raise RuntimeError, \ 807 "Data %s found changed although wasn't " \ 808 "labeled as such" % key 809 810 # Should be superseeded by above 811 # XXX remove in future 812 if __debug__ and 'CHECK_RETRAIN' in debug.active \ 813 and not self._changedData['testdata'] \ 814 and self.__trained['testdata'].shape != data.shape: 815 raise ValueError, "In repredict got dataset with %s size, " \ 816 "whenever previousely was trained on %s size" \ 817 % (data.shape, self.__trained['testdata'].shape) 818 819 return self.predict(data)
820
821 822 # TODO: callback into retrainable parameter 823 #retrainable = property(fget=_getRetrainable, fset=_setRetrainable, 824 # doc="Specifies either classifier should be retrainable") 825 826 827 # 828 # Base classifiers of various kinds 829 # 830 831 -class BoostedClassifier(Classifier, Harvestable):
832 """Classifier containing the farm of other classifiers. 833 834 Should rarely be used directly. Use one of its childs instead 835 """ 836 837 # should not be needed if we have prediction_values upstairs 838 # TODO : should be handled as Harvestable or smth like that 839 raw_predictions = StateVariable(enabled=False, 840 doc="Predictions obtained from each classifier") 841 842 raw_values = StateVariable(enabled=False, 843 doc="Values obtained from each classifier") 844 845
846 - def __init__(self, clfs=None, propagate_states=True, 847 harvest_attribs=None, copy_attribs='copy', 848 **kwargs):
849 """Initialize the instance. 850 851 :Parameters: 852 clfs : list 853 list of classifier instances to use (slave classifiers) 854 propagate_states : bool 855 either to propagate enabled states into slave classifiers. 856 It is in effect only when slaves get assigned - so if state 857 is enabled not during construction, it would not necessarily 858 propagate into slaves 859 harvest_attribs : list of basestr 860 What attributes of call to store and return within 861 harvested state variable 862 copy_attribs : None or basestr 863 Force copying values of attributes on harvesting 864 kwargs : dict 865 dict of keyworded arguments which might get used 866 by State or Classifier 867 """ 868 if clfs == None: 869 clfs = [] 870 871 Classifier.__init__(self, **kwargs) 872 Harvestable.__init__(self, harvest_attribs, copy_attribs) 873 874 self.__clfs = None 875 """Pylint friendly definition of __clfs""" 876 877 self.__propagate_states = propagate_states 878 """Enable current enabled states in slave classifiers""" 879 880 self._setClassifiers(clfs) 881 """Store the list of classifiers"""
882 883
884 - def __repr__(self, prefixes=[]):
885 if self.__clfs is None or len(self.__clfs)==0: 886 #prefix_ = "clfs=%s" % repr(self.__clfs) 887 prefix_ = [] 888 else: 889 prefix_ = ["clfs=[%s,...]" % repr(self.__clfs[0])] 890 return super(BoostedClassifier, self).__repr__(prefix_ + prefixes)
891 892
893 - def _train(self, dataset):
894 """Train `BoostedClassifier` 895 """ 896 for clf in self.__clfs: 897 clf.train(dataset)
898 899
900 - def _posttrain(self, dataset):
901 """Custom posttrain of `BoostedClassifier` 902 903 Harvest over the trained classifiers if it was asked to so 904 """ 905 Classifier._posttrain(self, dataset) 906 if self.states.isEnabled('harvested'): 907 for clf in self.__clfs: 908 self._harvest(locals()) 909 if self.params.retrainable: 910 self.__changedData_isset = False
911 912
913 - def _getFeatureIds(self):
914 """Custom _getFeatureIds for `BoostedClassifier` 915 """ 916 # return union of all used features by slave classifiers 917 feature_ids = Set([]) 918 for clf in self.__clfs: 919 feature_ids = feature_ids.union(Set(clf.feature_ids)) 920 return list(feature_ids)
921 922
923 - def _predict(self, data):
924 """Predict using `BoostedClassifier` 925 """ 926 raw_predictions = [ clf.predict(data) for clf in self.__clfs ] 927 self.raw_predictions = raw_predictions 928 assert(len(self.__clfs)>0) 929 if self.states.isEnabled("values"): 930 # XXX pylint complains that numpy has no array member... weird 931 if N.array([x.states.isEnabled("values") 932 for x in self.__clfs]).all(): 933 values = [ clf.values for clf in self.__clfs ] 934 self.raw_values = values 935 else: 936 warning("One or more classifiers in %s has no 'values' state" % 937 self + "enabled, thus BoostedClassifier can't have" + 938 " 'raw_values' state variable defined") 939 940 return raw_predictions
941 942
943 - def _setClassifiers(self, clfs):
944 """Set the classifiers used by the boosted classifier 945 946 We have to allow to set list of classifiers after the object 947 was actually created. It will be used by 948 MulticlassClassifier 949 """ 950 self.__clfs = clfs 951 """Classifiers to use""" 952 953 for flag in ['regression']: 954 values = N.array([clf.params[flag].value for clf in self.__clfs]) 955 value = values.any() 956 if __debug__: 957 debug("CLFBST", "Setting %(flag)s=%(value)s for classifiers " 958 "%(clfs)s with %(values)s", 959 msgargs={'flag' : flag, 'value' : value, 960 'clfs' : self.__clfs, 961 'values' : values}) 962 # set flag if it needs to be trained before predicting 963 self.params[flag].value = value 964 965 # enable corresponding states in the slave-classifiers 966 if self.__propagate_states: 967 for clf in self.__clfs: 968 clf.states.enable(self.states.enabled, missingok=True) 969 970 # adhere to their capabilities + 'multiclass' 971 # XXX do intersection across all classifiers! 972 self._clf_internals = [ 'binary', 'multiclass', 'meta' ] 973 if len(clfs)>0: 974 self._clf_internals += self.__clfs[0]._clf_internals
975
976 - def untrain(self):
977 """Untrain `BoostedClassifier` 978 979 Has to untrain any known classifier 980 """ 981 if not self.trained: 982 return 983 for clf in self.clfs: 984 clf.untrain() 985 super(BoostedClassifier, self).untrain()
986
987 - def getSensitivityAnalyzer(self, **kwargs):
988 """Return an appropriate SensitivityAnalyzer""" 989 return BoostedClassifierSensitivityAnalyzer( 990 self, 991 **kwargs)
992 993 994 clfs = property(fget=lambda x:x.__clfs, 995 fset=_setClassifiers, 996 doc="Used classifiers")
997
998 999 1000 -class ProxyClassifier(Classifier):
1001 """Classifier which decorates another classifier 1002 1003 Possible uses: 1004 1005 - modify data somehow prior training/testing: 1006 * normalization 1007 * feature selection 1008 * modification 1009 1010 - optimized classifier? 1011 1012 """ 1013
1014 - def __init__(self, clf, **kwargs):
1015 """Initialize the instance 1016 1017 :Parameters: 1018 clf : Classifier 1019 classifier based on which mask classifiers is created 1020 """ 1021 1022 Classifier.__init__(self, regression=clf.regression, **kwargs) 1023 1024 self.__clf = clf 1025 """Store the classifier to use.""" 1026 1027 # adhere to slave classifier capabilities 1028 # XXX test test test 1029 self._clf_internals = self._clf_internals[:] + ['meta'] 1030 if clf is not None: 1031 self._clf_internals += clf._clf_internals
1032 1033
1034 - def __repr__(self, prefixes=[]):
1035 return super(ProxyClassifier, self).__repr__( 1036 ["clf=%s" % repr(self.__clf)] + prefixes)
1037
1038 - def summary(self):
1039 s = super(ProxyClassifier, self).summary() 1040 if self.trained: 1041 s += "\n Slave classifier summary:" + \ 1042 '\n + %s' % \ 1043 (self.__clf.summary().replace('\n', '\n |')) 1044 return s
1045 1046 1047
1048 - def _train(self, dataset):
1049 """Train `ProxyClassifier` 1050 """ 1051 # base class does nothing much -- just proxies requests to underlying 1052 # classifier 1053 self.__clf.train(dataset)
1054 1055 # for the ease of access 1056 # TODO: if to copy we should exclude some states which are defined in 1057 # base Classifier (such as training_time, predicting_time) 1058 # YOH: for now _copy_states_ would copy only set states variables. If 1059 # anything needs to be overriden in the parent's class, it is welcome 1060 # to do so 1061 #self.states._copy_states_(self.__clf, deep=False) 1062 1063
1064 - def _predict(self, data):
1065 """Predict using `ProxyClassifier` 1066 """ 1067 clf = self.__clf 1068 if self.states.isEnabled('values'): 1069 clf.states.enable(['values']) 1070 1071 result = clf.predict(data) 1072 # for the ease of access 1073 self.states._copy_states_(self.__clf, ['values'], deep=False) 1074 return result
1075 1076
1077 - def untrain(self):
1078 """Untrain ProxyClassifier 1079 """ 1080 if not self.__clf is None: 1081 self.__clf.untrain() 1082 super(ProxyClassifier, self).untrain()
1083 1084 1085 @group_kwargs(prefixes=['slave_'], passthrough=True)
1086 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
1087 """Return an appropriate SensitivityAnalyzer""" 1088 return ProxyClassifierSensitivityAnalyzer( 1089 self, 1090 analyzer=self.__clf.getSensitivityAnalyzer(**slave_kwargs), 1091 **kwargs)
1092 1093 1094 clf = property(lambda x:x.__clf, doc="Used `Classifier`")
1095
1096 1097 1098 # 1099 # Various combiners for CombinedClassifier 1100 # 1101 1102 -class PredictionsCombiner(Stateful):
1103 """Base class for combining decisions of multiple classifiers""" 1104
1105 - def train(self, clfs, dataset):
1106 """PredictionsCombiner might need to be trained 1107 1108 :Parameters: 1109 clfs : list of Classifier 1110 List of classifiers to combine. Has to be classifiers (not 1111 pure predictions), since combiner might use some other 1112 state variables (value's) instead of pure prediction's 1113 dataset : Dataset 1114 training data in this case 1115 """ 1116 pass
1117 1118
1119 - def __call__(self, clfs, dataset):
1120 """Call function 1121 1122 :Parameters: 1123 clfs : list of Classifier 1124 List of classifiers to combine. Has to be classifiers (not 1125 pure predictions), since combiner might use some other 1126 state variables (value's) instead of pure prediction's 1127 """ 1128 raise NotImplementedError
1129
1130 1131 1132 -class MaximalVote(PredictionsCombiner):
1133 """Provides a decision using maximal vote rule""" 1134 1135 predictions = StateVariable(enabled=True, 1136 doc="Voted predictions") 1137 all_label_counts = StateVariable(enabled=False, 1138 doc="Counts across classifiers for each label/sample") 1139
1140 - def __init__(self):
1141 """XXX Might get a parameter to use raw decision values if 1142 voting is not unambigous (ie two classes have equal number of 1143 votes 1144 """ 1145 PredictionsCombiner.__init__(self)
1146 1147
1148 - def __call__(self, clfs, dataset):
1149 """Actuall callable - perform voting 1150 1151 Extended functionality which might not be needed actually: 1152 Since `BinaryClassifier` might return a list of possible 1153 predictions (not just a single one), we should consider all of those 1154 1155 MaximalVote doesn't care about dataset itself 1156 """ 1157 if len(clfs)==0: 1158 return [] # to don't even bother 1159 1160 all_label_counts = None 1161 for clf in clfs: 1162 # Lets check first if necessary state variable is enabled 1163 if not clf.states.isEnabled("predictions"): 1164 raise ValueError, "MaximalVote needs classifiers (such as " + \ 1165 "%s) with state 'predictions' enabled" % clf 1166 predictions = clf.predictions 1167 if all_label_counts is None: 1168 all_label_counts = [ {} for i in xrange(len(predictions)) ] 1169 1170 # for every sample 1171 for i in xrange(len(predictions)): 1172 prediction = predictions[i] 1173 if not operator.isSequenceType(prediction): 1174 prediction = (prediction,) 1175 for label in prediction: # for every label 1176 # XXX we might have multiple labels assigned 1177 # but might not -- don't remember now 1178 if not all_label_counts[i].has_key(label): 1179 all_label_counts[i][label] = 0 1180 all_label_counts[i][label] += 1 1181 1182 predictions = [] 1183 # select maximal vote now for each sample 1184 for i in xrange(len(all_label_counts)): 1185 label_counts = all_label_counts[i] 1186 # lets do explicit search for max so we know 1187 # if it is unique 1188 maxk = [] # labels of elements with max vote 1189 maxv = -1 1190 for k, v in label_counts.iteritems(): 1191 if v > maxv: 1192 maxk = [k] 1193 maxv = v 1194 elif v == maxv: 1195 maxk.append(k) 1196 1197 assert len(maxk) >= 1, \ 1198 "We should have obtained at least a single key of max label" 1199 1200 if len(maxk) > 1: 1201 warning("We got multiple labels %s which have the " % maxk + 1202 "same maximal vote %d. XXX disambiguate" % maxv) 1203 predictions.append(maxk[0]) 1204 1205 self.all_label_counts = all_label_counts 1206 self.predictions = predictions 1207 return predictions
1208
1209 1210 1211 -class MeanPrediction(PredictionsCombiner):
1212 """Provides a decision by taking mean of the results 1213 """ 1214 1215 predictions = StateVariable(enabled=True, 1216 doc="Mean predictions") 1217
1218 - def __call__(self, clfs, dataset):
1219 """Actuall callable - perform meaning 1220 1221 """ 1222 if len(clfs)==0: 1223 return [] # to don't even bother 1224 1225 all_predictions = [] 1226 for clf in clfs: 1227 # Lets check first if necessary state variable is enabled 1228 if not clf.states.isEnabled("predictions"): 1229 raise ValueError, "MeanPrediction needs classifiers (such as " + \ 1230 "%s) with state 'predictions' enabled" % clf 1231 all_predictions.append(clf.predictions) 1232 1233 # compute mean 1234 predictions = N.mean(N.asarray(all_predictions), axis=0) 1235 self.predictions = predictions 1236 return predictions
1237
1238 1239 -class ClassifierCombiner(PredictionsCombiner):
1240 """Provides a decision using training a classifier on predictions/values 1241 1242 TODO 1243 """ 1244 1245 predictions = StateVariable(enabled=True, 1246 doc="Trained predictions") 1247 1248
1249 - def __init__(self, clf, variables=None):
1250 """Initialize `ClassifierCombiner` 1251 1252 :Parameters: 1253 clf : Classifier 1254 Classifier to train on the predictions 1255 variables : list of basestring 1256 List of state variables stored in 'combined' classifiers, which 1257 to use as features for training this classifier 1258 """ 1259 PredictionsCombiner.__init__(self) 1260 1261 self.__clf = clf 1262 """Classifier to train on `variables` states of provided classifiers""" 1263 1264 if variables == None: 1265 variables = ['predictions'] 1266 self.__variables = variables 1267 """What state variables of the classifiers to use"""
1268 1269
1270 - def untrain(self):
1271 """It might be needed to untrain used classifier""" 1272 if self.__clf: 1273 self.__clf.untrain()
1274
1275 - def __call__(self, clfs, dataset):
1276 """ 1277 """ 1278 if len(clfs)==0: 1279 return [] # to don't even bother 1280 1281 # XXX TODO 1282 raise NotImplementedError
1283
1284 1285 1286 -class CombinedClassifier(BoostedClassifier):
1287 """`BoostedClassifier` which combines predictions using some 1288 `PredictionsCombiner` functor. 1289 """ 1290
1291 - def __init__(self, clfs=None, combiner=None, **kwargs):
1292 """Initialize the instance. 1293 1294 :Parameters: 1295 clfs : list of Classifier 1296 list of classifier instances to use 1297 combiner : PredictionsCombiner 1298 callable which takes care about combining multiple 1299 results into a single one (e.g. maximal vote for 1300 classification, MeanPrediction for regression)) 1301 kwargs : dict 1302 dict of keyworded arguments which might get used 1303 by State or Classifier 1304 1305 NB: `combiner` might need to operate not on 'predictions' descrete 1306 labels but rather on raw 'class' values classifiers 1307 estimate (which is pretty much what is stored under 1308 `values` 1309 """ 1310 if clfs == None: 1311 clfs = [] 1312 1313 BoostedClassifier.__init__(self, clfs, **kwargs) 1314 1315 # assign default combiner 1316 if combiner is None: 1317 combiner = (MaximalVote, MeanPrediction)[int(self.regression)]() 1318 self.__combiner = combiner 1319 """Functor destined to combine results of multiple classifiers"""
1320 1321
1322 - def __repr__(self, prefixes=[]):
1323 return super(CombinedClassifier, self).__repr__( 1324 ["combiner=%s" % repr(self.__combiner)] + prefixes)
1325 1326
1327 - def summary(self):
1328 s = super(CombinedClassifier, self).summary() 1329 if self.trained: 1330 s += "\n Slave classifiers summaries:" 1331 for i, clf in enumerate(self.clfs): 1332 s += '\n + %d clf: %s' % \ 1333 (i, clf.summary().replace('\n', '\n |')) 1334 return s
1335 1336
1337 - def untrain(self):
1338 """Untrain `CombinedClassifier` 1339 """ 1340 try: 1341 self.__combiner.untrain() 1342 except: 1343 pass 1344 super(CombinedClassifier, self).untrain()
1345
1346 - def _train(self, dataset):
1347 """Train `CombinedClassifier` 1348 """ 1349 BoostedClassifier._train(self, dataset) 1350 # combiner might need to train as well 1351 self.__combiner.train(self.clfs, dataset)
1352 1353
1354 - def _predict(self, data):
1355 """Predict using `CombinedClassifier` 1356 """ 1357 BoostedClassifier._predict(self, data) 1358 # combiner will make use of state variables instead of only predictions 1359 # returned from _predict 1360 predictions = self.__combiner(self.clfs, data) 1361 self.predictions = predictions 1362 1363 if self.states.isEnabled("values"): 1364 if self.__combiner.states.isActive("values"): 1365 # XXX or may be we could leave simply up to accessing .combiner? 1366 self.values = self.__combiner.values 1367 else: 1368 if __debug__: 1369 warning("Boosted classifier %s has 'values' state" % self + 1370 " enabled, but combiner has it active, thus no" + 1371 " values could be provided directly, access .clfs") 1372 return predictions
1373 1374 1375 combiner = property(fget=lambda x:x.__combiner, 1376 doc="Used combiner to derive a single result")
1377
1378 1379 1380 -class BinaryClassifier(ProxyClassifier):
1381 """`ProxyClassifier` which maps set of two labels into +1 and -1 1382 """ 1383
1384 - def __init__(self, clf, poslabels, neglabels, **kwargs):
1385 """ 1386 :Parameters: 1387 clf : Classifier 1388 classifier to use 1389 poslabels : list 1390 list of labels which are treated as +1 category 1391 neglabels : list 1392 list of labels which are treated as -1 category 1393 """ 1394 1395 ProxyClassifier.__init__(self, clf, **kwargs) 1396 1397 self._regressionIsBogus() 1398 1399 # Handle labels 1400 sposlabels = Set(poslabels) # so to remove duplicates 1401 sneglabels = Set(neglabels) # so to remove duplicates 1402 1403 # check if there is no overlap 1404 overlap = sposlabels.intersection(sneglabels) 1405 if len(overlap)>0: 1406 raise ValueError("Sets of positive and negative labels for " + 1407 "BinaryClassifier must not overlap. Got overlap " % 1408 overlap) 1409 1410 self.__poslabels = list(sposlabels) 1411 self.__neglabels = list(sneglabels) 1412 1413 # define what values will be returned by predict: if there is 1414 # a single label - return just it alone, otherwise - whole 1415 # list 1416 # Such approach might come useful if we use some classifiers 1417 # over different subsets of data with some voting later on 1418 # (1-vs-therest?) 1419 1420 if len(self.__poslabels) > 1: 1421 self.__predictpos = self.__poslabels 1422 else: 1423 self.__predictpos = self.__poslabels[0] 1424 1425 if len(self.__neglabels) > 1: 1426 self.__predictneg = self.__neglabels 1427 else: 1428 self.__predictneg = self.__neglabels[0]
1429 1430
1431 - def __repr__(self, prefixes=[]):
1432 prefix = "poslabels=%s, neglabels=%s" % ( 1433 repr(self.__poslabels), repr(self.__neglabels)) 1434 return super(BinaryClassifier, self).__repr__([prefix] + prefixes)
1435 1436
1437 - def _train(self, dataset):
1438 """Train `BinaryClassifier` 1439 """ 1440 idlabels = [(x, +1) for x in dataset.idsbylabels(self.__poslabels)] + \ 1441 [(x, -1) for x in dataset.idsbylabels(self.__neglabels)] 1442 # XXX we have to sort ids since at the moment Dataset.selectSamples 1443 # doesn't take care about order 1444 idlabels.sort() 1445 # select the samples 1446 orig_labels = None 1447 1448 # If we need all samples, why simply not perform on original 1449 # data, an just store/restore labels. But it really should be done 1450 # within Dataset.selectSamples 1451 if len(idlabels) == dataset.nsamples \ 1452 and [x[0] for x in idlabels] == range(dataset.nsamples): 1453 # the last condition is not even necessary... just overly 1454 # cautious 1455 datasetselected = dataset # no selection is needed 1456 orig_labels = dataset.labels # but we would need to restore labels 1457 if __debug__: 1458 debug('CLFBIN', 1459 "Assigned all %d samples for binary " % 1460 (dataset.nsamples) + 1461 " classification among labels %s/+1 and %s/-1" % 1462 (self.__poslabels, self.__neglabels)) 1463 else: 1464 datasetselected = dataset.selectSamples([ x[0] for x in idlabels ]) 1465 if __debug__: 1466 debug('CLFBIN', 1467 "Selected %d samples out of %d samples for binary " % 1468 (len(idlabels), dataset.nsamples) + 1469 " classification among labels %s/+1 and %s/-1" % 1470 (self.__poslabels, self.__neglabels) + 1471 ". Selected %s" % datasetselected) 1472 1473 # adjust the labels 1474 datasetselected.labels = [ x[1] for x in idlabels ] 1475 1476 # now we got a dataset with only 2 labels 1477 if __debug__: 1478 assert((datasetselected.uniquelabels == [-1, 1]).all()) 1479 1480 self.clf.train(datasetselected) 1481 1482 if not orig_labels is None: 1483 dataset.labels = orig_labels
1484
1485 - def _predict(self, data):
1486 """Predict the labels for a given `data` 1487 1488 Predicts using binary classifier and spits out list (for each sample) 1489 where with either poslabels or neglabels as the "label" for the sample. 1490 If there was just a single label within pos or neg labels then it would 1491 return not a list but just that single label. 1492 """ 1493 binary_predictions = ProxyClassifier._predict(self, data) 1494 self.values = binary_predictions 1495 predictions = [ {-1: self.__predictneg, 1496 +1: self.__predictpos}[x] for x in binary_predictions] 1497 self.predictions = predictions 1498 return predictions
1499
1500 1501 1502 -class MulticlassClassifier(CombinedClassifier):
1503 """`CombinedClassifier` to perform multiclass using a list of 1504 `BinaryClassifier`. 1505 1506 such as 1-vs-1 (ie in pairs like libsvm doesn) or 1-vs-all (which 1507 is yet to think about) 1508 """ 1509
1510 - def __init__(self, clf, bclf_type="1-vs-1", **kwargs):
1511 """Initialize the instance 1512 1513 :Parameters: 1514 clf : Classifier 1515 classifier based on which multiple classifiers are created 1516 for multiclass 1517 bclf_type 1518 "1-vs-1" or "1-vs-all", determines the way to generate binary 1519 classifiers 1520 """ 1521 CombinedClassifier.__init__(self, **kwargs) 1522 self._regressionIsBogus() 1523 if not clf is None: 1524 clf._regressionIsBogus() 1525 1526 self.__clf = clf 1527 """Store sample instance of basic classifier""" 1528 1529 # Some checks on known ways to do multiclass 1530 if bclf_type == "1-vs-1": 1531 pass 1532 elif bclf_type == "1-vs-all": # TODO 1533 raise NotImplementedError 1534 else: 1535 raise ValueError, \ 1536 "Unknown type of classifier %s for " % bclf_type + \ 1537 "BoostedMulticlassClassifier" 1538 self.__bclf_type = bclf_type
1539 1540 # XXX fix it up a bit... it seems that MulticlassClassifier should 1541 # be actually ProxyClassifier and use BoostedClassifier internally
1542 - def __repr__(self, prefixes=[]):
1543 prefix = "bclf_type=%s, clf=%s" % (repr(self.__bclf_type), 1544 repr(self.__clf)) 1545 return super(MulticlassClassifier, self).__repr__([prefix] + prefixes)
1546 1547
1548 - def _train(self, dataset):
1549 """Train classifier 1550 """ 1551 # construct binary classifiers 1552 ulabels = dataset.uniquelabels 1553 if self.__bclf_type == "1-vs-1": 1554 # generate pairs and corresponding classifiers 1555 biclfs = [] 1556 for i in xrange(len(ulabels)): 1557 for j in xrange(i+1, len(ulabels)): 1558 clf = _deepcopyclf(self.__clf) 1559 biclfs.append( 1560 BinaryClassifier( 1561 clf, 1562 poslabels=[ulabels[i]], neglabels=[ulabels[j]])) 1563 if __debug__: 1564 debug("CLFMC", "Created %d binary classifiers for %d labels" % 1565 (len(biclfs), len(ulabels))) 1566 1567 self.clfs = biclfs 1568 1569 elif self.__bclf_type == "1-vs-all": 1570 raise NotImplementedError 1571 1572 # perform actual training 1573 CombinedClassifier._train(self, dataset)
1574
1575 1576 1577 -class SplitClassifier(CombinedClassifier):
1578 """`BoostedClassifier` to work on splits of the data 1579 1580 """ 1581 1582 """ 1583 TODO: SplitClassifier and MulticlassClassifier have too much in 1584 common -- need to refactor: just need a splitter which would 1585 split dataset in pairs of class labels. MulticlassClassifier 1586 does just a tiny bit more which might be not necessary at 1587 all: map sets of labels into 2 categories... 1588 """ 1589 1590 # TODO: unify with CrossValidatedTransferError which now uses 1591 # harvest_attribs to expose gathered attributes 1592 confusion = StateVariable(enabled=False, 1593 doc="Resultant confusion whenever classifier trained " + 1594 "on 1 part and tested on 2nd part of each split") 1595 1596 splits = StateVariable(enabled=False, doc= 1597 """Store the actual splits of the data. Can be memory expensive""") 1598 1599 # XXX couldn't be training_confusion since it has other meaning 1600 # here, BUT it is named so within CrossValidatedTransferError 1601 # -- unify 1602 # YYY decided to go with overriding semantics tiny bit. For split 1603 # classifier training_confusion would correspond to summary 1604 # over training errors across all splits. Later on if need comes 1605 # we might want to implement global_training_confusion which would 1606 # correspond to overall confusion on full training dataset as it is 1607 # done in base Classifier 1608 #global_training_confusion = StateVariable(enabled=False, 1609 # doc="Summary over training confusions acquired at each split") 1610
1611 - def __init__(self, clf, splitter=NFoldSplitter(cvtype=1), **kwargs):
1612 """Initialize the instance 1613 1614 :Parameters: 1615 clf : Classifier 1616 classifier based on which multiple classifiers are created 1617 for multiclass 1618 splitter : Splitter 1619 `Splitter` to use to split the dataset prior training 1620 """ 1621 1622 CombinedClassifier.__init__(self, regression=clf.regression, **kwargs) 1623 self.__clf = clf 1624 """Store sample instance of basic classifier""" 1625 1626 if isinstance(splitter, type): 1627 raise ValueError, \ 1628 "Please provide an instance of a splitter, not a type." \ 1629 " Got %s" % splitter 1630 1631 self.__splitter = splitter
1632 1633
1634 - def _train(self, dataset):
1635 """Train `SplitClassifier` 1636 """ 1637 # generate pairs and corresponding classifiers 1638 bclfs = [] 1639 1640 # local binding 1641 states = self.states 1642 1643 clf_template = self.__clf 1644 if states.isEnabled('confusion'): 1645 states.confusion = clf_template._summaryClass() 1646 if states.isEnabled('training_confusion'): 1647 clf_template.states.enable(['training_confusion']) 1648 states.training_confusion = clf_template._summaryClass() 1649 1650 clf_hastestdataset = hasattr(clf_template, 'testdataset') 1651 1652 # for proper and easier debugging - first define classifiers and then 1653 # train them 1654 for split in self.__splitter.splitcfg(dataset): 1655 if __debug__: 1656 debug("CLFSPL", 1657 "Deepcopying %(clf)s for %(sclf)s", 1658 msgargs={'clf':clf_template, 1659 'sclf':self}) 1660 clf = _deepcopyclf(clf_template) 1661 bclfs.append(clf) 1662 self.clfs = bclfs 1663 1664 self.splits = [] 1665 1666 for i,split in enumerate(self.__splitter(dataset)): 1667 if __debug__: 1668 debug("CLFSPL", "Training classifier for split %d" % (i)) 1669 1670 if states.isEnabled("splits"): 1671 self.splits.append(split) 1672 1673 clf = self.clfs[i] 1674 1675 # assign testing dataset if given classifier can digest it 1676 if clf_hastestdataset: 1677 clf.testdataset = split[1] 1678 1679 clf.train(split[0]) 1680 1681 # unbind the testdataset from the classifier 1682 if clf_hastestdataset: 1683 clf.testdataset = None 1684 1685 if states.isEnabled("confusion"): 1686 predictions = clf.predict(split[1].samples) 1687 self.confusion.add(split[1].labels, predictions, 1688 clf.states.get('values', None)) 1689 if states.isEnabled("training_confusion"): 1690 states.training_confusion += \ 1691 clf.states.training_confusion 1692 # XXX hackish way -- so it should work only for ConfusionMatrix 1693 try: 1694 if states.isEnabled("confusion"): 1695 states.confusion.labels_map = dataset.labels_map 1696 if states.isEnabled("training_confusion"): 1697 states.training_confusion.labels_map = dataset.labels_map 1698 except: 1699 pass
1700 1701 1702 @group_kwargs(prefixes=['slave_'], passthrough=True)
1703 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
1704 """Return an appropriate SensitivityAnalyzer for `SplitClassifier` 1705 1706 :Parameters: 1707 combiner 1708 If not provided, FirstAxisMean is assumed 1709 """ 1710 kwargs.setdefault('combiner', FirstAxisMean) 1711 return BoostedClassifierSensitivityAnalyzer( 1712 self, 1713 analyzer=self.__clf.getSensitivityAnalyzer(**slave_kwargs), 1714 **kwargs)
1715 1716 splitter = property(fget=lambda x:x.__splitter, 1717 doc="Splitter user by SplitClassifier")
1718
1719 1720 -class MappedClassifier(ProxyClassifier):
1721 """`ProxyClassifier` which uses some mapper prior training/testing. 1722 1723 `MaskMapper` can be used just a subset of features to 1724 train/classify. 1725 Having such classifier we can easily create a set of classifiers 1726 for BoostedClassifier, where each classifier operates on some set 1727 of features, e.g. set of best spheres from SearchLight, set of 1728 ROIs selected elsewhere. It would be different from simply 1729 applying whole mask over the dataset, since here initial decision 1730 is made by each classifier and then later on they vote for the 1731 final decision across the set of classifiers. 1732 """ 1733
1734 - def __init__(self, clf, mapper, **kwargs):
1735 """Initialize the instance 1736 1737 :Parameters: 1738 clf : Classifier 1739 classifier based on which mask classifiers is created 1740 mapper 1741 whatever `Mapper` comes handy 1742 """ 1743 ProxyClassifier.__init__(self, clf, **kwargs) 1744 1745 self.__mapper = mapper 1746 """mapper to help us our with prepping data to 1747 training/classification"""
1748 1749
1750 - def _train(self, dataset):
1751 """Train `MappedClassifier` 1752 """ 1753 # first train the mapper 1754 # XXX: should training be done using whole dataset or just samples 1755 # YYY: in some cases labels might be needed, thus better full dataset 1756 self.__mapper.train(dataset) 1757 1758 # for train() we have to provide dataset -- not just samples to train! 1759 wdataset = dataset.applyMapper(featuresmapper = self.__mapper) 1760 ProxyClassifier._train(self, wdataset)
1761 1762
1763 - def _predict(self, data):
1764 """Predict using `MappedClassifier` 1765 """ 1766 return ProxyClassifier._predict(self, self.__mapper.forward(data))
1767 1768 1769 @group_kwargs(prefixes=['slave_'], passthrough=True)
1770 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
1771 """Return an appropriate SensitivityAnalyzer""" 1772 return MappedClassifierSensitivityAnalyzer( 1773 self, 1774 analyzer=self.__clf.getSensitivityAnalyzer(**slave_kwargs), 1775 **kwargs)
1776 1777 1778 mapper = property(lambda x:x.__mapper, doc="Used mapper")
1779
1780 1781 1782 -class FeatureSelectionClassifier(ProxyClassifier):
1783 """`ProxyClassifier` which uses some `FeatureSelection` prior training. 1784 1785 `FeatureSelection` is used first to select features for the classifier to 1786 use for prediction. Internally it would rely on MappedClassifier which 1787 would use created MaskMapper. 1788 1789 TODO: think about removing overhead of retraining the same classifier if 1790 feature selection was carried out with the same classifier already. It 1791 has been addressed by adding .trained property to classifier, but now 1792 we should expclitely use isTrained here if we want... need to think more 1793 """ 1794 1795 _clf_internals = [ 'does_feature_selection', 'meta' ] 1796
1797 - def __init__(self, clf, feature_selection, testdataset=None, **kwargs):
1798 """Initialize the instance 1799 1800 :Parameters: 1801 clf : Classifier 1802 classifier based on which mask classifiers is created 1803 feature_selection : FeatureSelection 1804 whatever `FeatureSelection` comes handy 1805 testdataset : Dataset 1806 optional dataset which would be given on call to feature_selection 1807 """ 1808 ProxyClassifier.__init__(self, clf, **kwargs) 1809 1810 self.__maskclf = None 1811 """Should become `MappedClassifier`(mapper=`MaskMapper`) later on.""" 1812 1813 self.__feature_selection = feature_selection 1814 """`FeatureSelection` to select the features prior training""" 1815 1816 self.__testdataset = testdataset 1817 """`FeatureSelection` might like to use testdataset"""
1818 1819
1820 - def untrain(self):
1821 """Untrain `FeatureSelectionClassifier` 1822 1823 Has to untrain any known classifier 1824 """ 1825 if not self.trained: 1826 return 1827 if not self.__maskclf is None: 1828 self.__maskclf.untrain() 1829 super(FeatureSelectionClassifier, self).untrain()
1830 1831
1832 - def _train(self, dataset):
1833 """Train `FeatureSelectionClassifier` 1834 """ 1835 # temporarily enable selected_ids 1836 self.__feature_selection.states._changeTemporarily( 1837 enable_states=["selected_ids"]) 1838 1839 if __debug__: 1840 debug("CLFFS", "Performing feature selection using %s" % 1841 self.__feature_selection + " on %s" % dataset) 1842 1843 (wdataset, tdataset) = self.__feature_selection(dataset, 1844 self.__testdataset) 1845 if __debug__: 1846 add_ = "" 1847 if "CLFFS_" in debug.active: 1848 add_ = " Selected features: %s" % \ 1849 self.__feature_selection.selected_ids 1850 debug("CLFFS", "%(fs)s selected %(nfeat)d out of " + 1851 "%(dsnfeat)d features.%(app)s", 1852 msgargs={'fs':self.__feature_selection, 1853 'nfeat':wdataset.nfeatures, 1854 'dsnfeat':dataset.nfeatures, 1855 'app':add_}) 1856 1857 # create a mask to devise a mapper 1858 # TODO -- think about making selected_ids a MaskMapper 1859 mappermask = N.zeros(dataset.nfeatures) 1860 mappermask[self.__feature_selection.selected_ids] = 1 1861 mapper = MaskMapper(mappermask) 1862 1863 self.__feature_selection.states._resetEnabledTemporarily() 1864 1865 # create and assign `MappedClassifier` 1866 self.__maskclf = MappedClassifier(self.clf, mapper) 1867 # we could have called self.__clf.train(dataset), but it would 1868 # cause unnecessary masking 1869 self.__maskclf.clf.train(wdataset)
1870 1871 # for the ease of access 1872 # TODO see for ProxyClassifier 1873 #self.states._copy_states_(self.__maskclf, deep=False) 1874
1875 - def _getFeatureIds(self):
1876 """Return used feature ids for `FeatureSelectionClassifier` 1877 1878 """ 1879 return self.__feature_selection.selected_ids
1880
1881 - def _predict(self, data):
1882 """Predict using `FeatureSelectionClassifier` 1883 """ 1884 clf = self.__maskclf 1885 if self.states.isEnabled('values'): 1886 clf.states.enable(['values']) 1887 1888 result = clf._predict(data) 1889 # for the ease of access 1890 self.states._copy_states_(clf, ['values'], deep=False) 1891 return result
1892
1893 - def setTestDataset(self, testdataset):
1894 """Set testing dataset to be used for feature selection 1895 """ 1896 self.__testdataset = testdataset
1897 1898 maskclf = property(lambda x:x.__maskclf, doc="Used `MappedClassifier`") 1899 feature_selection = property(lambda x:x.__feature_selection, 1900 doc="Used `FeatureSelection`") 1901 1902 @group_kwargs(prefixes=['slave_'], passthrough=True)
1903 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
1904 """Return an appropriate SensitivityAnalyzer 1905 1906 TODO: had to clone from mapped classifier... XXX 1907 """ 1908 return MappedClassifierSensitivityAnalyzer( 1909 self, 1910 analyzer=self.clf.getSensitivityAnalyzer(**slave_kwargs), 1911 **kwargs)
1912 1913 1914 1915 testdataset = property(fget=lambda x:x.__testdataset, 1916 fset=setTestDataset)
1917