Package mvpa :: Package clfs :: Module base
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.base

   1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
   2  #ex: set sts=4 ts=4 sw=4 et: 
   3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   4  # 
   5  #   See COPYING file distributed along with the PyMVPA package for the 
   6  #   copyright and license terms. 
   7  # 
   8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   9  """Base classes for all classifiers. 
  10   
  11  Base Classifiers can be grouped according to their function as 
  12   
  13  :group Basic Classifiers: Classifier BoostedClassifier ProxyClassifier 
  14  :group BoostedClassifiers: CombinedClassifier MulticlassClassifier 
  15    SplitClassifier 
  16  :group ProxyClassifiers: BinaryClassifier MappedClassifier 
  17    FeatureSelectionClassifier 
  18  :group PredictionsCombiners for CombinedClassifier: PredictionsCombiner 
  19    MaximalVote 
  20   
  21  """ 
  22   
  23  __docformat__ = 'restructuredtext' 
  24   
  25  import operator, sys 
  26  import numpy as N 
  27   
  28  # We have to use deepcopy from python 2.5, since otherwise it fails to 
  29  # copy sensitivity analyzers with assigned combiners which are just 
  30  # functions not functors 
  31  if sys.version_info[0] > 2 or sys.version_info[1] > 4: 
  32      from copy import deepcopy 
  33  else: 
  34      from mvpa.misc.copy import deepcopy 
  35   
  36  from sets import Set 
  37  from time import time 
  38   
  39  from mvpa.mappers import MaskMapper 
  40  from mvpa.datasets.splitter import NFoldSplitter 
  41  from mvpa.misc.state import StateVariable, Stateful, Harvestable 
  42   
  43  from mvpa.clfs.transerror import ConfusionMatrix 
  44   
  45  from mvpa.measures.base import \ 
  46      BoostedClassifierSensitivityAnalyzer, ProxyClassifierSensitivityAnalyzer 
  47  from mvpa.misc import warning 
  48   
  49  if __debug__: 
  50      import traceback 
  51      from mvpa.misc import debug 
52 53 54 -def _deepcopyclf(clf):
55 """Deepcopying of a classifier. 56 57 If deepcopy fails -- tries to untrain it first so that there is no 58 swig bindings attached 59 """ 60 try: 61 return deepcopy(clf) 62 except: 63 clf.untrain() 64 return deepcopy(clf)
65
66 67 -class Classifier(Stateful):
68 """Abstract classifier class to be inherited by all classifiers 69 70 Required behavior: 71 72 For every classifier is has to be possible to be instanciated without 73 having to specify the training pattern. 74 75 Repeated calls to the train() method with different training data have to 76 result in a valid classifier, trained for the particular dataset. 77 78 It must be possible to specify all classifier parameters as keyword 79 arguments to the constructor. 80 81 Recommended behavior: 82 83 Derived classifiers should provide access to *values* -- i.e. that 84 information that is finally used to determine the predicted class label. 85 86 Michael: Maybe it works well if each classifier provides a 'values' 87 state member. This variable is a list as long as and in same order 88 as Dataset.uniquelabels (training data). Each item in the list 89 corresponds to the likelyhood of a sample to belong to the 90 respective class. However the sematics might differ between 91 classifiers, e.g. kNN would probably store distances to class- 92 neighbours, where PLR would store the raw function value of the 93 logistic function. So in the case of kNN low is predictive and for 94 PLR high is predictive. Don't know if there is the need to unify 95 that. 96 97 As the storage and/or computation of this information might be 98 demanding its collection should be switchable and off be default. 99 100 Nomenclature 101 * predictions : corresponds to the quantized labels if classifier spits 102 out labels by .predict() 103 * values : might be different from predictions if a classifier's predict() 104 makes a decision based on some internal value such as 105 probability or a distance. 106 """ 107 # Dict that contains the parameters of a classifier. 108 # This shall provide an interface to plug generic parameter optimizer 109 # on all classifiers (e.g. grid- or line-search optimizer) 110 # A dictionary is used because Michael thinks that access by name is nicer. 111 # Additonally Michael thinks ATM that additonal information might be 112 # necessary in some situations (e.g. reasonably predefined parameter range, 113 # minimal iteration stepsize, ...), therefore the value to each key should 114 # also be a dict or we should use mvpa.misc.param.Parameter'... 115 116 trained_labels = StateVariable(enabled=True, 117 doc="Set of unique labels it has been trained on") 118 119 trained_dataset = StateVariable(enabled=False, 120 doc="The dataset it has been trained on") 121 122 training_confusion = StateVariable(enabled=False, 123 doc="Confusion matrix of learning performance") 124 125 predictions = StateVariable(enabled=True, 126 doc="Most recent set of predictions") 127 128 values = StateVariable(enabled=False, 129 doc="Internal classifier values the most recent " + 130 "predictions are based on") 131 132 training_time = StateVariable(enabled=True, 133 doc="Time (in seconds) which took classifier to train") 134 135 predicting_time = StateVariable(enabled=True, 136 doc="Time (in seconds) which took classifier to predict") 137 138 feature_ids = StateVariable(enabled=False, 139 doc="Feature IDS which were used for the actual training." + 140 " Some classifiers might internally do feature selection (SMLR)") 141 142 _clf_internals = [] 143 """Describes some specifics about the classifier -- is that it is 144 doing regression for instance....""" 145 146
147 - def __init__(self, train2predict=True, regression=False, retrainable=False, 148 **kwargs):
149 """Cheap initialization. 150 """ 151 Stateful.__init__(self, **kwargs) 152 153 self._train2predict = train2predict 154 """Some classifiers might not need to be trained to predict""" 155 156 self.__trainednfeatures = None 157 """Stores number of features for which classifier was trained. 158 If None -- it wasn't trained at all""" 159 160 self._regression = regression 161 """If True - perform regression, not classification""" 162 163 self.__retrainable = None 164 """If True - store anything necessary for efficient retrain""" 165 self._setRetrainable(retrainable) 166 167 if self._regression: 168 for statevar in [ "trained_labels", "training_confusion" ]: 169 if self.states.isEnabled(statevar): 170 if __debug__: 171 debug("CLF", 172 "Disabling state %s since doing regression, " % 173 statevar + "not classification") 174 self.states.disable(statevar) 175 176 self.__trainedidhash = None 177 """Stores id of the dataset on which it was trained to signal 178 in trained() if it was trained already on the same dataset"""
179 180
181 - def __str__(self):
182 if __debug__ and 'CLF_' in debug.active: 183 return "%s / %s" % (`self`, Stateful.__str__(self)) 184 else: 185 return `self`
186 187 #XXX that is a bad idea since object seems to be be deallocated by here 188 #def __del__(self): 189 # if __debug__: 190 # debug('CLF_', 'Destroying classifier %s' % `self`) 191 # self.untrain() 192 193
194 - def _pretrain(self, dataset):
195 """Functionality prior to training 196 """ 197 # So we reset all state variables and may be free up some memory 198 # explicitely 199 if not self.__retrainable: 200 self.untrain() 201 else: 202 # just reset the states 203 self.states.reset() 204 205 if not self._regression and 'regression' in self._clf_internals \ 206 and not self.states.isEnabled('trained_labels'): 207 # if classifier internally does regression we need to have 208 # labels it was trained on 209 if __debug__: 210 debug("CLF", "Enabling trained_labels state since it is needed") 211 self.states.enable('trained_labels')
212 213
214 - def _posttrain(self, dataset):
215 """Functionality post training 216 217 For instance -- computing confusion matrix 218 :Parameters: 219 dataset : Dataset 220 Data which was used for training 221 """ 222 if self.states.isEnabled('trained_labels'): 223 self.trained_labels = Set(dataset.uniquelabels) 224 225 self.trained_dataset = dataset 226 227 # needs to be assigned first since below we use predict 228 self.__trainednfeatures = dataset.nfeatures 229 self.__trainedidhash = dataset.idhash 230 231 if self.states.isEnabled('training_confusion'): 232 # we should not store predictions for training data, 233 # it is confusing imho (yoh) 234 self.states._changeTemporarily( 235 disable_states=["predictions"]) 236 predictions = self.predict(dataset.samples) 237 self.states._resetEnabledTemporarily() 238 self.training_confusion = ConfusionMatrix( 239 labels=dataset.uniquelabels, targets=dataset.labels, 240 predictions=predictions) 241 242 if self.states.isEnabled('feature_ids'): 243 self.feature_ids = self._getFeatureIds()
244 245
246 - def _getFeatureIds(self):
247 """Virtual method to return feature_ids used while training 248 249 Is not intended to be called anywhere but from _posttrain, 250 thus classifier is assumed to be trained at this point 251 """ 252 # By default all features are used 253 return range(self.__trainednfeatures)
254 255
256 - def _train(self, dataset):
257 """Function to be actually overriden in derived classes 258 """ 259 raise NotImplementedError
260 261
262 - def train(self, dataset):
263 """Train classifier on a dataset 264 265 Shouldn't be overriden in subclasses unless explicitely needed 266 to do so 267 """ 268 if __debug__: 269 debug("CLF", "Training classifier %s on dataset %s" % \ 270 (`self`, `dataset`)) 271 if 'CLF_TB' in debug.active: 272 tb = traceback.extract_stack(limit=5) 273 debug("CLF_TB", "Traceback: %s" % tb) 274 275 self._pretrain(dataset) 276 277 # remember the time when started training 278 t0 = time() 279 280 if dataset.nfeatures > 0: 281 result = self._train(dataset) 282 else: 283 warning("Trying to train on dataset with no features present") 284 if __debug__: 285 debug("CLF", 286 "No features present for training, no actual training is called") 287 result = None 288 289 self.training_time = time() - t0 290 self._posttrain(dataset) 291 return result
292 293
294 - def _prepredict(self, data):
295 """Functionality prior prediction 296 """ 297 if self._train2predict: 298 # check if classifier was trained if that is needed 299 if not self.trained: 300 raise ValueError, \ 301 "Classifier %s wasn't yet trained, therefore can't " \ 302 "predict" % `self` 303 nfeatures = data.shape[1] 304 # check if number of features is the same as in the data 305 # it was trained on 306 if nfeatures != self.__trainednfeatures: 307 raise ValueError, \ 308 "Classifier %s was trained on data with %d features, " % \ 309 (`self`, self.__trainednfeatures) + \ 310 "thus can't predict for %d features" % nfeatures
311 312
313 - def _postpredict(self, data, result):
314 """Functionality after prediction is computed 315 """ 316 self.predictions = result
317 318
319 - def _predict(self, data):
320 """Actual prediction 321 """ 322 raise NotImplementedError
323 324
325 - def predict(self, data):
326 """Predict classifier on data 327 328 Shouldn't be overriden in subclasses unless explicitely needed 329 to do so. Also subclasses trying to call super class's predict 330 should call _predict if within _predict instead of predict() 331 since otherwise it would loop 332 """ 333 data = N.asarray(data) 334 if __debug__: 335 debug("CLF", "Predicting classifier %s on data %s" \ 336 % (`self`, `data.shape`)) 337 tb = traceback.extract_stack(limit=5) 338 debug("CLF_TB", "Traceback: %s" % tb) 339 340 # remember the time when started computing predictions 341 t0 = time() 342 343 self._prepredict(data) 344 if self.__trainednfeatures > 0 or not self._train2predict: 345 result = self._predict(data) 346 else: 347 warning("Trying to predict using classifier trained on no features") 348 if __debug__: 349 debug("CLF", 350 "No features were present for training, prediction is bogus") 351 result = [None]*data.shape[0] 352 353 self.predicting_time = time() - t0 354 355 if 'regression' in self._clf_internals and not self._regression: 356 # We need to convert regression values into labels 357 # XXX unify may be labels -> internal_labels conversion. 358 #if len(self.trained_labels) != 2: 359 # raise RuntimeError, "XXX Ask developer to implement for multiclass mapping from regression into classification" 360 361 # must be N.array so we copy it to assign labels directly 362 # into labels. 363 # XXX or should we just recreate "result" 364 result_ = N.array(result) 365 self.values = result_ 366 trained_labels = N.asarray(list(self.trained_labels)) 367 for i,value in enumerate(result): 368 dists = N.abs(value - trained_labels) 369 result[i] = trained_labels[N.argmin(dists)] 370 371 if __debug__: 372 debug("CLF_", "Converted regression result %s into labels %s for %s" % (result_, result, self)) 373 374 self._postpredict(data, result) 375 return result
376
377 - def isTrained(self, dataset=None):
378 """Either classifier was already trained. 379 380 MUST BE USED WITH CARE IF EVER""" 381 if dataset is None: 382 # simply return if it was trained on anything 383 return not self.__trainednfeatures is None 384 else: 385 return (self.__trainednfeatures == dataset.nfeatures) \ 386 and (self.__trainedidhash == dataset.idhash)
387 388 @property
389 - def regression(self):
390 return self._regression
391 392
393 - def _regressionIsBogus(self):
394 """Some classifiers like BinaryClassifier can't be used for regression""" 395 396 if self.regression: 397 raise ValueError, "Regression mode is meaningless for %s" % \ 398 self.__class__.__name__ + " thus don't enable it"
399 400 401 @property
402 - def trained(self):
403 """Either classifier was already trained""" 404 return self.isTrained()
405
406 - def untrain(self):
407 """Reset trained state""" 408 self.__trainednfeatures = None 409 Stateful.reset(self)
410 411 412 @property
413 - def train2predict(self):
414 """Either classifier has to be trained to predict""" 415 return self._train2predict
416 417
418 - def getSensitivityAnalyzer(self, **kwargs):
419 """Factory method to return an appropriate sensitivity analyzer for 420 the respective classifier.""" 421 raise NotImplementedError
422
423 - def _getRetrainable(self):
424 return self.__retrainable
425
426 - def _setRetrainable(self, value):
427 if value != self.__retrainable: 428 # assure that we don't drag anything behind 429 if self.trained: 430 self.untrain() 431 states = self.states 432 if not value and states.isKnown('retrained'): 433 states.remove('retrained') 434 states.remove('retested') 435 if value: 436 if not 'retrainable' in self._clf_internals: 437 warning("Setting of flag retrainable for %s has no effect" 438 " since classifier has no such capability" % self) 439 states.add(StateVariable(enabled=True, 440 name='retrained', 441 doc="Either retrainable classifier was retrained")) 442 states.add(StateVariable(enabled=True, 443 name='retested', 444 doc="Either retrainable classifier was retested")) 445 446 self.__retrainable = value
447 448 449 retrainable = property(fget=_getRetrainable, fset=_setRetrainable, 450 doc="Specifies either classifier should be retrainable")
451
452 453 # 454 # Base classifiers of various kinds 455 # 456 457 -class BoostedClassifier(Classifier, Harvestable):
458 """Classifier containing the farm of other classifiers. 459 460 Should rarely be used directly. Use one of its childs instead 461 """ 462 463 # should not be needed if we have prediction_values upstairs 464 # TODO : should be handled as Harvestable or smth like that 465 raw_predictions = StateVariable(enabled=False, 466 doc="Predictions obtained from each classifier") 467 468 raw_values = StateVariable(enabled=False, 469 doc="Values obtained from each classifier") 470 471
472 - def __init__(self, clfs=None, propagate_states=True, 473 harvest_attribs=None, copy_attribs='copy', 474 **kwargs):
475 """Initialize the instance. 476 477 :Parameters: 478 clfs : list 479 list of classifier instances to use (slave classifiers) 480 propagate_states : bool 481 either to propagate enabled states into slave classifiers. 482 It is in effect only when slaves get assigned - so if state 483 is enabled not during construction, it would not necessarily 484 propagate into slaves 485 harvest_attribs : list of basestr 486 What attributes of call to store and return within 487 harvested state variable 488 copy_attribs : None or basestr 489 Force copying values of attributes on harvesting 490 kwargs : dict 491 dict of keyworded arguments which might get used 492 by State or Classifier 493 """ 494 if clfs == None: 495 clfs = [] 496 497 Classifier.__init__(self, **kwargs) 498 Harvestable.__init__(self, harvest_attribs, copy_attribs) 499 500 self.__clfs = None 501 """Pylint friendly definition of __clfs""" 502 503 self.__propagate_states = propagate_states 504 """Enable current enabled states in slave classifiers""" 505 506 self._setClassifiers(clfs) 507 """Store the list of classifiers"""
508 509
510 - def __repr__(self):
511 return "<%s(%d classifiers)>" \ 512 % (self.__class__.__name__, len(self.clfs))
513 514
515 - def _train(self, dataset):
516 """Train `BoostedClassifier` 517 """ 518 for clf in self.__clfs: 519 clf.train(dataset)
520 521
522 - def _posttrain(self, dataset):
523 """Custom posttrain of `BoostedClassifier` 524 525 Harvest over the trained classifiers if it was asked to so 526 """ 527 Classifier._posttrain(self, dataset) 528 if self.states.isEnabled('harvested'): 529 for clf in self.__clfs: 530 self._harvest(locals())
531 532
533 - def _getFeatureIds(self):
534 """Custom _getFeatureIds for `BoostedClassifier` 535 """ 536 # return union of all used features by slave classifiers 537 feature_ids = Set([]) 538 for clf in self.__clfs: 539 feature_ids = feature_ids.union(Set(clf.feature_ids)) 540 return list(feature_ids)
541 542
543 - def _predict(self, data):
544 """Predict using `BoostedClassifier` 545 """ 546 raw_predictions = [ clf.predict(data) for clf in self.__clfs ] 547 self.raw_predictions = raw_predictions 548 assert(len(self.__clfs)>0) 549 if self.states.isEnabled("values"): 550 # XXX pylint complains that numpy has no array member... weird 551 if N.array([x.states.isEnabled("values") 552 for x in self.__clfs]).all(): 553 values = [ clf.values for clf in self.__clfs ] 554 self.raw_values = values 555 else: 556 warning("One or more classifiers in %s has no 'values' state" % 557 `self` + "enabled, thus BoostedClassifier can't have" + 558 " 'raw_values' state variable defined") 559 560 return raw_predictions
561 562
563 - def _setClassifiers(self, clfs):
564 """Set the classifiers used by the boosted classifier 565 566 We have to allow to set list of classifiers after the object 567 was actually created. It will be used by 568 BoostedMulticlassClassifier 569 """ 570 self.__clfs = clfs 571 """Classifiers to use""" 572 573 for flag in ['_train2predict', '_regression']: 574 values = N.array([clf.__dict__[flag] for clf in self.__clfs]) 575 value = values.any() 576 if __debug__: 577 debug("CLFBST", "Setting %s=%s for classifiers " \ 578 "%s with %s" \ 579 % (flag, str(value), `self.__clfs`, str(values))) 580 # set flag if it needs to be trained before predicting 581 self.__dict__[flag] = value 582 583 # enable corresponding states in the slave-classifiers 584 if self.__propagate_states: 585 for clf in self.__clfs: 586 clf.states.enable(self.states.enabled, missingok=True) 587 588 # adhere to their capabilities + 'multiclass' 589 # XXX do intersection across all classifiers! 590 self._clf_internals = [ 'multiclass', 'meta' ] 591 if len(clfs)>0: 592 self._clf_internals += self.__clfs[0]._clf_internals
593
594 - def untrain(self):
595 """Untrain `BoostedClassifier` 596 597 Has to untrain any known classifier 598 """ 599 if not self.trained: 600 return 601 for clf in self.clfs: 602 clf.untrain() 603 super(BoostedClassifier, self).untrain()
604
605 - def getSensitivityAnalyzer(self, **kwargs):
606 """Return an appropriate SensitivityAnalyzer""" 607 return BoostedClassifierSensitivityAnalyzer( 608 self, 609 **kwargs)
610 611 612 clfs = property(fget=lambda x:x.__clfs, 613 fset=_setClassifiers, 614 doc="Used classifiers")
615
616 617 618 -class ProxyClassifier(Classifier):
619 """Classifier which decorates another classifier 620 621 Possible uses: 622 623 - modify data somehow prior training/testing: 624 * normalization 625 * feature selection 626 * modification 627 628 - optimized classifier? 629 630 """ 631
632 - def __init__(self, clf, **kwargs):
633 """Initialize the instance 634 635 :Parameters: 636 clf : Classifier 637 classifier based on which mask classifiers is created 638 """ 639 Classifier.__init__(self, train2predict=clf.train2predict, **kwargs) 640 641 self.__clf = clf 642 """Store the classifier to use.""" 643 644 self._regression = clf.regression 645 """Do regression if base classifier does""" 646 647 # adhere to slave classifier capabilities 648 # XXX test test test 649 self._clf_internals = self._clf_internals[:] + ['meta'] 650 if clf is not None: 651 self._clf_internals += clf._clf_internals
652 653
654 - def _train(self, dataset):
655 """Train `ProxyClassifier` 656 """ 657 # base class does nothing much -- just proxies requests to underlying 658 # classifier 659 self.__clf.train(dataset)
660 661 # for the ease of access 662 # TODO: if to copy we should exclude some states which are defined in 663 # base Classifier (such as training_time, predicting_time) 664 #self.states._copy_states_(self.__clf, deep=False) 665 666
667 - def _predict(self, data):
668 """Predict using `ProxyClassifier` 669 """ 670 result = self.__clf.predict(data) 671 # for the ease of access 672 #self.states._copy_states_(self.__clf, deep=False) 673 return result
674 675
676 - def untrain(self):
677 """Untrain ProxyClassifier 678 """ 679 if not self.__clf is None: 680 self.__clf.untrain() 681 super(ProxyClassifier, self).untrain()
682 683
684 - def getSensitivityAnalyzer(self, **kwargs):
685 """Return an appropriate SensitivityAnalyzer""" 686 return ProxyClassifierSensitivityAnalyzer( 687 self, 688 analyzer=self.__clf.getSensitivityAnalyzer(**kwargs), 689 **kwargs)
690 691 692 clf = property(lambda x:x.__clf, doc="Used `Classifier`")
693
694 695 696 # 697 # Various combiners for CombinedClassifier 698 # 699 700 -class PredictionsCombiner(Stateful):
701 """Base class for combining decisions of multiple classifiers""" 702
703 - def train(self, clfs, dataset):
704 """PredictionsCombiner might need to be trained 705 706 :Parameters: 707 clfs : list of Classifier 708 List of classifiers to combine. Has to be classifiers (not 709 pure predictions), since combiner might use some other 710 state variables (value's) instead of pure prediction's 711 dataset : Dataset 712 training data in this case 713 """ 714 pass
715 716
717 - def __call__(self, clfs, dataset):
718 """Call function 719 720 :Parameters: 721 clfs : list of Classifier 722 List of classifiers to combine. Has to be classifiers (not 723 pure predictions), since combiner might use some other 724 state variables (value's) instead of pure prediction's 725 """ 726 raise NotImplementedError
727
728 729 730 -class MaximalVote(PredictionsCombiner):
731 """Provides a decision using maximal vote rule""" 732 733 predictions = StateVariable(enabled=True, 734 doc="Voted predictions") 735 all_label_counts = StateVariable(enabled=False, 736 doc="Counts across classifiers for each label/sample") 737
738 - def __init__(self):
739 """XXX Might get a parameter to use raw decision values if 740 voting is not unambigous (ie two classes have equal number of 741 votes 742 """ 743 PredictionsCombiner.__init__(self)
744 745
746 - def __call__(self, clfs, dataset):
747 """Actuall callable - perform voting 748 749 Extended functionality which might not be needed actually: 750 Since `BinaryClassifier` might return a list of possible 751 predictions (not just a single one), we should consider all of those 752 753 MaximalVote doesn't care about dataset itself 754 """ 755 if len(clfs)==0: 756 return [] # to don't even bother 757 758 all_label_counts = None 759 for clf in clfs: 760 # Lets check first if necessary state variable is enabled 761 if not clf.states.isEnabled("predictions"): 762 raise ValueError, "MaximalVote needs classifiers (such as " + \ 763 "%s) with state 'predictions' enabled" % clf 764 predictions = clf.predictions 765 if all_label_counts is None: 766 all_label_counts = [ {} for i in xrange(len(predictions)) ] 767 768 # for every sample 769 for i in xrange(len(predictions)): 770 prediction = predictions[i] 771 if not operator.isSequenceType(prediction): 772 prediction = (prediction,) 773 for label in prediction: # for every label 774 # we might have multiple labels assigned XXX 775 # but might not -- don't remember now 776 if not all_label_counts[i].has_key(label): 777 all_label_counts[i][label] = 0 778 all_label_counts[i][label] += 1 779 780 predictions = [] 781 # select maximal vote now for each sample 782 for i in xrange(len(all_label_counts)): 783 label_counts = all_label_counts[i] 784 # lets do explicit search for max so we know 785 # if it is unique 786 maxk = [] # labels of elements with max vote 787 maxv = -1 788 for k, v in label_counts.iteritems(): 789 if v > maxv: 790 maxk = [k] 791 maxv = v 792 elif v == maxv: 793 maxk.append(k) 794 795 assert len(maxk) >= 1, \ 796 "We should have obtained at least a single key of max label" 797 798 if len(maxk) > 1: 799 warning("We got multiple labels %s which have the " % `maxk` + 800 "same maximal vote %d. XXX disambiguate" % maxv) 801 predictions.append(maxk[0]) 802 803 self.all_label_counts = all_label_counts 804 self.predictions = predictions 805 return predictions
806
807 808 809 -class ClassifierCombiner(PredictionsCombiner):
810 """Provides a decision using training a classifier on predictions/values 811 812 TODO 813 """ 814 815 predictions = StateVariable(enabled=True, 816 doc="Trained predictions") 817 818
819 - def __init__(self, clf, variables=None):
820 """Initialize `ClassifierCombiner` 821 822 :Parameters: 823 clf : Classifier 824 Classifier to train on the predictions 825 variables : list of basestring 826 List of state variables stored in 'combined' classifiers, which 827 to use as features for training this classifier 828 """ 829 PredictionsCombiner.__init__(self) 830 831 self.__clf = clf 832 """Classifier to train on `variables` states of provided classifiers""" 833 834 if variables == None: 835 variables = ['predictions'] 836 self.__variables = variables 837 """What state variables of the classifiers to use"""
838 839
840 - def untrain(self):
841 """It might be needed to untrain used classifier""" 842 if self.__clf: 843 self.__clf.untrain()
844
845 - def __call__(self, clfs, dataset):
846 """ 847 """ 848 if len(clfs)==0: 849 return [] # to don't even bother 850 851 # XXX What is it, Exception or Return? 852 raise NotImplementedError
853
854 855 856 -class CombinedClassifier(BoostedClassifier):
857 """`BoostedClassifier` which combines predictions using some `PredictionsCombiner` 858 functor. 859 """ 860
861 - def __init__(self, clfs=None, combiner=MaximalVote(), **kwargs):
862 """Initialize the instance. 863 864 :Parameters: 865 clfs : list of Classifier 866 list of classifier instances to use 867 combiner : PredictionsCombiner 868 callable which takes care about combining multiple 869 results into a single one (e.g. maximal vote) 870 kwargs : dict 871 dict of keyworded arguments which might get used 872 by State or Classifier 873 874 NB: `combiner` might need to operate not on 'predictions' descrete 875 labels but rather on raw 'class' values classifiers 876 estimate (which is pretty much what is stored under 877 `values` 878 """ 879 if clfs == None: 880 clfs = [] 881 882 BoostedClassifier.__init__(self, clfs, **kwargs) 883 884 self.__combiner = combiner 885 """Functor destined to combine results of multiple classifiers"""
886 887
888 - def __repr__(self):
889 return "<%s(%d classifiers, combiner %s)>" \ 890 % (self.__class__.__name__, len(self.clfs), `self.__combiner`)
891
892 - def untrain(self):
893 try: 894 self.__combiner.untrain() 895 except: 896 pass 897 super(CombinedClassifier, self).untrain()
898
899 - def _train(self, dataset):
900 """Train `CombinedClassifier` 901 """ 902 BoostedClassifier._train(self, dataset) 903 # combiner might need to train as well 904 self.__combiner.train(self.clfs, dataset)
905 906
907 - def _predict(self, data):
908 """Predict using `CombinedClassifier` 909 """ 910 BoostedClassifier._predict(self, data) 911 # combiner will make use of state variables instead of only predictions 912 # returned from _predict 913 predictions = self.__combiner(self.clfs, data) 914 self.predictions = predictions 915 916 if self.states.isEnabled("values"): 917 if self.__combiner.states.isActive("values"): 918 # XXX or may be we could leave simply up to accessing .combiner? 919 self.values = self.__combiner.values 920 else: 921 if __debug__: 922 warning("Boosted classifier %s has 'values' state" % `self` + 923 " enabled, but combiner has it active, thus no" + 924 " values could be provided directly, access .clfs") 925 return predictions
926 927 928 combiner = property(fget=lambda x:x.__combiner, 929 doc="Used combiner to derive a single result")
930
931 932 933 -class BinaryClassifier(ProxyClassifier):
934 """`ProxyClassifier` which maps set of two labels into +1 and -1 935 """ 936
937 - def __init__(self, clf, poslabels, neglabels, **kwargs):
938 """ 939 :Parameters: 940 clf : Classifier 941 classifier to use 942 poslabels : list 943 list of labels which are treated as +1 category 944 neglabels : list 945 list of labels which are treated as -1 category 946 """ 947 948 ProxyClassifier.__init__(self, clf, **kwargs) 949 950 self._regressionIsBogus() 951 952 # Handle labels 953 sposlabels = Set(poslabels) # so to remove duplicates 954 sneglabels = Set(neglabels) # so to remove duplicates 955 956 # check if there is no overlap 957 overlap = sposlabels.intersection(sneglabels) 958 if len(overlap)>0: 959 raise ValueError("Sets of positive and negative labels for " + 960 "BinaryClassifier must not overlap. Got overlap " % 961 overlap) 962 963 self.__poslabels = list(sposlabels) 964 self.__neglabels = list(sneglabels) 965 966 # define what values will be returned by predict: if there is 967 # a single label - return just it alone, otherwise - whole 968 # list 969 # Such approach might come useful if we use some classifiers 970 # over different subsets of data with some voting later on 971 # (1-vs-therest?) 972 973 if len(self.__poslabels)>1: 974 self.__predictpos = self.__poslabels 975 else: 976 self.__predictpos = self.__poslabels[0] 977 978 if len(self.__neglabels)>1: 979 self.__predictneg = self.__neglabels 980 else: 981 self.__predictneg = self.__neglabels[0]
982 983
984 - def __str__(self):
985 return "BinaryClassifier +1: %s -1: %s" % ( 986 `self.__poslabels`, `self.__neglabels`)
987 988
989 - def _train(self, dataset):
990 """Train `BinaryClassifier` 991 """ 992 idlabels = [(x, +1) for x in dataset.idsbylabels(self.__poslabels)] + \ 993 [(x, -1) for x in dataset.idsbylabels(self.__neglabels)] 994 # XXX we have to sort ids since at the moment Dataset.selectSamples 995 # doesn't take care about order 996 idlabels.sort() 997 # select the samples 998 orig_labels = None 999 1000 # If we need all samples, why simply not perform on original 1001 # data, an just store/restore labels. But it really should be done 1002 # within Dataset.selectSamples 1003 if len(idlabels) == dataset.nsamples \ 1004 and [x[0] for x in idlabels] == range(dataset.nsamples): 1005 # the last condition is not even necessary... just overly 1006 # cautious 1007 datasetselected = dataset # no selection is needed 1008 orig_labels = dataset.labels # but we would need to restore labels 1009 if __debug__: 1010 debug('CLFBIN', 1011 "Assigned all %d samples for binary " % 1012 (dataset.nsamples) + 1013 " classification among labels %s/+1 and %s/-1" % 1014 (self.__poslabels, self.__neglabels)) 1015 else: 1016 datasetselected = dataset.selectSamples([ x[0] for x in idlabels ]) 1017 if __debug__: 1018 debug('CLFBIN', 1019 "Selected %d samples out of %d samples for binary " % 1020 (len(idlabels), dataset.nsamples) + 1021 " classification among labels %s/+1 and %s/-1" % 1022 (self.__poslabels, self.__neglabels) + 1023 ". Selected %s" % datasetselected) 1024 1025 # adjust the labels 1026 datasetselected.labels = [ x[1] for x in idlabels ] 1027 1028 # now we got a dataset with only 2 labels 1029 if __debug__: 1030 assert((datasetselected.uniquelabels == [-1, 1]).all()) 1031 1032 self.clf.train(datasetselected) 1033 1034 if not orig_labels is None: 1035 dataset.labels = orig_labels
1036
1037 - def _predict(self, data):
1038 """Predict the labels for a given `data` 1039 1040 Predicts using binary classifier and spits out list (for each sample) 1041 where with either poslabels or neglabels as the "label" for the sample. 1042 If there was just a single label within pos or neg labels then it would 1043 return not a list but just that single label. 1044 """ 1045 binary_predictions = ProxyClassifier._predict(self, data) 1046 self.values = binary_predictions 1047 predictions = [ {-1: self.__predictneg, 1048 +1: self.__predictpos}[x] for x in binary_predictions] 1049 self.predictions = predictions 1050 return predictions
1051
1052 1053 1054 -class MulticlassClassifier(CombinedClassifier):
1055 """`CombinedClassifier` to perform multiclass using a list of 1056 `BinaryClassifier`. 1057 1058 such as 1-vs-1 (ie in pairs like libsvm doesn) or 1-vs-all (which 1059 is yet to think about) 1060 """ 1061
1062 - def __init__(self, clf, bclf_type="1-vs-1", **kwargs):
1063 """Initialize the instance 1064 1065 :Parameters: 1066 clf : Classifier 1067 classifier based on which multiple classifiers are created 1068 for multiclass 1069 bclf_type 1070 "1-vs-1" or "1-vs-all", determines the way to generate binary 1071 classifiers 1072 """ 1073 CombinedClassifier.__init__(self, **kwargs) 1074 self._regressionIsBogus() 1075 if not clf is None: 1076 clf._regressionIsBogus() 1077 1078 self.__clf = clf 1079 """Store sample instance of basic classifier""" 1080 1081 # XXX such logic below might go under train.... 1082 if bclf_type == "1-vs-1": 1083 pass 1084 elif bclf_type == "1-vs-all": 1085 raise NotImplementedError 1086 else: 1087 raise ValueError, \ 1088 "Unknown type of classifier %s for " % bclf_type + \ 1089 "BoostedMulticlassClassifier" 1090 self.__bclf_type = bclf_type
1091 1092
1093 - def _train(self, dataset):
1094 """Train classifier 1095 """ 1096 # construct binary classifiers 1097 ulabels = dataset.uniquelabels 1098 if self.__bclf_type == "1-vs-1": 1099 # generate pairs and corresponding classifiers 1100 biclfs = [] 1101 for i in xrange(len(ulabels)): 1102 for j in xrange(i+1, len(ulabels)): 1103 clf = _deepcopyclf(self.__clf) 1104 biclfs.append( 1105 BinaryClassifier( 1106 clf, 1107 poslabels=[ulabels[i]], neglabels=[ulabels[j]])) 1108 if __debug__: 1109 debug("CLFMC", "Created %d binary classifiers for %d labels" % 1110 (len(biclfs), len(ulabels))) 1111 1112 self.clfs = biclfs 1113 1114 elif self.__bclf_type == "1-vs-all": 1115 raise NotImplementedError 1116 1117 # perform actual training 1118 CombinedClassifier._train(self, dataset)
1119
1120 1121 1122 -class SplitClassifier(CombinedClassifier):
1123 """`BoostedClassifier` to work on splits of the data 1124 1125 TODO: SplitClassifier and MulticlassClassifier have too much in 1126 common -- need to refactor: just need a splitter which would 1127 split dataset in pairs of class labels. MulticlassClassifier 1128 does just a tiny bit more which might be not necessary at 1129 all: map sets of labels into 2 categories... 1130 """ 1131 1132 # Todo: unify with CrossValidatedTransferError which now uses 1133 # harvest_attribs to expose gathered attributes 1134 training_confusions = StateVariable(enabled=False, 1135 doc="Resultant confusion matrices whenever classifier trained " + 1136 "on 1 part and tested on 2nd part of each split") 1137
1138 - def __init__(self, clf, splitter=NFoldSplitter(cvtype=1), **kwargs):
1139 """Initialize the instance 1140 1141 :Parameters: 1142 clf : Classifier 1143 classifier based on which multiple classifiers are created 1144 for multiclass 1145 splitter : Splitter 1146 `Splitter` to use to split the dataset prior training 1147 """ 1148 CombinedClassifier.__init__(self, **kwargs) 1149 self.__clf = clf 1150 """Store sample instance of basic classifier""" 1151 self.__splitter = splitter
1152 1153
1154 - def _train(self, dataset):
1155 """Train `SplitClassifier` 1156 """ 1157 # generate pairs and corresponding classifiers 1158 bclfs = [] 1159 if self.states.isEnabled('training_confusions'): 1160 self.training_confusions = ConfusionMatrix(labels=dataset.uniquelabels) 1161 1162 # for proper and easier debugging - first define classifiers and then 1163 # train them 1164 for split in self.__splitter(dataset): 1165 if __debug__: 1166 debug("CLFSPL", 1167 "Deepcopying %s for %s" % (`self.__clf`, `self`)) 1168 clf = _deepcopyclf(self.__clf) 1169 bclfs.append(clf) 1170 self.clfs = bclfs 1171 1172 i = 0 1173 for split in self.__splitter(dataset): 1174 if __debug__: 1175 debug("CLFSPL", "Training classifier for split %d" % (i)) 1176 1177 clf = self.clfs[i] 1178 1179 # assign testing dataset if given classifier can digest it 1180 if hasattr(clf, 'testdataset'): 1181 clf.testdataset = split[1] 1182 1183 clf.train(split[0]) 1184 if self.states.isEnabled("training_confusions"): 1185 predictions = clf.predict(split[1].samples) 1186 self.training_confusions.add(split[1].labels, predictions) 1187 i += 1
1188 1189
1190 - def getSensitivityAnalyzer(self, **kwargs):
1191 """Return an appropriate SensitivityAnalyzer""" 1192 return BoostedClassifierSensitivityAnalyzer( 1193 self, 1194 analyzer=self.__clf.getSensitivityAnalyzer(**kwargs), 1195 **kwargs)
1196
1197 1198 1199 -class MappedClassifier(ProxyClassifier):
1200 """`ProxyClassifier` which uses some mapper prior training/testing. 1201 1202 `MaskMapper` can be used just a subset of features to 1203 train/classify. 1204 Having such classifier we can easily create a set of classifiers 1205 for BoostedClassifier, where each classifier operates on some set 1206 of features, e.g. set of best spheres from SearchLight, set of 1207 ROIs selected elsewhere. It would be different from simply 1208 applying whole mask over the dataset, since here initial decision 1209 is made by each classifier and then later on they vote for the 1210 final decision across the set of classifiers. 1211 """ 1212
1213 - def __init__(self, clf, mapper, **kwargs):
1214 """Initialize the instance 1215 1216 :Parameters: 1217 clf : Classifier 1218 classifier based on which mask classifiers is created 1219 mapper 1220 whatever `Mapper` comes handy 1221 """ 1222 ProxyClassifier.__init__(self, clf, **kwargs) 1223 1224 self.__mapper = mapper 1225 """mapper to help us our with prepping data to 1226 training/classification"""
1227 1228
1229 - def _train(self, dataset):
1230 """Train `MappedClassifier` 1231 """ 1232 # first train the mapper 1233 # XXX: should training be done using whole dataset or just samples 1234 self.__mapper.train(dataset) 1235 1236 # for train() we have to provide dataset -- not just samples to train! 1237 wdataset = dataset.applyMapper(featuresmapper = self.__mapper) 1238 ProxyClassifier._train(self, wdataset)
1239 1240
1241 - def _predict(self, data):
1242 """Predict using `MappedClassifier` 1243 """ 1244 return ProxyClassifier._predict(self, self.__mapper.forward(data))
1245 1246 1247 mapper = property(lambda x:x.__mapper, doc="Used mapper")
1248
1249 1250 1251 -class FeatureSelectionClassifier(ProxyClassifier):
1252 """`ProxyClassifier` which uses some `FeatureSelection` prior training. 1253 1254 `FeatureSelection` is used first to select features for the classifier to 1255 use for prediction. Internally it would rely on MappedClassifier which 1256 would use created MaskMapper. 1257 1258 TODO: think about removing overhead of retraining the same classifier if 1259 feature selection was carried out with the same classifier already. It 1260 has been addressed by adding .trained property to classifier, but now 1261 we should expclitely use isTrained here if we want... need to think more 1262 """ 1263 1264 _clf_internals = [ 'does_feature_selection', 'meta' ] 1265
1266 - def __init__(self, clf, feature_selection, testdataset=None, **kwargs):
1267 """Initialize the instance 1268 1269 :Parameters: 1270 clf : Classifier 1271 classifier based on which mask classifiers is created 1272 feature_selection : FeatureSelection 1273 whatever `FeatureSelection` comes handy 1274 testdataset : Dataset 1275 optional dataset which would be given on call to feature_selection 1276 """ 1277 ProxyClassifier.__init__(self, clf, **kwargs) 1278 1279 self.__maskclf = None 1280 """Should become `MappedClassifier`(mapper=`MaskMapper`) later on.""" 1281 1282 self.__feature_selection = feature_selection 1283 """`FeatureSelection` to select the features prior training""" 1284 1285 self.__testdataset = testdataset 1286 """`FeatureSelection` might like to use testdataset"""
1287 1288
1289 - def untrain(self):
1290 """Untrain `FeatureSelectionClassifier` 1291 1292 Has to untrain any known classifier 1293 """ 1294 if not self.trained: 1295 return 1296 if not self.__maskclf is None: 1297 self.__maskclf.untrain() 1298 super(FeatureSelectionClassifier, self).untrain()
1299 1300
1301 - def _train(self, dataset):
1302 """Train `FeatureSelectionClassifier` 1303 """ 1304 # temporarily enable selected_ids 1305 self.__feature_selection.states._changeTemporarily( 1306 enable_states=["selected_ids"]) 1307 1308 if __debug__: 1309 debug("CLFFS", "Performing feature selection using %s" % 1310 self.__feature_selection + " on %s" % dataset) 1311 1312 (wdataset, tdataset) = self.__feature_selection(dataset, 1313 self.__testdataset) 1314 if __debug__: 1315 add_ = "" 1316 if "CLFFS_" in debug.active: 1317 add_ = " Selected features: %s" % \ 1318 self.__feature_selection.selected_ids 1319 debug("CLFFS", "{%s} selected %d out of %d features.%s" % 1320 (`self.__feature_selection`, wdataset.nfeatures, 1321 dataset.nfeatures, add_)) 1322 1323 # create a mask to devise a mapper 1324 # TODO -- think about making selected_ids a MaskMapper 1325 mappermask = N.zeros(dataset.nfeatures) 1326 mappermask[self.__feature_selection.selected_ids] = 1 1327 mapper = MaskMapper(mappermask) 1328 1329 self.__feature_selection.states._resetEnabledTemporarily() 1330 1331 # create and assign `MappedClassifier` 1332 self.__maskclf = MappedClassifier(self.clf, mapper) 1333 # we could have called self.__clf.train(dataset), but it would 1334 # cause unnecessary masking 1335 self.__maskclf.clf.train(wdataset)
1336 1337 # for the ease of access 1338 # TODO see for ProxyClassifier 1339 #self.states._copy_states_(self.__maskclf, deep=False) 1340
1341 - def _getFeatureIds(self):
1342 """Return used feature ids for `FeatureSelectionClassifier` 1343 1344 """ 1345 return self.__feature_selection.selected_ids
1346
1347 - def _predict(self, data):
1348 """Predict using `FeatureSelectionClassifier` 1349 """ 1350 result = self.__maskclf._predict(data) 1351 # for the ease of access 1352 #self.states._copy_states_(self.__maskclf, deep=False) 1353 return result
1354
1355 - def setTestDataset(self, testdataset):
1356 """Set testing dataset to be used for feature selection 1357 """ 1358 self.__testdataset = testdataset
1359 1360 # XXX Shouldn't that be mappedclf ? 1361 # YYY yoh: not sure... by nature it is mappedclf, by purpouse it 1362 # is maskclf using MaskMapper 1363 maskclf = property(lambda x:x.__maskclf, doc="Used `MappedClassifier`") 1364 feature_selection = property(lambda x:x.__feature_selection, 1365 doc="Used `FeatureSelection`") 1366 1367 1368 testdataset = property(fget=lambda x:x.__testdataset, 1369 fset=setTestDataset)
1370