Package mvpa :: Package clfs :: Module meta
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.meta

   1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
   2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
   3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   4  # 
   5  #   See COPYING file distributed along with the PyMVPA package for the 
   6  #   copyright and license terms. 
   7  # 
   8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   9  """Classes for meta classifiers -- classifiers which use other classifiers 
  10   
  11  Meta Classifiers can be grouped according to their function as 
  12   
  13  :group BoostedClassifiers: CombinedClassifier MulticlassClassifier 
  14    SplitClassifier 
  15  :group ProxyClassifiers: ProxyClassifier BinaryClassifier MappedClassifier 
  16    FeatureSelectionClassifier 
  17  :group PredictionsCombiners for CombinedClassifier: PredictionsCombiner 
  18    MaximalVote MeanPrediction 
  19   
  20  """ 
  21   
  22  __docformat__ = 'restructuredtext' 
  23   
  24  import operator 
  25  import numpy as N 
  26   
  27  from mvpa.misc.args import group_kwargs 
  28  from mvpa.mappers.mask import MaskMapper 
  29  from mvpa.datasets.splitters import NFoldSplitter 
  30  from mvpa.misc.state import StateVariable, ClassWithCollections, Harvestable 
  31   
  32  from mvpa.clfs.base import Classifier 
  33  from mvpa.misc.transformers import FirstAxisMean 
  34   
  35  from mvpa.measures.base import \ 
  36      BoostedClassifierSensitivityAnalyzer, ProxyClassifierSensitivityAnalyzer, \ 
  37      MappedClassifierSensitivityAnalyzer, \ 
  38      FeatureSelectionClassifierSensitivityAnalyzer 
  39   
  40  from mvpa.base import warning 
  41   
  42  if __debug__: 
  43      from mvpa.base import debug 
44 45 46 -class BoostedClassifier(Classifier, Harvestable):
47 """Classifier containing the farm of other classifiers. 48 49 Should rarely be used directly. Use one of its childs instead 50 """ 51 52 # should not be needed if we have prediction_values upstairs 53 # raw_predictions should be handled as Harvestable??? 54 raw_predictions = StateVariable(enabled=False, 55 doc="Predictions obtained from each classifier") 56 57 raw_values = StateVariable(enabled=False, 58 doc="Values obtained from each classifier") 59 60
61 - def __init__(self, clfs=None, propagate_states=True, 62 harvest_attribs=None, copy_attribs='copy', 63 **kwargs):
64 """Initialize the instance. 65 66 :Parameters: 67 clfs : list 68 list of classifier instances to use (slave classifiers) 69 propagate_states : bool 70 either to propagate enabled states into slave classifiers. 71 It is in effect only when slaves get assigned - so if state 72 is enabled not during construction, it would not necessarily 73 propagate into slaves 74 kwargs : dict 75 dict of keyworded arguments which might get used 76 by State or Classifier 77 """ 78 if clfs == None: 79 clfs = [] 80 81 Classifier.__init__(self, **kwargs) 82 Harvestable.__init__(self, harvest_attribs, copy_attribs) 83 84 self.__clfs = None 85 """Pylint friendly definition of __clfs""" 86 87 self.__propagate_states = propagate_states 88 """Enable current enabled states in slave classifiers""" 89 90 self._setClassifiers(clfs) 91 """Store the list of classifiers"""
92 93
94 - def __repr__(self, prefixes=[]):
95 if self.__clfs is None or len(self.__clfs)==0: 96 #prefix_ = "clfs=%s" % repr(self.__clfs) 97 prefix_ = [] 98 else: 99 prefix_ = ["clfs=[%s,...]" % repr(self.__clfs[0])] 100 return super(BoostedClassifier, self).__repr__(prefix_ + prefixes)
101 102
103 - def _train(self, dataset):
104 """Train `BoostedClassifier` 105 """ 106 for clf in self.__clfs: 107 clf.train(dataset)
108 109
110 - def _posttrain(self, dataset):
111 """Custom posttrain of `BoostedClassifier` 112 113 Harvest over the trained classifiers if it was asked to so 114 """ 115 Classifier._posttrain(self, dataset) 116 if self.states.isEnabled('harvested'): 117 for clf in self.__clfs: 118 self._harvest(locals()) 119 if self.params.retrainable: 120 self.__changedData_isset = False
121 122
123 - def _getFeatureIds(self):
124 """Custom _getFeatureIds for `BoostedClassifier` 125 """ 126 # return union of all used features by slave classifiers 127 feature_ids = set([]) 128 for clf in self.__clfs: 129 feature_ids = feature_ids.union(set(clf.feature_ids)) 130 return list(feature_ids)
131 132
133 - def _predict(self, data):
134 """Predict using `BoostedClassifier` 135 """ 136 raw_predictions = [ clf.predict(data) for clf in self.__clfs ] 137 self.raw_predictions = raw_predictions 138 assert(len(self.__clfs)>0) 139 if self.states.isEnabled("values"): 140 if N.array([x.states.isEnabled("values") 141 for x in self.__clfs]).all(): 142 values = [ clf.values for clf in self.__clfs ] 143 self.raw_values = values 144 else: 145 warning("One or more classifiers in %s has no 'values' state" % 146 self + "enabled, thus BoostedClassifier can't have" + 147 " 'raw_values' state variable defined") 148 149 return raw_predictions
150 151
152 - def _setClassifiers(self, clfs):
153 """Set the classifiers used by the boosted classifier 154 155 We have to allow to set list of classifiers after the object 156 was actually created. It will be used by 157 MulticlassClassifier 158 """ 159 self.__clfs = clfs 160 """Classifiers to use""" 161 162 if len(clfs): 163 for flag in ['regression']: 164 values = N.array([clf.params[flag].value for clf in clfs]) 165 value = values.any() 166 if __debug__: 167 debug("CLFBST", "Setting %(flag)s=%(value)s for classifiers " 168 "%(clfs)s with %(values)s", 169 msgargs={'flag' : flag, 'value' : value, 170 'clfs' : clfs, 171 'values' : values}) 172 # set flag if it needs to be trained before predicting 173 self.params[flag].value = value 174 175 # enable corresponding states in the slave-classifiers 176 if self.__propagate_states: 177 for clf in self.__clfs: 178 clf.states.enable(self.states.enabled, missingok=True) 179 180 # adhere to their capabilities + 'multiclass' 181 # XXX do intersection across all classifiers! 182 # TODO: this seems to be wrong since it can be regression etc 183 self._clf_internals = [ 'binary', 'multiclass', 'meta' ] 184 if len(clfs)>0: 185 self._clf_internals += self.__clfs[0]._clf_internals
186
187 - def untrain(self):
188 """Untrain `BoostedClassifier` 189 190 Has to untrain any known classifier 191 """ 192 if not self.trained: 193 return 194 for clf in self.clfs: 195 clf.untrain() 196 super(BoostedClassifier, self).untrain()
197
198 - def getSensitivityAnalyzer(self, **kwargs):
199 """Return an appropriate SensitivityAnalyzer""" 200 return BoostedClassifierSensitivityAnalyzer( 201 self, 202 **kwargs)
203 204 205 clfs = property(fget=lambda x:x.__clfs, 206 fset=_setClassifiers, 207 doc="Used classifiers")
208
209 210 211 -class ProxyClassifier(Classifier):
212 """Classifier which decorates another classifier 213 214 Possible uses: 215 216 - modify data somehow prior training/testing: 217 * normalization 218 * feature selection 219 * modification 220 221 - optimized classifier? 222 223 """ 224
225 - def __init__(self, clf, **kwargs):
226 """Initialize the instance 227 228 :Parameters: 229 clf : Classifier 230 classifier based on which mask classifiers is created 231 """ 232 233 Classifier.__init__(self, regression=clf.regression, **kwargs) 234 235 self.__clf = clf 236 """Store the classifier to use.""" 237 238 # adhere to slave classifier capabilities 239 # TODO: unittest 240 self._clf_internals = self._clf_internals[:] + ['meta'] 241 if clf is not None: 242 self._clf_internals += clf._clf_internals
243 244
245 - def __repr__(self, prefixes=[]):
246 return super(ProxyClassifier, self).__repr__( 247 ["clf=%s" % repr(self.__clf)] + prefixes)
248
249 - def summary(self):
250 s = super(ProxyClassifier, self).summary() 251 if self.trained: 252 s += "\n Slave classifier summary:" + \ 253 '\n + %s' % \ 254 (self.__clf.summary().replace('\n', '\n |')) 255 return s
256 257 258
259 - def _train(self, dataset):
260 """Train `ProxyClassifier` 261 """ 262 # base class does nothing much -- just proxies requests to underlying 263 # classifier 264 self.__clf.train(dataset)
265 266 # for the ease of access 267 # TODO: if to copy we should exclude some states which are defined in 268 # base Classifier (such as training_time, predicting_time) 269 # YOH: for now _copy_states_ would copy only set states variables. If 270 # anything needs to be overriden in the parent's class, it is 271 # welcome to do so 272 #self.states._copy_states_(self.__clf, deep=False) 273 274
275 - def _predict(self, data):
276 """Predict using `ProxyClassifier` 277 """ 278 clf = self.__clf 279 if self.states.isEnabled('values'): 280 clf.states.enable(['values']) 281 282 result = clf.predict(data) 283 # for the ease of access 284 self.states._copy_states_(self.__clf, ['values'], deep=False) 285 return result
286 287
288 - def untrain(self):
289 """Untrain ProxyClassifier 290 """ 291 if not self.__clf is None: 292 self.__clf.untrain() 293 super(ProxyClassifier, self).untrain()
294 295 296 @group_kwargs(prefixes=['slave_'], passthrough=True)
297 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
298 """Return an appropriate SensitivityAnalyzer""" 299 return ProxyClassifierSensitivityAnalyzer( 300 self, 301 analyzer=self.__clf.getSensitivityAnalyzer(**slave_kwargs), 302 **kwargs)
303 304 305 clf = property(lambda x:x.__clf, doc="Used `Classifier`")
306
307 308 309 # 310 # Various combiners for CombinedClassifier 311 # 312 313 -class PredictionsCombiner(ClassWithCollections):
314 """Base class for combining decisions of multiple classifiers""" 315
316 - def train(self, clfs, dataset):
317 """PredictionsCombiner might need to be trained 318 319 :Parameters: 320 clfs : list of Classifier 321 List of classifiers to combine. Has to be classifiers (not 322 pure predictions), since combiner might use some other 323 state variables (value's) instead of pure prediction's 324 dataset : Dataset 325 training data in this case 326 """ 327 pass
328 329
330 - def __call__(self, clfs, dataset):
331 """Call function 332 333 :Parameters: 334 clfs : list of Classifier 335 List of classifiers to combine. Has to be classifiers (not 336 pure predictions), since combiner might use some other 337 state variables (value's) instead of pure prediction's 338 """ 339 raise NotImplementedError
340
341 342 343 -class MaximalVote(PredictionsCombiner):
344 """Provides a decision using maximal vote rule""" 345 346 predictions = StateVariable(enabled=True, 347 doc="Voted predictions") 348 all_label_counts = StateVariable(enabled=False, 349 doc="Counts across classifiers for each label/sample") 350
351 - def __init__(self):
352 """XXX Might get a parameter to use raw decision values if 353 voting is not unambigous (ie two classes have equal number of 354 votes 355 """ 356 PredictionsCombiner.__init__(self)
357 358
359 - def __call__(self, clfs, dataset):
360 """Actuall callable - perform voting 361 362 Extended functionality which might not be needed actually: 363 Since `BinaryClassifier` might return a list of possible 364 predictions (not just a single one), we should consider all of those 365 366 MaximalVote doesn't care about dataset itself 367 """ 368 if len(clfs)==0: 369 return [] # to don't even bother 370 371 all_label_counts = None 372 for clf in clfs: 373 # Lets check first if necessary state variable is enabled 374 if not clf.states.isEnabled("predictions"): 375 raise ValueError, "MaximalVote needs classifiers (such as " + \ 376 "%s) with state 'predictions' enabled" % clf 377 predictions = clf.predictions 378 if all_label_counts is None: 379 all_label_counts = [ {} for i in xrange(len(predictions)) ] 380 381 # for every sample 382 for i in xrange(len(predictions)): 383 prediction = predictions[i] 384 if not operator.isSequenceType(prediction): 385 prediction = (prediction,) 386 for label in prediction: # for every label 387 # XXX we might have multiple labels assigned 388 # but might not -- don't remember now 389 if not all_label_counts[i].has_key(label): 390 all_label_counts[i][label] = 0 391 all_label_counts[i][label] += 1 392 393 predictions = [] 394 # select maximal vote now for each sample 395 for i in xrange(len(all_label_counts)): 396 label_counts = all_label_counts[i] 397 # lets do explicit search for max so we know 398 # if it is unique 399 maxk = [] # labels of elements with max vote 400 maxv = -1 401 for k, v in label_counts.iteritems(): 402 if v > maxv: 403 maxk = [k] 404 maxv = v 405 elif v == maxv: 406 maxk.append(k) 407 408 assert len(maxk) >= 1, \ 409 "We should have obtained at least a single key of max label" 410 411 if len(maxk) > 1: 412 warning("We got multiple labels %s which have the " % maxk + 413 "same maximal vote %d. XXX disambiguate" % maxv) 414 predictions.append(maxk[0]) 415 416 self.all_label_counts = all_label_counts 417 self.predictions = predictions 418 return predictions
419
420 421 422 -class MeanPrediction(PredictionsCombiner):
423 """Provides a decision by taking mean of the results 424 """ 425 426 predictions = StateVariable(enabled=True, 427 doc="Mean predictions") 428
429 - def __call__(self, clfs, dataset):
430 """Actuall callable - perform meaning 431 432 """ 433 if len(clfs)==0: 434 return [] # to don't even bother 435 436 all_predictions = [] 437 for clf in clfs: 438 # Lets check first if necessary state variable is enabled 439 if not clf.states.isEnabled("predictions"): 440 raise ValueError, "MeanPrediction needs classifiers (such " \ 441 " as %s) with state 'predictions' enabled" % clf 442 all_predictions.append(clf.predictions) 443 444 # compute mean 445 predictions = N.mean(N.asarray(all_predictions), axis=0) 446 self.predictions = predictions 447 return predictions
448
449 450 -class ClassifierCombiner(PredictionsCombiner):
451 """Provides a decision using training a classifier on predictions/values 452 453 TODO: implement 454 """ 455 456 predictions = StateVariable(enabled=True, 457 doc="Trained predictions") 458 459
460 - def __init__(self, clf, variables=None):
461 """Initialize `ClassifierCombiner` 462 463 :Parameters: 464 clf : Classifier 465 Classifier to train on the predictions 466 variables : list of basestring 467 List of state variables stored in 'combined' classifiers, which 468 to use as features for training this classifier 469 """ 470 PredictionsCombiner.__init__(self) 471 472 self.__clf = clf 473 """Classifier to train on `variables` states of provided classifiers""" 474 475 if variables == None: 476 variables = ['predictions'] 477 self.__variables = variables 478 """What state variables of the classifiers to use"""
479 480
481 - def untrain(self):
482 """It might be needed to untrain used classifier""" 483 if self.__clf: 484 self.__clf.untrain()
485
486 - def __call__(self, clfs, dataset):
487 """ 488 """ 489 if len(clfs)==0: 490 return [] # to don't even bother 491 492 raise NotImplementedError
493
494 495 496 -class CombinedClassifier(BoostedClassifier):
497 """`BoostedClassifier` which combines predictions using some 498 `PredictionsCombiner` functor. 499 """ 500
501 - def __init__(self, clfs=None, combiner=None, **kwargs):
502 """Initialize the instance. 503 504 :Parameters: 505 clfs : list of Classifier 506 list of classifier instances to use 507 combiner : PredictionsCombiner 508 callable which takes care about combining multiple 509 results into a single one (e.g. maximal vote for 510 classification, MeanPrediction for regression)) 511 kwargs : dict 512 dict of keyworded arguments which might get used 513 by State or Classifier 514 515 NB: `combiner` might need to operate not on 'predictions' descrete 516 labels but rather on raw 'class' values classifiers 517 estimate (which is pretty much what is stored under 518 `values` 519 """ 520 if clfs == None: 521 clfs = [] 522 523 BoostedClassifier.__init__(self, clfs, **kwargs) 524 525 # assign default combiner 526 if combiner is None: 527 combiner = (MaximalVote, MeanPrediction)[int(self.regression)]() 528 self.__combiner = combiner 529 """Functor destined to combine results of multiple classifiers"""
530 531
532 - def __repr__(self, prefixes=[]):
533 """Literal representation of `CombinedClassifier`. 534 """ 535 return super(CombinedClassifier, self).__repr__( 536 ["combiner=%s" % repr(self.__combiner)] + prefixes)
537 538
539 - def summary(self):
540 """Provide summary for the `CombinedClassifier`. 541 """ 542 s = super(CombinedClassifier, self).summary() 543 if self.trained: 544 s += "\n Slave classifiers summaries:" 545 for i, clf in enumerate(self.clfs): 546 s += '\n + %d clf: %s' % \ 547 (i, clf.summary().replace('\n', '\n |')) 548 return s
549 550
551 - def untrain(self):
552 """Untrain `CombinedClassifier` 553 """ 554 try: 555 self.__combiner.untrain() 556 except: 557 pass 558 super(CombinedClassifier, self).untrain()
559
560 - def _train(self, dataset):
561 """Train `CombinedClassifier` 562 """ 563 BoostedClassifier._train(self, dataset) 564 # combiner might need to train as well 565 self.__combiner.train(self.clfs, dataset)
566 567
568 - def _predict(self, data):
569 """Predict using `CombinedClassifier` 570 """ 571 BoostedClassifier._predict(self, data) 572 # combiner will make use of state variables instead of only predictions 573 # returned from _predict 574 predictions = self.__combiner(self.clfs, data) 575 self.predictions = predictions 576 577 if self.states.isEnabled("values"): 578 if self.__combiner.states.isActive("values"): 579 # XXX or may be we could leave simply up to accessing .combiner? 580 self.values = self.__combiner.values 581 else: 582 if __debug__: 583 warning("Boosted classifier %s has 'values' state enabled," 584 " but combiner doesn't have 'values' active, thus " 585 " .values cannot be provided directly, access .clfs" 586 % self) 587 return predictions
588 589 590 combiner = property(fget=lambda x:x.__combiner, 591 doc="Used combiner to derive a single result")
592
593 594 595 -class TreeClassifier(ProxyClassifier):
596 """`TreeClassifier` which allows to create hierarchy of classifiers 597 598 Functions by grouping some labels into a single "meta-label" and training 599 classifier first to separate between meta-labels. Then 600 each group further proceeds with classification within each group. 601 602 Possible scenarios:: 603 604 TreeClassifier(SVM(), 605 {'animate': ((1,2,3,4), 606 TreeClassifier(SVM(), 607 {'human': (('male', 'female'), SVM()), 608 'animals': (('monkey', 'dog'), SMLR())})), 609 'inanimate': ((5,6,7,8), SMLR())}) 610 611 would create classifier which would first do binary classification 612 to separate animate from inanimate, then for animate result it 613 would separate to classify human vs animal and so on:: 614 615 SVM 616 / \ 617 animate inanimate 618 / \ 619 SVM SMLR 620 / \ / | \ \ 621 human animal 5 6 7 8 622 | | 623 SVM SVM 624 / \ / \ 625 male female monkey dog 626 1 2 3 4 627 628 """ 629 630 _DEV__doc = """ 631 Questions: 632 * how to collect confusion matrices at a particular layer if such 633 classifier is given to SplitClassifier or CVTE 634 635 * What additional states to add, something like 636 clf_labels -- store remapped labels for the dataset 637 clf_values ... 638 639 * What do we store into values ? just values from the clfs[] 640 for corresponding samples, or top level clf values as well? 641 642 * what should be SensitivityAnalyzer? by default it would just 643 use top slave classifier (i.e. animate/inanimate) 644 645 Problems? 646 * .clf is not actually "proxied" per se, so not sure what things 647 should be taken care of yet... 648 649 TODO: 650 * Allow a group to be just a single category, so no further 651 classifier is needed, it just should stay separate from the 652 other groups 653 654 Possible TODO: 655 * Add ability to provide results of clf.values as features into 656 input of clfs[]. This way we could provide additional 'similarity' 657 information to the "other" branch 658 659 """ 660
661 - def __init__(self, clf, groups, **kwargs):
662 """Initialize TreeClassifier 663 664 :Parameters: 665 clf : Classifier 666 Classifier to separate between the groups 667 groups : dict of meta-label: tuple of (tuple of labels, classifier) 668 Defines the groups of labels and their classifiers. 669 See :class:`~mvpa.clfs.meta.TreeClassifier` for example 670 """ 671 672 # Basic initialization 673 ProxyClassifier.__init__(self, clf, **kwargs) 674 self._regressionIsBogus() 675 676 # XXX RF: probably create internal structure with dictionary, 677 # not just a tuple, and store all information in there 678 # accordingly 679 680 self._groups = groups 681 self._index2group = groups.keys() 682 683 # All processing of groups needs to be handled within _train 684 # since labels_map is not available here and definition 685 # is allowed to carry both symbolic and numeric values for 686 # labels 687 688 # We can only assign respective classifiers 689 self.clfs = dict([(gk, c) for gk, (ls, c) in groups.iteritems()]) 690 """Dictionary of classifiers used by the groups"""
691 692
693 - def __repr__(self, prefixes=[]):
694 """String representation of TreeClassifier 695 """ 696 prefix = "groups=%s" % repr(self._groups) 697 return super(TreeClassifier, self).__repr__([prefix] + prefixes)
698 699
700 - def summary(self):
701 """Provide summary for the `TreeClassifier`. 702 """ 703 s = super(TreeClassifier, self).summary() 704 if self.trained: 705 s += "\n Node classifiers summaries:" 706 for i, (clfname, clf) in enumerate(self.clfs.iteritems()): 707 s += '\n + %d %s clf: %s' % \ 708 (i, clfname, clf.summary().replace('\n', '\n |')) 709 return s
710 711
712 - def _train(self, dataset):
713 """Train TreeClassifier 714 715 First train .clf on groupped samples, then train each of .clfs 716 on a corresponding subset of samples. 717 """ 718 # Local bindings 719 clf, clfs, index2group = self.clf, self.clfs, self._index2group 720 721 # Handle groups of labels 722 groups = self._groups 723 labels_map = dataset.labels_map 724 # just for convenience 725 if labels_map is None: labels_map = {} 726 groups_labels = {} # just groups with numeric indexes 727 label2index = {} # how to map old labels to new 728 known = set() 729 for gi, gk in enumerate(index2group): 730 ls = groups[gk][0] 731 # if mapping exists -- map 732 ls_ = [labels_map.get(l, l) for l in ls] 733 known_already = known.intersection(ls_) 734 if len(known_already): 735 raise ValueError, "Grouping of labels is not appropriate. " \ 736 "Got labels %s already among known in %s. " \ 737 "Used labelsmap %s" % (known_already, known, labels_map) 738 groups_labels[gk] = ls_ # needed? XXX 739 for l in ls_: 740 label2index[l] = gi 741 known = known.union(ls_) 742 # TODO: check if different literal labels weren't mapped into 743 # same numerical but here asked to belong to different groups 744 # yoh: actually above should catch it 745 746 # Check if none of the labels is missing from known groups 747 dsul = set(dataset.uniquelabels) 748 if known.intersection(dsul) != dsul: 749 raise ValueError, \ 750 "Dataset %s had some labels not defined in groups: %s. " \ 751 "Known are %s" % \ 752 (dataset, dsul.difference(known), known) 753 754 # We can operate on the same dataset here 755 # Nope: doesn't work nicely with the classifier like kNN 756 # which links to the dataset used in the training, 757 # so whenever if we simply restore labels back, we 758 # would get kNN confused in _predict() 759 # Therefore we need to create a shallow copy of 760 # dataset and provide it with new labels 761 ds_group = dataset.copy(deep=False) 762 # assign new labels group samples into groups of labels 763 ds_group.labels = [label2index[l] for l in dataset.labels] 764 765 # train primary classifier 766 if __debug__: 767 debug('CLFTREE', "Training primary %(clf)s on %(ds)s", 768 msgargs=dict(clf=clf, ds=ds_group)) 769 clf.train(ds_group) 770 771 # ??? should we obtain values for anything? 772 # may be we could training values of .clfs to be added 773 # as features to the next level -- i.e. .clfs 774 775 # Proceed with next 'layer' and train all .clfs on corresponding 776 # selection of samples 777 # ??? should we may be allow additional 'the other' category, to 778 # signal contain all the other categories data? probably not 779 # since then it would lead to undetermined prediction (which 780 # might be not a bad thing altogether...) 781 for gk in groups.iterkeys(): 782 # select samples per each group 783 ids = dataset.idsbylabels(groups_labels[gk]) 784 ds_group = dataset.selectSamples(ids) 785 if __debug__: 786 debug('CLFTREE', "Training %(clf)s for group %(gk)s on %(ds)s", 787 msgargs=dict(clf=clfs[gk], gk=gk, ds=ds_group)) 788 # and train corresponding slave clf 789 clfs[gk].train(ds_group)
790 791
792 - def untrain(self):
793 """Untrain TreeClassifier 794 """ 795 super(TreeClassifier, self).untrain() 796 for clf in self.clfs.values(): 797 clf.untrain()
798 799
800 - def _predict(self, data):
801 """ 802 """ 803 # Local bindings 804 clfs, index2group = self.clfs, self._index2group 805 clf_predictions = N.asanyarray(ProxyClassifier._predict(self, data)) 806 # assure that predictions are indexes, ie int 807 clf_predictions = clf_predictions.astype(int) 808 809 # now for predictions pointing to specific groups go into 810 # corresponding one 811 predictions = N.array([N.nan]*len(data)) 812 for pred_group in set(clf_predictions): 813 gk = index2group[pred_group] 814 clf_ = clfs[gk] 815 group_indexes = (clf_predictions == pred_group) 816 if __debug__: 817 debug('CLFTREE', 'Predicting for group %s using %s on %d samples' % 818 (gk, clf_, N.sum(group_indexes))) 819 predictions[group_indexes] = clf_.predict(data[group_indexes]) 820 return predictions
821
822 823 -class BinaryClassifier(ProxyClassifier):
824 """`ProxyClassifier` which maps set of two labels into +1 and -1 825 """ 826
827 - def __init__(self, clf, poslabels, neglabels, **kwargs):
828 """ 829 :Parameters: 830 clf : Classifier 831 classifier to use 832 poslabels : list 833 list of labels which are treated as +1 category 834 neglabels : list 835 list of labels which are treated as -1 category 836 """ 837 838 ProxyClassifier.__init__(self, clf, **kwargs) 839 840 self._regressionIsBogus() 841 842 # Handle labels 843 sposlabels = set(poslabels) # so to remove duplicates 844 sneglabels = set(neglabels) # so to remove duplicates 845 846 # check if there is no overlap 847 overlap = sposlabels.intersection(sneglabels) 848 if len(overlap)>0: 849 raise ValueError("Sets of positive and negative labels for " + 850 "BinaryClassifier must not overlap. Got overlap " % 851 overlap) 852 853 self.__poslabels = list(sposlabels) 854 self.__neglabels = list(sneglabels) 855 856 # define what values will be returned by predict: if there is 857 # a single label - return just it alone, otherwise - whole 858 # list 859 # Such approach might come useful if we use some classifiers 860 # over different subsets of data with some voting later on 861 # (1-vs-therest?) 862 863 if len(self.__poslabels) > 1: 864 self.__predictpos = self.__poslabels 865 else: 866 self.__predictpos = self.__poslabels[0] 867 868 if len(self.__neglabels) > 1: 869 self.__predictneg = self.__neglabels 870 else: 871 self.__predictneg = self.__neglabels[0]
872 873
874 - def __repr__(self, prefixes=[]):
875 prefix = "poslabels=%s, neglabels=%s" % ( 876 repr(self.__poslabels), repr(self.__neglabels)) 877 return super(BinaryClassifier, self).__repr__([prefix] + prefixes)
878 879
880 - def _train(self, dataset):
881 """Train `BinaryClassifier` 882 """ 883 idlabels = [(x, +1) for x in dataset.idsbylabels(self.__poslabels)] + \ 884 [(x, -1) for x in dataset.idsbylabels(self.__neglabels)] 885 # XXX we have to sort ids since at the moment Dataset.selectSamples 886 # doesn't take care about order 887 idlabels.sort() 888 # select the samples 889 orig_labels = None 890 891 # If we need all samples, why simply not perform on original 892 # data, an just store/restore labels. But it really should be done 893 # within Dataset.selectSamples 894 if len(idlabels) == dataset.nsamples \ 895 and [x[0] for x in idlabels] == range(dataset.nsamples): 896 # the last condition is not even necessary... just overly 897 # cautious 898 datasetselected = dataset # no selection is needed 899 orig_labels = dataset.labels # but we would need to restore labels 900 if __debug__: 901 debug('CLFBIN', 902 "Assigned all %d samples for binary " % 903 (dataset.nsamples) + 904 " classification among labels %s/+1 and %s/-1" % 905 (self.__poslabels, self.__neglabels)) 906 else: 907 datasetselected = dataset.selectSamples([ x[0] for x in idlabels ]) 908 if __debug__: 909 debug('CLFBIN', 910 "Selected %d samples out of %d samples for binary " % 911 (len(idlabels), dataset.nsamples) + 912 " classification among labels %s/+1 and %s/-1" % 913 (self.__poslabels, self.__neglabels) + 914 ". Selected %s" % datasetselected) 915 916 # adjust the labels 917 datasetselected.labels = [ x[1] for x in idlabels ] 918 919 # now we got a dataset with only 2 labels 920 if __debug__: 921 assert((datasetselected.uniquelabels == [-1, 1]).all()) 922 923 self.clf.train(datasetselected) 924 925 if not orig_labels is None: 926 dataset.labels = orig_labels
927
928 - def _predict(self, data):
929 """Predict the labels for a given `data` 930 931 Predicts using binary classifier and spits out list (for each sample) 932 where with either poslabels or neglabels as the "label" for the sample. 933 If there was just a single label within pos or neg labels then it would 934 return not a list but just that single label. 935 """ 936 binary_predictions = ProxyClassifier._predict(self, data) 937 self.values = binary_predictions 938 predictions = [ {-1: self.__predictneg, 939 +1: self.__predictpos}[x] for x in binary_predictions] 940 self.predictions = predictions 941 return predictions
942
943 944 945 -class MulticlassClassifier(CombinedClassifier):
946 """`CombinedClassifier` to perform multiclass using a list of 947 `BinaryClassifier`. 948 949 such as 1-vs-1 (ie in pairs like libsvm doesn) or 1-vs-all (which 950 is yet to think about) 951 """ 952
953 - def __init__(self, clf, bclf_type="1-vs-1", **kwargs):
954 """Initialize the instance 955 956 :Parameters: 957 clf : Classifier 958 classifier based on which multiple classifiers are created 959 for multiclass 960 bclf_type 961 "1-vs-1" or "1-vs-all", determines the way to generate binary 962 classifiers 963 """ 964 CombinedClassifier.__init__(self, **kwargs) 965 self._regressionIsBogus() 966 if not clf is None: 967 clf._regressionIsBogus() 968 969 self.__clf = clf 970 """Store sample instance of basic classifier""" 971 972 # Some checks on known ways to do multiclass 973 if bclf_type == "1-vs-1": 974 pass 975 elif bclf_type == "1-vs-all": # TODO 976 raise NotImplementedError 977 else: 978 raise ValueError, \ 979 "Unknown type of classifier %s for " % bclf_type + \ 980 "BoostedMulticlassClassifier" 981 self.__bclf_type = bclf_type
982 983 # XXX fix it up a bit... it seems that MulticlassClassifier should 984 # be actually ProxyClassifier and use BoostedClassifier internally
985 - def __repr__(self, prefixes=[]):
986 prefix = "bclf_type=%s, clf=%s" % (repr(self.__bclf_type), 987 repr(self.__clf)) 988 return super(MulticlassClassifier, self).__repr__([prefix] + prefixes)
989 990
991 - def _train(self, dataset):
992 """Train classifier 993 """ 994 # construct binary classifiers 995 ulabels = dataset.uniquelabels 996 if self.__bclf_type == "1-vs-1": 997 # generate pairs and corresponding classifiers 998 biclfs = [] 999 for i in xrange(len(ulabels)): 1000 for j in xrange(i+1, len(ulabels)): 1001 clf = self.__clf.clone() 1002 biclfs.append( 1003 BinaryClassifier( 1004 clf, 1005 poslabels=[ulabels[i]], neglabels=[ulabels[j]])) 1006 if __debug__: 1007 debug("CLFMC", "Created %d binary classifiers for %d labels" % 1008 (len(biclfs), len(ulabels))) 1009 1010 self.clfs = biclfs 1011 1012 elif self.__bclf_type == "1-vs-all": 1013 raise NotImplementedError 1014 1015 # perform actual training 1016 CombinedClassifier._train(self, dataset)
1017
1018 1019 1020 -class SplitClassifier(CombinedClassifier):
1021 """`BoostedClassifier` to work on splits of the data 1022 1023 """ 1024 1025 """ 1026 TODO: SplitClassifier and MulticlassClassifier have too much in 1027 common -- need to refactor: just need a splitter which would 1028 split dataset in pairs of class labels. MulticlassClassifier 1029 does just a tiny bit more which might be not necessary at 1030 all: map sets of labels into 2 categories... 1031 """ 1032 1033 # TODO: unify with CrossValidatedTransferError which now uses 1034 # harvest_attribs to expose gathered attributes 1035 confusion = StateVariable(enabled=False, 1036 doc="Resultant confusion whenever classifier trained " + 1037 "on 1 part and tested on 2nd part of each split") 1038 1039 splits = StateVariable(enabled=False, doc= 1040 """Store the actual splits of the data. Can be memory expensive""") 1041 1042 # ??? couldn't be training_confusion since it has other meaning 1043 # here, BUT it is named so within CrossValidatedTransferError 1044 # -- unify 1045 # decided to go with overriding semantics tiny bit. For split 1046 # classifier training_confusion would correspond to summary 1047 # over training errors across all splits. Later on if need comes 1048 # we might want to implement global_training_confusion which would 1049 # correspond to overall confusion on full training dataset as it is 1050 # done in base Classifier 1051 #global_training_confusion = StateVariable(enabled=False, 1052 # doc="Summary over training confusions acquired at each split") 1053
1054 - def __init__(self, clf, splitter=NFoldSplitter(cvtype=1), **kwargs):
1055 """Initialize the instance 1056 1057 :Parameters: 1058 clf : Classifier 1059 classifier based on which multiple classifiers are created 1060 for multiclass 1061 splitter : Splitter 1062 `Splitter` to use to split the dataset prior training 1063 """ 1064 1065 CombinedClassifier.__init__(self, regression=clf.regression, **kwargs) 1066 self.__clf = clf 1067 """Store sample instance of basic classifier""" 1068 1069 if isinstance(splitter, type): 1070 raise ValueError, \ 1071 "Please provide an instance of a splitter, not a type." \ 1072 " Got %s" % splitter 1073 1074 self.__splitter = splitter
1075 1076
1077 - def _train(self, dataset):
1078 """Train `SplitClassifier` 1079 """ 1080 # generate pairs and corresponding classifiers 1081 bclfs = [] 1082 1083 # local binding 1084 states = self.states 1085 1086 clf_template = self.__clf 1087 if states.isEnabled('confusion'): 1088 states.confusion = clf_template._summaryClass() 1089 if states.isEnabled('training_confusion'): 1090 clf_template.states.enable(['training_confusion']) 1091 states.training_confusion = clf_template._summaryClass() 1092 1093 clf_hastestdataset = hasattr(clf_template, 'testdataset') 1094 1095 # for proper and easier debugging - first define classifiers and then 1096 # train them 1097 for split in self.__splitter.splitcfg(dataset): 1098 if __debug__: 1099 debug("CLFSPL_", 1100 "Deepcopying %(clf)s for %(sclf)s", 1101 msgargs={'clf':clf_template, 1102 'sclf':self}) 1103 clf = clf_template.clone() 1104 bclfs.append(clf) 1105 self.clfs = bclfs 1106 1107 self.splits = [] 1108 1109 for i, split in enumerate(self.__splitter(dataset)): 1110 if __debug__: 1111 debug("CLFSPL", "Training classifier for split %d" % (i)) 1112 1113 if states.isEnabled("splits"): 1114 self.splits.append(split) 1115 1116 clf = self.clfs[i] 1117 1118 # assign testing dataset if given classifier can digest it 1119 if clf_hastestdataset: 1120 clf.testdataset = split[1] 1121 1122 clf.train(split[0]) 1123 1124 # unbind the testdataset from the classifier 1125 if clf_hastestdataset: 1126 clf.testdataset = None 1127 1128 if states.isEnabled("confusion"): 1129 predictions = clf.predict(split[1].samples) 1130 self.confusion.add(split[1].labels, predictions, 1131 clf.states.get('values', None)) 1132 if __debug__: 1133 dact = debug.active 1134 if 'CLFSPL_' in dact: 1135 debug('CLFSPL_', 'Split %d:\n%s' % (i, self.confusion)) 1136 elif 'CLFSPL' in dact: 1137 debug('CLFSPL', 'Split %d error %.2f%%' 1138 % (i, self.confusion.summaries[-1].error)) 1139 1140 if states.isEnabled("training_confusion"): 1141 states.training_confusion += \ 1142 clf.states.training_confusion 1143 # hackish way -- so it should work only for ConfusionMatrix??? 1144 try: 1145 if states.isEnabled("confusion"): 1146 states.confusion.labels_map = dataset.labels_map 1147 if states.isEnabled("training_confusion"): 1148 states.training_confusion.labels_map = dataset.labels_map 1149 except: 1150 pass
1151 1152 1153 @group_kwargs(prefixes=['slave_'], passthrough=True)
1154 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
1155 """Return an appropriate SensitivityAnalyzer for `SplitClassifier` 1156 1157 :Parameters: 1158 combiner 1159 If not provided, FirstAxisMean is assumed 1160 """ 1161 kwargs.setdefault('combiner', FirstAxisMean) 1162 return BoostedClassifierSensitivityAnalyzer( 1163 self, 1164 analyzer=self.__clf.getSensitivityAnalyzer(**slave_kwargs), 1165 **kwargs)
1166 1167 splitter = property(fget=lambda x:x.__splitter, 1168 doc="Splitter user by SplitClassifier")
1169
1170 1171 -class MappedClassifier(ProxyClassifier):
1172 """`ProxyClassifier` which uses some mapper prior training/testing. 1173 1174 `MaskMapper` can be used just a subset of features to 1175 train/classify. 1176 Having such classifier we can easily create a set of classifiers 1177 for BoostedClassifier, where each classifier operates on some set 1178 of features, e.g. set of best spheres from SearchLight, set of 1179 ROIs selected elsewhere. It would be different from simply 1180 applying whole mask over the dataset, since here initial decision 1181 is made by each classifier and then later on they vote for the 1182 final decision across the set of classifiers. 1183 """ 1184
1185 - def __init__(self, clf, mapper, **kwargs):
1186 """Initialize the instance 1187 1188 :Parameters: 1189 clf : Classifier 1190 classifier based on which mask classifiers is created 1191 mapper 1192 whatever `Mapper` comes handy 1193 """ 1194 ProxyClassifier.__init__(self, clf, **kwargs) 1195 1196 self.__mapper = mapper 1197 """mapper to help us our with prepping data to 1198 training/classification"""
1199 1200
1201 - def _train(self, dataset):
1202 """Train `MappedClassifier` 1203 """ 1204 # first train the mapper 1205 # XXX: should training be done using whole dataset or just samples 1206 # YYY: in some cases labels might be needed, thus better full dataset 1207 self.__mapper.train(dataset) 1208 1209 # for train() we have to provide dataset -- not just samples to train! 1210 wdataset = dataset.applyMapper(featuresmapper = self.__mapper) 1211 ProxyClassifier._train(self, wdataset)
1212 1213
1214 - def _predict(self, data):
1215 """Predict using `MappedClassifier` 1216 """ 1217 return ProxyClassifier._predict(self, self.__mapper.forward(data))
1218 1219 1220 @group_kwargs(prefixes=['slave_'], passthrough=True)
1221 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
1222 """Return an appropriate SensitivityAnalyzer""" 1223 return MappedClassifierSensitivityAnalyzer( 1224 self, 1225 analyzer=self.clf.getSensitivityAnalyzer(**slave_kwargs), 1226 **kwargs)
1227 1228 1229 mapper = property(lambda x:x.__mapper, doc="Used mapper")
1230
1231 1232 1233 -class FeatureSelectionClassifier(ProxyClassifier):
1234 """`ProxyClassifier` which uses some `FeatureSelection` prior training. 1235 1236 `FeatureSelection` is used first to select features for the classifier to 1237 use for prediction. Internally it would rely on MappedClassifier which 1238 would use created MaskMapper. 1239 1240 TODO: think about removing overhead of retraining the same classifier if 1241 feature selection was carried out with the same classifier already. It 1242 has been addressed by adding .trained property to classifier, but now 1243 we should expclitely use isTrained here if we want... need to think more 1244 """ 1245 1246 _clf_internals = [ 'does_feature_selection', 'meta' ] 1247
1248 - def __init__(self, clf, feature_selection, testdataset=None, **kwargs):
1249 """Initialize the instance 1250 1251 :Parameters: 1252 clf : Classifier 1253 classifier based on which mask classifiers is created 1254 feature_selection : FeatureSelection 1255 whatever `FeatureSelection` comes handy 1256 testdataset : Dataset 1257 optional dataset which would be given on call to feature_selection 1258 """ 1259 ProxyClassifier.__init__(self, clf, **kwargs) 1260 1261 self.__maskclf = None 1262 """Should become `MappedClassifier`(mapper=`MaskMapper`) later on.""" 1263 1264 self.__feature_selection = feature_selection 1265 """`FeatureSelection` to select the features prior training""" 1266 1267 self.__testdataset = testdataset 1268 """`FeatureSelection` might like to use testdataset"""
1269 1270
1271 - def untrain(self):
1272 """Untrain `FeatureSelectionClassifier` 1273 1274 Has to untrain any known classifier 1275 """ 1276 if self.__feature_selection is not None: 1277 self.__feature_selection.untrain() 1278 if not self.trained: 1279 return 1280 if not self.__maskclf is None: 1281 self.__maskclf.untrain() 1282 super(FeatureSelectionClassifier, self).untrain()
1283 1284
1285 - def _train(self, dataset):
1286 """Train `FeatureSelectionClassifier` 1287 """ 1288 # temporarily enable selected_ids 1289 self.__feature_selection.states._changeTemporarily( 1290 enable_states=["selected_ids"]) 1291 1292 if __debug__: 1293 debug("CLFFS", "Performing feature selection using %s" % 1294 self.__feature_selection + " on %s" % dataset) 1295 1296 (wdataset, tdataset) = self.__feature_selection(dataset, 1297 self.__testdataset) 1298 if __debug__: 1299 add_ = "" 1300 if "CLFFS_" in debug.active: 1301 add_ = " Selected features: %s" % \ 1302 self.__feature_selection.selected_ids 1303 debug("CLFFS", "%(fs)s selected %(nfeat)d out of " + 1304 "%(dsnfeat)d features.%(app)s", 1305 msgargs={'fs':self.__feature_selection, 1306 'nfeat':wdataset.nfeatures, 1307 'dsnfeat':dataset.nfeatures, 1308 'app':add_}) 1309 1310 # create a mask to devise a mapper 1311 # TODO -- think about making selected_ids a MaskMapper 1312 mappermask = N.zeros(dataset.nfeatures) 1313 mappermask[self.__feature_selection.selected_ids] = 1 1314 mapper = MaskMapper(mappermask) 1315 1316 self.__feature_selection.states._resetEnabledTemporarily() 1317 1318 # create and assign `MappedClassifier` 1319 self.__maskclf = MappedClassifier(self.clf, mapper) 1320 # we could have called self.__clf.train(dataset), but it would 1321 # cause unnecessary masking 1322 self.__maskclf.clf.train(wdataset)
1323 1324 # for the ease of access 1325 # TODO see for ProxyClassifier 1326 #self.states._copy_states_(self.__maskclf, deep=False) 1327
1328 - def _getFeatureIds(self):
1329 """Return used feature ids for `FeatureSelectionClassifier` 1330 1331 """ 1332 return self.__feature_selection.selected_ids
1333
1334 - def _predict(self, data):
1335 """Predict using `FeatureSelectionClassifier` 1336 """ 1337 clf = self.__maskclf 1338 if self.states.isEnabled('values'): 1339 clf.states.enable(['values']) 1340 1341 result = clf._predict(data) 1342 # for the ease of access 1343 self.states._copy_states_(clf, ['values'], deep=False) 1344 return result
1345
1346 - def setTestDataset(self, testdataset):
1347 """Set testing dataset to be used for feature selection 1348 """ 1349 self.__testdataset = testdataset
1350 1351 maskclf = property(lambda x:x.__maskclf, doc="Used `MappedClassifier`") 1352 feature_selection = property(lambda x:x.__feature_selection, 1353 doc="Used `FeatureSelection`") 1354 1355 @group_kwargs(prefixes=['slave_'], passthrough=True)
1356 - def getSensitivityAnalyzer(self, slave_kwargs, **kwargs):
1357 """Return an appropriate SensitivityAnalyzer 1358 1359 had to clone from mapped classifier??? 1360 """ 1361 return FeatureSelectionClassifierSensitivityAnalyzer( 1362 self, 1363 analyzer=self.clf.getSensitivityAnalyzer(**slave_kwargs), 1364 **kwargs)
1365 1366 1367 1368 testdataset = property(fget=lambda x:x.__testdataset, 1369 fset=setTestDataset)
1370