1
2
3
4
5
6
7
8
9 """Base classes for all classifiers.
10
11 Base Classifiers can be grouped according to their function as
12
13 :group Basic Classifiers: Classifier BoostedClassifier ProxyClassifier
14 :group BoostedClassifiers: CombinedClassifier MulticlassClassifier
15 SplitClassifier
16 :group ProxyClassifiers: BinaryClassifier MappedClassifier
17 FeatureSelectionClassifier
18 :group PredictionsCombiners for CombinedClassifier: PredictionsCombiner
19 MaximalVote MeanPrediction
20
21 """
22
23 __docformat__ = 'restructuredtext'
24
25 import operator, sys
26 import numpy as N
27
28
29
30
31 if sys.version_info[0] > 2 or sys.version_info[1] > 4:
32 from mvpa.misc.copy import deepcopy
33 else:
34 from mvpa.misc.copy import deepcopy
35
36 import time
37 from sets import Set
38
39 from mvpa.misc.args import group_kwargs
40 from mvpa.misc.support import idhash
41 from mvpa.mappers.mask import MaskMapper
42 from mvpa.datasets.splitter import NFoldSplitter
43 from mvpa.misc.state import StateVariable, Stateful, Harvestable, Parametrized
44 from mvpa.misc.param import Parameter
45
46 from mvpa.clfs.transerror import ConfusionMatrix, RegressionStatistics
47 from mvpa.misc.transformers import FirstAxisMean, SecondAxisSumOfAbs
48
49 from mvpa.measures.base import \
50 BoostedClassifierSensitivityAnalyzer, ProxyClassifierSensitivityAnalyzer, \
51 MappedClassifierSensitivityAnalyzer
52 from mvpa.base import warning
53
54 if __debug__:
55 import traceback
56 from mvpa.base import debug
60 """Deepcopying of a classifier.
61
62 If deepcopy fails -- tries to untrain it first so that there is no
63 swig bindings attached
64 """
65 try:
66 return deepcopy(clf)
67 except:
68 clf.untrain()
69 return deepcopy(clf)
70
73 """Abstract classifier class to be inherited by all classifiers
74 """
75
76
77
78 _DEV__doc__ = """
79 Required behavior:
80
81 For every classifier is has to be possible to be instanciated without
82 having to specify the training pattern.
83
84 Repeated calls to the train() method with different training data have to
85 result in a valid classifier, trained for the particular dataset.
86
87 It must be possible to specify all classifier parameters as keyword
88 arguments to the constructor.
89
90 Recommended behavior:
91
92 Derived classifiers should provide access to *values* -- i.e. that
93 information that is finally used to determine the predicted class label.
94
95 Michael: Maybe it works well if each classifier provides a 'values'
96 state member. This variable is a list as long as and in same order
97 as Dataset.uniquelabels (training data). Each item in the list
98 corresponds to the likelyhood of a sample to belong to the
99 respective class. However the sematics might differ between
100 classifiers, e.g. kNN would probably store distances to class-
101 neighbours, where PLR would store the raw function value of the
102 logistic function. So in the case of kNN low is predictive and for
103 PLR high is predictive. Don't know if there is the need to unify
104 that.
105
106 As the storage and/or computation of this information might be
107 demanding its collection should be switchable and off be default.
108
109 Nomenclature
110 * predictions : corresponds to the quantized labels if classifier spits
111 out labels by .predict()
112 * values : might be different from predictions if a classifier's predict()
113 makes a decision based on some internal value such as
114 probability or a distance.
115 """
116
117
118
119
120
121
122
123
124
125 trained_labels = StateVariable(enabled=True,
126 doc="Set of unique labels it has been trained on")
127
128 trained_dataset = StateVariable(enabled=False,
129 doc="The dataset it has been trained on")
130
131 training_confusion = StateVariable(enabled=False,
132 doc="Confusion matrix of learning performance")
133
134 predictions = StateVariable(enabled=True,
135 doc="Most recent set of predictions")
136
137 values = StateVariable(enabled=True,
138 doc="Internal classifier values the most recent " +
139 "predictions are based on")
140
141 training_time = StateVariable(enabled=True,
142 doc="Time (in seconds) which took classifier to train")
143
144 predicting_time = StateVariable(enabled=True,
145 doc="Time (in seconds) which took classifier to predict")
146
147 feature_ids = StateVariable(enabled=False,
148 doc="Feature IDS which were used for the actual training.")
149
150 _clf_internals = []
151 """Describes some specifics about the classifier -- is that it is
152 doing regression for instance...."""
153
154 regression = Parameter(False, allowedtype='bool',
155 doc="""Either to use 'regression' as regression. By default any
156 Classifier-derived class serves as a classifier, so regression
157 does binary classification. TODO:""")
158
159 retrainable = Parameter(False, allowedtype='bool',
160 doc="""Either to enable retraining for 'retrainable' classifier.
161 TODO: make it available only for actually retrainable classifiers""")
162
163
165 """Cheap initialization.
166 """
167 Parametrized.__init__(self, **kwargs)
168
169
170 self.__trainednfeatures = None
171 """Stores number of features for which classifier was trained.
172 If None -- it wasn't trained at all"""
173
174 self._setRetrainable(self.params.retrainable, force=True)
175
176 if self.params.regression:
177 for statevar in [ "trained_labels"]:
178 if self.states.isEnabled(statevar):
179 if __debug__:
180 debug("CLF",
181 "Disabling state %s since doing regression, " %
182 statevar + "not classification")
183 self.states.disable(statevar)
184 self._summaryClass = RegressionStatistics
185 else:
186 self._summaryClass = ConfusionMatrix
187 if 'regression' in self._clf_internals:
188
189
190 self._clf_internals.append('binary')
191
192
193
194
195
196
197
199 if __debug__ and 'CLF_' in debug.active:
200 return "%s / %s" % (repr(self), super(Classifier, self).__str__())
201 else:
202 return repr(self)
203
206
207
209 """Functionality prior to training
210 """
211
212
213 params = self.params
214 if not params.retrainable:
215 self.untrain()
216 else:
217
218 self.states.reset()
219 if not self.__changedData_isset:
220 self.__resetChangedData()
221 _changedData = self._changedData
222 __idhashes = self.__idhashes
223 __invalidatedChangedData = self.__invalidatedChangedData
224
225
226
227 if __debug__:
228 debug('CLF_', "IDHashes are %s" % (__idhashes))
229
230
231 for key, data_ in (('traindata', dataset.samples),
232 ('labels', dataset.labels)):
233 _changedData[key] = self.__wasDataChanged(key, data_)
234
235
236 if __invalidatedChangedData.get(key, False):
237 if __debug__ and not _changedData[key]:
238 debug('CLF_', 'Found that idhash for %s was '
239 'invalidated by retraining' % key)
240 _changedData[key] = True
241
242
243 for col in self._paramscols:
244 changedParams = self._collections[col].whichSet()
245 if len(changedParams):
246 _changedData[col] = changedParams
247
248 self.__invalidatedChangedData = {}
249
250 if __debug__:
251 debug('CLF_', "Obtained _changedData is %s" % (self._changedData))
252
253 if not params.regression and 'regression' in self._clf_internals \
254 and not self.states.isEnabled('trained_labels'):
255
256
257 if __debug__:
258 debug("CLF", "Enabling trained_labels state since it is needed")
259 self.states.enable('trained_labels')
260
261
262 - def _posttrain(self, dataset):
263 """Functionality post training
264
265 For instance -- computing confusion matrix
266 :Parameters:
267 dataset : Dataset
268 Data which was used for training
269 """
270 if self.states.isEnabled('trained_labels'):
271 self.trained_labels = dataset.uniquelabels
272
273 self.trained_dataset = dataset
274
275
276 self.__trainednfeatures = dataset.nfeatures
277
278
279
280
281 if __debug__ and 'CHECK_TRAINED' in debug.active:
282 self.__trainedidhash = dataset.idhash
283
284 if self.states.isEnabled('training_confusion') and \
285 not self.states.isSet('training_confusion'):
286
287
288 self.states._changeTemporarily(
289 disable_states=["predictions"])
290 if self.params.retrainable:
291
292
293
294
295
296 self.__changedData_isset = False
297 predictions = self.predict(dataset.samples)
298 self.states._resetEnabledTemporarily()
299 self.training_confusion = self._summaryClass(
300 targets=dataset.labels,
301 predictions=predictions)
302
303 try:
304 self.training_confusion.labels_map = dataset.labels_map
305 except:
306 pass
307
308 if self.states.isEnabled('feature_ids'):
309 self.feature_ids = self._getFeatureIds()
310
311
313 """Virtual method to return feature_ids used while training
314
315 Is not intended to be called anywhere but from _posttrain,
316 thus classifier is assumed to be trained at this point
317 """
318
319 return range(self.__trainednfeatures)
320
321
323 """Providing summary over the classifier"""
324
325 s = "Classifier %s" % self
326 states = self.states
327 states_enabled = states.enabled
328
329 if self.trained:
330 s += "\n trained"
331 if states.isSet('training_time'):
332 s += ' in %.3g sec' % states.training_time
333 s += ' on data with'
334 if states.isSet('trained_labels'):
335 s += ' labels:%s' % list(states.trained_labels)
336 if states.isSet('trained_dataset'):
337 td = states.trained_dataset
338 s += ' #samples:%d #chunks:%d' % (td.nsamples, len(td.uniquechunks))
339 s += " #features:%d" % self.__trainednfeatures
340 if states.isSet('feature_ids'):
341 s += ", used #features:%d" % len(states.feature_ids)
342 if states.isSet('training_confusion'):
343 s += ", training error:%.3g" % states.training_confusion.error
344 else:
345 s += "\n not yet trained"
346
347 if len(states_enabled):
348 s += "\n enabled states:%s" % ', '.join([str(states[x]) for x in states_enabled])
349 return s
350
351
353 """Function to be actually overriden in derived classes
354 """
355 raise NotImplementedError
356
357
358 - def train(self, dataset):
359 """Train classifier on a dataset
360
361 Shouldn't be overriden in subclasses unless explicitely needed
362 to do so
363 """
364 if __debug__:
365 debug("CLF", "Training classifier %(clf)s on dataset %(dataset)s",
366 msgargs={'clf':self, 'dataset':dataset})
367
368
369
370
371 self._pretrain(dataset)
372
373
374 t0 = time.time()
375
376 if dataset.nfeatures > 0:
377 result = self._train(dataset)
378 else:
379 warning("Trying to train on dataset with no features present")
380 if __debug__:
381 debug("CLF",
382 "No features present for training, no actual training " \
383 "is called")
384 result = None
385
386 self.training_time = time.time() - t0
387 self._posttrain(dataset)
388 return result
389
390
392 """Functionality prior prediction
393 """
394 if not ('notrain2predict' in self._clf_internals):
395
396 if not self.trained:
397 raise ValueError, \
398 "Classifier %s wasn't yet trained, therefore can't " \
399 "predict" % self
400 nfeatures = data.shape[1]
401
402
403 if nfeatures != self.__trainednfeatures:
404 raise ValueError, \
405 "Classifier %s was trained on data with %d features, " % \
406 (self, self.__trainednfeatures) + \
407 "thus can't predict for %d features" % nfeatures
408
409
410 if self.params.retrainable:
411 if not self.__changedData_isset:
412 self.__resetChangedData()
413 _changedData = self._changedData
414 _changedData['testdata'] = \
415 self.__wasDataChanged('testdata', data)
416 if __debug__:
417 debug('CLF_', "prepredict: Obtained _changedData is %s" % (_changedData))
418
419
420 - def _postpredict(self, data, result):
421 """Functionality after prediction is computed
422 """
423 self.predictions = result
424 if self.params.retrainable:
425 self.__changedData_isset = False
426
428 """Actual prediction
429 """
430 raise NotImplementedError
431
432
434 """Predict classifier on data
435
436 Shouldn't be overriden in subclasses unless explicitely needed
437 to do so. Also subclasses trying to call super class's predict
438 should call _predict if within _predict instead of predict()
439 since otherwise it would loop
440 """
441 data = N.asarray(data)
442 if __debug__:
443 debug("CLF", "Predicting classifier %(clf)s on data %(data)s",
444 msgargs={'clf':self, 'data':data.shape})
445
446
447
448
449
450 t0 = time.time()
451
452 states = self.states
453
454
455 states.reset(['values', 'predictions'])
456
457 self._prepredict(data)
458
459 if self.__trainednfeatures > 0 \
460 or 'notrain2predict' in self._clf_internals:
461 result = self._predict(data)
462 else:
463 warning("Trying to predict using classifier trained on no features")
464 if __debug__:
465 debug("CLF",
466 "No features were present for training, prediction is " \
467 "bogus")
468 result = [None]*data.shape[0]
469
470 states.predicting_time = time.time() - t0
471
472 if 'regression' in self._clf_internals and not self.params.regression:
473
474
475
476
477
478
479
480
481
482 result_ = N.array(result)
483 if states.isEnabled('values'):
484
485
486 if not states.isSet('values'):
487 states.values = result_.copy()
488 else:
489
490
491
492 states.values = states.values.copy()
493
494 trained_labels = self.trained_labels
495 for i, value in enumerate(result):
496 dists = N.abs(value - trained_labels)
497 result[i] = trained_labels[N.argmin(dists)]
498
499 if __debug__:
500 debug("CLF_", "Converted regression result %(result_)s "
501 "into labels %(result)s for %(self_)s",
502 msgargs={'result_':result_, 'result':result,
503 'self_': self})
504
505 self._postpredict(data, result)
506 return result
507
508
510 """Either classifier was already trained.
511
512 MUST BE USED WITH CARE IF EVER"""
513 if dataset is None:
514
515 return not self.__trainednfeatures is None
516 else:
517 res = (self.__trainednfeatures == dataset.nfeatures)
518 if __debug__ and 'CHECK_TRAINED' in debug.active:
519 res2 = (self.__trainedidhash == dataset.idhash)
520 if res2 != res:
521 raise RuntimeError, \
522 "isTrained is weak and shouldn't be relied upon. " \
523 "Got result %b although comparing of idhash says %b" \
524 % (res, res2)
525 return res
526
527
529 """Some classifiers like BinaryClassifier can't be used for
530 regression"""
531
532 if self.params.regression:
533 raise ValueError, "Regression mode is meaningless for %s" % \
534 self.__class__.__name__ + " thus don't enable it"
535
536
537 @property
539 """Either classifier was already trained"""
540 return self.isTrained()
541
543 """Reset trained state"""
544 self.__trainednfeatures = None
545
546
547
548
549
550
551 super(Classifier, self).reset()
552
553
555 """Factory method to return an appropriate sensitivity analyzer for
556 the respective classifier."""
557 raise NotImplementedError
558
559
560
561
562
564 """Assign value of retrainable parameter
565
566 If retrainable flag is to be changed, classifier has to be
567 untrained. Also internal attributes such as _changedData,
568 __changedData_isset, and __idhashes should be initialized if
569 it becomes retrainable
570 """
571 pretrainable = self.params['retrainable']
572 if (force or value != pretrainable.value) and 'retrainable' in self._clf_internals:
573 if __debug__:
574 debug("CLF_", "Setting retrainable to %s" % value)
575 if 'meta' in self._clf_internals:
576 warning("Retrainability is not yet crafted/tested for "
577 "meta classifiers. Unpredictable behavior might occur")
578
579 if self.trained:
580 self.untrain()
581 states = self.states
582 if not value and states.isKnown('retrained'):
583 states.remove('retrained')
584 states.remove('repredicted')
585 if value:
586 if not 'retrainable' in self._clf_internals:
587 warning("Setting of flag retrainable for %s has no effect"
588 " since classifier has no such capability. It would"
589 " just lead to resources consumption and slowdown"
590 % self)
591 states.add(StateVariable(enabled=True,
592 name='retrained',
593 doc="Either retrainable classifier was retrained"))
594 states.add(StateVariable(enabled=True,
595 name='repredicted',
596 doc="Either retrainable classifier was repredicted"))
597
598 pretrainable.value = value
599
600
601 if value:
602 self.__idhashes = {'traindata': None, 'labels': None,
603 'testdata': None}
604 if __debug__ and 'CHECK_RETRAIN' in debug.active:
605
606
607
608
609 self.__trained = self.__idhashes.copy()
610 self.__resetChangedData()
611 self.__invalidatedChangedData = {}
612 elif 'retrainable' in self._clf_internals:
613
614 self.__changedData_isset = False
615 self._changedData = None
616 self.__idhashes = None
617 if __debug__ and 'CHECK_RETRAIN' in debug.active:
618 self.__trained = None
619
621 """For retrainable classifier we keep track of what was changed
622 This function resets that dictionary
623 """
624 if __debug__:
625 debug('CLF_', 'Resetting flags on either data was changed (for retrainable)')
626 keys = self.__idhashes.keys() + self._paramscols
627
628
629
630
631
632 self._changedData = dict(zip(keys, [False]*len(keys)))
633 self.__changedData_isset = False
634
635
637 """Check if given entry was changed from what known prior. If so -- store
638
639 needed only for retrainable beastie
640 """
641 idhash_ = idhash(entry)
642 __idhashes = self.__idhashes
643
644 changed = __idhashes[key] != idhash_
645 if __debug__ and 'CHECK_RETRAIN' in debug.active:
646 __trained = self.__trained
647 changed2 = entry != __trained[key]
648 if isinstance(changed2, N.ndarray):
649 changed2 = changed2.any()
650 if changed != changed2 and not changed:
651 raise RuntimeError, \
652 'idhash found to be weak for %s. Though hashid %s!=%s %s, '\
653 'values %s!=%s %s' % \
654 (key, idhash_, __idhashes[key], changed,
655 entry, __trained[key], changed2)
656 if update:
657 __trained[key] = entry
658
659 if __debug__ and changed:
660 debug('CLF_', "Changed %s from %s to %s.%s"
661 % (key, __idhashes[key], idhash_,
662 ('','updated')[int(update)]))
663 if update:
664 __idhashes[key] = idhash_
665
666 return changed
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698 - def retrain(self, dataset, **kwargs):
699 """Helper to avoid check if data was changed actually changed
700
701 Useful if just some aspects of classifier were changed since
702 its previous training. For instance if dataset wasn't changed
703 but only classifier parameters, then kernel matrix does not
704 have to be computed.
705
706 Words of caution: classifier must be previousely trained,
707 results always should first be compared to the results on not
708 'retrainable' classifier (without calling retrain). Some
709 additional checks are enabled if debug id 'CHECK_RETRAIN' is
710 enabled, to guard against obvious mistakes.
711
712 :Parameters:
713 kwargs
714 that is what _changedData gets updated with. So, smth like
715 ``(params=['C'], labels=True)`` if parameter C and labels
716 got changed
717 """
718
719
720 if __debug__:
721 if not self.params.retrainable:
722 raise RuntimeError, \
723 "Do not use retrain/repredict on non-retrainable classifiers"
724
725 if kwargs.has_key('params') or kwargs.has_key('kernel_params'):
726 raise ValueError, "Retraining for changed params yet not working"
727
728 self.__resetChangedData()
729
730
731 chd = self._changedData
732 ichd = self.__invalidatedChangedData
733
734 chd.update(kwargs)
735
736
737 for key, value in kwargs.iteritems():
738 if value: ichd[key] = True
739 self.__changedData_isset = True
740
741
742 if __debug__ and 'CHECK_RETRAIN' in debug.active:
743 for key, data_ in (('traindata', dataset.samples),
744 ('labels', dataset.labels)):
745
746 if not chd[key] and not ichd.get(key, False):
747 if self.__wasDataChanged(key, data_, update=False):
748 raise RuntimeError, \
749 "Data %s found changed although wasn't " \
750 "labeled as such" % key
751
752
753
754
755
756
757 if __debug__ and 'CHECK_RETRAIN' in debug.active and self.trained \
758 and not self._changedData['traindata'] \
759 and self.__trained['traindata'].shape != dataset.samples.shape:
760 raise ValueError, "In retrain got dataset with %s size, " \
761 "whenever previousely was trained on %s size" \
762 % (dataset.samples.shape, self.__trained['traindata'].shape)
763 self.train(dataset)
764
765
767 """Helper to avoid check if data was changed actually changed
768
769 Useful if classifier was (re)trained but with the same data
770 (so just parameters were changed), so that it could be
771 repredicted easily (on the same data as before) without
772 recomputing for instance train/test kernel matrix. Should be
773 used with caution and always compared to the results on not
774 'retrainable' classifier. Some additional checks are enabled
775 if debug id 'CHECK_RETRAIN' is enabled, to guard against
776 obvious mistakes.
777
778 :Parameters:
779 data
780 data which is conventionally given to predict
781 kwargs
782 that is what _changedData gets updated with. So, smth like
783 ``(params=['C'], labels=True)`` if parameter C and labels
784 got changed
785 """
786 if len(kwargs)>0:
787 raise RuntimeError, \
788 "repredict for now should be used without params since " \
789 "it makes little sense to repredict if anything got changed"
790 if __debug__ and not self.params.retrainable:
791 raise RuntimeError, \
792 "Do not use retrain/repredict on non-retrainable classifiers"
793
794 self.__resetChangedData()
795 chd = self._changedData
796 chd.update(**kwargs)
797 self.__changedData_isset = True
798
799
800
801 if __debug__ and 'CHECK_RETRAIN' in debug.active:
802 for key, data_ in (('testdata', data),):
803
804
805 if self.__wasDataChanged(key, data_, update=False):
806 raise RuntimeError, \
807 "Data %s found changed although wasn't " \
808 "labeled as such" % key
809
810
811
812 if __debug__ and 'CHECK_RETRAIN' in debug.active \
813 and not self._changedData['testdata'] \
814 and self.__trained['testdata'].shape != data.shape:
815 raise ValueError, "In repredict got dataset with %s size, " \
816 "whenever previousely was trained on %s size" \
817 % (data.shape, self.__trained['testdata'].shape)
818
819 return self.predict(data)
820
821
822
823
824
825
826
827
828
829
830
831 -class BoostedClassifier(Classifier, Harvestable):
832 """Classifier containing the farm of other classifiers.
833
834 Should rarely be used directly. Use one of its childs instead
835 """
836
837
838
839 raw_predictions = StateVariable(enabled=False,
840 doc="Predictions obtained from each classifier")
841
842 raw_values = StateVariable(enabled=False,
843 doc="Values obtained from each classifier")
844
845
846 - def __init__(self, clfs=None, propagate_states=True,
847 harvest_attribs=None, copy_attribs='copy',
848 **kwargs):
849 """Initialize the instance.
850
851 :Parameters:
852 clfs : list
853 list of classifier instances to use (slave classifiers)
854 propagate_states : bool
855 either to propagate enabled states into slave classifiers.
856 It is in effect only when slaves get assigned - so if state
857 is enabled not during construction, it would not necessarily
858 propagate into slaves
859 harvest_attribs : list of basestr
860 What attributes of call to store and return within
861 harvested state variable
862 copy_attribs : None or basestr
863 Force copying values of attributes on harvesting
864 kwargs : dict
865 dict of keyworded arguments which might get used
866 by State or Classifier
867 """
868 if clfs == None:
869 clfs = []
870
871 Classifier.__init__(self, **kwargs)
872 Harvestable.__init__(self, harvest_attribs, copy_attribs)
873
874 self.__clfs = None
875 """Pylint friendly definition of __clfs"""
876
877 self.__propagate_states = propagate_states
878 """Enable current enabled states in slave classifiers"""
879
880 self._setClassifiers(clfs)
881 """Store the list of classifiers"""
882
883
885 if self.__clfs is None or len(self.__clfs)==0:
886
887 prefix_ = []
888 else:
889 prefix_ = ["clfs=[%s,...]" % repr(self.__clfs[0])]
890 return super(BoostedClassifier, self).__repr__(prefix_ + prefixes)
891
892
894 """Train `BoostedClassifier`
895 """
896 for clf in self.__clfs:
897 clf.train(dataset)
898
899
900 - def _posttrain(self, dataset):
901 """Custom posttrain of `BoostedClassifier`
902
903 Harvest over the trained classifiers if it was asked to so
904 """
905 Classifier._posttrain(self, dataset)
906 if self.states.isEnabled('harvested'):
907 for clf in self.__clfs:
908 self._harvest(locals())
909 if self.params.retrainable:
910 self.__changedData_isset = False
911
912
921
922
941
942
944 """Set the classifiers used by the boosted classifier
945
946 We have to allow to set list of classifiers after the object
947 was actually created. It will be used by
948 MulticlassClassifier
949 """
950 self.__clfs = clfs
951 """Classifiers to use"""
952
953 for flag in ['regression']:
954 values = N.array([clf.params[flag].value for clf in self.__clfs])
955 value = values.any()
956 if __debug__:
957 debug("CLFBST", "Setting %(flag)s=%(value)s for classifiers "
958 "%(clfs)s with %(values)s",
959 msgargs={'flag' : flag, 'value' : value,
960 'clfs' : self.__clfs,
961 'values' : values})
962
963 self.params[flag].value = value
964
965
966 if self.__propagate_states:
967 for clf in self.__clfs:
968 clf.states.enable(self.states.enabled, missingok=True)
969
970
971
972 self._clf_internals = [ 'binary', 'multiclass', 'meta' ]
973 if len(clfs)>0:
974 self._clf_internals += self.__clfs[0]._clf_internals
975
986
992
993
994 clfs = property(fget=lambda x:x.__clfs,
995 fset=_setClassifiers,
996 doc="Used classifiers")
997
1001 """Classifier which decorates another classifier
1002
1003 Possible uses:
1004
1005 - modify data somehow prior training/testing:
1006 * normalization
1007 * feature selection
1008 * modification
1009
1010 - optimized classifier?
1011
1012 """
1013
1032
1033
1037
1039 s = super(ProxyClassifier, self).summary()
1040 if self.trained:
1041 s += "\n Slave classifier summary:" + \
1042 '\n + %s' % \
1043 (self.__clf.summary().replace('\n', '\n |'))
1044 return s
1045
1046
1047
1049 """Train `ProxyClassifier`
1050 """
1051
1052
1053 self.__clf.train(dataset)
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1065 """Predict using `ProxyClassifier`
1066 """
1067 clf = self.__clf
1068 if self.states.isEnabled('values'):
1069 clf.states.enable(['values'])
1070
1071 result = clf.predict(data)
1072
1073 self.states._copy_states_(self.__clf, ['values'], deep=False)
1074 return result
1075
1076
1083
1084
1085 @group_kwargs(prefixes=['slave_'], passthrough=True)
1092
1093
1094 clf = property(lambda x:x.__clf, doc="Used `Classifier`")
1095
1103 """Base class for combining decisions of multiple classifiers"""
1104
1105 - def train(self, clfs, dataset):
1106 """PredictionsCombiner might need to be trained
1107
1108 :Parameters:
1109 clfs : list of Classifier
1110 List of classifiers to combine. Has to be classifiers (not
1111 pure predictions), since combiner might use some other
1112 state variables (value's) instead of pure prediction's
1113 dataset : Dataset
1114 training data in this case
1115 """
1116 pass
1117
1118
1120 """Call function
1121
1122 :Parameters:
1123 clfs : list of Classifier
1124 List of classifiers to combine. Has to be classifiers (not
1125 pure predictions), since combiner might use some other
1126 state variables (value's) instead of pure prediction's
1127 """
1128 raise NotImplementedError
1129
1133 """Provides a decision using maximal vote rule"""
1134
1135 predictions = StateVariable(enabled=True,
1136 doc="Voted predictions")
1137 all_label_counts = StateVariable(enabled=False,
1138 doc="Counts across classifiers for each label/sample")
1139
1141 """XXX Might get a parameter to use raw decision values if
1142 voting is not unambigous (ie two classes have equal number of
1143 votes
1144 """
1145 PredictionsCombiner.__init__(self)
1146
1147
1149 """Actuall callable - perform voting
1150
1151 Extended functionality which might not be needed actually:
1152 Since `BinaryClassifier` might return a list of possible
1153 predictions (not just a single one), we should consider all of those
1154
1155 MaximalVote doesn't care about dataset itself
1156 """
1157 if len(clfs)==0:
1158 return []
1159
1160 all_label_counts = None
1161 for clf in clfs:
1162
1163 if not clf.states.isEnabled("predictions"):
1164 raise ValueError, "MaximalVote needs classifiers (such as " + \
1165 "%s) with state 'predictions' enabled" % clf
1166 predictions = clf.predictions
1167 if all_label_counts is None:
1168 all_label_counts = [ {} for i in xrange(len(predictions)) ]
1169
1170
1171 for i in xrange(len(predictions)):
1172 prediction = predictions[i]
1173 if not operator.isSequenceType(prediction):
1174 prediction = (prediction,)
1175 for label in prediction:
1176
1177
1178 if not all_label_counts[i].has_key(label):
1179 all_label_counts[i][label] = 0
1180 all_label_counts[i][label] += 1
1181
1182 predictions = []
1183
1184 for i in xrange(len(all_label_counts)):
1185 label_counts = all_label_counts[i]
1186
1187
1188 maxk = []
1189 maxv = -1
1190 for k, v in label_counts.iteritems():
1191 if v > maxv:
1192 maxk = [k]
1193 maxv = v
1194 elif v == maxv:
1195 maxk.append(k)
1196
1197 assert len(maxk) >= 1, \
1198 "We should have obtained at least a single key of max label"
1199
1200 if len(maxk) > 1:
1201 warning("We got multiple labels %s which have the " % maxk +
1202 "same maximal vote %d. XXX disambiguate" % maxv)
1203 predictions.append(maxk[0])
1204
1205 self.all_label_counts = all_label_counts
1206 self.predictions = predictions
1207 return predictions
1208
1212 """Provides a decision by taking mean of the results
1213 """
1214
1215 predictions = StateVariable(enabled=True,
1216 doc="Mean predictions")
1217
1219 """Actuall callable - perform meaning
1220
1221 """
1222 if len(clfs)==0:
1223 return []
1224
1225 all_predictions = []
1226 for clf in clfs:
1227
1228 if not clf.states.isEnabled("predictions"):
1229 raise ValueError, "MeanPrediction needs classifiers (such as " + \
1230 "%s) with state 'predictions' enabled" % clf
1231 all_predictions.append(clf.predictions)
1232
1233
1234 predictions = N.mean(N.asarray(all_predictions), axis=0)
1235 self.predictions = predictions
1236 return predictions
1237
1240 """Provides a decision using training a classifier on predictions/values
1241
1242 TODO
1243 """
1244
1245 predictions = StateVariable(enabled=True,
1246 doc="Trained predictions")
1247
1248
1249 - def __init__(self, clf, variables=None):
1250 """Initialize `ClassifierCombiner`
1251
1252 :Parameters:
1253 clf : Classifier
1254 Classifier to train on the predictions
1255 variables : list of basestring
1256 List of state variables stored in 'combined' classifiers, which
1257 to use as features for training this classifier
1258 """
1259 PredictionsCombiner.__init__(self)
1260
1261 self.__clf = clf
1262 """Classifier to train on `variables` states of provided classifiers"""
1263
1264 if variables == None:
1265 variables = ['predictions']
1266 self.__variables = variables
1267 """What state variables of the classifiers to use"""
1268
1269
1271 """It might be needed to untrain used classifier"""
1272 if self.__clf:
1273 self.__clf.untrain()
1274
1276 """
1277 """
1278 if len(clfs)==0:
1279 return []
1280
1281
1282 raise NotImplementedError
1283
1287 """`BoostedClassifier` which combines predictions using some
1288 `PredictionsCombiner` functor.
1289 """
1290
1291 - def __init__(self, clfs=None, combiner=None, **kwargs):
1292 """Initialize the instance.
1293
1294 :Parameters:
1295 clfs : list of Classifier
1296 list of classifier instances to use
1297 combiner : PredictionsCombiner
1298 callable which takes care about combining multiple
1299 results into a single one (e.g. maximal vote for
1300 classification, MeanPrediction for regression))
1301 kwargs : dict
1302 dict of keyworded arguments which might get used
1303 by State or Classifier
1304
1305 NB: `combiner` might need to operate not on 'predictions' descrete
1306 labels but rather on raw 'class' values classifiers
1307 estimate (which is pretty much what is stored under
1308 `values`
1309 """
1310 if clfs == None:
1311 clfs = []
1312
1313 BoostedClassifier.__init__(self, clfs, **kwargs)
1314
1315
1316 if combiner is None:
1317 combiner = (MaximalVote, MeanPrediction)[int(self.regression)]()
1318 self.__combiner = combiner
1319 """Functor destined to combine results of multiple classifiers"""
1320
1321
1325
1326
1328 s = super(CombinedClassifier, self).summary()
1329 if self.trained:
1330 s += "\n Slave classifiers summaries:"
1331 for i, clf in enumerate(self.clfs):
1332 s += '\n + %d clf: %s' % \
1333 (i, clf.summary().replace('\n', '\n |'))
1334 return s
1335
1336
1345
1352
1353
1373
1374
1375 combiner = property(fget=lambda x:x.__combiner,
1376 doc="Used combiner to derive a single result")
1377
1381 """`ProxyClassifier` which maps set of two labels into +1 and -1
1382 """
1383
1384 - def __init__(self, clf, poslabels, neglabels, **kwargs):
1385 """
1386 :Parameters:
1387 clf : Classifier
1388 classifier to use
1389 poslabels : list
1390 list of labels which are treated as +1 category
1391 neglabels : list
1392 list of labels which are treated as -1 category
1393 """
1394
1395 ProxyClassifier.__init__(self, clf, **kwargs)
1396
1397 self._regressionIsBogus()
1398
1399
1400 sposlabels = Set(poslabels)
1401 sneglabels = Set(neglabels)
1402
1403
1404 overlap = sposlabels.intersection(sneglabels)
1405 if len(overlap)>0:
1406 raise ValueError("Sets of positive and negative labels for " +
1407 "BinaryClassifier must not overlap. Got overlap " %
1408 overlap)
1409
1410 self.__poslabels = list(sposlabels)
1411 self.__neglabels = list(sneglabels)
1412
1413
1414
1415
1416
1417
1418
1419
1420 if len(self.__poslabels) > 1:
1421 self.__predictpos = self.__poslabels
1422 else:
1423 self.__predictpos = self.__poslabels[0]
1424
1425 if len(self.__neglabels) > 1:
1426 self.__predictneg = self.__neglabels
1427 else:
1428 self.__predictneg = self.__neglabels[0]
1429
1430
1432 prefix = "poslabels=%s, neglabels=%s" % (
1433 repr(self.__poslabels), repr(self.__neglabels))
1434 return super(BinaryClassifier, self).__repr__([prefix] + prefixes)
1435
1436
1438 """Train `BinaryClassifier`
1439 """
1440 idlabels = [(x, +1) for x in dataset.idsbylabels(self.__poslabels)] + \
1441 [(x, -1) for x in dataset.idsbylabels(self.__neglabels)]
1442
1443
1444 idlabels.sort()
1445
1446 orig_labels = None
1447
1448
1449
1450
1451 if len(idlabels) == dataset.nsamples \
1452 and [x[0] for x in idlabels] == range(dataset.nsamples):
1453
1454
1455 datasetselected = dataset
1456 orig_labels = dataset.labels
1457 if __debug__:
1458 debug('CLFBIN',
1459 "Assigned all %d samples for binary " %
1460 (dataset.nsamples) +
1461 " classification among labels %s/+1 and %s/-1" %
1462 (self.__poslabels, self.__neglabels))
1463 else:
1464 datasetselected = dataset.selectSamples([ x[0] for x in idlabels ])
1465 if __debug__:
1466 debug('CLFBIN',
1467 "Selected %d samples out of %d samples for binary " %
1468 (len(idlabels), dataset.nsamples) +
1469 " classification among labels %s/+1 and %s/-1" %
1470 (self.__poslabels, self.__neglabels) +
1471 ". Selected %s" % datasetselected)
1472
1473
1474 datasetselected.labels = [ x[1] for x in idlabels ]
1475
1476
1477 if __debug__:
1478 assert((datasetselected.uniquelabels == [-1, 1]).all())
1479
1480 self.clf.train(datasetselected)
1481
1482 if not orig_labels is None:
1483 dataset.labels = orig_labels
1484
1486 """Predict the labels for a given `data`
1487
1488 Predicts using binary classifier and spits out list (for each sample)
1489 where with either poslabels or neglabels as the "label" for the sample.
1490 If there was just a single label within pos or neg labels then it would
1491 return not a list but just that single label.
1492 """
1493 binary_predictions = ProxyClassifier._predict(self, data)
1494 self.values = binary_predictions
1495 predictions = [ {-1: self.__predictneg,
1496 +1: self.__predictpos}[x] for x in binary_predictions]
1497 self.predictions = predictions
1498 return predictions
1499
1503 """`CombinedClassifier` to perform multiclass using a list of
1504 `BinaryClassifier`.
1505
1506 such as 1-vs-1 (ie in pairs like libsvm doesn) or 1-vs-all (which
1507 is yet to think about)
1508 """
1509
1510 - def __init__(self, clf, bclf_type="1-vs-1", **kwargs):
1511 """Initialize the instance
1512
1513 :Parameters:
1514 clf : Classifier
1515 classifier based on which multiple classifiers are created
1516 for multiclass
1517 bclf_type
1518 "1-vs-1" or "1-vs-all", determines the way to generate binary
1519 classifiers
1520 """
1521 CombinedClassifier.__init__(self, **kwargs)
1522 self._regressionIsBogus()
1523 if not clf is None:
1524 clf._regressionIsBogus()
1525
1526 self.__clf = clf
1527 """Store sample instance of basic classifier"""
1528
1529
1530 if bclf_type == "1-vs-1":
1531 pass
1532 elif bclf_type == "1-vs-all":
1533 raise NotImplementedError
1534 else:
1535 raise ValueError, \
1536 "Unknown type of classifier %s for " % bclf_type + \
1537 "BoostedMulticlassClassifier"
1538 self.__bclf_type = bclf_type
1539
1540
1541
1543 prefix = "bclf_type=%s, clf=%s" % (repr(self.__bclf_type),
1544 repr(self.__clf))
1545 return super(MulticlassClassifier, self).__repr__([prefix] + prefixes)
1546
1547
1549 """Train classifier
1550 """
1551
1552 ulabels = dataset.uniquelabels
1553 if self.__bclf_type == "1-vs-1":
1554
1555 biclfs = []
1556 for i in xrange(len(ulabels)):
1557 for j in xrange(i+1, len(ulabels)):
1558 clf = _deepcopyclf(self.__clf)
1559 biclfs.append(
1560 BinaryClassifier(
1561 clf,
1562 poslabels=[ulabels[i]], neglabels=[ulabels[j]]))
1563 if __debug__:
1564 debug("CLFMC", "Created %d binary classifiers for %d labels" %
1565 (len(biclfs), len(ulabels)))
1566
1567 self.clfs = biclfs
1568
1569 elif self.__bclf_type == "1-vs-all":
1570 raise NotImplementedError
1571
1572
1573 CombinedClassifier._train(self, dataset)
1574
1578 """`BoostedClassifier` to work on splits of the data
1579
1580 """
1581
1582 """
1583 TODO: SplitClassifier and MulticlassClassifier have too much in
1584 common -- need to refactor: just need a splitter which would
1585 split dataset in pairs of class labels. MulticlassClassifier
1586 does just a tiny bit more which might be not necessary at
1587 all: map sets of labels into 2 categories...
1588 """
1589
1590
1591
1592 confusion = StateVariable(enabled=False,
1593 doc="Resultant confusion whenever classifier trained " +
1594 "on 1 part and tested on 2nd part of each split")
1595
1596 splits = StateVariable(enabled=False, doc=
1597 """Store the actual splits of the data. Can be memory expensive""")
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1612 """Initialize the instance
1613
1614 :Parameters:
1615 clf : Classifier
1616 classifier based on which multiple classifiers are created
1617 for multiclass
1618 splitter : Splitter
1619 `Splitter` to use to split the dataset prior training
1620 """
1621
1622 CombinedClassifier.__init__(self, regression=clf.regression, **kwargs)
1623 self.__clf = clf
1624 """Store sample instance of basic classifier"""
1625
1626 if isinstance(splitter, type):
1627 raise ValueError, \
1628 "Please provide an instance of a splitter, not a type." \
1629 " Got %s" % splitter
1630
1631 self.__splitter = splitter
1632
1633
1700
1701
1702 @group_kwargs(prefixes=['slave_'], passthrough=True)
1715
1716 splitter = property(fget=lambda x:x.__splitter,
1717 doc="Splitter user by SplitClassifier")
1718
1721 """`ProxyClassifier` which uses some mapper prior training/testing.
1722
1723 `MaskMapper` can be used just a subset of features to
1724 train/classify.
1725 Having such classifier we can easily create a set of classifiers
1726 for BoostedClassifier, where each classifier operates on some set
1727 of features, e.g. set of best spheres from SearchLight, set of
1728 ROIs selected elsewhere. It would be different from simply
1729 applying whole mask over the dataset, since here initial decision
1730 is made by each classifier and then later on they vote for the
1731 final decision across the set of classifiers.
1732 """
1733
1734 - def __init__(self, clf, mapper, **kwargs):
1735 """Initialize the instance
1736
1737 :Parameters:
1738 clf : Classifier
1739 classifier based on which mask classifiers is created
1740 mapper
1741 whatever `Mapper` comes handy
1742 """
1743 ProxyClassifier.__init__(self, clf, **kwargs)
1744
1745 self.__mapper = mapper
1746 """mapper to help us our with prepping data to
1747 training/classification"""
1748
1749
1751 """Train `MappedClassifier`
1752 """
1753
1754
1755
1756 self.__mapper.train(dataset)
1757
1758
1759 wdataset = dataset.applyMapper(featuresmapper = self.__mapper)
1760 ProxyClassifier._train(self, wdataset)
1761
1762
1767
1768
1769 @group_kwargs(prefixes=['slave_'], passthrough=True)
1776
1777
1778 mapper = property(lambda x:x.__mapper, doc="Used mapper")
1779
1783 """`ProxyClassifier` which uses some `FeatureSelection` prior training.
1784
1785 `FeatureSelection` is used first to select features for the classifier to
1786 use for prediction. Internally it would rely on MappedClassifier which
1787 would use created MaskMapper.
1788
1789 TODO: think about removing overhead of retraining the same classifier if
1790 feature selection was carried out with the same classifier already. It
1791 has been addressed by adding .trained property to classifier, but now
1792 we should expclitely use isTrained here if we want... need to think more
1793 """
1794
1795 _clf_internals = [ 'does_feature_selection', 'meta' ]
1796
1797 - def __init__(self, clf, feature_selection, testdataset=None, **kwargs):
1798 """Initialize the instance
1799
1800 :Parameters:
1801 clf : Classifier
1802 classifier based on which mask classifiers is created
1803 feature_selection : FeatureSelection
1804 whatever `FeatureSelection` comes handy
1805 testdataset : Dataset
1806 optional dataset which would be given on call to feature_selection
1807 """
1808 ProxyClassifier.__init__(self, clf, **kwargs)
1809
1810 self.__maskclf = None
1811 """Should become `MappedClassifier`(mapper=`MaskMapper`) later on."""
1812
1813 self.__feature_selection = feature_selection
1814 """`FeatureSelection` to select the features prior training"""
1815
1816 self.__testdataset = testdataset
1817 """`FeatureSelection` might like to use testdataset"""
1818
1819
1821 """Untrain `FeatureSelectionClassifier`
1822
1823 Has to untrain any known classifier
1824 """
1825 if not self.trained:
1826 return
1827 if not self.__maskclf is None:
1828 self.__maskclf.untrain()
1829 super(FeatureSelectionClassifier, self).untrain()
1830
1831
1833 """Train `FeatureSelectionClassifier`
1834 """
1835
1836 self.__feature_selection.states._changeTemporarily(
1837 enable_states=["selected_ids"])
1838
1839 if __debug__:
1840 debug("CLFFS", "Performing feature selection using %s" %
1841 self.__feature_selection + " on %s" % dataset)
1842
1843 (wdataset, tdataset) = self.__feature_selection(dataset,
1844 self.__testdataset)
1845 if __debug__:
1846 add_ = ""
1847 if "CLFFS_" in debug.active:
1848 add_ = " Selected features: %s" % \
1849 self.__feature_selection.selected_ids
1850 debug("CLFFS", "%(fs)s selected %(nfeat)d out of " +
1851 "%(dsnfeat)d features.%(app)s",
1852 msgargs={'fs':self.__feature_selection,
1853 'nfeat':wdataset.nfeatures,
1854 'dsnfeat':dataset.nfeatures,
1855 'app':add_})
1856
1857
1858
1859 mappermask = N.zeros(dataset.nfeatures)
1860 mappermask[self.__feature_selection.selected_ids] = 1
1861 mapper = MaskMapper(mappermask)
1862
1863 self.__feature_selection.states._resetEnabledTemporarily()
1864
1865
1866 self.__maskclf = MappedClassifier(self.clf, mapper)
1867
1868
1869 self.__maskclf.clf.train(wdataset)
1870
1871
1872
1873
1874
1876 """Return used feature ids for `FeatureSelectionClassifier`
1877
1878 """
1879 return self.__feature_selection.selected_ids
1880
1882 """Predict using `FeatureSelectionClassifier`
1883 """
1884 clf = self.__maskclf
1885 if self.states.isEnabled('values'):
1886 clf.states.enable(['values'])
1887
1888 result = clf._predict(data)
1889
1890 self.states._copy_states_(clf, ['values'], deep=False)
1891 return result
1892
1894 """Set testing dataset to be used for feature selection
1895 """
1896 self.__testdataset = testdataset
1897
1898 maskclf = property(lambda x:x.__maskclf, doc="Used `MappedClassifier`")
1899 feature_selection = property(lambda x:x.__feature_selection,
1900 doc="Used `FeatureSelection`")
1901
1902 @group_kwargs(prefixes=['slave_'], passthrough=True)
1912
1913
1914
1915 testdataset = property(fget=lambda x:x.__testdataset,
1916 fset=setTestDataset)
1917