1
2
3
4
5
6
7
8
9 """Base classes for all classifiers.
10
11 Base Classifiers can be grouped according to their function as
12
13 :group Basic Classifiers: Classifier BoostedClassifier ProxyClassifier
14 :group BoostedClassifiers: CombinedClassifier MulticlassClassifier
15 SplitClassifier
16 :group ProxyClassifiers: BinaryClassifier MappedClassifier
17 FeatureSelectionClassifier
18 :group PredictionsCombiners for CombinedClassifier: PredictionsCombiner
19 MaximalVote
20
21 """
22
23 __docformat__ = 'restructuredtext'
24
25 import operator, sys
26 import numpy as N
27
28
29
30
31 if sys.version_info[0] > 2 or sys.version_info[1] > 4:
32 from copy import deepcopy
33 else:
34 from mvpa.misc.copy import deepcopy
35
36 from sets import Set
37 from time import time
38
39 from mvpa.mappers import MaskMapper
40 from mvpa.datasets.splitter import NFoldSplitter
41 from mvpa.misc.state import StateVariable, Stateful, Harvestable
42
43 from mvpa.clfs.transerror import ConfusionMatrix
44
45 from mvpa.measures.base import \
46 BoostedClassifierSensitivityAnalyzer, ProxyClassifierSensitivityAnalyzer
47 from mvpa.misc import warning
48
49 if __debug__:
50 import traceback
51 from mvpa.misc import debug
55 """Deepcopying of a classifier.
56
57 If deepcopy fails -- tries to untrain it first so that there is no
58 swig bindings attached
59 """
60 try:
61 return deepcopy(clf)
62 except:
63 clf.untrain()
64 return deepcopy(clf)
65
68 """Abstract classifier class to be inherited by all classifiers
69
70 Required behavior:
71
72 For every classifier is has to be possible to be instanciated without
73 having to specify the training pattern.
74
75 Repeated calls to the train() method with different training data have to
76 result in a valid classifier, trained for the particular dataset.
77
78 It must be possible to specify all classifier parameters as keyword
79 arguments to the constructor.
80
81 Recommended behavior:
82
83 Derived classifiers should provide access to *values* -- i.e. that
84 information that is finally used to determine the predicted class label.
85
86 Michael: Maybe it works well if each classifier provides a 'values'
87 state member. This variable is a list as long as and in same order
88 as Dataset.uniquelabels (training data). Each item in the list
89 corresponds to the likelyhood of a sample to belong to the
90 respective class. However the sematics might differ between
91 classifiers, e.g. kNN would probably store distances to class-
92 neighbours, where PLR would store the raw function value of the
93 logistic function. So in the case of kNN low is predictive and for
94 PLR high is predictive. Don't know if there is the need to unify
95 that.
96
97 As the storage and/or computation of this information might be
98 demanding its collection should be switchable and off be default.
99
100 Nomenclature
101 * predictions : corresponds to the quantized labels if classifier spits
102 out labels by .predict()
103 * values : might be different from predictions if a classifier's predict()
104 makes a decision based on some internal value such as
105 probability or a distance.
106 """
107
108
109
110
111
112
113
114
115
116 trained_labels = StateVariable(enabled=True,
117 doc="Set of unique labels it has been trained on")
118
119 trained_dataset = StateVariable(enabled=False,
120 doc="The dataset it has been trained on")
121
122 training_confusion = StateVariable(enabled=False,
123 doc="Confusion matrix of learning performance")
124
125 predictions = StateVariable(enabled=True,
126 doc="Most recent set of predictions")
127
128 values = StateVariable(enabled=False,
129 doc="Internal classifier values the most recent " +
130 "predictions are based on")
131
132 training_time = StateVariable(enabled=True,
133 doc="Time (in seconds) which took classifier to train")
134
135 predicting_time = StateVariable(enabled=True,
136 doc="Time (in seconds) which took classifier to predict")
137
138 feature_ids = StateVariable(enabled=False,
139 doc="Feature IDS which were used for the actual training." +
140 " Some classifiers might internally do feature selection (SMLR)")
141
142 _clf_internals = []
143 """Describes some specifics about the classifier -- is that it is
144 doing regression for instance...."""
145
146
147 - def __init__(self, train2predict=True, regression=False, retrainable=False,
148 **kwargs):
149 """Cheap initialization.
150 """
151 Stateful.__init__(self, **kwargs)
152
153 self._train2predict = train2predict
154 """Some classifiers might not need to be trained to predict"""
155
156 self.__trainednfeatures = None
157 """Stores number of features for which classifier was trained.
158 If None -- it wasn't trained at all"""
159
160 self._regression = regression
161 """If True - perform regression, not classification"""
162
163 self.__retrainable = None
164 """If True - store anything necessary for efficient retrain"""
165 self._setRetrainable(retrainable)
166
167 if self._regression:
168 for statevar in [ "trained_labels", "training_confusion" ]:
169 if self.states.isEnabled(statevar):
170 if __debug__:
171 debug("CLF",
172 "Disabling state %s since doing regression, " %
173 statevar + "not classification")
174 self.states.disable(statevar)
175
176 self.__trainedidhash = None
177 """Stores id of the dataset on which it was trained to signal
178 in trained() if it was trained already on the same dataset"""
179
180
186
187
188
189
190
191
192
193
195 """Functionality prior to training
196 """
197
198
199 if not self.__retrainable:
200 self.untrain()
201 else:
202
203 self.states.reset()
204
205 if not self._regression and 'regression' in self._clf_internals \
206 and not self.states.isEnabled('trained_labels'):
207
208
209 if __debug__:
210 debug("CLF", "Enabling trained_labels state since it is needed")
211 self.states.enable('trained_labels')
212
213
214 - def _posttrain(self, dataset):
215 """Functionality post training
216
217 For instance -- computing confusion matrix
218 :Parameters:
219 dataset : Dataset
220 Data which was used for training
221 """
222 if self.states.isEnabled('trained_labels'):
223 self.trained_labels = Set(dataset.uniquelabels)
224
225 self.trained_dataset = dataset
226
227
228 self.__trainednfeatures = dataset.nfeatures
229 self.__trainedidhash = dataset.idhash
230
231 if self.states.isEnabled('training_confusion'):
232
233
234 self.states._changeTemporarily(
235 disable_states=["predictions"])
236 predictions = self.predict(dataset.samples)
237 self.states._resetEnabledTemporarily()
238 self.training_confusion = ConfusionMatrix(
239 labels=dataset.uniquelabels, targets=dataset.labels,
240 predictions=predictions)
241
242 if self.states.isEnabled('feature_ids'):
243 self.feature_ids = self._getFeatureIds()
244
245
247 """Virtual method to return feature_ids used while training
248
249 Is not intended to be called anywhere but from _posttrain,
250 thus classifier is assumed to be trained at this point
251 """
252
253 return range(self.__trainednfeatures)
254
255
257 """Function to be actually overriden in derived classes
258 """
259 raise NotImplementedError
260
261
262 - def train(self, dataset):
263 """Train classifier on a dataset
264
265 Shouldn't be overriden in subclasses unless explicitely needed
266 to do so
267 """
268 if __debug__:
269 debug("CLF", "Training classifier %s on dataset %s" % \
270 (`self`, `dataset`))
271 if 'CLF_TB' in debug.active:
272 tb = traceback.extract_stack(limit=5)
273 debug("CLF_TB", "Traceback: %s" % tb)
274
275 self._pretrain(dataset)
276
277
278 t0 = time()
279
280 if dataset.nfeatures > 0:
281 result = self._train(dataset)
282 else:
283 warning("Trying to train on dataset with no features present")
284 if __debug__:
285 debug("CLF",
286 "No features present for training, no actual training is called")
287 result = None
288
289 self.training_time = time() - t0
290 self._posttrain(dataset)
291 return result
292
293
295 """Functionality prior prediction
296 """
297 if self._train2predict:
298
299 if not self.trained:
300 raise ValueError, \
301 "Classifier %s wasn't yet trained, therefore can't " \
302 "predict" % `self`
303 nfeatures = data.shape[1]
304
305
306 if nfeatures != self.__trainednfeatures:
307 raise ValueError, \
308 "Classifier %s was trained on data with %d features, " % \
309 (`self`, self.__trainednfeatures) + \
310 "thus can't predict for %d features" % nfeatures
311
312
313 - def _postpredict(self, data, result):
314 """Functionality after prediction is computed
315 """
316 self.predictions = result
317
318
320 """Actual prediction
321 """
322 raise NotImplementedError
323
324
326 """Predict classifier on data
327
328 Shouldn't be overriden in subclasses unless explicitely needed
329 to do so. Also subclasses trying to call super class's predict
330 should call _predict if within _predict instead of predict()
331 since otherwise it would loop
332 """
333 data = N.asarray(data)
334 if __debug__:
335 debug("CLF", "Predicting classifier %s on data %s" \
336 % (`self`, `data.shape`))
337 tb = traceback.extract_stack(limit=5)
338 debug("CLF_TB", "Traceback: %s" % tb)
339
340
341 t0 = time()
342
343 self._prepredict(data)
344 if self.__trainednfeatures > 0 or not self._train2predict:
345 result = self._predict(data)
346 else:
347 warning("Trying to predict using classifier trained on no features")
348 if __debug__:
349 debug("CLF",
350 "No features were present for training, prediction is bogus")
351 result = [None]*data.shape[0]
352
353 self.predicting_time = time() - t0
354
355 if 'regression' in self._clf_internals and not self._regression:
356
357
358
359
360
361
362
363
364 result_ = N.array(result)
365 self.values = result_
366 trained_labels = N.asarray(list(self.trained_labels))
367 for i,value in enumerate(result):
368 dists = N.abs(value - trained_labels)
369 result[i] = trained_labels[N.argmin(dists)]
370
371 if __debug__:
372 debug("CLF_", "Converted regression result %s into labels %s for %s" % (result_, result, self))
373
374 self._postpredict(data, result)
375 return result
376
378 """Either classifier was already trained.
379
380 MUST BE USED WITH CARE IF EVER"""
381 if dataset is None:
382
383 return not self.__trainednfeatures is None
384 else:
385 return (self.__trainednfeatures == dataset.nfeatures) \
386 and (self.__trainedidhash == dataset.idhash)
387
388 @property
390 return self._regression
391
392
394 """Some classifiers like BinaryClassifier can't be used for regression"""
395
396 if self.regression:
397 raise ValueError, "Regression mode is meaningless for %s" % \
398 self.__class__.__name__ + " thus don't enable it"
399
400
401 @property
403 """Either classifier was already trained"""
404 return self.isTrained()
405
407 """Reset trained state"""
408 self.__trainednfeatures = None
409 Stateful.reset(self)
410
411
412 @property
414 """Either classifier has to be trained to predict"""
415 return self._train2predict
416
417
419 """Factory method to return an appropriate sensitivity analyzer for
420 the respective classifier."""
421 raise NotImplementedError
422
424 return self.__retrainable
425
427 if value != self.__retrainable:
428
429 if self.trained:
430 self.untrain()
431 states = self.states
432 if not value and states.isKnown('retrained'):
433 states.remove('retrained')
434 states.remove('retested')
435 if value:
436 if not 'retrainable' in self._clf_internals:
437 warning("Setting of flag retrainable for %s has no effect"
438 " since classifier has no such capability" % self)
439 states.add(StateVariable(enabled=True,
440 name='retrained',
441 doc="Either retrainable classifier was retrained"))
442 states.add(StateVariable(enabled=True,
443 name='retested',
444 doc="Either retrainable classifier was retested"))
445
446 self.__retrainable = value
447
448
449 retrainable = property(fget=_getRetrainable, fset=_setRetrainable,
450 doc="Specifies either classifier should be retrainable")
451
458 """Classifier containing the farm of other classifiers.
459
460 Should rarely be used directly. Use one of its childs instead
461 """
462
463
464
465 raw_predictions = StateVariable(enabled=False,
466 doc="Predictions obtained from each classifier")
467
468 raw_values = StateVariable(enabled=False,
469 doc="Values obtained from each classifier")
470
471
472 - def __init__(self, clfs=None, propagate_states=True,
473 harvest_attribs=None, copy_attribs='copy',
474 **kwargs):
475 """Initialize the instance.
476
477 :Parameters:
478 clfs : list
479 list of classifier instances to use (slave classifiers)
480 propagate_states : bool
481 either to propagate enabled states into slave classifiers.
482 It is in effect only when slaves get assigned - so if state
483 is enabled not during construction, it would not necessarily
484 propagate into slaves
485 harvest_attribs : list of basestr
486 What attributes of call to store and return within
487 harvested state variable
488 copy_attribs : None or basestr
489 Force copying values of attributes on harvesting
490 kwargs : dict
491 dict of keyworded arguments which might get used
492 by State or Classifier
493 """
494 if clfs == None:
495 clfs = []
496
497 Classifier.__init__(self, **kwargs)
498 Harvestable.__init__(self, harvest_attribs, copy_attribs)
499
500 self.__clfs = None
501 """Pylint friendly definition of __clfs"""
502
503 self.__propagate_states = propagate_states
504 """Enable current enabled states in slave classifiers"""
505
506 self._setClassifiers(clfs)
507 """Store the list of classifiers"""
508
509
511 return "<%s(%d classifiers)>" \
512 % (self.__class__.__name__, len(self.clfs))
513
514
516 """Train `BoostedClassifier`
517 """
518 for clf in self.__clfs:
519 clf.train(dataset)
520
521
522 - def _posttrain(self, dataset):
523 """Custom posttrain of `BoostedClassifier`
524
525 Harvest over the trained classifiers if it was asked to so
526 """
527 Classifier._posttrain(self, dataset)
528 if self.states.isEnabled('harvested'):
529 for clf in self.__clfs:
530 self._harvest(locals())
531
532
541
542
561
562
564 """Set the classifiers used by the boosted classifier
565
566 We have to allow to set list of classifiers after the object
567 was actually created. It will be used by
568 BoostedMulticlassClassifier
569 """
570 self.__clfs = clfs
571 """Classifiers to use"""
572
573 for flag in ['_train2predict', '_regression']:
574 values = N.array([clf.__dict__[flag] for clf in self.__clfs])
575 value = values.any()
576 if __debug__:
577 debug("CLFBST", "Setting %s=%s for classifiers " \
578 "%s with %s" \
579 % (flag, str(value), `self.__clfs`, str(values)))
580
581 self.__dict__[flag] = value
582
583
584 if self.__propagate_states:
585 for clf in self.__clfs:
586 clf.states.enable(self.states.enabled, missingok=True)
587
588
589
590 self._clf_internals = [ 'multiclass', 'meta' ]
591 if len(clfs)>0:
592 self._clf_internals += self.__clfs[0]._clf_internals
593
604
610
611
612 clfs = property(fget=lambda x:x.__clfs,
613 fset=_setClassifiers,
614 doc="Used classifiers")
615
619 """Classifier which decorates another classifier
620
621 Possible uses:
622
623 - modify data somehow prior training/testing:
624 * normalization
625 * feature selection
626 * modification
627
628 - optimized classifier?
629
630 """
631
652
653
655 """Train `ProxyClassifier`
656 """
657
658
659 self.__clf.train(dataset)
660
661
662
663
664
665
666
668 """Predict using `ProxyClassifier`
669 """
670 result = self.__clf.predict(data)
671
672
673 return result
674
675
682
683
690
691
692 clf = property(lambda x:x.__clf, doc="Used `Classifier`")
693
701 """Base class for combining decisions of multiple classifiers"""
702
703 - def train(self, clfs, dataset):
704 """PredictionsCombiner might need to be trained
705
706 :Parameters:
707 clfs : list of Classifier
708 List of classifiers to combine. Has to be classifiers (not
709 pure predictions), since combiner might use some other
710 state variables (value's) instead of pure prediction's
711 dataset : Dataset
712 training data in this case
713 """
714 pass
715
716
718 """Call function
719
720 :Parameters:
721 clfs : list of Classifier
722 List of classifiers to combine. Has to be classifiers (not
723 pure predictions), since combiner might use some other
724 state variables (value's) instead of pure prediction's
725 """
726 raise NotImplementedError
727
731 """Provides a decision using maximal vote rule"""
732
733 predictions = StateVariable(enabled=True,
734 doc="Voted predictions")
735 all_label_counts = StateVariable(enabled=False,
736 doc="Counts across classifiers for each label/sample")
737
739 """XXX Might get a parameter to use raw decision values if
740 voting is not unambigous (ie two classes have equal number of
741 votes
742 """
743 PredictionsCombiner.__init__(self)
744
745
747 """Actuall callable - perform voting
748
749 Extended functionality which might not be needed actually:
750 Since `BinaryClassifier` might return a list of possible
751 predictions (not just a single one), we should consider all of those
752
753 MaximalVote doesn't care about dataset itself
754 """
755 if len(clfs)==0:
756 return []
757
758 all_label_counts = None
759 for clf in clfs:
760
761 if not clf.states.isEnabled("predictions"):
762 raise ValueError, "MaximalVote needs classifiers (such as " + \
763 "%s) with state 'predictions' enabled" % clf
764 predictions = clf.predictions
765 if all_label_counts is None:
766 all_label_counts = [ {} for i in xrange(len(predictions)) ]
767
768
769 for i in xrange(len(predictions)):
770 prediction = predictions[i]
771 if not operator.isSequenceType(prediction):
772 prediction = (prediction,)
773 for label in prediction:
774
775
776 if not all_label_counts[i].has_key(label):
777 all_label_counts[i][label] = 0
778 all_label_counts[i][label] += 1
779
780 predictions = []
781
782 for i in xrange(len(all_label_counts)):
783 label_counts = all_label_counts[i]
784
785
786 maxk = []
787 maxv = -1
788 for k, v in label_counts.iteritems():
789 if v > maxv:
790 maxk = [k]
791 maxv = v
792 elif v == maxv:
793 maxk.append(k)
794
795 assert len(maxk) >= 1, \
796 "We should have obtained at least a single key of max label"
797
798 if len(maxk) > 1:
799 warning("We got multiple labels %s which have the " % `maxk` +
800 "same maximal vote %d. XXX disambiguate" % maxv)
801 predictions.append(maxk[0])
802
803 self.all_label_counts = all_label_counts
804 self.predictions = predictions
805 return predictions
806
810 """Provides a decision using training a classifier on predictions/values
811
812 TODO
813 """
814
815 predictions = StateVariable(enabled=True,
816 doc="Trained predictions")
817
818
819 - def __init__(self, clf, variables=None):
820 """Initialize `ClassifierCombiner`
821
822 :Parameters:
823 clf : Classifier
824 Classifier to train on the predictions
825 variables : list of basestring
826 List of state variables stored in 'combined' classifiers, which
827 to use as features for training this classifier
828 """
829 PredictionsCombiner.__init__(self)
830
831 self.__clf = clf
832 """Classifier to train on `variables` states of provided classifiers"""
833
834 if variables == None:
835 variables = ['predictions']
836 self.__variables = variables
837 """What state variables of the classifiers to use"""
838
839
841 """It might be needed to untrain used classifier"""
842 if self.__clf:
843 self.__clf.untrain()
844
846 """
847 """
848 if len(clfs)==0:
849 return []
850
851
852 raise NotImplementedError
853
857 """`BoostedClassifier` which combines predictions using some `PredictionsCombiner`
858 functor.
859 """
860
862 """Initialize the instance.
863
864 :Parameters:
865 clfs : list of Classifier
866 list of classifier instances to use
867 combiner : PredictionsCombiner
868 callable which takes care about combining multiple
869 results into a single one (e.g. maximal vote)
870 kwargs : dict
871 dict of keyworded arguments which might get used
872 by State or Classifier
873
874 NB: `combiner` might need to operate not on 'predictions' descrete
875 labels but rather on raw 'class' values classifiers
876 estimate (which is pretty much what is stored under
877 `values`
878 """
879 if clfs == None:
880 clfs = []
881
882 BoostedClassifier.__init__(self, clfs, **kwargs)
883
884 self.__combiner = combiner
885 """Functor destined to combine results of multiple classifiers"""
886
887
889 return "<%s(%d classifiers, combiner %s)>" \
890 % (self.__class__.__name__, len(self.clfs), `self.__combiner`)
891
898
905
906
908 """Predict using `CombinedClassifier`
909 """
910 BoostedClassifier._predict(self, data)
911
912
913 predictions = self.__combiner(self.clfs, data)
914 self.predictions = predictions
915
916 if self.states.isEnabled("values"):
917 if self.__combiner.states.isActive("values"):
918
919 self.values = self.__combiner.values
920 else:
921 if __debug__:
922 warning("Boosted classifier %s has 'values' state" % `self` +
923 " enabled, but combiner has it active, thus no" +
924 " values could be provided directly, access .clfs")
925 return predictions
926
927
928 combiner = property(fget=lambda x:x.__combiner,
929 doc="Used combiner to derive a single result")
930
934 """`ProxyClassifier` which maps set of two labels into +1 and -1
935 """
936
937 - def __init__(self, clf, poslabels, neglabels, **kwargs):
938 """
939 :Parameters:
940 clf : Classifier
941 classifier to use
942 poslabels : list
943 list of labels which are treated as +1 category
944 neglabels : list
945 list of labels which are treated as -1 category
946 """
947
948 ProxyClassifier.__init__(self, clf, **kwargs)
949
950 self._regressionIsBogus()
951
952
953 sposlabels = Set(poslabels)
954 sneglabels = Set(neglabels)
955
956
957 overlap = sposlabels.intersection(sneglabels)
958 if len(overlap)>0:
959 raise ValueError("Sets of positive and negative labels for " +
960 "BinaryClassifier must not overlap. Got overlap " %
961 overlap)
962
963 self.__poslabels = list(sposlabels)
964 self.__neglabels = list(sneglabels)
965
966
967
968
969
970
971
972
973 if len(self.__poslabels)>1:
974 self.__predictpos = self.__poslabels
975 else:
976 self.__predictpos = self.__poslabels[0]
977
978 if len(self.__neglabels)>1:
979 self.__predictneg = self.__neglabels
980 else:
981 self.__predictneg = self.__neglabels[0]
982
983
985 return "BinaryClassifier +1: %s -1: %s" % (
986 `self.__poslabels`, `self.__neglabels`)
987
988
990 """Train `BinaryClassifier`
991 """
992 idlabels = [(x, +1) for x in dataset.idsbylabels(self.__poslabels)] + \
993 [(x, -1) for x in dataset.idsbylabels(self.__neglabels)]
994
995
996 idlabels.sort()
997
998 orig_labels = None
999
1000
1001
1002
1003 if len(idlabels) == dataset.nsamples \
1004 and [x[0] for x in idlabels] == range(dataset.nsamples):
1005
1006
1007 datasetselected = dataset
1008 orig_labels = dataset.labels
1009 if __debug__:
1010 debug('CLFBIN',
1011 "Assigned all %d samples for binary " %
1012 (dataset.nsamples) +
1013 " classification among labels %s/+1 and %s/-1" %
1014 (self.__poslabels, self.__neglabels))
1015 else:
1016 datasetselected = dataset.selectSamples([ x[0] for x in idlabels ])
1017 if __debug__:
1018 debug('CLFBIN',
1019 "Selected %d samples out of %d samples for binary " %
1020 (len(idlabels), dataset.nsamples) +
1021 " classification among labels %s/+1 and %s/-1" %
1022 (self.__poslabels, self.__neglabels) +
1023 ". Selected %s" % datasetselected)
1024
1025
1026 datasetselected.labels = [ x[1] for x in idlabels ]
1027
1028
1029 if __debug__:
1030 assert((datasetselected.uniquelabels == [-1, 1]).all())
1031
1032 self.clf.train(datasetselected)
1033
1034 if not orig_labels is None:
1035 dataset.labels = orig_labels
1036
1038 """Predict the labels for a given `data`
1039
1040 Predicts using binary classifier and spits out list (for each sample)
1041 where with either poslabels or neglabels as the "label" for the sample.
1042 If there was just a single label within pos or neg labels then it would
1043 return not a list but just that single label.
1044 """
1045 binary_predictions = ProxyClassifier._predict(self, data)
1046 self.values = binary_predictions
1047 predictions = [ {-1: self.__predictneg,
1048 +1: self.__predictpos}[x] for x in binary_predictions]
1049 self.predictions = predictions
1050 return predictions
1051
1055 """`CombinedClassifier` to perform multiclass using a list of
1056 `BinaryClassifier`.
1057
1058 such as 1-vs-1 (ie in pairs like libsvm doesn) or 1-vs-all (which
1059 is yet to think about)
1060 """
1061
1062 - def __init__(self, clf, bclf_type="1-vs-1", **kwargs):
1063 """Initialize the instance
1064
1065 :Parameters:
1066 clf : Classifier
1067 classifier based on which multiple classifiers are created
1068 for multiclass
1069 bclf_type
1070 "1-vs-1" or "1-vs-all", determines the way to generate binary
1071 classifiers
1072 """
1073 CombinedClassifier.__init__(self, **kwargs)
1074 self._regressionIsBogus()
1075 if not clf is None:
1076 clf._regressionIsBogus()
1077
1078 self.__clf = clf
1079 """Store sample instance of basic classifier"""
1080
1081
1082 if bclf_type == "1-vs-1":
1083 pass
1084 elif bclf_type == "1-vs-all":
1085 raise NotImplementedError
1086 else:
1087 raise ValueError, \
1088 "Unknown type of classifier %s for " % bclf_type + \
1089 "BoostedMulticlassClassifier"
1090 self.__bclf_type = bclf_type
1091
1092
1094 """Train classifier
1095 """
1096
1097 ulabels = dataset.uniquelabels
1098 if self.__bclf_type == "1-vs-1":
1099
1100 biclfs = []
1101 for i in xrange(len(ulabels)):
1102 for j in xrange(i+1, len(ulabels)):
1103 clf = _deepcopyclf(self.__clf)
1104 biclfs.append(
1105 BinaryClassifier(
1106 clf,
1107 poslabels=[ulabels[i]], neglabels=[ulabels[j]]))
1108 if __debug__:
1109 debug("CLFMC", "Created %d binary classifiers for %d labels" %
1110 (len(biclfs), len(ulabels)))
1111
1112 self.clfs = biclfs
1113
1114 elif self.__bclf_type == "1-vs-all":
1115 raise NotImplementedError
1116
1117
1118 CombinedClassifier._train(self, dataset)
1119
1123 """`BoostedClassifier` to work on splits of the data
1124
1125 TODO: SplitClassifier and MulticlassClassifier have too much in
1126 common -- need to refactor: just need a splitter which would
1127 split dataset in pairs of class labels. MulticlassClassifier
1128 does just a tiny bit more which might be not necessary at
1129 all: map sets of labels into 2 categories...
1130 """
1131
1132
1133
1134 training_confusions = StateVariable(enabled=False,
1135 doc="Resultant confusion matrices whenever classifier trained " +
1136 "on 1 part and tested on 2nd part of each split")
1137
1139 """Initialize the instance
1140
1141 :Parameters:
1142 clf : Classifier
1143 classifier based on which multiple classifiers are created
1144 for multiclass
1145 splitter : Splitter
1146 `Splitter` to use to split the dataset prior training
1147 """
1148 CombinedClassifier.__init__(self, **kwargs)
1149 self.__clf = clf
1150 """Store sample instance of basic classifier"""
1151 self.__splitter = splitter
1152
1153
1188
1189
1196
1200 """`ProxyClassifier` which uses some mapper prior training/testing.
1201
1202 `MaskMapper` can be used just a subset of features to
1203 train/classify.
1204 Having such classifier we can easily create a set of classifiers
1205 for BoostedClassifier, where each classifier operates on some set
1206 of features, e.g. set of best spheres from SearchLight, set of
1207 ROIs selected elsewhere. It would be different from simply
1208 applying whole mask over the dataset, since here initial decision
1209 is made by each classifier and then later on they vote for the
1210 final decision across the set of classifiers.
1211 """
1212
1213 - def __init__(self, clf, mapper, **kwargs):
1214 """Initialize the instance
1215
1216 :Parameters:
1217 clf : Classifier
1218 classifier based on which mask classifiers is created
1219 mapper
1220 whatever `Mapper` comes handy
1221 """
1222 ProxyClassifier.__init__(self, clf, **kwargs)
1223
1224 self.__mapper = mapper
1225 """mapper to help us our with prepping data to
1226 training/classification"""
1227
1228
1230 """Train `MappedClassifier`
1231 """
1232
1233
1234 self.__mapper.train(dataset)
1235
1236
1237 wdataset = dataset.applyMapper(featuresmapper = self.__mapper)
1238 ProxyClassifier._train(self, wdataset)
1239
1240
1245
1246
1247 mapper = property(lambda x:x.__mapper, doc="Used mapper")
1248
1252 """`ProxyClassifier` which uses some `FeatureSelection` prior training.
1253
1254 `FeatureSelection` is used first to select features for the classifier to
1255 use for prediction. Internally it would rely on MappedClassifier which
1256 would use created MaskMapper.
1257
1258 TODO: think about removing overhead of retraining the same classifier if
1259 feature selection was carried out with the same classifier already. It
1260 has been addressed by adding .trained property to classifier, but now
1261 we should expclitely use isTrained here if we want... need to think more
1262 """
1263
1264 _clf_internals = [ 'does_feature_selection', 'meta' ]
1265
1266 - def __init__(self, clf, feature_selection, testdataset=None, **kwargs):
1267 """Initialize the instance
1268
1269 :Parameters:
1270 clf : Classifier
1271 classifier based on which mask classifiers is created
1272 feature_selection : FeatureSelection
1273 whatever `FeatureSelection` comes handy
1274 testdataset : Dataset
1275 optional dataset which would be given on call to feature_selection
1276 """
1277 ProxyClassifier.__init__(self, clf, **kwargs)
1278
1279 self.__maskclf = None
1280 """Should become `MappedClassifier`(mapper=`MaskMapper`) later on."""
1281
1282 self.__feature_selection = feature_selection
1283 """`FeatureSelection` to select the features prior training"""
1284
1285 self.__testdataset = testdataset
1286 """`FeatureSelection` might like to use testdataset"""
1287
1288
1290 """Untrain `FeatureSelectionClassifier`
1291
1292 Has to untrain any known classifier
1293 """
1294 if not self.trained:
1295 return
1296 if not self.__maskclf is None:
1297 self.__maskclf.untrain()
1298 super(FeatureSelectionClassifier, self).untrain()
1299
1300
1302 """Train `FeatureSelectionClassifier`
1303 """
1304
1305 self.__feature_selection.states._changeTemporarily(
1306 enable_states=["selected_ids"])
1307
1308 if __debug__:
1309 debug("CLFFS", "Performing feature selection using %s" %
1310 self.__feature_selection + " on %s" % dataset)
1311
1312 (wdataset, tdataset) = self.__feature_selection(dataset,
1313 self.__testdataset)
1314 if __debug__:
1315 add_ = ""
1316 if "CLFFS_" in debug.active:
1317 add_ = " Selected features: %s" % \
1318 self.__feature_selection.selected_ids
1319 debug("CLFFS", "{%s} selected %d out of %d features.%s" %
1320 (`self.__feature_selection`, wdataset.nfeatures,
1321 dataset.nfeatures, add_))
1322
1323
1324
1325 mappermask = N.zeros(dataset.nfeatures)
1326 mappermask[self.__feature_selection.selected_ids] = 1
1327 mapper = MaskMapper(mappermask)
1328
1329 self.__feature_selection.states._resetEnabledTemporarily()
1330
1331
1332 self.__maskclf = MappedClassifier(self.clf, mapper)
1333
1334
1335 self.__maskclf.clf.train(wdataset)
1336
1337
1338
1339
1340
1342 """Return used feature ids for `FeatureSelectionClassifier`
1343
1344 """
1345 return self.__feature_selection.selected_ids
1346
1348 """Predict using `FeatureSelectionClassifier`
1349 """
1350 result = self.__maskclf._predict(data)
1351
1352
1353 return result
1354
1356 """Set testing dataset to be used for feature selection
1357 """
1358 self.__testdataset = testdataset
1359
1360
1361
1362
1363 maskclf = property(lambda x:x.__maskclf, doc="Used `MappedClassifier`")
1364 feature_selection = property(lambda x:x.__feature_selection,
1365 doc="Used `FeatureSelection`")
1366
1367
1368 testdataset = property(fget=lambda x:x.__testdataset,
1369 fset=setTestDataset)
1370