1
2
3
4
5
6
7
8
9 """Classes for meta classifiers -- classifiers which use other classifiers
10
11 Meta Classifiers can be grouped according to their function as
12
13 :group BoostedClassifiers: CombinedClassifier MulticlassClassifier
14 SplitClassifier
15 :group ProxyClassifiers: ProxyClassifier BinaryClassifier MappedClassifier
16 FeatureSelectionClassifier
17 :group PredictionsCombiners for CombinedClassifier: PredictionsCombiner
18 MaximalVote MeanPrediction
19
20 """
21
22 __docformat__ = 'restructuredtext'
23
24 import operator
25 import numpy as N
26
27 from mvpa.misc.args import group_kwargs
28 from mvpa.mappers.mask import MaskMapper
29 from mvpa.datasets.splitters import NFoldSplitter
30 from mvpa.misc.state import StateVariable, ClassWithCollections, Harvestable
31
32 from mvpa.clfs.base import Classifier
33 from mvpa.misc.transformers import FirstAxisMean
34
35 from mvpa.measures.base import \
36 BoostedClassifierSensitivityAnalyzer, ProxyClassifierSensitivityAnalyzer, \
37 MappedClassifierSensitivityAnalyzer, \
38 FeatureSelectionClassifierSensitivityAnalyzer
39
40 from mvpa.base import warning
41
42 if __debug__:
43 from mvpa.base import debug
47 """Classifier containing the farm of other classifiers.
48
49 Should rarely be used directly. Use one of its childs instead
50 """
51
52
53
54 raw_predictions = StateVariable(enabled=False,
55 doc="Predictions obtained from each classifier")
56
57 raw_values = StateVariable(enabled=False,
58 doc="Values obtained from each classifier")
59
60
61 - def __init__(self, clfs=None, propagate_states=True,
62 harvest_attribs=None, copy_attribs='copy',
63 **kwargs):
64 """Initialize the instance.
65
66 :Parameters:
67 clfs : list
68 list of classifier instances to use (slave classifiers)
69 propagate_states : bool
70 either to propagate enabled states into slave classifiers.
71 It is in effect only when slaves get assigned - so if state
72 is enabled not during construction, it would not necessarily
73 propagate into slaves
74 kwargs : dict
75 dict of keyworded arguments which might get used
76 by State or Classifier
77 """
78 if clfs == None:
79 clfs = []
80
81 Classifier.__init__(self, **kwargs)
82 Harvestable.__init__(self, harvest_attribs, copy_attribs)
83
84 self.__clfs = None
85 """Pylint friendly definition of __clfs"""
86
87 self.__propagate_states = propagate_states
88 """Enable current enabled states in slave classifiers"""
89
90 self._setClassifiers(clfs)
91 """Store the list of classifiers"""
92
93
95 if self.__clfs is None or len(self.__clfs)==0:
96
97 prefix_ = []
98 else:
99 prefix_ = ["clfs=[%s,...]" % repr(self.__clfs[0])]
100 return super(BoostedClassifier, self).__repr__(prefix_ + prefixes)
101
102
104 """Train `BoostedClassifier`
105 """
106 for clf in self.__clfs:
107 clf.train(dataset)
108
109
110 - def _posttrain(self, dataset):
111 """Custom posttrain of `BoostedClassifier`
112
113 Harvest over the trained classifiers if it was asked to so
114 """
115 Classifier._posttrain(self, dataset)
116 if self.states.isEnabled('harvested'):
117 for clf in self.__clfs:
118 self._harvest(locals())
119 if self.params.retrainable:
120 self.__changedData_isset = False
121
122
131
132
150
151
153 """Set the classifiers used by the boosted classifier
154
155 We have to allow to set list of classifiers after the object
156 was actually created. It will be used by
157 MulticlassClassifier
158 """
159 self.__clfs = clfs
160 """Classifiers to use"""
161
162 if len(clfs):
163 for flag in ['regression']:
164 values = N.array([clf.params[flag].value for clf in clfs])
165 value = values.any()
166 if __debug__:
167 debug("CLFBST", "Setting %(flag)s=%(value)s for classifiers "
168 "%(clfs)s with %(values)s",
169 msgargs={'flag' : flag, 'value' : value,
170 'clfs' : clfs,
171 'values' : values})
172
173 self.params[flag].value = value
174
175
176 if self.__propagate_states:
177 for clf in self.__clfs:
178 clf.states.enable(self.states.enabled, missingok=True)
179
180
181
182
183 self._clf_internals = [ 'binary', 'multiclass', 'meta' ]
184 if len(clfs)>0:
185 self._clf_internals += self.__clfs[0]._clf_internals
186
197
203
204
205 clfs = property(fget=lambda x:x.__clfs,
206 fset=_setClassifiers,
207 doc="Used classifiers")
208
212 """Classifier which decorates another classifier
213
214 Possible uses:
215
216 - modify data somehow prior training/testing:
217 * normalization
218 * feature selection
219 * modification
220
221 - optimized classifier?
222
223 """
224
243
244
248
250 s = super(ProxyClassifier, self).summary()
251 if self.trained:
252 s += "\n Slave classifier summary:" + \
253 '\n + %s' % \
254 (self.__clf.summary().replace('\n', '\n |'))
255 return s
256
257
258
260 """Train `ProxyClassifier`
261 """
262
263
264 self.__clf.train(dataset)
265
266
267
268
269
270
271
272
273
274
276 """Predict using `ProxyClassifier`
277 """
278 clf = self.__clf
279 if self.states.isEnabled('values'):
280 clf.states.enable(['values'])
281
282 result = clf.predict(data)
283
284 self.states._copy_states_(self.__clf, ['values'], deep=False)
285 return result
286
287
294
295
296 @group_kwargs(prefixes=['slave_'], passthrough=True)
303
304
305 clf = property(lambda x:x.__clf, doc="Used `Classifier`")
306
314 """Base class for combining decisions of multiple classifiers"""
315
316 - def train(self, clfs, dataset):
317 """PredictionsCombiner might need to be trained
318
319 :Parameters:
320 clfs : list of Classifier
321 List of classifiers to combine. Has to be classifiers (not
322 pure predictions), since combiner might use some other
323 state variables (value's) instead of pure prediction's
324 dataset : Dataset
325 training data in this case
326 """
327 pass
328
329
331 """Call function
332
333 :Parameters:
334 clfs : list of Classifier
335 List of classifiers to combine. Has to be classifiers (not
336 pure predictions), since combiner might use some other
337 state variables (value's) instead of pure prediction's
338 """
339 raise NotImplementedError
340
344 """Provides a decision using maximal vote rule"""
345
346 predictions = StateVariable(enabled=True,
347 doc="Voted predictions")
348 all_label_counts = StateVariable(enabled=False,
349 doc="Counts across classifiers for each label/sample")
350
352 """XXX Might get a parameter to use raw decision values if
353 voting is not unambigous (ie two classes have equal number of
354 votes
355 """
356 PredictionsCombiner.__init__(self)
357
358
360 """Actuall callable - perform voting
361
362 Extended functionality which might not be needed actually:
363 Since `BinaryClassifier` might return a list of possible
364 predictions (not just a single one), we should consider all of those
365
366 MaximalVote doesn't care about dataset itself
367 """
368 if len(clfs)==0:
369 return []
370
371 all_label_counts = None
372 for clf in clfs:
373
374 if not clf.states.isEnabled("predictions"):
375 raise ValueError, "MaximalVote needs classifiers (such as " + \
376 "%s) with state 'predictions' enabled" % clf
377 predictions = clf.predictions
378 if all_label_counts is None:
379 all_label_counts = [ {} for i in xrange(len(predictions)) ]
380
381
382 for i in xrange(len(predictions)):
383 prediction = predictions[i]
384 if not operator.isSequenceType(prediction):
385 prediction = (prediction,)
386 for label in prediction:
387
388
389 if not all_label_counts[i].has_key(label):
390 all_label_counts[i][label] = 0
391 all_label_counts[i][label] += 1
392
393 predictions = []
394
395 for i in xrange(len(all_label_counts)):
396 label_counts = all_label_counts[i]
397
398
399 maxk = []
400 maxv = -1
401 for k, v in label_counts.iteritems():
402 if v > maxv:
403 maxk = [k]
404 maxv = v
405 elif v == maxv:
406 maxk.append(k)
407
408 assert len(maxk) >= 1, \
409 "We should have obtained at least a single key of max label"
410
411 if len(maxk) > 1:
412 warning("We got multiple labels %s which have the " % maxk +
413 "same maximal vote %d. XXX disambiguate" % maxv)
414 predictions.append(maxk[0])
415
416 self.all_label_counts = all_label_counts
417 self.predictions = predictions
418 return predictions
419
423 """Provides a decision by taking mean of the results
424 """
425
426 predictions = StateVariable(enabled=True,
427 doc="Mean predictions")
428
430 """Actuall callable - perform meaning
431
432 """
433 if len(clfs)==0:
434 return []
435
436 all_predictions = []
437 for clf in clfs:
438
439 if not clf.states.isEnabled("predictions"):
440 raise ValueError, "MeanPrediction needs classifiers (such " \
441 " as %s) with state 'predictions' enabled" % clf
442 all_predictions.append(clf.predictions)
443
444
445 predictions = N.mean(N.asarray(all_predictions), axis=0)
446 self.predictions = predictions
447 return predictions
448
451 """Provides a decision using training a classifier on predictions/values
452
453 TODO: implement
454 """
455
456 predictions = StateVariable(enabled=True,
457 doc="Trained predictions")
458
459
460 - def __init__(self, clf, variables=None):
461 """Initialize `ClassifierCombiner`
462
463 :Parameters:
464 clf : Classifier
465 Classifier to train on the predictions
466 variables : list of basestring
467 List of state variables stored in 'combined' classifiers, which
468 to use as features for training this classifier
469 """
470 PredictionsCombiner.__init__(self)
471
472 self.__clf = clf
473 """Classifier to train on `variables` states of provided classifiers"""
474
475 if variables == None:
476 variables = ['predictions']
477 self.__variables = variables
478 """What state variables of the classifiers to use"""
479
480
482 """It might be needed to untrain used classifier"""
483 if self.__clf:
484 self.__clf.untrain()
485
487 """
488 """
489 if len(clfs)==0:
490 return []
491
492 raise NotImplementedError
493
497 """`BoostedClassifier` which combines predictions using some
498 `PredictionsCombiner` functor.
499 """
500
501 - def __init__(self, clfs=None, combiner=None, **kwargs):
502 """Initialize the instance.
503
504 :Parameters:
505 clfs : list of Classifier
506 list of classifier instances to use
507 combiner : PredictionsCombiner
508 callable which takes care about combining multiple
509 results into a single one (e.g. maximal vote for
510 classification, MeanPrediction for regression))
511 kwargs : dict
512 dict of keyworded arguments which might get used
513 by State or Classifier
514
515 NB: `combiner` might need to operate not on 'predictions' descrete
516 labels but rather on raw 'class' values classifiers
517 estimate (which is pretty much what is stored under
518 `values`
519 """
520 if clfs == None:
521 clfs = []
522
523 BoostedClassifier.__init__(self, clfs, **kwargs)
524
525
526 if combiner is None:
527 combiner = (MaximalVote, MeanPrediction)[int(self.regression)]()
528 self.__combiner = combiner
529 """Functor destined to combine results of multiple classifiers"""
530
531
533 """Literal representation of `CombinedClassifier`.
534 """
535 return super(CombinedClassifier, self).__repr__(
536 ["combiner=%s" % repr(self.__combiner)] + prefixes)
537
538
540 """Provide summary for the `CombinedClassifier`.
541 """
542 s = super(CombinedClassifier, self).summary()
543 if self.trained:
544 s += "\n Slave classifiers summaries:"
545 for i, clf in enumerate(self.clfs):
546 s += '\n + %d clf: %s' % \
547 (i, clf.summary().replace('\n', '\n |'))
548 return s
549
550
559
566
567
588
589
590 combiner = property(fget=lambda x:x.__combiner,
591 doc="Used combiner to derive a single result")
592
596 """`TreeClassifier` which allows to create hierarchy of classifiers
597
598 Functions by grouping some labels into a single "meta-label" and training
599 classifier first to separate between meta-labels. Then
600 each group further proceeds with classification within each group.
601
602 Possible scenarios::
603
604 TreeClassifier(SVM(),
605 {'animate': ((1,2,3,4),
606 TreeClassifier(SVM(),
607 {'human': (('male', 'female'), SVM()),
608 'animals': (('monkey', 'dog'), SMLR())})),
609 'inanimate': ((5,6,7,8), SMLR())})
610
611 would create classifier which would first do binary classification
612 to separate animate from inanimate, then for animate result it
613 would separate to classify human vs animal and so on::
614
615 SVM
616 / \
617 animate inanimate
618 / \
619 SVM SMLR
620 / \ / | \ \
621 human animal 5 6 7 8
622 | |
623 SVM SVM
624 / \ / \
625 male female monkey dog
626 1 2 3 4
627
628 """
629
630 _DEV__doc = """
631 Questions:
632 * how to collect confusion matrices at a particular layer if such
633 classifier is given to SplitClassifier or CVTE
634
635 * What additional states to add, something like
636 clf_labels -- store remapped labels for the dataset
637 clf_values ...
638
639 * What do we store into values ? just values from the clfs[]
640 for corresponding samples, or top level clf values as well?
641
642 * what should be SensitivityAnalyzer? by default it would just
643 use top slave classifier (i.e. animate/inanimate)
644
645 Problems?
646 * .clf is not actually "proxied" per se, so not sure what things
647 should be taken care of yet...
648
649 TODO:
650 * Allow a group to be just a single category, so no further
651 classifier is needed, it just should stay separate from the
652 other groups
653
654 Possible TODO:
655 * Add ability to provide results of clf.values as features into
656 input of clfs[]. This way we could provide additional 'similarity'
657 information to the "other" branch
658
659 """
660
661 - def __init__(self, clf, groups, **kwargs):
662 """Initialize TreeClassifier
663
664 :Parameters:
665 clf : Classifier
666 Classifier to separate between the groups
667 groups : dict of meta-label: tuple of (tuple of labels, classifier)
668 Defines the groups of labels and their classifiers.
669 See :class:`~mvpa.clfs.meta.TreeClassifier` for example
670 """
671
672
673 ProxyClassifier.__init__(self, clf, **kwargs)
674 self._regressionIsBogus()
675
676
677
678
679
680 self._groups = groups
681 self._index2group = groups.keys()
682
683
684
685
686
687
688
689 self.clfs = dict([(gk, c) for gk, (ls, c) in groups.iteritems()])
690 """Dictionary of classifiers used by the groups"""
691
692
694 """String representation of TreeClassifier
695 """
696 prefix = "groups=%s" % repr(self._groups)
697 return super(TreeClassifier, self).__repr__([prefix] + prefixes)
698
699
701 """Provide summary for the `TreeClassifier`.
702 """
703 s = super(TreeClassifier, self).summary()
704 if self.trained:
705 s += "\n Node classifiers summaries:"
706 for i, (clfname, clf) in enumerate(self.clfs.iteritems()):
707 s += '\n + %d %s clf: %s' % \
708 (i, clfname, clf.summary().replace('\n', '\n |'))
709 return s
710
711
713 """Train TreeClassifier
714
715 First train .clf on groupped samples, then train each of .clfs
716 on a corresponding subset of samples.
717 """
718
719 clf, clfs, index2group = self.clf, self.clfs, self._index2group
720
721
722 groups = self._groups
723 labels_map = dataset.labels_map
724
725 if labels_map is None: labels_map = {}
726 groups_labels = {}
727 label2index = {}
728 known = set()
729 for gi, gk in enumerate(index2group):
730 ls = groups[gk][0]
731
732 ls_ = [labels_map.get(l, l) for l in ls]
733 known_already = known.intersection(ls_)
734 if len(known_already):
735 raise ValueError, "Grouping of labels is not appropriate. " \
736 "Got labels %s already among known in %s. " \
737 "Used labelsmap %s" % (known_already, known, labels_map)
738 groups_labels[gk] = ls_
739 for l in ls_:
740 label2index[l] = gi
741 known = known.union(ls_)
742
743
744
745
746
747 dsul = set(dataset.uniquelabels)
748 if known.intersection(dsul) != dsul:
749 raise ValueError, \
750 "Dataset %s had some labels not defined in groups: %s. " \
751 "Known are %s" % \
752 (dataset, dsul.difference(known), known)
753
754
755
756
757
758
759
760
761 ds_group = dataset.copy(deep=False)
762
763 ds_group.labels = [label2index[l] for l in dataset.labels]
764
765
766 if __debug__:
767 debug('CLFTREE', "Training primary %(clf)s on %(ds)s",
768 msgargs=dict(clf=clf, ds=ds_group))
769 clf.train(ds_group)
770
771
772
773
774
775
776
777
778
779
780
781 for gk in groups.iterkeys():
782
783 ids = dataset.idsbylabels(groups_labels[gk])
784 ds_group = dataset.selectSamples(ids)
785 if __debug__:
786 debug('CLFTREE', "Training %(clf)s for group %(gk)s on %(ds)s",
787 msgargs=dict(clf=clfs[gk], gk=gk, ds=ds_group))
788
789 clfs[gk].train(ds_group)
790
791
798
799
801 """
802 """
803
804 clfs, index2group = self.clfs, self._index2group
805 clf_predictions = N.asanyarray(ProxyClassifier._predict(self, data))
806
807 clf_predictions = clf_predictions.astype(int)
808
809
810
811 predictions = N.array([N.nan]*len(data))
812 for pred_group in set(clf_predictions):
813 gk = index2group[pred_group]
814 clf_ = clfs[gk]
815 group_indexes = (clf_predictions == pred_group)
816 if __debug__:
817 debug('CLFTREE', 'Predicting for group %s using %s on %d samples' %
818 (gk, clf_, N.sum(group_indexes)))
819 predictions[group_indexes] = clf_.predict(data[group_indexes])
820 return predictions
821
824 """`ProxyClassifier` which maps set of two labels into +1 and -1
825 """
826
827 - def __init__(self, clf, poslabels, neglabels, **kwargs):
828 """
829 :Parameters:
830 clf : Classifier
831 classifier to use
832 poslabels : list
833 list of labels which are treated as +1 category
834 neglabels : list
835 list of labels which are treated as -1 category
836 """
837
838 ProxyClassifier.__init__(self, clf, **kwargs)
839
840 self._regressionIsBogus()
841
842
843 sposlabels = set(poslabels)
844 sneglabels = set(neglabels)
845
846
847 overlap = sposlabels.intersection(sneglabels)
848 if len(overlap)>0:
849 raise ValueError("Sets of positive and negative labels for " +
850 "BinaryClassifier must not overlap. Got overlap " %
851 overlap)
852
853 self.__poslabels = list(sposlabels)
854 self.__neglabels = list(sneglabels)
855
856
857
858
859
860
861
862
863 if len(self.__poslabels) > 1:
864 self.__predictpos = self.__poslabels
865 else:
866 self.__predictpos = self.__poslabels[0]
867
868 if len(self.__neglabels) > 1:
869 self.__predictneg = self.__neglabels
870 else:
871 self.__predictneg = self.__neglabels[0]
872
873
875 prefix = "poslabels=%s, neglabels=%s" % (
876 repr(self.__poslabels), repr(self.__neglabels))
877 return super(BinaryClassifier, self).__repr__([prefix] + prefixes)
878
879
881 """Train `BinaryClassifier`
882 """
883 idlabels = [(x, +1) for x in dataset.idsbylabels(self.__poslabels)] + \
884 [(x, -1) for x in dataset.idsbylabels(self.__neglabels)]
885
886
887 idlabels.sort()
888
889 orig_labels = None
890
891
892
893
894 if len(idlabels) == dataset.nsamples \
895 and [x[0] for x in idlabels] == range(dataset.nsamples):
896
897
898 datasetselected = dataset
899 orig_labels = dataset.labels
900 if __debug__:
901 debug('CLFBIN',
902 "Assigned all %d samples for binary " %
903 (dataset.nsamples) +
904 " classification among labels %s/+1 and %s/-1" %
905 (self.__poslabels, self.__neglabels))
906 else:
907 datasetselected = dataset.selectSamples([ x[0] for x in idlabels ])
908 if __debug__:
909 debug('CLFBIN',
910 "Selected %d samples out of %d samples for binary " %
911 (len(idlabels), dataset.nsamples) +
912 " classification among labels %s/+1 and %s/-1" %
913 (self.__poslabels, self.__neglabels) +
914 ". Selected %s" % datasetselected)
915
916
917 datasetselected.labels = [ x[1] for x in idlabels ]
918
919
920 if __debug__:
921 assert((datasetselected.uniquelabels == [-1, 1]).all())
922
923 self.clf.train(datasetselected)
924
925 if not orig_labels is None:
926 dataset.labels = orig_labels
927
929 """Predict the labels for a given `data`
930
931 Predicts using binary classifier and spits out list (for each sample)
932 where with either poslabels or neglabels as the "label" for the sample.
933 If there was just a single label within pos or neg labels then it would
934 return not a list but just that single label.
935 """
936 binary_predictions = ProxyClassifier._predict(self, data)
937 self.values = binary_predictions
938 predictions = [ {-1: self.__predictneg,
939 +1: self.__predictpos}[x] for x in binary_predictions]
940 self.predictions = predictions
941 return predictions
942
946 """`CombinedClassifier` to perform multiclass using a list of
947 `BinaryClassifier`.
948
949 such as 1-vs-1 (ie in pairs like libsvm doesn) or 1-vs-all (which
950 is yet to think about)
951 """
952
953 - def __init__(self, clf, bclf_type="1-vs-1", **kwargs):
954 """Initialize the instance
955
956 :Parameters:
957 clf : Classifier
958 classifier based on which multiple classifiers are created
959 for multiclass
960 bclf_type
961 "1-vs-1" or "1-vs-all", determines the way to generate binary
962 classifiers
963 """
964 CombinedClassifier.__init__(self, **kwargs)
965 self._regressionIsBogus()
966 if not clf is None:
967 clf._regressionIsBogus()
968
969 self.__clf = clf
970 """Store sample instance of basic classifier"""
971
972
973 if bclf_type == "1-vs-1":
974 pass
975 elif bclf_type == "1-vs-all":
976 raise NotImplementedError
977 else:
978 raise ValueError, \
979 "Unknown type of classifier %s for " % bclf_type + \
980 "BoostedMulticlassClassifier"
981 self.__bclf_type = bclf_type
982
983
984
986 prefix = "bclf_type=%s, clf=%s" % (repr(self.__bclf_type),
987 repr(self.__clf))
988 return super(MulticlassClassifier, self).__repr__([prefix] + prefixes)
989
990
992 """Train classifier
993 """
994
995 ulabels = dataset.uniquelabels
996 if self.__bclf_type == "1-vs-1":
997
998 biclfs = []
999 for i in xrange(len(ulabels)):
1000 for j in xrange(i+1, len(ulabels)):
1001 clf = self.__clf.clone()
1002 biclfs.append(
1003 BinaryClassifier(
1004 clf,
1005 poslabels=[ulabels[i]], neglabels=[ulabels[j]]))
1006 if __debug__:
1007 debug("CLFMC", "Created %d binary classifiers for %d labels" %
1008 (len(biclfs), len(ulabels)))
1009
1010 self.clfs = biclfs
1011
1012 elif self.__bclf_type == "1-vs-all":
1013 raise NotImplementedError
1014
1015
1016 CombinedClassifier._train(self, dataset)
1017
1021 """`BoostedClassifier` to work on splits of the data
1022
1023 """
1024
1025 """
1026 TODO: SplitClassifier and MulticlassClassifier have too much in
1027 common -- need to refactor: just need a splitter which would
1028 split dataset in pairs of class labels. MulticlassClassifier
1029 does just a tiny bit more which might be not necessary at
1030 all: map sets of labels into 2 categories...
1031 """
1032
1033
1034
1035 confusion = StateVariable(enabled=False,
1036 doc="Resultant confusion whenever classifier trained " +
1037 "on 1 part and tested on 2nd part of each split")
1038
1039 splits = StateVariable(enabled=False, doc=
1040 """Store the actual splits of the data. Can be memory expensive""")
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1055 """Initialize the instance
1056
1057 :Parameters:
1058 clf : Classifier
1059 classifier based on which multiple classifiers are created
1060 for multiclass
1061 splitter : Splitter
1062 `Splitter` to use to split the dataset prior training
1063 """
1064
1065 CombinedClassifier.__init__(self, regression=clf.regression, **kwargs)
1066 self.__clf = clf
1067 """Store sample instance of basic classifier"""
1068
1069 if isinstance(splitter, type):
1070 raise ValueError, \
1071 "Please provide an instance of a splitter, not a type." \
1072 " Got %s" % splitter
1073
1074 self.__splitter = splitter
1075
1076
1078 """Train `SplitClassifier`
1079 """
1080
1081 bclfs = []
1082
1083
1084 states = self.states
1085
1086 clf_template = self.__clf
1087 if states.isEnabled('confusion'):
1088 states.confusion = clf_template._summaryClass()
1089 if states.isEnabled('training_confusion'):
1090 clf_template.states.enable(['training_confusion'])
1091 states.training_confusion = clf_template._summaryClass()
1092
1093 clf_hastestdataset = hasattr(clf_template, 'testdataset')
1094
1095
1096
1097 for split in self.__splitter.splitcfg(dataset):
1098 if __debug__:
1099 debug("CLFSPL_",
1100 "Deepcopying %(clf)s for %(sclf)s",
1101 msgargs={'clf':clf_template,
1102 'sclf':self})
1103 clf = clf_template.clone()
1104 bclfs.append(clf)
1105 self.clfs = bclfs
1106
1107 self.splits = []
1108
1109 for i, split in enumerate(self.__splitter(dataset)):
1110 if __debug__:
1111 debug("CLFSPL", "Training classifier for split %d" % (i))
1112
1113 if states.isEnabled("splits"):
1114 self.splits.append(split)
1115
1116 clf = self.clfs[i]
1117
1118
1119 if clf_hastestdataset:
1120 clf.testdataset = split[1]
1121
1122 clf.train(split[0])
1123
1124
1125 if clf_hastestdataset:
1126 clf.testdataset = None
1127
1128 if states.isEnabled("confusion"):
1129 predictions = clf.predict(split[1].samples)
1130 self.confusion.add(split[1].labels, predictions,
1131 clf.states.get('values', None))
1132 if __debug__:
1133 dact = debug.active
1134 if 'CLFSPL_' in dact:
1135 debug('CLFSPL_', 'Split %d:\n%s' % (i, self.confusion))
1136 elif 'CLFSPL' in dact:
1137 debug('CLFSPL', 'Split %d error %.2f%%'
1138 % (i, self.confusion.summaries[-1].error))
1139
1140 if states.isEnabled("training_confusion"):
1141 states.training_confusion += \
1142 clf.states.training_confusion
1143
1144 try:
1145 if states.isEnabled("confusion"):
1146 states.confusion.labels_map = dataset.labels_map
1147 if states.isEnabled("training_confusion"):
1148 states.training_confusion.labels_map = dataset.labels_map
1149 except:
1150 pass
1151
1152
1153 @group_kwargs(prefixes=['slave_'], passthrough=True)
1166
1167 splitter = property(fget=lambda x:x.__splitter,
1168 doc="Splitter user by SplitClassifier")
1169
1172 """`ProxyClassifier` which uses some mapper prior training/testing.
1173
1174 `MaskMapper` can be used just a subset of features to
1175 train/classify.
1176 Having such classifier we can easily create a set of classifiers
1177 for BoostedClassifier, where each classifier operates on some set
1178 of features, e.g. set of best spheres from SearchLight, set of
1179 ROIs selected elsewhere. It would be different from simply
1180 applying whole mask over the dataset, since here initial decision
1181 is made by each classifier and then later on they vote for the
1182 final decision across the set of classifiers.
1183 """
1184
1185 - def __init__(self, clf, mapper, **kwargs):
1186 """Initialize the instance
1187
1188 :Parameters:
1189 clf : Classifier
1190 classifier based on which mask classifiers is created
1191 mapper
1192 whatever `Mapper` comes handy
1193 """
1194 ProxyClassifier.__init__(self, clf, **kwargs)
1195
1196 self.__mapper = mapper
1197 """mapper to help us our with prepping data to
1198 training/classification"""
1199
1200
1202 """Train `MappedClassifier`
1203 """
1204
1205
1206
1207 self.__mapper.train(dataset)
1208
1209
1210 wdataset = dataset.applyMapper(featuresmapper = self.__mapper)
1211 ProxyClassifier._train(self, wdataset)
1212
1213
1218
1219
1220 @group_kwargs(prefixes=['slave_'], passthrough=True)
1227
1228
1229 mapper = property(lambda x:x.__mapper, doc="Used mapper")
1230
1234 """`ProxyClassifier` which uses some `FeatureSelection` prior training.
1235
1236 `FeatureSelection` is used first to select features for the classifier to
1237 use for prediction. Internally it would rely on MappedClassifier which
1238 would use created MaskMapper.
1239
1240 TODO: think about removing overhead of retraining the same classifier if
1241 feature selection was carried out with the same classifier already. It
1242 has been addressed by adding .trained property to classifier, but now
1243 we should expclitely use isTrained here if we want... need to think more
1244 """
1245
1246 _clf_internals = [ 'does_feature_selection', 'meta' ]
1247
1248 - def __init__(self, clf, feature_selection, testdataset=None, **kwargs):
1249 """Initialize the instance
1250
1251 :Parameters:
1252 clf : Classifier
1253 classifier based on which mask classifiers is created
1254 feature_selection : FeatureSelection
1255 whatever `FeatureSelection` comes handy
1256 testdataset : Dataset
1257 optional dataset which would be given on call to feature_selection
1258 """
1259 ProxyClassifier.__init__(self, clf, **kwargs)
1260
1261 self.__maskclf = None
1262 """Should become `MappedClassifier`(mapper=`MaskMapper`) later on."""
1263
1264 self.__feature_selection = feature_selection
1265 """`FeatureSelection` to select the features prior training"""
1266
1267 self.__testdataset = testdataset
1268 """`FeatureSelection` might like to use testdataset"""
1269
1270
1272 """Untrain `FeatureSelectionClassifier`
1273
1274 Has to untrain any known classifier
1275 """
1276 if self.__feature_selection is not None:
1277 self.__feature_selection.untrain()
1278 if not self.trained:
1279 return
1280 if not self.__maskclf is None:
1281 self.__maskclf.untrain()
1282 super(FeatureSelectionClassifier, self).untrain()
1283
1284
1286 """Train `FeatureSelectionClassifier`
1287 """
1288
1289 self.__feature_selection.states._changeTemporarily(
1290 enable_states=["selected_ids"])
1291
1292 if __debug__:
1293 debug("CLFFS", "Performing feature selection using %s" %
1294 self.__feature_selection + " on %s" % dataset)
1295
1296 (wdataset, tdataset) = self.__feature_selection(dataset,
1297 self.__testdataset)
1298 if __debug__:
1299 add_ = ""
1300 if "CLFFS_" in debug.active:
1301 add_ = " Selected features: %s" % \
1302 self.__feature_selection.selected_ids
1303 debug("CLFFS", "%(fs)s selected %(nfeat)d out of " +
1304 "%(dsnfeat)d features.%(app)s",
1305 msgargs={'fs':self.__feature_selection,
1306 'nfeat':wdataset.nfeatures,
1307 'dsnfeat':dataset.nfeatures,
1308 'app':add_})
1309
1310
1311
1312 mappermask = N.zeros(dataset.nfeatures)
1313 mappermask[self.__feature_selection.selected_ids] = 1
1314 mapper = MaskMapper(mappermask)
1315
1316 self.__feature_selection.states._resetEnabledTemporarily()
1317
1318
1319 self.__maskclf = MappedClassifier(self.clf, mapper)
1320
1321
1322 self.__maskclf.clf.train(wdataset)
1323
1324
1325
1326
1327
1329 """Return used feature ids for `FeatureSelectionClassifier`
1330
1331 """
1332 return self.__feature_selection.selected_ids
1333
1335 """Predict using `FeatureSelectionClassifier`
1336 """
1337 clf = self.__maskclf
1338 if self.states.isEnabled('values'):
1339 clf.states.enable(['values'])
1340
1341 result = clf._predict(data)
1342
1343 self.states._copy_states_(clf, ['values'], deep=False)
1344 return result
1345
1347 """Set testing dataset to be used for feature selection
1348 """
1349 self.__testdataset = testdataset
1350
1351 maskclf = property(lambda x:x.__maskclf, doc="Used `MappedClassifier`")
1352 feature_selection = property(lambda x:x.__feature_selection,
1353 doc="Used `FeatureSelection`")
1354
1355 @group_kwargs(prefixes=['slave_'], passthrough=True)
1365
1366
1367
1368 testdataset = property(fget=lambda x:x.__testdataset,
1369 fset=setTestDataset)
1370