1
2
3
4
5
6
7
8
9 """Recursive feature elimination."""
10
11 __docformat__ = 'restructuredtext'
12
13 from mvpa.clfs.transerror import ClassifierError
14 from mvpa.measures.base import Sensitivity
15 from mvpa.featsel.base import FeatureSelection
16 from mvpa.featsel.helpers import BestDetector, \
17 NBackHistoryStopCrit, \
18 FractionTailSelector
19 from numpy import arange
20 from mvpa.misc.state import StateVariable
21
22 if __debug__:
23 from mvpa.misc import debug
24
25
26
27
28
29
30
31
32 -class RFE(FeatureSelection):
33 """Recursive feature elimination.
34
35 A `FeaturewiseDatasetMeasure` is used to compute sensitivity maps given a
36 certain dataset. These sensitivity maps are in turn used to discard
37 unimportant features. For each feature selection the transfer error on some
38 testdatset is computed. This procedure is repeated until a given
39 `StoppingCriterion` is reached.
40 """
41
42
43
44
45
46
47
48
49 errors = StateVariable()
50 nfeatures = StateVariable()
51 history = StateVariable()
52 sensitivities = StateVariable(enabled=False)
53
64
65
66 """Initialize recursive feature elimination
67
68 :Parameters:
69 sensitivity_analyzer : FeaturewiseDatasetMeasure object
70 transfer_error : TransferError object
71 used to compute the transfer error of a classifier based on a
72 certain feature set on the test dataset.
73 NOTE: If sensitivity analyzer is based on the same
74 classifier as transfer_error is using, make sure you
75 initialize transfer_error with train=False, otherwise
76 it would train classifier twice without any necessity.
77 feature_selector : Functor
78 Given a sensitivity map it has to return the ids of those
79 features that should be kept.
80 bestdetector : Functor
81 Given a list of error values it has to return a boolean that
82 signals whether the latest error value is the total minimum.
83 stopping_criterion : Functor
84 Given a list of error values it has to return whether the
85 criterion is fulfilled.
86 train_clf : bool
87 Flag whether the classifier in `transfer_error` should be
88 trained before computing the error. In general this is
89 required, but if the `sensitivity_analyzer` and
90 `transfer_error` share and make use of the same classifier it
91 can be switched off to save CPU cycles. Default `None` checks
92 if sensitivity_analyzer is based on a classifier and doesn't train
93 if so.
94 update_sensitivity : bool
95 If False the sensitivity map is only computed once and reused
96 for each iteration. Otherwise the senstitivities are
97 recomputed at each selection step.
98 """
99
100
101 FeatureSelection.__init__(self, **kargs)
102
103 self.__sensitivity_analyzer = sensitivity_analyzer
104 """Sensitivity analyzer used to call at each step."""
105
106 self.__transfer_error = transfer_error
107 """Compute transfer error for each feature set."""
108
109 self.__feature_selector = feature_selector
110 """Functor which takes care about removing some features."""
111
112 self.__stopping_criterion = stopping_criterion
113
114 self.__bestdetector = bestdetector
115
116 if train_clf is None:
117 self.__train_clf = isinstance(sensitivity_analyzer,
118 Sensitivity)
119 else:
120 self.__train_clf = train_clf
121 """Flag whether training classifier is required."""
122
123 self.__update_sensitivity = update_sensitivity
124 """Flag whether sensitivity map is recomputed for each step."""
125
126
127
128 if not self.__update_sensitivity \
129 and isinstance(self.__transfer_error, ClassifierError) \
130 and not self.__train_clf:
131 if __debug__:
132 debug("RFEC", "Forcing training of classifier since " +
133 "sensitivities aren't updated at each step")
134 self.__train_clf = True
135
136
137 - def __call__(self, dataset, testdataset):
138 """Proceed and select the features recursively eliminating less
139 important ones.
140
141 :Parameters:
142 dataset : Dataset
143 used to compute sensitivity maps and train a classifier
144 to determine the transfer error
145 testdataset : Dataset
146 used to test the trained classifer to determine the
147 transfer error
148
149 Returns a tuple of two new datasets with the feature subset of
150 `dataset` that had the lowest transfer error of all tested
151 sets until the stopping criterion was reached. The first
152 dataset is the feature subset of the training data and the
153 second the selection of the test dataset.
154 """
155 errors = []
156 """Computed error for each tested features set."""
157
158 self.nfeatures = []
159 """Number of features at each step. Since it is not used by the
160 algorithm it is stored directly in the state variable"""
161
162 self.history = arange(dataset.nfeatures)
163 """Store the last step # when the feature was still present
164 """
165
166 self.sensitivities = []
167
168 stop = False
169 """Flag when RFE should be stopped."""
170
171 results = None
172 """Will hold the best feature set ever."""
173
174 wdataset = dataset
175 """Operate on working dataset initially identical."""
176
177 wtestdataset = testdataset
178 """Same feature selection has to be performs on test dataset as well.
179 This will hold the current testdataset."""
180
181 step = 0
182 """Counter how many selection step where done."""
183
184 orig_feature_ids = arange(dataset.nfeatures)
185 """List of feature Ids as per original dataset remaining at any given
186 step"""
187
188 sensitivity = None
189 """Contains the latest sensitivity map."""
190
191 result_selected_ids = orig_feature_ids
192 """Resultant ids of selected features. Since the best is not
193 necessarily is the last - we better keep this one around. By
194 default -- all features are there"""
195 selected_ids = result_selected_ids
196
197 while wdataset.nfeatures > 0:
198
199 if __debug__:
200 debug('RFEC',
201 "Step %d: nfeatures=%d" % (step, wdataset.nfeatures))
202
203
204
205
206 self.history[orig_feature_ids] = step
207
208
209 if self.__update_sensitivity or sensitivity == None:
210 sensitivity = self.__sensitivity_analyzer(wdataset)
211
212 if self.states.isEnabled("sensitivities"):
213 self.sensitivities.append(sensitivity)
214
215
216 if self.__train_clf:
217 error = self.__transfer_error(wtestdataset, wdataset)
218 else:
219 error = self.__transfer_error(wtestdataset, None)
220
221
222 errors.append(error)
223
224
225
226 stop = self.__stopping_criterion(errors)
227 isthebest = self.__bestdetector(errors)
228
229 nfeatures = wdataset.nfeatures
230
231 if self.states.isEnabled("nfeatures"):
232 self.nfeatures.append(wdataset.nfeatures)
233
234
235 if isthebest:
236 results = (wdataset, wtestdataset)
237 result_selected_ids = orig_feature_ids
238
239
240 if nfeatures == 1 or stop:
241 break
242
243
244 selected_ids = self.__feature_selector(sensitivity)
245
246 if __debug__:
247 debug('RFEC',
248 "Step %d: nfeatures=%d error=%.4f best/stop=%d/%d "
249 "nfeatures_selected=%d" %
250 (step, nfeatures, error, isthebest, stop,
251 len(selected_ids)))
252 debug('RFEC_',
253 "Sensitivity: %s, selected_ids: %s" %
254 (sensitivity, selected_ids))
255
256
257
258 wdataset = wdataset.selectFeatures(selected_ids)
259
260
261
262 if not self.__update_sensitivity:
263 sensitivity = sensitivity[selected_ids]
264
265
266
267
268
269
270
271
272 if not testdataset is None:
273 wtestdataset = wtestdataset.selectFeatures(selected_ids)
274
275 step += 1
276
277
278 selected_ids.sort()
279 if self.states.isEnabled("history") or self.states.isEnabled('selected_ids'):
280 orig_feature_ids = orig_feature_ids[selected_ids]
281
282
283 if hasattr(self.__transfer_error, "clf"):
284 self.__transfer_error.clf.untrain()
285
286 self.errors = errors
287 self.selected_ids = result_selected_ids
288
289
290 return results
291