Package mvpa :: Package featsel :: Module rfe
[hide private]
[frames] | no frames]

Source Code for Module mvpa.featsel.rfe

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Recursive feature elimination.""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  from mvpa.clfs.transerror import ClassifierError 
 14  from mvpa.measures.base import Sensitivity 
 15  from mvpa.featsel.base import FeatureSelection 
 16  from mvpa.featsel.helpers import BestDetector, \ 
 17                                   NBackHistoryStopCrit, \ 
 18                                   FractionTailSelector 
 19  from numpy import arange 
 20  from mvpa.misc.state import StateVariable 
 21   
 22  if __debug__: 
 23      from mvpa.misc import debug 
 24   
 25  # TODO: Abs value of sensitivity should be able to rule RFE 
 26  # Often it is what abs value of the sensitivity is what matters. 
 27  # So we should either provide a simple decorator around arbitrary 
 28  # FeatureSelector to convert sensitivities to abs values before calling 
 29  # actual selector, or a decorator around SensitivityEstimators 
 30   
 31   
32 -class RFE(FeatureSelection):
33 """Recursive feature elimination. 34 35 A `FeaturewiseDatasetMeasure` is used to compute sensitivity maps given a 36 certain dataset. These sensitivity maps are in turn used to discard 37 unimportant features. For each feature selection the transfer error on some 38 testdatset is computed. This procedure is repeated until a given 39 `StoppingCriterion` is reached. 40 """ 41 42 # TODO: remove 43 # doesn't work nicely -- if FeatureSelection defines its states via 44 # _register_states, they would simply be ignored 45 #_register_states = {'errors':True, 46 # 'nfeatures':True, 47 # 'history':True} 48 49 errors = StateVariable() 50 nfeatures = StateVariable() 51 history = StateVariable() 52 sensitivities = StateVariable(enabled=False) 53
54 - def __init__(self, 55 sensitivity_analyzer, 56 transfer_error, 57 feature_selector=FractionTailSelector(0.05), 58 bestdetector=BestDetector(), 59 stopping_criterion=NBackHistoryStopCrit(BestDetector()), 60 train_clf=None, 61 update_sensitivity=True, 62 **kargs 63 ):
64 # XXX Allow for multiple stopping criterions, e.g. error not decreasing 65 # anymore OR number of features less than threshold 66 """Initialize recursive feature elimination 67 68 :Parameters: 69 sensitivity_analyzer : FeaturewiseDatasetMeasure object 70 transfer_error : TransferError object 71 used to compute the transfer error of a classifier based on a 72 certain feature set on the test dataset. 73 NOTE: If sensitivity analyzer is based on the same 74 classifier as transfer_error is using, make sure you 75 initialize transfer_error with train=False, otherwise 76 it would train classifier twice without any necessity. 77 feature_selector : Functor 78 Given a sensitivity map it has to return the ids of those 79 features that should be kept. 80 bestdetector : Functor 81 Given a list of error values it has to return a boolean that 82 signals whether the latest error value is the total minimum. 83 stopping_criterion : Functor 84 Given a list of error values it has to return whether the 85 criterion is fulfilled. 86 train_clf : bool 87 Flag whether the classifier in `transfer_error` should be 88 trained before computing the error. In general this is 89 required, but if the `sensitivity_analyzer` and 90 `transfer_error` share and make use of the same classifier it 91 can be switched off to save CPU cycles. Default `None` checks 92 if sensitivity_analyzer is based on a classifier and doesn't train 93 if so. 94 update_sensitivity : bool 95 If False the sensitivity map is only computed once and reused 96 for each iteration. Otherwise the senstitivities are 97 recomputed at each selection step. 98 """ 99 100 # base init first 101 FeatureSelection.__init__(self, **kargs) 102 103 self.__sensitivity_analyzer = sensitivity_analyzer 104 """Sensitivity analyzer used to call at each step.""" 105 106 self.__transfer_error = transfer_error 107 """Compute transfer error for each feature set.""" 108 109 self.__feature_selector = feature_selector 110 """Functor which takes care about removing some features.""" 111 112 self.__stopping_criterion = stopping_criterion 113 114 self.__bestdetector = bestdetector 115 116 if train_clf is None: 117 self.__train_clf = isinstance(sensitivity_analyzer, 118 Sensitivity) 119 else: 120 self.__train_clf = train_clf 121 """Flag whether training classifier is required.""" 122 123 self.__update_sensitivity = update_sensitivity 124 """Flag whether sensitivity map is recomputed for each step.""" 125 126 # force clf training when sensitivities are not updated as otherwise 127 # shared classifiers are not retrained 128 if not self.__update_sensitivity \ 129 and isinstance(self.__transfer_error, ClassifierError) \ 130 and not self.__train_clf: 131 if __debug__: 132 debug("RFEC", "Forcing training of classifier since " + 133 "sensitivities aren't updated at each step") 134 self.__train_clf = True
135 136
137 - def __call__(self, dataset, testdataset):
138 """Proceed and select the features recursively eliminating less 139 important ones. 140 141 :Parameters: 142 dataset : Dataset 143 used to compute sensitivity maps and train a classifier 144 to determine the transfer error 145 testdataset : Dataset 146 used to test the trained classifer to determine the 147 transfer error 148 149 Returns a tuple of two new datasets with the feature subset of 150 `dataset` that had the lowest transfer error of all tested 151 sets until the stopping criterion was reached. The first 152 dataset is the feature subset of the training data and the 153 second the selection of the test dataset. 154 """ 155 errors = [] 156 """Computed error for each tested features set.""" 157 158 self.nfeatures = [] 159 """Number of features at each step. Since it is not used by the 160 algorithm it is stored directly in the state variable""" 161 162 self.history = arange(dataset.nfeatures) 163 """Store the last step # when the feature was still present 164 """ 165 166 self.sensitivities = [] 167 168 stop = False 169 """Flag when RFE should be stopped.""" 170 171 results = None 172 """Will hold the best feature set ever.""" 173 174 wdataset = dataset 175 """Operate on working dataset initially identical.""" 176 177 wtestdataset = testdataset 178 """Same feature selection has to be performs on test dataset as well. 179 This will hold the current testdataset.""" 180 181 step = 0 182 """Counter how many selection step where done.""" 183 184 orig_feature_ids = arange(dataset.nfeatures) 185 """List of feature Ids as per original dataset remaining at any given 186 step""" 187 188 sensitivity = None 189 """Contains the latest sensitivity map.""" 190 191 result_selected_ids = orig_feature_ids 192 """Resultant ids of selected features. Since the best is not 193 necessarily is the last - we better keep this one around. By 194 default -- all features are there""" 195 selected_ids = result_selected_ids 196 197 while wdataset.nfeatures > 0: 198 199 if __debug__: 200 debug('RFEC', 201 "Step %d: nfeatures=%d" % (step, wdataset.nfeatures)) 202 203 # mark the features which are present at this step 204 # if it brings anyb mentionable computational burden in the future, 205 # only mark on removed features at each step 206 self.history[orig_feature_ids] = step 207 208 # Compute sensitivity map 209 if self.__update_sensitivity or sensitivity == None: 210 sensitivity = self.__sensitivity_analyzer(wdataset) 211 212 if self.states.isEnabled("sensitivities"): 213 self.sensitivities.append(sensitivity) 214 215 # do not retrain clf if not necessary 216 if self.__train_clf: 217 error = self.__transfer_error(wtestdataset, wdataset) 218 else: 219 error = self.__transfer_error(wtestdataset, None) 220 221 # Record the error 222 errors.append(error) 223 224 # Check if it is time to stop and if we got 225 # the best result 226 stop = self.__stopping_criterion(errors) 227 isthebest = self.__bestdetector(errors) 228 229 nfeatures = wdataset.nfeatures 230 231 if self.states.isEnabled("nfeatures"): 232 self.nfeatures.append(wdataset.nfeatures) 233 234 # store result 235 if isthebest: 236 results = (wdataset, wtestdataset) 237 result_selected_ids = orig_feature_ids 238 239 # stop if it is time to finish 240 if nfeatures == 1 or stop: 241 break 242 243 # Select features to preserve 244 selected_ids = self.__feature_selector(sensitivity) 245 246 if __debug__: 247 debug('RFEC', 248 "Step %d: nfeatures=%d error=%.4f best/stop=%d/%d " 249 "nfeatures_selected=%d" % 250 (step, nfeatures, error, isthebest, stop, 251 len(selected_ids))) 252 debug('RFEC_', 253 "Sensitivity: %s, selected_ids: %s" % 254 (sensitivity, selected_ids)) 255 256 257 # Create a dataset only with selected features 258 wdataset = wdataset.selectFeatures(selected_ids) 259 260 # select corresponding sensitivity values if they are not 261 # recomputed 262 if not self.__update_sensitivity: 263 sensitivity = sensitivity[selected_ids] 264 265 # need to update the test dataset as well 266 # XXX why should it ever become None? 267 # yoh: because we can have __transfer_error computed 268 # using wdataset. See xia-generalization estimate 269 # in lightsvm. Or for god's sake leave-one-out 270 # on a wdataset 271 # TODO: document these cases in this class 272 if not testdataset is None: 273 wtestdataset = wtestdataset.selectFeatures(selected_ids) 274 275 step += 1 276 277 # WARNING: THIS MUST BE THE LAST THING TO DO ON selected_ids 278 selected_ids.sort() 279 if self.states.isEnabled("history") or self.states.isEnabled('selected_ids'): 280 orig_feature_ids = orig_feature_ids[selected_ids] 281 282 283 if hasattr(self.__transfer_error, "clf"): 284 self.__transfer_error.clf.untrain() 285 # charge state variables 286 self.errors = errors 287 self.selected_ids = result_selected_ids 288 289 # best dataset ever is returned 290 return results
291