Package mvpa :: Package clfs :: Module warehouse
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.warehouse

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Collection of classifiers to ease the exploration. 
 10  """ 
 11   
 12  __docformat__ = 'restructuredtext' 
 13   
 14  from sets import Set 
 15  import operator 
 16   
 17  # Data 
 18  from mvpa.datasets.splitter import OddEvenSplitter 
 19   
 20  # Define sets of classifiers 
 21  from mvpa.clfs.base import FeatureSelectionClassifier, SplitClassifier, \ 
 22                                   MulticlassClassifier 
 23  from mvpa.clfs.smlr import SMLR 
 24  from mvpa.clfs.knn import kNN 
 25  from mvpa.clfs.kernel import KernelLinear, KernelSquaredExponential 
 26   
 27  # Helpers 
 28  from mvpa.clfs.transerror import TransferError 
 29  from mvpa.base import externals, cfg 
 30  from mvpa.measures.anova import OneWayAnova 
 31  from mvpa.misc.transformers import Absolute 
 32  from mvpa.featsel.rfe import RFE 
 33  from mvpa.clfs.smlr import SMLRWeights 
 34  from mvpa.featsel.helpers import FractionTailSelector, \ 
 35      FixedNElementTailSelector, RangeElementSelector, \ 
 36      FixedErrorThresholdStopCrit 
 37  from mvpa.clfs.transerror import ConfusionBasedError 
 38  from mvpa.featsel.base import SensitivityBasedFeatureSelection 
 39   
 40  _KNOWN_INTERNALS=[ 'knn', 'binary', 'svm', 'linear', 
 41          'smlr', 'does_feature_selection', 'has_sensitivity', 
 42          'multiclass', 'non-linear', 'kernel-based', 'lars', 
 43          'regression', 'libsvm', 'sg', 'meta', 'retrainable', 'gpr', 
 44          'notrain2predict', 'ridge', 'blr', 'gnpp'] 
45 46 -class Warehouse(object):
47 """Class to keep known instantiated classifiers 48 49 Should provide easy ways to select classifiers of needed kind: 50 clfs['linear', 'svm'] should return all linear SVMs 51 clfs['linear', 'multiclass'] should return all linear classifiers 52 capable of doing multiclass classification 53 """ 54
55 - def __init__(self, known_tags=None, matches={}):
56 """Initialize warehouse 57 58 :Parameters: 59 known_tags : list of basestring 60 List of known tags 61 matches : dict 62 Optional dictionary of additional matches. E.g. since any 63 regression can be used as a binary classifier, 64 matches={'binary':['regression']}, would allow to provide 65 regressions also if 'binary' was requested 66 """ 67 self._known_tags = Set(known_tags) 68 self.__items = [] 69 self.__keys = Set() 70 self.__matches = matches
71
72 - def __getitem__(self, *args):
73 if isinstance(args[0], tuple): 74 args = args[0] 75 76 # so we explicitely handle [:] 77 if args == (slice(None),): 78 args = [] 79 80 # lets remove optional modifier '!' 81 dargs = Set([x.lstrip('!') for x in args]).difference( 82 self._known_tags) 83 84 if len(dargs)>0: 85 raise ValueError, "Unknown internals %s requested. Known are %s" % \ 86 (list(dargs), list(self._known_tags)) 87 88 # dummy implementation for now 89 result = [] 90 # check every known item 91 for item in self.__items: 92 good = True 93 # by default each one counts 94 for arg in args: 95 # check for rejection first 96 if arg.startswith('!'): 97 if (arg[1:] in item._clf_internals): 98 good = False 99 break 100 else: 101 continue 102 # check for inclusion 103 found = False 104 for arg in [arg] + self.__matches.get(arg, []): 105 if (arg in item._clf_internals): 106 found = True 107 break 108 good = found 109 if not good: 110 break 111 if good: 112 result.append(item) 113 return result
114
115 - def __iadd__(self, item):
116 if operator.isSequenceType(item): 117 for item_ in item: 118 self.__iadd__(item_) 119 else: 120 if not hasattr(item, '_clf_internals'): 121 raise ValueError, "Cannot register %s " % item + \ 122 "which has no _clf_internals defined" 123 if len(item._clf_internals) == 0: 124 raise ValueError, "Cannot register %s " % item + \ 125 "which has empty _clf_internals" 126 clf_internals = Set(item._clf_internals) 127 if clf_internals.issubset(self._known_tags): 128 self.__items.append(item) 129 self.__keys |= clf_internals 130 else: 131 raise ValueError, 'Unknown clf internal(s) %s' % \ 132 clf_internals.difference(self._known_tags) 133 return self
134 135 @property
136 - def internals(self):
137 return self.__keys
138
139 - def listing(self):
140 return [(x.descr, x._clf_internals) for x in self.__items]
141 142 @property
143 - def items(self):
144 return self.__items
145 146 clfs = Warehouse(known_tags=_KNOWN_INTERNALS) # classifiers 147 regrs = Warehouse(known_tags=_KNOWN_INTERNALS) # regressions 148 149 # NB: 150 # - Nu-classifiers are turned off since for haxby DS default nu 151 # is an 'infisible' one 152 # - Python's SMLR is turned off for the duration of development 153 # since it is slow and results should be the same as of C version 154 # 155 clfs += [ SMLR(lm=0.1, implementation="C", descr="SMLR(lm=0.1)"), 156 SMLR(lm=1.0, implementation="C", descr="SMLR(lm=1.0)"), 157 #SMLR(lm=10.0, implementation="C", descr="SMLR(lm=10.0)"), 158 #SMLR(lm=100.0, implementation="C", descr="SMLR(lm=100.0)"), 159 # SMLR(implementation="Python", descr="SMLR(Python)") 160 ] 161 162 clfs += \ 163 [ MulticlassClassifier(clfs['smlr'][0], 164 descr='Pairs+maxvote multiclass on ' + \ 165 clfs['smlr'][0].descr) ] 166 167 if externals.exists('libsvm'): 168 from mvpa.clfs import libsvmc as libsvm 169 clfs._known_tags.union_update(libsvm.SVM._KNOWN_IMPLEMENTATIONS.keys()) 170 clfs += [libsvm.SVM(descr="libsvm.LinSVM(C=def)", probability=1), 171 libsvm.SVM( 172 C=-10.0, descr="libsvm.LinSVM(C=10*def)", probability=1), 173 libsvm.SVM( 174 C=1.0, descr="libsvm.LinSVM(C=1)", probability=1), 175 libsvm.SVM(svm_impl='NU_SVC', 176 descr="libsvm.LinNuSVM(nu=def)", probability=1) 177 ] 178 clfs += [libsvm.SVM(kernel_type='RBF', descr="libsvm.RbfSVM()"), 179 libsvm.SVM(kernel_type='RBF', svm_impl='NU_SVC', 180 descr="libsvm.RbfNuSVM(nu=def)"), 181 libsvm.SVM(kernel_type='poly', 182 descr='libsvm.PolySVM()', probability=1), 183 #libsvm.svm.SVM(kernel_type='sigmoid', 184 # svm_impl='C_SVC', 185 # descr='libsvm.SigmoidSVM()'), 186 ] 187 188 # regressions 189 regrs._known_tags.union_update(['EPSILON_SVR', 'NU_SVR']) 190 regrs += [libsvm.SVM(svm_impl='EPSILON_SVR', descr='libsvm epsilon-SVR', 191 regression=True), 192 libsvm.SVM(svm_impl='NU_SVR', descr='libsvm nu-SVR', 193 regression=True)] 194 195 if externals.exists('shogun'): 196 from mvpa.clfs import sg 197 clfs._known_tags.union_update(sg.SVM._KNOWN_IMPLEMENTATIONS) 198 199 # some classifiers are not yet ready to be used out-of-the-box in 200 # PyMVPA, thus we don't populate warehouse with their instances 201 bad_classifiers = [ 202 'mpd', # was segfault, now non-training on testcases, and XOR. 203 # and was described as "for educational purposes", thus 204 # shouldn't be used for real data ;-) 205 # Should be a drop-in replacement for lightsvm 206 'gpbt', # fails to train for testAnalyzerWithSplitClassifier 207 # also 'retraining' doesn't work -- fails to generalize 208 'gmnp', # would fail with 'assertion Cache_Size > 2' if shogun < 0.6.3, also refuses to train 209 'svrlight', # fails to 'generalize' as a binary classifier after 'binning' 210 'krr', # fails to generalize 211 ] 212 if not externals.exists('sg_fixedcachesize'): 213 # would fail with 'assertion Cache_Size > 2' if shogun < 0.6.3 214 bad_classifiers.append('gnpp') 215 216 for impl in sg.SVM._KNOWN_IMPLEMENTATIONS: 217 # Uncomment the ones to disable 218 if impl in bad_classifiers: 219 continue 220 clfs += [ 221 sg.SVM( 222 descr="sg.LinSVM(C=def)/%s" % impl, svm_impl=impl), 223 sg.SVM( 224 C=-10.0, descr="sg.LinSVM(C=10*def)/%s" % impl, svm_impl=impl), 225 sg.SVM( 226 C=1.0, descr="sg.LinSVM(C=1)/%s" % impl, svm_impl=impl), 227 ] 228 clfs += [ 229 sg.SVM(kernel_type='RBF', descr="sg.RbfSVM()/%s" % impl, svm_impl=impl), 230 # sg.SVM(kernel_type='RBF', descr="sg.RbfSVM(gamma=0.1)/%s" % impl, svm_impl=impl, gamma=0.1), 231 # sg.SVM(descr="sg.SigmoidSVM()/%s" % impl, svm_impl=impl, kernel_type="sigmoid"), 232 ] 233 234 for impl in ['libsvr', 'krr']:# \ 235 # XXX svrlight sucks in SG -- dont' have time to figure it out 236 #+ ([], ['svrlight'])['svrlight' in sg.SVM._KNOWN_IMPLEMENTATIONS]: 237 regrs._known_tags.union_update([impl]) 238 regrs += [ sg.SVM(svm_impl=impl, descr='sg.LinSVMR()/%s' % impl, 239 regression=True), 240 #sg.SVM(svm_impl=impl, kernel_type='RBF', 241 # descr='sg.RBFSVMR()/%s' % impl, 242 # regression=True), 243 ] 244 245 if len(clfs['svm', 'linear']) > 0: 246 # if any SVM implementation is known, import default ones 247 from mvpa.clfs.svm import * 248 249 # lars from R via RPy 250 if externals.exists('lars'): 251 import mvpa.clfs.lars as lars 252 from mvpa.clfs.lars import LARS 253 for model in lars.known_models: 254 # XXX create proper repository of classifiers! 255 lars = LARS(descr="LARS(%s)" % model, model_type=model) 256 clfs += lars 257 # clfs += MulticlassClassifier(lars, descr='Multiclass %s' % lars.descr) 258 259 # kNN 260 clfs += kNN(k=5, descr="kNN(k=5)") 261 262 clfs += \ 263 FeatureSelectionClassifier( 264 kNN(), 265 SensitivityBasedFeatureSelection( 266 SMLRWeights(SMLR(lm=1.0, implementation="C")), 267 RangeElementSelector(mode='select')), 268 descr="kNN on SMLR(lm=1) non-0") 269 270 clfs += \ 271 FeatureSelectionClassifier( 272 kNN(), 273 SensitivityBasedFeatureSelection( 274 OneWayAnova(), 275 FractionTailSelector(0.05, mode='select', tail='upper')), 276 descr="kNN on 5%(ANOVA)") 277 278 clfs += \ 279 FeatureSelectionClassifier( 280 kNN(), 281 SensitivityBasedFeatureSelection( 282 OneWayAnova(), 283 FixedNElementTailSelector(50, mode='select', tail='upper')), 284 descr="kNN on 50(ANOVA)") 285 286 287 # GPR 288 if externals.exists('scipy'): 289 from mvpa.clfs.gpr import GPR 290 291 clfs += GPR(kernel=KernelLinear(), descr="GPR(kernel='linear')") 292 clfs += GPR(kernel=KernelSquaredExponential(), descr="GPR(kernel='sqexp')") 293 294 # BLR 295 from mvpa.clfs.blr import BLR 296 clfs += BLR(descr="BLR()") 297 298 299 # SVM stuff 300 301 if len(clfs['linear', 'svm']) > 0: 302 303 linearSVMC = clfs['linear', 'svm', 304 cfg.get('svm', 'backend', default='libsvm').lower() 305 ][0] 306 307 # "Interesting" classifiers 308 clfs += \ 309 FeatureSelectionClassifier( 310 linearSVMC, 311 SensitivityBasedFeatureSelection( 312 SMLRWeights(SMLR(lm=0.1, implementation="C")), 313 RangeElementSelector(mode='select')), 314 descr="LinSVM on SMLR(lm=0.1) non-0") 315 316 317 clfs += \ 318 FeatureSelectionClassifier( 319 linearSVMC, 320 SensitivityBasedFeatureSelection( 321 SMLRWeights(SMLR(lm=1.0, implementation="C")), 322 RangeElementSelector(mode='select')), 323 descr="LinSVM on SMLR(lm=1) non-0") 324 325 326 # "Interesting" classifiers 327 clfs += \ 328 FeatureSelectionClassifier( 329 RbfCSVMC(), 330 SensitivityBasedFeatureSelection( 331 SMLRWeights(SMLR(lm=1.0, implementation="C")), 332 RangeElementSelector(mode='select')), 333 descr="RbfSVM on SMLR(lm=1) non-0") 334 335 clfs += \ 336 FeatureSelectionClassifier( 337 linearSVMC, 338 SensitivityBasedFeatureSelection( 339 OneWayAnova(), 340 FractionTailSelector(0.05, mode='select', tail='upper')), 341 descr="LinSVM on 5%(ANOVA)") 342 343 clfs += \ 344 FeatureSelectionClassifier( 345 linearSVMC, 346 SensitivityBasedFeatureSelection( 347 OneWayAnova(), 348 FixedNElementTailSelector(50, mode='select', tail='upper')), 349 descr="LinSVM on 50(ANOVA)") 350 351 clfs += \ 352 FeatureSelectionClassifier( 353 linearSVMC, 354 SensitivityBasedFeatureSelection( 355 linearSVMC.getSensitivityAnalyzer(transformer=Absolute), 356 FractionTailSelector(0.05, mode='select', tail='upper')), 357 descr="LinSVM on 5%(SVM)") 358 359 clfs += \ 360 FeatureSelectionClassifier( 361 linearSVMC, 362 SensitivityBasedFeatureSelection( 363 linearSVMC.getSensitivityAnalyzer(transformer=Absolute), 364 FixedNElementTailSelector(50, mode='select', tail='upper')), 365 descr="LinSVM on 50(SVM)") 366 367 368 # SVM with unbiased RFE -- transfer-error to another splits, or in 369 # other terms leave-1-out error on the same dataset 370 # Has to be bound outside of the RFE definition since both analyzer and 371 # error should use the same instance. 372 rfesvm_split = SplitClassifier(linearSVMC)#clfs['LinearSVMC'][0]) 373 374 # "Almost" classical RFE. If this works it would differ only that 375 # our transfer_error is based on internal splitting and classifier used 376 # within RFE is a split classifier and its sensitivities per split will get 377 # averaged 378 # 379 380 #clfs += \ 381 # FeatureSelectionClassifier( 382 # clf = LinearCSVMC(), #clfs['LinearSVMC'][0], # we train LinearSVM 383 # feature_selection = RFE( # on features selected via RFE 384 # # based on sensitivity of a clf which does splitting internally 385 # sensitivity_analyzer=rfesvm_split.getSensitivityAnalyzer(), 386 # transfer_error=ConfusionBasedError( 387 # rfesvm_split, 388 # confusion_state="confusion"), 389 # # and whose internal error we use 390 # feature_selector=FractionTailSelector( 391 # 0.2, mode='discard', tail='lower'), 392 # # remove 20% of features at each step 393 # update_sensitivity=True), 394 # # update sensitivity at each step 395 # descr='LinSVM+RFE(splits_avg)' ) 396 # 397 #clfs += \ 398 # FeatureSelectionClassifier( 399 # clf = LinearCSVMC(), #clfs['LinearSVMC'][0], # we train LinearSVM 400 # feature_selection = RFE( # on features selected via RFE 401 # # based on sensitivity of a clf which does splitting internally 402 # sensitivity_analyzer=rfesvm_split.getSensitivityAnalyzer(), 403 # transfer_error=ConfusionBasedError( 404 # rfesvm_split, 405 # confusion_state="confusion"), 406 # # and whose internal error we use 407 # feature_selector=FractionTailSelector( 408 # 0.2, mode='discard', tail='lower'), 409 # # remove 20% of features at each step 410 # update_sensitivity=False), 411 # # update sensitivity at each step 412 # descr='LinSVM+RFE(splits_avg,static)' ) 413 414 rfesvm = LinearCSVMC() 415 416 # This classifier will do RFE while taking transfer error to testing 417 # set of that split. Resultant classifier is voted classifier on top 418 # of all splits, let see what that would do ;-) 419 #clfs += \ 420 # SplitClassifier( # which does splitting internally 421 # FeatureSelectionClassifier( 422 # clf = LinearCSVMC(), 423 # feature_selection = RFE( # on features selected via RFE 424 # sensitivity_analyzer=\ 425 # rfesvm.getSensitivityAnalyzer(transformer=Absolute), 426 # transfer_error=TransferError(rfesvm), 427 # stopping_criterion=FixedErrorThresholdStopCrit(0.05), 428 # feature_selector=FractionTailSelector( 429 # 0.2, mode='discard', tail='lower'), 430 # # remove 20% of features at each step 431 # update_sensitivity=True)), 432 # # update sensitivity at each step 433 # descr='LinSVM+RFE(N-Fold)') 434 # 435 # 436 #clfs += \ 437 # SplitClassifier( # which does splitting internally 438 # FeatureSelectionClassifier( 439 # clf = LinearCSVMC(), 440 # feature_selection = RFE( # on features selected via RFE 441 # sensitivity_analyzer=\ 442 # rfesvm.getSensitivityAnalyzer(transformer=Absolute), 443 # transfer_error=TransferError(rfesvm), 444 # stopping_criterion=FixedErrorThresholdStopCrit(0.05), 445 # feature_selector=FractionTailSelector( 446 # 0.2, mode='discard', tail='lower'), 447 # # remove 20% of features at each step 448 # update_sensitivity=True)), 449 # # update sensitivity at each step 450 # splitter = OddEvenSplitter(), 451 # descr='LinSVM+RFE(OddEven)') 452