Package mvpa :: Package clfs :: Module warehouse
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.warehouse

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Collection of classifiers to ease the exploration. 
 10  """ 
 11   
 12  __docformat__ = 'restructuredtext' 
 13   
 14  from sets import Set 
 15  import operator 
 16   
 17  # Define sets of classifiers 
 18  from mvpa.clfs.meta import FeatureSelectionClassifier, SplitClassifier, \ 
 19       MulticlassClassifier 
 20  from mvpa.clfs.smlr import SMLR 
 21  from mvpa.clfs.knn import kNN 
 22  from mvpa.clfs.gnb import GNB 
 23  from mvpa.clfs.kernel import KernelLinear, KernelSquaredExponential 
 24   
 25  # Helpers 
 26  from mvpa.base import externals, cfg 
 27  from mvpa.measures.anova import OneWayAnova 
 28  from mvpa.misc.transformers import Absolute 
 29  from mvpa.clfs.smlr import SMLRWeights 
 30  from mvpa.featsel.helpers import FractionTailSelector, \ 
 31      FixedNElementTailSelector, RangeElementSelector 
 32   
 33  from mvpa.featsel.base import SensitivityBasedFeatureSelection 
 34   
 35  _KNOWN_INTERNALS = [ 'knn', 'binary', 'svm', 'linear', 
 36          'smlr', 'does_feature_selection', 'has_sensitivity', 
 37          'multiclass', 'non-linear', 'kernel-based', 'lars', 
 38          'regression', 'libsvm', 'sg', 'meta', 'retrainable', 'gpr', 
 39          'notrain2predict', 'ridge', 'blr', 'gnpp', 'enet', 'glmnet', 
 40          'gnb'] 
41 42 -class Warehouse(object):
43 """Class to keep known instantiated classifiers 44 45 Should provide easy ways to select classifiers of needed kind: 46 clfswh['linear', 'svm'] should return all linear SVMs 47 clfswh['linear', 'multiclass'] should return all linear classifiers 48 capable of doing multiclass classification 49 """ 50
51 - def __init__(self, known_tags=None, matches=None):
52 """Initialize warehouse 53 54 :Parameters: 55 known_tags : list of basestring 56 List of known tags 57 matches : dict 58 Optional dictionary of additional matches. E.g. since any 59 regression can be used as a binary classifier, 60 matches={'binary':['regression']}, would allow to provide 61 regressions also if 'binary' was requested 62 """ 63 self._known_tags = Set(known_tags) 64 self.__items = [] 65 self.__keys = Set() 66 if matches is None: 67 matches = {} 68 self.__matches = matches
69
70 - def __getitem__(self, *args):
71 if isinstance(args[0], tuple): 72 args = args[0] 73 74 # so we explicitely handle [:] 75 if args == (slice(None),): 76 args = [] 77 78 # lets remove optional modifier '!' 79 dargs = Set([str(x).lstrip('!') for x in args]).difference( 80 self._known_tags) 81 82 if len(dargs)>0: 83 raise ValueError, "Unknown internals %s requested. Known are %s" % \ 84 (list(dargs), list(self._known_tags)) 85 86 # dummy implementation for now 87 result = [] 88 # check every known item 89 for item in self.__items: 90 good = True 91 # by default each one counts 92 for arg in args: 93 # check for rejection first 94 if arg.startswith('!'): 95 if (arg[1:] in item._clf_internals): 96 good = False 97 break 98 else: 99 continue 100 # check for inclusion 101 found = False 102 for arg in [arg] + self.__matches.get(arg, []): 103 if (arg in item._clf_internals): 104 found = True 105 break 106 good = found 107 if not good: 108 break 109 if good: 110 result.append(item) 111 return result
112
113 - def __iadd__(self, item):
114 if operator.isSequenceType(item): 115 for item_ in item: 116 self.__iadd__(item_) 117 else: 118 if not hasattr(item, '_clf_internals'): 119 raise ValueError, "Cannot register %s " % item + \ 120 "which has no _clf_internals defined" 121 if len(item._clf_internals) == 0: 122 raise ValueError, "Cannot register %s " % item + \ 123 "which has empty _clf_internals" 124 clf_internals = Set(item._clf_internals) 125 if clf_internals.issubset(self._known_tags): 126 self.__items.append(item) 127 self.__keys |= clf_internals 128 else: 129 raise ValueError, 'Unknown clf internal(s) %s' % \ 130 clf_internals.difference(self._known_tags) 131 return self
132 133 @property
134 - def internals(self):
135 """Known internal tags of the classifiers 136 """ 137 return self.__keys
138
139 - def listing(self):
140 """Listing (description + internals) of registered items 141 """ 142 return [(x.descr, x._clf_internals) for x in self.__items]
143 144 @property
145 - def items(self):
146 """Registered items 147 """ 148 return self.__items
149 150 clfswh = Warehouse(known_tags=_KNOWN_INTERNALS) # classifiers 151 regrswh = Warehouse(known_tags=_KNOWN_INTERNALS) # regressions 152 153 # NB: 154 # - Nu-classifiers are turned off since for haxby DS default nu 155 # is an 'infisible' one 156 # - Python's SMLR is turned off for the duration of development 157 # since it is slow and results should be the same as of C version 158 # 159 clfswh += [ SMLR(lm=0.1, implementation="C", descr="SMLR(lm=0.1)"), 160 SMLR(lm=1.0, implementation="C", descr="SMLR(lm=1.0)"), 161 #SMLR(lm=10.0, implementation="C", descr="SMLR(lm=10.0)"), 162 #SMLR(lm=100.0, implementation="C", descr="SMLR(lm=100.0)"), 163 #SMLR(implementation="Python", descr="SMLR(Python)") 164 ] 165 166 clfswh += \ 167 [ MulticlassClassifier(clfswh['smlr'][0], 168 descr='Pairs+maxvote multiclass on ' + \ 169 clfswh['smlr'][0].descr) ] 170 171 if externals.exists('libsvm'): 172 from mvpa.clfs import libsvmc as libsvm 173 clfswh._known_tags.union_update(libsvm.SVM._KNOWN_IMPLEMENTATIONS.keys()) 174 clfswh += [libsvm.SVM(descr="libsvm.LinSVM(C=def)", probability=1), 175 libsvm.SVM( 176 C=-10.0, descr="libsvm.LinSVM(C=10*def)", probability=1), 177 libsvm.SVM( 178 C=1.0, descr="libsvm.LinSVM(C=1)", probability=1), 179 libsvm.SVM(svm_impl='NU_SVC', 180 descr="libsvm.LinNuSVM(nu=def)", probability=1) 181 ] 182 clfswh += [libsvm.SVM(kernel_type='RBF', descr="libsvm.RbfSVM()"), 183 libsvm.SVM(kernel_type='RBF', svm_impl='NU_SVC', 184 descr="libsvm.RbfNuSVM(nu=def)"), 185 libsvm.SVM(kernel_type='poly', 186 descr='libsvm.PolySVM()', probability=1), 187 #libsvm.svm.SVM(kernel_type='sigmoid', 188 # svm_impl='C_SVC', 189 # descr='libsvm.SigmoidSVM()'), 190 ] 191 192 # regressions 193 regrswh._known_tags.union_update(['EPSILON_SVR', 'NU_SVR']) 194 regrswh += [libsvm.SVM(svm_impl='EPSILON_SVR', descr='libsvm epsilon-SVR', 195 regression=True), 196 libsvm.SVM(svm_impl='NU_SVR', descr='libsvm nu-SVR', 197 regression=True)] 198 199 if externals.exists('shogun'): 200 from mvpa.clfs import sg 201 clfswh._known_tags.union_update(sg.SVM._KNOWN_IMPLEMENTATIONS) 202 203 # some classifiers are not yet ready to be used out-of-the-box in 204 # PyMVPA, thus we don't populate warehouse with their instances 205 bad_classifiers = [ 206 'mpd', # was segfault, now non-training on testcases, and XOR. 207 # and was described as "for educational purposes", thus 208 # shouldn't be used for real data ;-) 209 # Should be a drop-in replacement for lightsvm 210 'gpbt', # fails to train for testAnalyzerWithSplitClassifier 211 # also 'retraining' doesn't work -- fails to generalize 212 'gmnp', # would fail with 'assertion Cache_Size > 2' 213 # if shogun < 0.6.3, also refuses to train 214 'svrlight', # fails to 'generalize' as a binary classifier 215 # after 'binning' 216 'krr', # fails to generalize 217 ] 218 if not externals.exists('sg_fixedcachesize'): 219 # would fail with 'assertion Cache_Size > 2' if shogun < 0.6.3 220 bad_classifiers.append('gnpp') 221 222 for impl in sg.SVM._KNOWN_IMPLEMENTATIONS: 223 # Uncomment the ones to disable 224 if impl in bad_classifiers: 225 continue 226 clfswh += [ 227 sg.SVM( 228 descr="sg.LinSVM(C=def)/%s" % impl, svm_impl=impl), 229 sg.SVM( 230 C=-10.0, descr="sg.LinSVM(C=10*def)/%s" % impl, svm_impl=impl), 231 sg.SVM( 232 C=1.0, descr="sg.LinSVM(C=1)/%s" % impl, svm_impl=impl), 233 ] 234 clfswh += [ 235 sg.SVM(kernel_type='RBF', 236 descr="sg.RbfSVM()/%s" % impl, svm_impl=impl), 237 # sg.SVM(kernel_type='RBF', 238 # descr="sg.RbfSVM(gamma=0.1)/%s" 239 # % impl, svm_impl=impl, gamma=0.1), 240 # sg.SVM(descr="sg.SigmoidSVM()/%s" 241 # % impl, svm_impl=impl, kernel_type="sigmoid"), 242 ] 243 244 _optional_regressions = [] 245 if externals.exists('shogun.krr'): 246 _optional_regressions += ['krr'] 247 for impl in ['libsvr'] + _optional_regressions:# \ 248 # XXX svrlight sucks in SG -- dont' have time to figure it out 249 #+ ([], ['svrlight'])['svrlight' in sg.SVM._KNOWN_IMPLEMENTATIONS]: 250 regrswh._known_tags.union_update([impl]) 251 regrswh += [ sg.SVM(svm_impl=impl, descr='sg.LinSVMR()/%s' % impl, 252 regression=True), 253 #sg.SVM(svm_impl=impl, kernel_type='RBF', 254 # descr='sg.RBFSVMR()/%s' % impl, 255 # regression=True), 256 ] 257 258 if len(clfswh['svm', 'linear']) > 0: 259 # if any SVM implementation is known, import default ones 260 from mvpa.clfs.svm import * 261 262 # lars from R via RPy 263 if externals.exists('lars'): 264 import mvpa.clfs.lars as lars 265 from mvpa.clfs.lars import LARS 266 for model in lars.known_models: 267 # XXX create proper repository of classifiers! 268 lars_clf = LARS(descr="LARS(%s)" % model, model_type=model) 269 clfswh += lars_clf 270 271 # is a regression, too 272 lars_regr = LARS(descr="_LARS(%s, regression=True)" % model, 273 regression=True, model_type=model) 274 regrswh += lars_regr 275 # clfswh += MulticlassClassifier(lars, 276 # descr='Multiclass %s' % lars.descr) 277 278 ## PBS: enet has some weird issue that causes it to fail. GLMNET is 279 ## better anyway, so just use that instead 280 ## # enet from R via RPy 281 ## if externals.exists('elasticnet'): 282 ## from mvpa.clfs.enet import ENET 283 ## clfswh += ENET(descr="ENET()") 284 ## regrswh += ENET(descr="ENET(regression=True)", regression=True) 285 286 # glmnet from R via RPy 287 if externals.exists('glmnet'): 288 from mvpa.clfs.glmnet import GLMNET_C, GLMNET_R 289 clfswh += GLMNET_C(descr="GLMNET_C()") 290 regrswh += GLMNET_R(descr="GLMNET_R()") 291 292 # kNN 293 clfswh += kNN(k=5, descr="kNN(k=5)") 294 clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')") 295 296 clfswh += \ 297 FeatureSelectionClassifier( 298 kNN(), 299 SensitivityBasedFeatureSelection( 300 SMLRWeights(SMLR(lm=1.0, implementation="C")), 301 RangeElementSelector(mode='select')), 302 descr="kNN on SMLR(lm=1) non-0") 303 304 clfswh += \ 305 FeatureSelectionClassifier( 306 kNN(), 307 SensitivityBasedFeatureSelection( 308 OneWayAnova(), 309 FractionTailSelector(0.05, mode='select', tail='upper')), 310 descr="kNN on 5%(ANOVA)") 311 312 clfswh += \ 313 FeatureSelectionClassifier( 314 kNN(), 315 SensitivityBasedFeatureSelection( 316 OneWayAnova(), 317 FixedNElementTailSelector(50, mode='select', tail='upper')), 318 descr="kNN on 50(ANOVA)") 319 320 321 # GNB 322 clfswh += GNB(descr="GNB()") 323 clfswh += GNB(common_variance=True, descr="GNB(common_variance=True)") 324 clfswh += GNB(prior='uniform', descr="GNB(prior='uniform')") 325 clfswh += \ 326 FeatureSelectionClassifier( 327 GNB(), 328 SensitivityBasedFeatureSelection( 329 OneWayAnova(), 330 FractionTailSelector(0.05, mode='select', tail='upper')), 331 descr="GNB on 5%(ANOVA)") 332 333 334 # GPR 335 if externals.exists('scipy'): 336 from mvpa.clfs.gpr import GPR 337 338 clfswh += GPR(kernel=KernelLinear(), descr="GPR(kernel='linear')") 339 clfswh += GPR(kernel=KernelSquaredExponential(), 340 descr="GPR(kernel='sqexp')") 341 342 # BLR 343 from mvpa.clfs.blr import BLR 344 clfswh += BLR(descr="BLR()") 345 346 347 # SVM stuff 348 349 if len(clfswh['linear', 'svm']) > 0: 350 351 linearSVMC = clfswh['linear', 'svm', 352 cfg.get('svm', 'backend', default='libsvm').lower() 353 ][0] 354 355 # "Interesting" classifiers 356 clfswh += \ 357 FeatureSelectionClassifier( 358 linearSVMC.clone(), 359 SensitivityBasedFeatureSelection( 360 SMLRWeights(SMLR(lm=0.1, implementation="C")), 361 RangeElementSelector(mode='select')), 362 descr="LinSVM on SMLR(lm=0.1) non-0") 363 364 365 clfswh += \ 366 FeatureSelectionClassifier( 367 linearSVMC.clone(), 368 SensitivityBasedFeatureSelection( 369 SMLRWeights(SMLR(lm=1.0, implementation="C")), 370 RangeElementSelector(mode='select')), 371 descr="LinSVM on SMLR(lm=1) non-0") 372 373 374 # "Interesting" classifiers 375 clfswh += \ 376 FeatureSelectionClassifier( 377 RbfCSVMC(), 378 SensitivityBasedFeatureSelection( 379 SMLRWeights(SMLR(lm=1.0, implementation="C")), 380 RangeElementSelector(mode='select')), 381 descr="RbfSVM on SMLR(lm=1) non-0") 382 383 clfswh += \ 384 FeatureSelectionClassifier( 385 linearSVMC.clone(), 386 SensitivityBasedFeatureSelection( 387 OneWayAnova(), 388 FractionTailSelector(0.05, mode='select', tail='upper')), 389 descr="LinSVM on 5%(ANOVA)") 390 391 clfswh += \ 392 FeatureSelectionClassifier( 393 linearSVMC.clone(), 394 SensitivityBasedFeatureSelection( 395 OneWayAnova(), 396 FixedNElementTailSelector(50, mode='select', tail='upper')), 397 descr="LinSVM on 50(ANOVA)") 398 399 clfswh += \ 400 FeatureSelectionClassifier( 401 linearSVMC.clone(), 402 SensitivityBasedFeatureSelection( 403 linearSVMC.getSensitivityAnalyzer(transformer=Absolute), 404 FractionTailSelector(0.05, mode='select', tail='upper')), 405 descr="LinSVM on 5%(SVM)") 406 407 clfswh += \ 408 FeatureSelectionClassifier( 409 linearSVMC.clone(), 410 SensitivityBasedFeatureSelection( 411 linearSVMC.getSensitivityAnalyzer(transformer=Absolute), 412 FixedNElementTailSelector(50, mode='select', tail='upper')), 413 descr="LinSVM on 50(SVM)") 414 415 416 ### Imports which are specific to RFEs 417 # from mvpa.datasets.splitters import OddEvenSplitter 418 # from mvpa.clfs.transerror import TransferError 419 # from mvpa.featsel.rfe import RFE 420 # from mvpa.featsel.helpers import FixedErrorThresholdStopCrit 421 # from mvpa.clfs.transerror import ConfusionBasedError 422 423 # SVM with unbiased RFE -- transfer-error to another splits, or in 424 # other terms leave-1-out error on the same dataset 425 # Has to be bound outside of the RFE definition since both analyzer and 426 # error should use the same instance. 427 rfesvm_split = SplitClassifier(linearSVMC)#clfswh['LinearSVMC'][0]) 428 429 # "Almost" classical RFE. If this works it would differ only that 430 # our transfer_error is based on internal splitting and classifier used 431 # within RFE is a split classifier and its sensitivities per split will get 432 # averaged 433 # 434 435 #clfswh += \ 436 # FeatureSelectionClassifier( 437 # clf = LinearCSVMC(), #clfswh['LinearSVMC'][0], # we train LinearSVM 438 # feature_selection = RFE( # on features selected via RFE 439 # # based on sensitivity of a clf which does splitting internally 440 # sensitivity_analyzer=rfesvm_split.getSensitivityAnalyzer(), 441 # transfer_error=ConfusionBasedError( 442 # rfesvm_split, 443 # confusion_state="confusion"), 444 # # and whose internal error we use 445 # feature_selector=FractionTailSelector( 446 # 0.2, mode='discard', tail='lower'), 447 # # remove 20% of features at each step 448 # update_sensitivity=True), 449 # # update sensitivity at each step 450 # descr='LinSVM+RFE(splits_avg)' ) 451 # 452 #clfswh += \ 453 # FeatureSelectionClassifier( 454 # clf = LinearCSVMC(), # we train LinearSVM 455 # feature_selection = RFE( # on features selected via RFE 456 # # based on sensitivity of a clf which does splitting internally 457 # sensitivity_analyzer=rfesvm_split.getSensitivityAnalyzer(), 458 # transfer_error=ConfusionBasedError( 459 # rfesvm_split, 460 # confusion_state="confusion"), 461 # # and whose internal error we use 462 # feature_selector=FractionTailSelector( 463 # 0.2, mode='discard', tail='lower'), 464 # # remove 20% of features at each step 465 # update_sensitivity=False), 466 # # update sensitivity at each step 467 # descr='LinSVM+RFE(splits_avg,static)' ) 468 469 rfesvm = LinearCSVMC() 470 471 # This classifier will do RFE while taking transfer error to testing 472 # set of that split. Resultant classifier is voted classifier on top 473 # of all splits, let see what that would do ;-) 474 #clfswh += \ 475 # SplitClassifier( # which does splitting internally 476 # FeatureSelectionClassifier( 477 # clf = LinearCSVMC(), 478 # feature_selection = RFE( # on features selected via RFE 479 # sensitivity_analyzer=\ 480 # rfesvm.getSensitivityAnalyzer(transformer=Absolute), 481 # transfer_error=TransferError(rfesvm), 482 # stopping_criterion=FixedErrorThresholdStopCrit(0.05), 483 # feature_selector=FractionTailSelector( 484 # 0.2, mode='discard', tail='lower'), 485 # # remove 20% of features at each step 486 # update_sensitivity=True)), 487 # # update sensitivity at each step 488 # descr='LinSVM+RFE(N-Fold)') 489 # 490 # 491 #clfswh += \ 492 # SplitClassifier( # which does splitting internally 493 # FeatureSelectionClassifier( 494 # clf = LinearCSVMC(), 495 # feature_selection = RFE( # on features selected via RFE 496 # sensitivity_analyzer=\ 497 # rfesvm.getSensitivityAnalyzer(transformer=Absolute), 498 # transfer_error=TransferError(rfesvm), 499 # stopping_criterion=FixedErrorThresholdStopCrit(0.05), 500 # feature_selector=FractionTailSelector( 501 # 0.2, mode='discard', tail='lower'), 502 # # remove 20% of features at each step 503 # update_sensitivity=True)), 504 # # update sensitivity at each step 505 # splitter = OddEvenSplitter(), 506 # descr='LinSVM+RFE(OddEven)') 507