Package mvpa :: Package datasets :: Module splitter
[hide private]
[frames] | no frames]

Source Code for Module mvpa.datasets.splitter

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Collection of dataset splitters.""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  import operator 
 14   
 15  import numpy as N 
 16   
 17  import mvpa.misc.support as support 
 18  from mvpa.base.dochelpers import enhancedDocString 
 19   
 20  if __debug__: 
 21      from mvpa.base import debug 
 22   
23 -class Splitter(object):
24 """Base class of dataset splitters. 25 26 Each splitter should be initialized with all its necessary parameters. The 27 final splitting is done running the splitter object on a certain Dataset 28 via __call__(). This method has to be implemented like a generator, i.e. it 29 has to return every possible split with a yield() call. 30 31 Each split has to be returned as a sequence of Datasets. The properties 32 of the splitted dataset may vary between implementations. It is possible 33 to declare a sequence element as 'None'. 34 35 Please note, that even if there is only one Dataset returned it has to be 36 an element in a sequence and not just the Dataset object! 37 """ 38 39 _STRATEGIES = ('first', 'random', 'equidistant') 40
41 - def __init__(self, 42 nperlabel='all', 43 nrunspersplit=1, 44 permute=False, 45 count=None, 46 strategy='equidistant', 47 attr='chunks'):
48 """Initialize splitter base. 49 50 :Parameters: 51 nperlabel : int or str (or list of them) 52 Number of dataset samples per label to be included in each 53 split. Two special strings are recognized: 'all' uses all available 54 samples (default) and 'equal' uses the maximum number of samples 55 the can be provided by all of the classes. This value might be 56 provided as a sequence whos length matches the number of datasets 57 per split and indicates the configuration for the respective dataset 58 in each split. 59 nrunspersplit: int 60 Number of times samples for each split are chosen. This 61 is mostly useful if a subset of the available samples 62 is used in each split and the subset is randomly 63 selected for each run (see the `nperlabel` argument). 64 permute : bool 65 If set to `True`, the labels of each generated dataset 66 will be permuted on a per-chunk basis. 67 count : None or int 68 Desired number of splits to be output. It is limited by the 69 number of splits possible for a given splitter 70 (e.g. `OddEvenSplitter` can have only up to 2 splits). If None, 71 all splits are output (default). 72 strategy : str 73 If `count` is not None, possible strategies are possible: 74 first 75 First `count` splits are chosen 76 random 77 Random (without replacement) `count` splits are chosen 78 equidistant 79 Splits which are equidistant from each other 80 attr : str 81 Sample attribute used to determine splits. 82 """ 83 # pylint happyness block 84 self.__nperlabel = None 85 self.__runspersplit = nrunspersplit 86 self.__permute = permute 87 self.__splitattr = attr 88 89 # we don't check it, thus no reason to make it private. 90 # someone might find it useful to change post creation 91 # TODO utilize such (or similar) policy through out the code 92 self.count = count 93 """Number (max) of splits to output on call""" 94 95 self._setStrategy(strategy) 96 97 # pattern sampling status vars 98 self.setNPerLabel(nperlabel)
99 100 101 __doc__ = enhancedDocString('Splitter', locals()) 102
103 - def _setStrategy(self, strategy):
104 strategy = strategy.lower() 105 if not strategy in self._STRATEGIES: 106 raise ValueError, "strategy is not known. Known are %s" \ 107 % str(self._STRATEGIES) 108 self.__strategy = strategy
109
110 - def setNPerLabel(self, value):
111 """Set the number of samples per label in the split datasets. 112 113 'equal' sets sample size to highest possible number of samples that 114 can be provided by each class. 'all' uses all available samples 115 (default). 116 """ 117 self.__nperlabel = value
118 119
120 - def _getSplitConfig(self, uniqueattr):
121 """Each subclass has to implement this method. It gets a sequence with 122 the unique attribte ids of a dataset and has to return a list of lists 123 containing attribute ids to split into the second dataset. 124 """ 125 raise NotImplementedError
126 127
128 - def __call__(self, dataset):
129 """Splits the dataset. 130 131 This method behaves like a generator. 132 """ 133 134 # local bindings to methods to gain some speedup 135 ds_class = dataset.__class__ 136 DS_permuteLabels = ds_class.permuteLabels 137 try: 138 DS_getNSamplesPerLabel = ds_class._getNSamplesPerAttr 139 except AttributeError: 140 # Some "not-real" datasets e.g. MetaDataset, might not 141 # have it 142 pass 143 DS_getRandomSamples = ds_class.getRandomSamples 144 145 # for each split 146 cfgs = self.splitcfg(dataset) 147 148 # Select just some splits if desired 149 count, Ncfgs = self.count, len(cfgs) 150 # further makes sense only iff count < Ncfgs, 151 # otherwise all strategies are equivalent 152 if count is not None and count < Ncfgs: 153 if count < 1: 154 # we can only wish a good luck 155 return 156 strategy = self.strategy 157 if strategy == 'first': 158 cfgs = cfgs[:count] 159 elif strategy in ['equidistant', 'random']: 160 if strategy == 'equidistant': 161 # figure out what step is needed to 162 # acommodate the `count` number 163 step = float(Ncfgs) / count 164 assert(step >= 1.0) 165 indexes = [int(round(step * i)) for i in xrange(count)] 166 elif strategy == 'random': 167 indexes = N.random.permutation(range(Ncfgs))[:count] 168 # doesn't matter much but lets keep them in the original 169 # order at least 170 indexes.sort() 171 else: 172 # who said that I am paranoid? 173 raise RuntimeError, "Really should not happen" 174 if __debug__: 175 debug("SPL", "For %s strategy selected %s splits " 176 "from %d total" % (strategy, indexes, Ncfgs)) 177 cfgs = [cfgs[i] for i in indexes] 178 179 for split in cfgs: 180 # determine sample sizes 181 if not operator.isSequenceType(self.__nperlabel) \ 182 or isinstance(self.__nperlabel, str): 183 nperlabel = [self.__nperlabel] * len(split) 184 else: 185 nperlabel = self.__nperlabel 186 187 # get splitted datasets 188 split_ds = self.splitDataset(dataset, split) 189 190 # do multiple post-processing runs for this split 191 for run in xrange(self.__runspersplit): 192 193 # post-process all datasets 194 finalized_datasets = [] 195 196 for i, ds in enumerate(split_ds): 197 # permute the labels 198 if self.__permute: 199 DS_permuteLabels(ds, True, perchunk=True) 200 201 # select subset of samples if requested 202 if nperlabel[i] == 'all': 203 finalized_datasets.append(ds) 204 else: 205 # just pass through if no real dataset 206 if ds == None: 207 finalized_datasets.append(None) 208 else: 209 # go for maximum possible number of samples provided 210 # by each label in this dataset 211 if nperlabel[i] == 'equal': 212 # determine number number of samples per class 213 npl = N.array(DS_getNSamplesPerLabel( 214 ds, attrib='labels').values()).min() 215 else: 216 npl = nperlabel[i] 217 218 # finally select the patterns 219 finalized_datasets.append( 220 DS_getRandomSamples(ds, npl)) 221 222 yield finalized_datasets
223 224
225 - def splitDataset(self, dataset, specs):
226 """Split a dataset by separating the samples where the configured 227 sample attribute matches an element of `specs`. 228 229 :Parameters: 230 dataset : Dataset 231 This is this source dataset. 232 specs : sequence of sequences 233 Contains ids of a sample attribute that shall be split into the 234 another dataset. 235 236 :Returns: Tuple of splitted datasets. 237 """ 238 # collect the sample ids for each resulting dataset 239 filters = [] 240 none_specs = 0 241 cum_filter = None 242 243 splitattr_data = eval('dataset.' + self.__splitattr) 244 for spec in specs: 245 if spec == None: 246 filters.append(None) 247 none_specs += 1 248 else: 249 filter_ = N.array([ i in spec \ 250 for i in splitattr_data]) 251 filters.append(filter_) 252 if cum_filter == None: 253 cum_filter = filter_ 254 else: 255 cum_filter = N.logical_and(cum_filter, filter_) 256 257 # need to turn possible Nones into proper ids sequences 258 if none_specs > 1: 259 raise ValueError, "Splitter cannot handle more than one `None` " \ 260 "split definition." 261 262 for i, filter_ in enumerate(filters): 263 if filter_ == None: 264 filters[i] = N.logical_not(cum_filter) 265 266 # split data: return None if no samples are left 267 # XXX: Maybe it should simply return an empty dataset instead, but 268 # keeping it this way for now, to maintain current behavior 269 split_datasets = [] 270 271 # local bindings 272 dataset_selectSamples = dataset.selectSamples 273 for filter_ in filters: 274 if (filter_ == False).all(): 275 split_datasets.append(None) 276 else: 277 split_datasets.append(dataset_selectSamples(filter_)) 278 279 return split_datasets
280 281
282 - def __str__(self):
283 """String summary over the object 284 """ 285 return \ 286 "SplitterConfig: nperlabel:%s runs-per-split:%d permute:%s" \ 287 % (self.__nperlabel, self.__runspersplit, self.__permute)
288 289
290 - def splitcfg(self, dataset):
291 """Return splitcfg for a given dataset""" 292 return self._getSplitConfig(eval('dataset.unique' + self.__splitattr))
293 294 295 strategy = property(fget=lambda self:self.__strategy, 296 fset=_setStrategy)
297 298
299 -class NoneSplitter(Splitter):
300 """This is a dataset splitter that does **not** split. It simply returns 301 the full dataset that it is called with. 302 303 The passed dataset is returned as the second element of the 2-tuple. 304 The first element of that tuple will always be 'None'. 305 """ 306 307 _known_modes = ['first', 'second'] 308
309 - def __init__(self, mode='second', **kwargs):
310 """Cheap init -- nothing special 311 312 :Parameters: 313 mode 314 Either 'first' or 'second' (default) -- which output dataset 315 would actually contain the samples 316 """ 317 Splitter.__init__(self, **(kwargs)) 318 319 if not mode in NoneSplitter._known_modes: 320 raise ValueError, "Unknown mode %s for NoneSplitter" % mode 321 self.__mode = mode
322 323 324 __doc__ = enhancedDocString('NoneSplitter', locals(), Splitter) 325 326
327 - def _getSplitConfig(self, uniqueattrs):
328 """Return just one full split: no first or second dataset. 329 """ 330 if self.__mode == 'second': 331 return [([], None)] 332 else: 333 return [(None, [])]
334 335
336 - def __str__(self):
337 """String summary over the object 338 """ 339 return \ 340 "NoneSplitter / " + Splitter.__str__(self)
341 342 343
344 -class OddEvenSplitter(Splitter):
345 """Split a dataset into odd and even values of the sample attribute. 346 347 The splitter yields to splits: first (odd, even) and second (even, odd). 348 """
349 - def __init__(self, usevalues=False, **kwargs):
350 """Cheap init. 351 352 :Parameters: 353 usevalues: Boolean 354 If True the values of the attribute used for splitting will be 355 used to determine odd and even samples. If False odd and even 356 chunks are defined by the order of attribute values, i.e. first 357 unique attribute is odd, second is even, despite the 358 corresponding values might indicate the opposite (e.g. in case 359 of [2,3]. 360 """ 361 Splitter.__init__(self, **(kwargs)) 362 363 self.__usevalues = usevalues
364 365 366 __doc__ = enhancedDocString('OddEvenSplitter', locals(), Splitter) 367 368
369 - def _getSplitConfig(self, uniqueattrs):
370 """Huka chaka! 371 YOH: LOL XXX 372 """ 373 if self.__usevalues: 374 return [(None, uniqueattrs[(uniqueattrs % 2) == True]), 375 (None, uniqueattrs[(uniqueattrs % 2) == False])] 376 else: 377 return [(None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == True]), 378 (None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == False])]
379 380
381 - def __str__(self):
382 """String summary over the object 383 """ 384 return \ 385 "OddEvenSplitter / " + Splitter.__str__(self)
386 387 388
389 -class HalfSplitter(Splitter):
390 """Split a dataset into two halves of the sample attribute. 391 392 The splitter yields to splits: first (1st half, 2nd half) and second 393 (2nd half, 1st half). 394 """
395 - def __init__(self, **kwargs):
396 """Cheap init. 397 """ 398 Splitter.__init__(self, **(kwargs))
399 400 401 __doc__ = enhancedDocString('HalfSplitter', locals(), Splitter) 402 403
404 - def _getSplitConfig(self, uniqueattrs):
405 """Huka chaka! 406 """ 407 return [(None, uniqueattrs[:len(uniqueattrs)/2]), 408 (None, uniqueattrs[len(uniqueattrs)/2:])]
409 410
411 - def __str__(self):
412 """String summary over the object 413 """ 414 return \ 415 "HalfSplitter / " + Splitter.__str__(self)
416 417 418
419 -class NFoldSplitter(Splitter):
420 """Generic N-fold data splitter. 421 422 XXX: This docstring is a shame for such an important class! 423 """
424 - def __init__(self, 425 cvtype = 1, 426 **kwargs):
427 """Initialize the N-fold splitter. 428 429 :Parameter: 430 cvtype: Int 431 Type of cross-validation: N-(cvtype) 432 kwargs 433 Additional parameters are passed to the `Splitter` base class. 434 """ 435 Splitter.__init__(self, **(kwargs)) 436 437 # pylint happyness block 438 self.__cvtype = cvtype
439 440 441 __doc__ = enhancedDocString('NFoldSplitter', locals(), Splitter) 442 443
444 - def __str__(self):
445 """String summary over the object 446 """ 447 return \ 448 "N-%d-FoldSplitter / " % self.__cvtype + Splitter.__str__(self)
449 450
451 - def _getSplitConfig(self, uniqueattrs):
452 """Returns proper split configuration for N-M fold split. 453 """ 454 return [(None, i) for i in \ 455 support.getUniqueLengthNCombinations(uniqueattrs, 456 self.__cvtype)]
457 458 459
460 -class CustomSplitter(Splitter):
461 """Split a dataset using an arbitrary custom rule. 462 463 The splitter is configured by passing a custom spitting rule (`splitrule`) 464 to its constructor. Such a rule is basically a sequence of split 465 definitions. Every single element in this sequence results in excatly one 466 split generated by the Splitter. Each element is another sequence for 467 sequences of sample ids for each dataset that shall be generated in the 468 split. 469 470 Example: 471 472 * Generate two splits. In the first split the *second* dataset 473 contains all samples with sample attributes corresponding to 474 either 0, 1 or 2. The *first* dataset of the first split contains 475 all samples which are not split into the second dataset. 476 477 The second split yields three datasets. The first with all samples 478 corresponding to sample attributes 1 and 2, the second dataset 479 contains only samples with attrbiute 3 and the last dataset 480 contains the samples with attribute 5 and 6. 481 482 CustomSplitter([(None, [0, 1, 2]), ([1,2], [3], [5, 6])]) 483 """
484 - def __init__(self, splitrule, **kwargs):
485 """Cheap init. 486 """ 487 Splitter.__init__(self, **(kwargs)) 488 489 self.__splitrule = splitrule
490 491 492 __doc__ = enhancedDocString('CustomSplitter', locals(), Splitter) 493 494
495 - def _getSplitConfig(self, uniqueattrs):
496 """Huka chaka! 497 """ 498 return self.__splitrule
499 500
501 - def __str__(self):
502 """String summary over the object 503 """ 504 return "CustomSplitter / " + Splitter.__str__(self)
505