Package mvpa :: Package datasets :: Module splitter
[hide private]
[frames] | no frames]

Source Code for Module mvpa.datasets.splitter

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Collection of dataset splitters.""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  import operator 
 14   
 15  import numpy as N 
 16   
 17  import mvpa.misc.support as support 
 18  from mvpa.base.dochelpers import enhancedDocString 
 19   
 20   
21 -class Splitter(object):
22 """Base class of dataset splitters. 23 24 Each splitter should be initialized with all its necessary parameters. The 25 final splitting is done running the splitter object on a certain Dataset 26 via __call__(). This method has to be implemented like a generator, i.e. it 27 has to return every possible split with a yield() call. 28 29 Each split has to be returned as a sequence of Datasets. The properties 30 of the splitted dataset may vary between implementations. It is possible 31 to declare a sequence element as 'None'. 32 33 Please note, that even if there is only one Dataset returned it has to be 34 an element in a sequence and not just the Dataset object! 35 """
36 - def __init__(self, 37 nperlabel='all', 38 nrunspersplit=1, 39 permute=False, 40 attr='chunks'):
41 """Initialize splitter base. 42 43 :Parameters: 44 nperlabel : int or str (or list of them) 45 Number of dataset samples per label to be included in each 46 split. Two special strings are recognized: 'all' uses all available 47 samples (default) and 'equal' uses the maximum number of samples 48 the can be provided by all of the classes. This value might be 49 provided as a sequence whos length matches the number of datasets 50 per split and indicates the configuration for the respective dataset 51 in each split. 52 nrunspersplit: int 53 Number of times samples for each split are chosen. This 54 is mostly useful if a subset of the available samples 55 is used in each split and the subset is randomly 56 selected for each run (see the `nperlabel` argument). 57 permute : bool 58 If set to `True`, the labels of each generated dataset 59 will be permuted on a per-chunk basis. 60 attr : str 61 Sample attribute used to determine splits. 62 """ 63 # pylint happyness block 64 self.__nperlabel = None 65 self.__runspersplit = nrunspersplit 66 self.__permute = permute 67 self.__splitattr = attr 68 69 # pattern sampling status vars 70 self.setNPerLabel(nperlabel)
71 72
73 - def setNPerLabel(self, value):
74 """Set the number of samples per label in the split datasets. 75 76 'equal' sets sample size to highest possible number of samples that 77 can be provided by each class. 'all' uses all available samples 78 (default). 79 """ 80 self.__nperlabel = value
81 82
83 - def _getSplitConfig(self, uniqueattr):
84 """Each subclass has to implement this method. It gets a sequence with 85 the unique attribte ids of a dataset and has to return a list of lists 86 containing attribute ids to split into the second dataset. 87 """ 88 raise NotImplementedError
89 90
91 - def __call__(self, dataset):
92 """Splits the dataset. 93 94 This method behaves like a generator. 95 """ 96 97 # for each split 98 for split in self.splitcfg(dataset): 99 100 # determine sample sizes 101 if not operator.isSequenceType(self.__nperlabel) \ 102 or isinstance(self.__nperlabel, str): 103 nperlabel = [self.__nperlabel] * len(split) 104 else: 105 nperlabel = self.__nperlabel 106 107 # get splitted datasets 108 split_ds = self.splitDataset(dataset, split) 109 110 # do multiple post-processing runs for this split 111 for run in xrange(self.__runspersplit): 112 113 # post-process all datasets 114 finalized_datasets = [] 115 116 for i, ds in enumerate(split_ds): 117 # permute the labels 118 if self.__permute: 119 ds.permuteLabels(True, perchunk=True) 120 121 # select subset of samples if requested 122 if nperlabel[i] == 'all': 123 finalized_datasets.append(ds) 124 else: 125 # just pass through if no real dataset 126 if ds == None: 127 finalized_datasets.append(None) 128 else: 129 # go for maximum possible number of samples provided 130 # by each label in this dataset 131 if nperlabel[i] == 'equal': 132 # determine number number of samples per class 133 npl = N.array(ds.samplesperlabel.values()).min() 134 else: 135 npl = nperlabel[i] 136 137 # finally select the patterns 138 finalized_datasets.append( 139 ds.getRandomSamples(npl)) 140 141 yield finalized_datasets
142 143
144 - def splitDataset(self, dataset, specs):
145 """Split a dataset by separating the samples where the configured 146 sample attribute matches an element of `specs`. 147 148 :Parameters: 149 dataset : Dataset 150 This is this source dataset. 151 specs : sequence of sequences 152 Contains ids of a sample attribute that shall be split into the 153 another dataset. 154 155 :Returns: Tuple of splitted datasets. 156 """ 157 # collect the sample ids for each resulting dataset 158 filters = [] 159 none_specs = 0 160 cum_filter = None 161 for spec in specs: 162 if spec == None: 163 filters.append(None) 164 none_specs += 1 165 else: 166 filter_ = N.array([ i in spec \ 167 for i in eval('dataset.' + self.__splitattr)]) 168 filters.append(filter_) 169 if cum_filter == None: 170 cum_filter = filter_ 171 else: 172 cum_filter = N.logical_and(cum_filter, filter_) 173 174 # need to turn possible Nones into proper ids sequences 175 if none_specs > 1: 176 raise ValueError, "Splitter cannot handle more than one `None` " \ 177 "split definition." 178 179 for i, filter_ in enumerate(filters): 180 if filter_ == None: 181 filters[i] = N.logical_not(cum_filter) 182 183 # split data: return None if no samples are left 184 # XXX: Maybe it should simply return an empty dataset instead, but 185 # keeping it this way for now, to maintain current behavior 186 split_datasets = [] 187 for filter_ in filters: 188 if (filter_ == False).all(): 189 split_datasets.append(None) 190 else: 191 split_datasets.append(dataset.selectSamples(filter_)) 192 193 return split_datasets
194 195
196 - def __str__(self):
197 """String summary over the object 198 """ 199 return \ 200 "SplitterConfig: nperlabel:%s runs-per-split:%d permute:%s" \ 201 % (self.__nperlabel, self.__runspersplit, self.__permute)
202 203
204 - def splitcfg(self, dataset):
205 """Return splitcfg for a given dataset""" 206 return self._getSplitConfig(eval('dataset.unique' + self.__splitattr))
207 208 209
210 -class NoneSplitter(Splitter):
211 """This is a dataset splitter that does **not** split. It simply returns 212 the full dataset that it is called with. 213 214 The passed dataset is returned as the second element of the 2-tuple. 215 The first element of that tuple will always be 'None'. 216 """ 217 218 _known_modes = ['first', 'second'] 219
220 - def __init__(self, mode='second', **kwargs):
221 """Cheap init -- nothing special 222 223 :Parameters: 224 mode 225 Either 'first' or 'second' (default) -- which output dataset 226 would actually contain the samples 227 """ 228 Splitter.__init__(self, **(kwargs)) 229 230 if not mode in NoneSplitter._known_modes: 231 raise ValueError, "Unknown mode %s for NoneSplitter" % mode 232 self.__mode = mode
233 234
235 - def _getSplitConfig(self, uniqueattrs):
236 """Return just one full split: no first or second dataset. 237 """ 238 if self.__mode == 'second': 239 return [([], None)] 240 else: 241 return [(None, [])]
242 243
244 - def __str__(self):
245 """String summary over the object 246 """ 247 return \ 248 "NoneSplitter / " + Splitter.__str__(self)
249 250 251
252 -class OddEvenSplitter(Splitter):
253 """Split a dataset into odd and even values of the sample attribute. 254 255 The splitter yields to splits: first (odd, even) and second (even, odd). 256 """
257 - def __init__(self, usevalues=False, **kwargs):
258 """Cheap init. 259 260 :Parameters: 261 usevalues: Boolean 262 If True the values of the attribute used for splitting will be 263 used to determine odd and even samples. If False odd and even 264 chunks are defined by the order of attribute values, i.e. first 265 unique attribute is odd, second is even, despite the 266 corresponding values might indicate the opposite (e.g. in case 267 of [2,3]. 268 """ 269 Splitter.__init__(self, **(kwargs)) 270 271 self.__usevalues = usevalues
272 273
274 - def _getSplitConfig(self, uniqueattrs):
275 """Huka chaka! 276 """ 277 if self.__usevalues: 278 return [(None, uniqueattrs[(uniqueattrs % 2) == True]), 279 (None, uniqueattrs[(uniqueattrs % 2) == False])] 280 else: 281 return [(None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == True]), 282 (None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == False])]
283 284
285 - def __str__(self):
286 """String summary over the object 287 """ 288 return \ 289 "OddEvenSplitter / " + Splitter.__str__(self)
290 291 292
293 -class HalfSplitter(Splitter):
294 """Split a dataset into two halves of the sample attribute. 295 296 The splitter yields to splits: first (1st half, 2nd half) and second 297 (2nd half, 1st half). 298 """
299 - def __init__(self, **kwargs):
300 """Cheap init. 301 """ 302 Splitter.__init__(self, **(kwargs))
303 304
305 - def _getSplitConfig(self, uniqueattrs):
306 """Huka chaka! 307 """ 308 return [(None, uniqueattrs[:len(uniqueattrs)/2]), 309 (None, uniqueattrs[len(uniqueattrs)/2:])]
310 311
312 - def __str__(self):
313 """String summary over the object 314 """ 315 return \ 316 "HalfSplitter / " + Splitter.__str__(self)
317 318 319
320 -class NFoldSplitter(Splitter):
321 """Generic N-fold data splitter. 322 323 XXX: This docstring is a shame for such an important class! 324 """
325 - def __init__(self, 326 cvtype = 1, 327 **kwargs):
328 """Initialize the N-fold splitter. 329 330 :Parameter: 331 cvtype: Int 332 Type of cross-validation: N-(cvtype) 333 kwargs 334 Additional parameters are passed to the `Splitter` base class. 335 """ 336 Splitter.__init__(self, **(kwargs)) 337 338 # pylint happyness block 339 self.__cvtype = cvtype
340 341 342 __doc__ = enhancedDocString('NFoldSplitter', locals(), Splitter) 343 344
345 - def __str__(self):
346 """String summary over the object 347 """ 348 return \ 349 "N-%d-FoldSplitter / " % self.__cvtype + Splitter.__str__(self)
350 351
352 - def _getSplitConfig(self, uniqueattrs):
353 """Returns proper split configuration for N-M fold split. 354 """ 355 return [(None, i) for i in \ 356 support.getUniqueLengthNCombinations(uniqueattrs, 357 self.__cvtype)]
358 359 360
361 -class CustomSplitter(Splitter):
362 """Split a dataset using an arbitrary custom rule. 363 364 The splitter is configured by passing a custom spitting rule (`splitrule`) 365 to its constructor. Such a rule is basically a sequence of split 366 definitions. Every single element in this sequence results in excatly one 367 split generated by the Splitter. Each element is another sequence for 368 sequences of sample ids for each dataset that shall be generated in the 369 split. 370 371 Example: 372 373 * Generate two splits. In the first split the *second* dataset 374 contains all samples with sample attributes corresponding to 375 either 0, 1 or 2. The *first* dataset of the first split contains 376 all samples which are not split into the second dataset. 377 378 The second split yields three datasets. The first with all samples 379 corresponding to sample attributes 1 and 2, the second dataset 380 contains only samples with attrbiute 3 and the last dataset 381 contains the samples with attribute 5 and 6. 382 383 CustomSplitter([(None, [0, 1, 2]), ([1,2], [3], [5, 6])]) 384 """
385 - def __init__(self, splitrule, **kwargs):
386 """Cheap init. 387 """ 388 Splitter.__init__(self, **(kwargs)) 389 390 self.__splitrule = splitrule
391 392
393 - def _getSplitConfig(self, uniqueattrs):
394 """Huka chaka! 395 """ 396 return self.__splitrule
397 398
399 - def __str__(self):
400 """String summary over the object 401 """ 402 return "CustomSplitter / " + Splitter.__str__(self)
403