1
2
3
4
5
6
7
8
9 """Collection of dataset splitters."""
10
11 __docformat__ = 'restructuredtext'
12
13 import operator
14
15 import numpy as N
16
17 import mvpa.misc.support as support
18 from mvpa.base.dochelpers import enhancedDocString
19
20 if __debug__:
21 from mvpa.base import debug
22
24 """Base class of dataset splitters.
25
26 Each splitter should be initialized with all its necessary parameters. The
27 final splitting is done running the splitter object on a certain Dataset
28 via __call__(). This method has to be implemented like a generator, i.e. it
29 has to return every possible split with a yield() call.
30
31 Each split has to be returned as a sequence of Datasets. The properties
32 of the splitted dataset may vary between implementations. It is possible
33 to declare a sequence element as 'None'.
34
35 Please note, that even if there is only one Dataset returned it has to be
36 an element in a sequence and not just the Dataset object!
37 """
38
39 _STRATEGIES = ('first', 'random', 'equidistant')
40
41 - def __init__(self,
42 nperlabel='all',
43 nrunspersplit=1,
44 permute=False,
45 count=None,
46 strategy='equidistant',
47 attr='chunks'):
48 """Initialize splitter base.
49
50 :Parameters:
51 nperlabel : int or str (or list of them)
52 Number of dataset samples per label to be included in each
53 split. Two special strings are recognized: 'all' uses all available
54 samples (default) and 'equal' uses the maximum number of samples
55 the can be provided by all of the classes. This value might be
56 provided as a sequence whos length matches the number of datasets
57 per split and indicates the configuration for the respective dataset
58 in each split.
59 nrunspersplit: int
60 Number of times samples for each split are chosen. This
61 is mostly useful if a subset of the available samples
62 is used in each split and the subset is randomly
63 selected for each run (see the `nperlabel` argument).
64 permute : bool
65 If set to `True`, the labels of each generated dataset
66 will be permuted on a per-chunk basis.
67 count : None or int
68 Desired number of splits to be output. It is limited by the
69 number of splits possible for a given splitter
70 (e.g. `OddEvenSplitter` can have only up to 2 splits). If None,
71 all splits are output (default).
72 strategy : str
73 If `count` is not None, possible strategies are possible:
74 first
75 First `count` splits are chosen
76 random
77 Random (without replacement) `count` splits are chosen
78 equidistant
79 Splits which are equidistant from each other
80 attr : str
81 Sample attribute used to determine splits.
82 """
83
84 self.__nperlabel = None
85 self.__runspersplit = nrunspersplit
86 self.__permute = permute
87 self.__splitattr = attr
88
89
90
91
92 self.count = count
93 """Number (max) of splits to output on call"""
94
95 self._setStrategy(strategy)
96
97
98 self.setNPerLabel(nperlabel)
99
100
101 __doc__ = enhancedDocString('Splitter', locals())
102
109
111 """Set the number of samples per label in the split datasets.
112
113 'equal' sets sample size to highest possible number of samples that
114 can be provided by each class. 'all' uses all available samples
115 (default).
116 """
117 self.__nperlabel = value
118
119
121 """Each subclass has to implement this method. It gets a sequence with
122 the unique attribte ids of a dataset and has to return a list of lists
123 containing attribute ids to split into the second dataset.
124 """
125 raise NotImplementedError
126
127
129 """Splits the dataset.
130
131 This method behaves like a generator.
132 """
133
134
135 ds_class = dataset.__class__
136 DS_permuteLabels = ds_class.permuteLabels
137 try:
138 DS_getNSamplesPerLabel = ds_class._getNSamplesPerAttr
139 except AttributeError:
140
141
142 pass
143 DS_getRandomSamples = ds_class.getRandomSamples
144
145
146 cfgs = self.splitcfg(dataset)
147
148
149 count, Ncfgs = self.count, len(cfgs)
150
151
152 if count is not None and count < Ncfgs:
153 if count < 1:
154
155 return
156 strategy = self.strategy
157 if strategy == 'first':
158 cfgs = cfgs[:count]
159 elif strategy in ['equidistant', 'random']:
160 if strategy == 'equidistant':
161
162
163 step = float(Ncfgs) / count
164 assert(step >= 1.0)
165 indexes = [int(round(step * i)) for i in xrange(count)]
166 elif strategy == 'random':
167 indexes = N.random.permutation(range(Ncfgs))[:count]
168
169
170 indexes.sort()
171 else:
172
173 raise RuntimeError, "Really should not happen"
174 if __debug__:
175 debug("SPL", "For %s strategy selected %s splits "
176 "from %d total" % (strategy, indexes, Ncfgs))
177 cfgs = [cfgs[i] for i in indexes]
178
179 for split in cfgs:
180
181 if not operator.isSequenceType(self.__nperlabel) \
182 or isinstance(self.__nperlabel, str):
183 nperlabel = [self.__nperlabel] * len(split)
184 else:
185 nperlabel = self.__nperlabel
186
187
188 split_ds = self.splitDataset(dataset, split)
189
190
191 for run in xrange(self.__runspersplit):
192
193
194 finalized_datasets = []
195
196 for i, ds in enumerate(split_ds):
197
198 if self.__permute:
199 DS_permuteLabels(ds, True, perchunk=True)
200
201
202 if nperlabel[i] == 'all':
203 finalized_datasets.append(ds)
204 else:
205
206 if ds == None:
207 finalized_datasets.append(None)
208 else:
209
210
211 if nperlabel[i] == 'equal':
212
213 npl = N.array(DS_getNSamplesPerLabel(
214 ds, attrib='labels').values()).min()
215 else:
216 npl = nperlabel[i]
217
218
219 finalized_datasets.append(
220 DS_getRandomSamples(ds, npl))
221
222 yield finalized_datasets
223
224
226 """Split a dataset by separating the samples where the configured
227 sample attribute matches an element of `specs`.
228
229 :Parameters:
230 dataset : Dataset
231 This is this source dataset.
232 specs : sequence of sequences
233 Contains ids of a sample attribute that shall be split into the
234 another dataset.
235
236 :Returns: Tuple of splitted datasets.
237 """
238
239 filters = []
240 none_specs = 0
241 cum_filter = None
242
243 splitattr_data = eval('dataset.' + self.__splitattr)
244 for spec in specs:
245 if spec == None:
246 filters.append(None)
247 none_specs += 1
248 else:
249 filter_ = N.array([ i in spec \
250 for i in splitattr_data])
251 filters.append(filter_)
252 if cum_filter == None:
253 cum_filter = filter_
254 else:
255 cum_filter = N.logical_and(cum_filter, filter_)
256
257
258 if none_specs > 1:
259 raise ValueError, "Splitter cannot handle more than one `None` " \
260 "split definition."
261
262 for i, filter_ in enumerate(filters):
263 if filter_ == None:
264 filters[i] = N.logical_not(cum_filter)
265
266
267
268
269 split_datasets = []
270
271
272 dataset_selectSamples = dataset.selectSamples
273 for filter_ in filters:
274 if (filter_ == False).all():
275 split_datasets.append(None)
276 else:
277 split_datasets.append(dataset_selectSamples(filter_))
278
279 return split_datasets
280
281
283 """String summary over the object
284 """
285 return \
286 "SplitterConfig: nperlabel:%s runs-per-split:%d permute:%s" \
287 % (self.__nperlabel, self.__runspersplit, self.__permute)
288
289
291 """Return splitcfg for a given dataset"""
292 return self._getSplitConfig(eval('dataset.unique' + self.__splitattr))
293
294
295 strategy = property(fget=lambda self:self.__strategy,
296 fset=_setStrategy)
297
298
300 """This is a dataset splitter that does **not** split. It simply returns
301 the full dataset that it is called with.
302
303 The passed dataset is returned as the second element of the 2-tuple.
304 The first element of that tuple will always be 'None'.
305 """
306
307 _known_modes = ['first', 'second']
308
309 - def __init__(self, mode='second', **kwargs):
310 """Cheap init -- nothing special
311
312 :Parameters:
313 mode
314 Either 'first' or 'second' (default) -- which output dataset
315 would actually contain the samples
316 """
317 Splitter.__init__(self, **(kwargs))
318
319 if not mode in NoneSplitter._known_modes:
320 raise ValueError, "Unknown mode %s for NoneSplitter" % mode
321 self.__mode = mode
322
323
324 __doc__ = enhancedDocString('NoneSplitter', locals(), Splitter)
325
326
328 """Return just one full split: no first or second dataset.
329 """
330 if self.__mode == 'second':
331 return [([], None)]
332 else:
333 return [(None, [])]
334
335
337 """String summary over the object
338 """
339 return \
340 "NoneSplitter / " + Splitter.__str__(self)
341
342
343
345 """Split a dataset into odd and even values of the sample attribute.
346
347 The splitter yields to splits: first (odd, even) and second (even, odd).
348 """
349 - def __init__(self, usevalues=False, **kwargs):
350 """Cheap init.
351
352 :Parameters:
353 usevalues: Boolean
354 If True the values of the attribute used for splitting will be
355 used to determine odd and even samples. If False odd and even
356 chunks are defined by the order of attribute values, i.e. first
357 unique attribute is odd, second is even, despite the
358 corresponding values might indicate the opposite (e.g. in case
359 of [2,3].
360 """
361 Splitter.__init__(self, **(kwargs))
362
363 self.__usevalues = usevalues
364
365
366 __doc__ = enhancedDocString('OddEvenSplitter', locals(), Splitter)
367
368
370 """Huka chaka!
371 YOH: LOL XXX
372 """
373 if self.__usevalues:
374 return [(None, uniqueattrs[(uniqueattrs % 2) == True]),
375 (None, uniqueattrs[(uniqueattrs % 2) == False])]
376 else:
377 return [(None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == True]),
378 (None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == False])]
379
380
382 """String summary over the object
383 """
384 return \
385 "OddEvenSplitter / " + Splitter.__str__(self)
386
387
388
390 """Split a dataset into two halves of the sample attribute.
391
392 The splitter yields to splits: first (1st half, 2nd half) and second
393 (2nd half, 1st half).
394 """
399
400
401 __doc__ = enhancedDocString('HalfSplitter', locals(), Splitter)
402
403
405 """Huka chaka!
406 """
407 return [(None, uniqueattrs[:len(uniqueattrs)/2]),
408 (None, uniqueattrs[len(uniqueattrs)/2:])]
409
410
412 """String summary over the object
413 """
414 return \
415 "HalfSplitter / " + Splitter.__str__(self)
416
417
418
420 """Generic N-fold data splitter.
421
422 XXX: This docstring is a shame for such an important class!
423 """
424 - def __init__(self,
425 cvtype = 1,
426 **kwargs):
427 """Initialize the N-fold splitter.
428
429 :Parameter:
430 cvtype: Int
431 Type of cross-validation: N-(cvtype)
432 kwargs
433 Additional parameters are passed to the `Splitter` base class.
434 """
435 Splitter.__init__(self, **(kwargs))
436
437
438 self.__cvtype = cvtype
439
440
441 __doc__ = enhancedDocString('NFoldSplitter', locals(), Splitter)
442
443
445 """String summary over the object
446 """
447 return \
448 "N-%d-FoldSplitter / " % self.__cvtype + Splitter.__str__(self)
449
450
452 """Returns proper split configuration for N-M fold split.
453 """
454 return [(None, i) for i in \
455 support.getUniqueLengthNCombinations(uniqueattrs,
456 self.__cvtype)]
457
458
459
461 """Split a dataset using an arbitrary custom rule.
462
463 The splitter is configured by passing a custom spitting rule (`splitrule`)
464 to its constructor. Such a rule is basically a sequence of split
465 definitions. Every single element in this sequence results in excatly one
466 split generated by the Splitter. Each element is another sequence for
467 sequences of sample ids for each dataset that shall be generated in the
468 split.
469
470 Example:
471
472 * Generate two splits. In the first split the *second* dataset
473 contains all samples with sample attributes corresponding to
474 either 0, 1 or 2. The *first* dataset of the first split contains
475 all samples which are not split into the second dataset.
476
477 The second split yields three datasets. The first with all samples
478 corresponding to sample attributes 1 and 2, the second dataset
479 contains only samples with attrbiute 3 and the last dataset
480 contains the samples with attribute 5 and 6.
481
482 CustomSplitter([(None, [0, 1, 2]), ([1,2], [3], [5, 6])])
483 """
484 - def __init__(self, splitrule, **kwargs):
485 """Cheap init.
486 """
487 Splitter.__init__(self, **(kwargs))
488
489 self.__splitrule = splitrule
490
491
492 __doc__ = enhancedDocString('CustomSplitter', locals(), Splitter)
493
494
496 """Huka chaka!
497 """
498 return self.__splitrule
499
500
502 """String summary over the object
503 """
504 return "CustomSplitter / " + Splitter.__str__(self)
505