1
2
3
4
5
6
7
8
9 """Collection of dataset splitters."""
10
11 __docformat__ = 'restructuredtext'
12
13 import operator
14
15 import numpy as N
16
17 import mvpa.misc.support as support
18 from mvpa.base.dochelpers import enhancedDocString
19
20
22 """Base class of dataset splitters.
23
24 Each splitter should be initialized with all its necessary parameters. The
25 final splitting is done running the splitter object on a certain Dataset
26 via __call__(). This method has to be implemented like a generator, i.e. it
27 has to return every possible split with a yield() call.
28
29 Each split has to be returned as a sequence of Datasets. The properties
30 of the splitted dataset may vary between implementations. It is possible
31 to declare a sequence element as 'None'.
32
33 Please note, that even if there is only one Dataset returned it has to be
34 an element in a sequence and not just the Dataset object!
35 """
36 - def __init__(self,
37 nperlabel='all',
38 nrunspersplit=1,
39 permute=False,
40 attr='chunks'):
41 """Initialize splitter base.
42
43 :Parameters:
44 nperlabel : int or str (or list of them)
45 Number of dataset samples per label to be included in each
46 split. Two special strings are recognized: 'all' uses all available
47 samples (default) and 'equal' uses the maximum number of samples
48 the can be provided by all of the classes. This value might be
49 provided as a sequence whos length matches the number of datasets
50 per split and indicates the configuration for the respective dataset
51 in each split.
52 nrunspersplit: int
53 Number of times samples for each split are chosen. This
54 is mostly useful if a subset of the available samples
55 is used in each split and the subset is randomly
56 selected for each run (see the `nperlabel` argument).
57 permute : bool
58 If set to `True`, the labels of each generated dataset
59 will be permuted on a per-chunk basis.
60 attr : str
61 Sample attribute used to determine splits.
62 """
63
64 self.__nperlabel = None
65 self.__runspersplit = nrunspersplit
66 self.__permute = permute
67 self.__splitattr = attr
68
69
70 self.setNPerLabel(nperlabel)
71
72
74 """Set the number of samples per label in the split datasets.
75
76 'equal' sets sample size to highest possible number of samples that
77 can be provided by each class. 'all' uses all available samples
78 (default).
79 """
80 self.__nperlabel = value
81
82
84 """Each subclass has to implement this method. It gets a sequence with
85 the unique attribte ids of a dataset and has to return a list of lists
86 containing attribute ids to split into the second dataset.
87 """
88 raise NotImplementedError
89
90
92 """Splits the dataset.
93
94 This method behaves like a generator.
95 """
96
97
98 for split in self.splitcfg(dataset):
99
100
101 if not operator.isSequenceType(self.__nperlabel) \
102 or isinstance(self.__nperlabel, str):
103 nperlabel = [self.__nperlabel] * len(split)
104 else:
105 nperlabel = self.__nperlabel
106
107
108 split_ds = self.splitDataset(dataset, split)
109
110
111 for run in xrange(self.__runspersplit):
112
113
114 finalized_datasets = []
115
116 for i, ds in enumerate(split_ds):
117
118 if self.__permute:
119 ds.permuteLabels(True, perchunk=True)
120
121
122 if nperlabel[i] == 'all':
123 finalized_datasets.append(ds)
124 else:
125
126 if ds == None:
127 finalized_datasets.append(None)
128 else:
129
130
131 if nperlabel[i] == 'equal':
132
133 npl = N.array(ds.samplesperlabel.values()).min()
134 else:
135 npl = nperlabel[i]
136
137
138 finalized_datasets.append(
139 ds.getRandomSamples(npl))
140
141 yield finalized_datasets
142
143
145 """Split a dataset by separating the samples where the configured
146 sample attribute matches an element of `specs`.
147
148 :Parameters:
149 dataset : Dataset
150 This is this source dataset.
151 specs : sequence of sequences
152 Contains ids of a sample attribute that shall be split into the
153 another dataset.
154
155 :Returns: Tuple of splitted datasets.
156 """
157
158 filters = []
159 none_specs = 0
160 cum_filter = None
161 for spec in specs:
162 if spec == None:
163 filters.append(None)
164 none_specs += 1
165 else:
166 filter_ = N.array([ i in spec \
167 for i in eval('dataset.' + self.__splitattr)])
168 filters.append(filter_)
169 if cum_filter == None:
170 cum_filter = filter_
171 else:
172 cum_filter = N.logical_and(cum_filter, filter_)
173
174
175 if none_specs > 1:
176 raise ValueError, "Splitter cannot handle more than one `None` " \
177 "split definition."
178
179 for i, filter_ in enumerate(filters):
180 if filter_ == None:
181 filters[i] = N.logical_not(cum_filter)
182
183
184
185
186 split_datasets = []
187 for filter_ in filters:
188 if (filter_ == False).all():
189 split_datasets.append(None)
190 else:
191 split_datasets.append(dataset.selectSamples(filter_))
192
193 return split_datasets
194
195
197 """String summary over the object
198 """
199 return \
200 "SplitterConfig: nperlabel:%s runs-per-split:%d permute:%s" \
201 % (self.__nperlabel, self.__runspersplit, self.__permute)
202
203
205 """Return splitcfg for a given dataset"""
206 return self._getSplitConfig(eval('dataset.unique' + self.__splitattr))
207
208
209
211 """This is a dataset splitter that does **not** split. It simply returns
212 the full dataset that it is called with.
213
214 The passed dataset is returned as the second element of the 2-tuple.
215 The first element of that tuple will always be 'None'.
216 """
217
218 _known_modes = ['first', 'second']
219
220 - def __init__(self, mode='second', **kwargs):
221 """Cheap init -- nothing special
222
223 :Parameters:
224 mode
225 Either 'first' or 'second' (default) -- which output dataset
226 would actually contain the samples
227 """
228 Splitter.__init__(self, **(kwargs))
229
230 if not mode in NoneSplitter._known_modes:
231 raise ValueError, "Unknown mode %s for NoneSplitter" % mode
232 self.__mode = mode
233
234
236 """Return just one full split: no first or second dataset.
237 """
238 if self.__mode == 'second':
239 return [([], None)]
240 else:
241 return [(None, [])]
242
243
245 """String summary over the object
246 """
247 return \
248 "NoneSplitter / " + Splitter.__str__(self)
249
250
251
253 """Split a dataset into odd and even values of the sample attribute.
254
255 The splitter yields to splits: first (odd, even) and second (even, odd).
256 """
257 - def __init__(self, usevalues=False, **kwargs):
258 """Cheap init.
259
260 :Parameters:
261 usevalues: Boolean
262 If True the values of the attribute used for splitting will be
263 used to determine odd and even samples. If False odd and even
264 chunks are defined by the order of attribute values, i.e. first
265 unique attribute is odd, second is even, despite the
266 corresponding values might indicate the opposite (e.g. in case
267 of [2,3].
268 """
269 Splitter.__init__(self, **(kwargs))
270
271 self.__usevalues = usevalues
272
273
275 """Huka chaka!
276 """
277 if self.__usevalues:
278 return [(None, uniqueattrs[(uniqueattrs % 2) == True]),
279 (None, uniqueattrs[(uniqueattrs % 2) == False])]
280 else:
281 return [(None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == True]),
282 (None, uniqueattrs[N.arange(len(uniqueattrs)) %2 == False])]
283
284
286 """String summary over the object
287 """
288 return \
289 "OddEvenSplitter / " + Splitter.__str__(self)
290
291
292
294 """Split a dataset into two halves of the sample attribute.
295
296 The splitter yields to splits: first (1st half, 2nd half) and second
297 (2nd half, 1st half).
298 """
303
304
306 """Huka chaka!
307 """
308 return [(None, uniqueattrs[:len(uniqueattrs)/2]),
309 (None, uniqueattrs[len(uniqueattrs)/2:])]
310
311
313 """String summary over the object
314 """
315 return \
316 "HalfSplitter / " + Splitter.__str__(self)
317
318
319
321 """Generic N-fold data splitter.
322
323 XXX: This docstring is a shame for such an important class!
324 """
325 - def __init__(self,
326 cvtype = 1,
327 **kwargs):
328 """Initialize the N-fold splitter.
329
330 :Parameter:
331 cvtype: Int
332 Type of cross-validation: N-(cvtype)
333 kwargs
334 Additional parameters are passed to the `Splitter` base class.
335 """
336 Splitter.__init__(self, **(kwargs))
337
338
339 self.__cvtype = cvtype
340
341
342 __doc__ = enhancedDocString('NFoldSplitter', locals(), Splitter)
343
344
346 """String summary over the object
347 """
348 return \
349 "N-%d-FoldSplitter / " % self.__cvtype + Splitter.__str__(self)
350
351
353 """Returns proper split configuration for N-M fold split.
354 """
355 return [(None, i) for i in \
356 support.getUniqueLengthNCombinations(uniqueattrs,
357 self.__cvtype)]
358
359
360
362 """Split a dataset using an arbitrary custom rule.
363
364 The splitter is configured by passing a custom spitting rule (`splitrule`)
365 to its constructor. Such a rule is basically a sequence of split
366 definitions. Every single element in this sequence results in excatly one
367 split generated by the Splitter. Each element is another sequence for
368 sequences of sample ids for each dataset that shall be generated in the
369 split.
370
371 Example:
372
373 * Generate two splits. In the first split the *second* dataset
374 contains all samples with sample attributes corresponding to
375 either 0, 1 or 2. The *first* dataset of the first split contains
376 all samples which are not split into the second dataset.
377
378 The second split yields three datasets. The first with all samples
379 corresponding to sample attributes 1 and 2, the second dataset
380 contains only samples with attrbiute 3 and the last dataset
381 contains the samples with attribute 5 and 6.
382
383 CustomSplitter([(None, [0, 1, 2]), ([1,2], [3], [5, 6])])
384 """
385 - def __init__(self, splitrule, **kwargs):
386 """Cheap init.
387 """
388 Splitter.__init__(self, **(kwargs))
389
390 self.__splitrule = splitrule
391
392
394 """Huka chaka!
395 """
396 return self.__splitrule
397
398
400 """String summary over the object
401 """
402 return "CustomSplitter / " + Splitter.__str__(self)
403