1
2
3
4
5
6
7
8
9 """Dataset container"""
10
11 __docformat__ = 'restructuredtext'
12
13 import operator
14 import random
15 import copy
16
17 import numpy as N
18
19 from mvpa.misc.exceptions import DatasetError
20 from mvpa.misc.support import idhash as idhash_
21
22 if __debug__:
23 from mvpa.misc import debug, warning
26 """This class provides a container to store all necessary data to perform
27 MVPA analyses. These are the data samples, as well as the labels
28 associated with these patterns. Additionally samples can be grouped into
29 chunks.
30
31 :Groups:
32 - `Creators`: `__init__`, `selectFeatures`, `selectSamples`, `applyMapper`
33 - `Mutators`: `permuteLabels`
34
35 Important: labels assumed to be immutable, ie noone should modify
36 them externally by accessing indexed items, ie something like
37 ``dataset.labels[1] += "_bad"`` should not be used. If a label has
38 to be modified, full copy of labels should be obtained, operated
39 on, and assigned back to the dataset, otherwise
40 dataset.uniquelabels would not work. The same applies to any
41 other attribute which has corresponding unique* access property.
42 """
43
44
45
46
47
48
49 _uniqueattributes = []
50 """Unique attributes associated with the data"""
51
52 _registeredattributes = []
53 """Registered attributes (stored in _data)"""
54
55 _requiredattributes = ['samples', 'labels']
56 """Attributes which have to be provided to __init__, or otherwise
57 no default values would be assumed and construction of the
58 instance would fail"""
59
60 - def __init__(self, data=None, dsattr=None, dtype=None, \
61 samples=None, labels=None, chunks=None, check_data=True,
62 copy_samples=False, copy_data=True, copy_dsattr=True):
63 """Initialize dataset instance
64
65 :Parameters:
66 data : dict
67 Dictionary with an arbitrary number of entries. The value for
68 each key in the dict has to be an ndarray with the
69 same length as the number of rows in the samples array.
70 A special entry in theis dictionary is 'samples', a 2d array
71 (samples x features). A shallow copy is stored in the object.
72 dsattr : dict
73 Dictionary of dataset attributes. An arbitrary number of
74 arbitrarily named and typed objects can be stored here. A
75 shallow copy of the dictionary is stored in the object.
76 dtype
77 If None -- do not change data type if samples
78 is an ndarray. Otherwise convert samples to dtype.
79
80 :Keywords:
81 samples : ndarray
82 a 2d array (samples x features)
83 labels
84 array or scalar value defining labels for each samples
85 chunks
86 array or scalar value defining chunks for each sample
87
88 Each of the Keywords arguments overwrites what is/might be
89 already in the `data` container.
90
91 """
92
93 if data is None:
94 data = {}
95 if dsattr is None:
96 dsattr = {}
97
98
99
100
101 if copy_data:
102
103
104
105
106 lcl_data = data.copy()
107 for k, v in data.iteritems():
108
109 if k == 'samples' and not copy_samples:
110 continue
111 lcl_data[k] = v.copy()
112 else:
113
114
115
116 lcl_data = data.copy()
117
118 if copy_dsattr and len(dsattr)>0:
119
120 if __debug__:
121 debug('DS', "Deep copying dsattr %s" % `dsattr`)
122 lcl_dsattr = copy.deepcopy(dsattr)
123
124 else:
125
126 lcl_dsattr = copy.copy(dsattr)
127
128
129
130
131 self._data = lcl_data
132 """What makes a dataset."""
133
134 self._dsattr = lcl_dsattr
135 """Dataset attriibutes."""
136
137
138 if not samples == None:
139 if __debug__:
140 if self._data.has_key('samples'):
141 debug('DS',
142 "`Data` dict has `samples` (%s) but there is also" +
143 " __init__ parameter `samples` which overrides " +
144 " stored in `data`" % (`self._data['samples'].shape`))
145 self._data['samples'] = self._shapeSamples(samples, dtype,
146 copy_samples)
147
148
149
150
151 if not labels == None:
152 if __debug__:
153 if self._data.has_key('labels'):
154 debug('DS',
155 "`Data` dict has `labels` (%s) but there is also" +
156 " __init__ parameter `labels` which overrides " +
157 " stored in `data`" % (`self._data['labels']`))
158 if self._data.has_key('samples'):
159 self._data['labels'] = \
160 self._expandSampleAttribute(labels, 'labels')
161
162
163 for attr in self._requiredattributes:
164 if not self._data.has_key(attr):
165 raise DatasetError, \
166 "Attribute %s is required to initialize dataset" % \
167 attr
168
169
170 if not chunks == None:
171 self._data['chunks'] = \
172 self._expandSampleAttribute(chunks, 'chunks')
173 elif not self._data.has_key('chunks'):
174
175
176 self._data['chunks'] = N.arange(self.nsamples)
177
178
179 for attr in self._registeredattributes:
180 if not self._data.has_key(attr):
181 if __debug__:
182 debug("DS", "Initializing attribute %s" % attr)
183 self._data[attr] = N.zeros(self.nsamples)
184
185 if check_data:
186 self._checkData()
187
188
189
190
191
192
193
194 if not labels is None or not chunks is None:
195
196
197 self._dsattr['__uniquereseted'] = False
198 self._resetallunique(force=True)
199
200 @property
202 """To verify if dataset is in the same state as when smth else was done
203
204 Like if classifier was trained on the same dataset as in question"""
205
206 res = id(self._data)
207 for val in self._data.values():
208 res += idhash_(val)
209 return res
210
211
213 """Set to None all unique* attributes of corresponding dictionary
214 """
215
216 if not force and self._dsattr['__uniquereseted']:
217 return
218
219
220 for k in self._uniqueattributes:
221 if __debug__:
222 debug("DS_", "Reset attribute %s" % k)
223 self._dsattr[k] = None
224 self._dsattr['__uniquereseted'] = True
225
226
228 """Provide common facility to return unique attributes
229
230 XXX `dict_` can be simply replaced now with self._dsattr
231 """
232 if not self._dsattr.has_key(attrib) or self._dsattr[attrib] is None:
233 if __debug__:
234 debug("DS_", "Recomputing unique set for attrib %s within %s" %
235 (attrib, self.summary(uniq=False)))
236
237
238 self._dsattr[attrib] = N.unique( dict_[attrib[6:]] )
239 assert(not self._dsattr[attrib] is None)
240 self._dsattr['__uniquereseted'] = False
241
242 return self._dsattr[attrib]
243
244
246 """Provide common facility to set attributes
247
248 """
249 if len(value) != self.nsamples:
250 raise ValueError, \
251 "Provided %s have %d entries while there is %d samples" % \
252 (attrib, len(value), self.nsamples)
253 self._data[attrib] = N.asarray(value)
254 uniqueattr = "unique" + attrib
255
256 if self._dsattr.has_key(uniqueattr):
257 self._dsattr[uniqueattr] = None
258
259
261 """Returns the number of samples per unique label.
262 """
263
264 uniqueattr = self._getuniqueattr(attrib="unique" + attrib,
265 dict_=self._data)
266
267
268 result = dict(zip(uniqueattr, [ 0 ] * len(uniqueattr)))
269 for l in self._data[attrib]:
270 result[l] += 1
271
272
273
274
275 return result
276
277
278
279
281 """Return indecies of samples given a list of attributes
282 """
283
284 if not operator.isSequenceType(values):
285 values = [ values ]
286
287
288
289 sel = N.array([], dtype=N.int16)
290 for value in values:
291 sel = N.concatenate((
292 sel, N.where(self._data[attrib]==value)[0]))
293
294
295 sel.sort()
296
297 return sel
298
299
301 """Adapt different kinds of samples
302
303 Handle all possible input value for 'samples' and tranform
304 them into a 2d (samples x feature) representation.
305 """
306
307
308 if (not isinstance(samples, N.ndarray)):
309
310
311 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy)
312 else:
313 if samples.ndim < 2 \
314 or (not dtype is None and dtype != samples.dtype):
315 if dtype is None:
316 dtype = samples.dtype
317 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy)
318 elif copy:
319 samples = samples.copy()
320
321
322 if len(samples.shape) > 2:
323 raise DatasetError, "Only (samples x features) -> 2d sample " \
324 + "are supported (got %s shape of samples)." \
325 % (`samples.shape`) \
326 +" Consider MappedDataset if applicable."
327
328 return samples
329
330
332 """Checks `_data` members to have the same # of samples.
333 """
334 for k, v in self._data.iteritems():
335 if not len(v) == self.nsamples:
336 raise DatasetError, \
337 "Length of sample attribute '%s' [%i] does not " \
338 "match the number of samples in the dataset [%i]." \
339 % (k, len(v), self.nsamples)
340
341
343 """If a sample attribute is given as a scalar expand/repeat it to a
344 length matching the number of samples in the dataset.
345 """
346 try:
347 if len(attr) != self.nsamples:
348 raise DatasetError, \
349 "Length of sample attribute '%s' [%d]" \
350 % (attr_name, len(attr)) \
351 + " has to match the number of samples" \
352 + " [%d]." % self.nsamples
353
354 return N.array(attr)
355
356 except TypeError:
357
358
359 return N.repeat(attr, self.nsamples)
360
361
362 @classmethod
363 - def _registerAttribute(cls, key, dictname="_data", hasunique=False,
364 default_setter=True):
365 """Register an attribute for any Dataset class.
366
367 Creates property assigning getters/setters depending on the
368 availability of corresponding _get, _set functions.
369 """
370
371
372 classdict = cls.__dict__
373 if not classdict.has_key(key):
374 if __debug__:
375 debug("DS", "Registering new attribute %s" % key)
376
377
378 getter = '_get%s' % key
379 if classdict.has_key(getter):
380 getter = '%s.%s' % (cls.__name__, getter)
381 else:
382 getter = "lambda x: x.%s['%s']" % (dictname, key)
383
384
385
386 setter = '_set%s' % key
387 if classdict.has_key(setter):
388 setter = '%s.%s' % (cls.__name__, setter)
389 elif default_setter and dictname=="_data":
390 setter = "lambda self,x: self._setdataattr" + \
391 "(attrib='%s', value=x)" % (key)
392 else:
393 setter = None
394
395 if __debug__:
396 debug("DS", "Registering new property %s.%s" %
397 (cls.__name__, key))
398 exec "%s.%s = property(fget=%s, fset=%s)" % \
399 (cls.__name__, key, getter, setter)
400
401 if hasunique:
402 uniquekey = "unique%s" % key
403 getter = '_get%s' % uniquekey
404 if classdict.has_key(getter):
405 getter = '%s.%s' % (cls.__name__, getter)
406 else:
407 getter = "lambda x: x._getuniqueattr" + \
408 "(attrib='%s', dict_=x.%s)" % (uniquekey, dictname)
409
410 if __debug__:
411 debug("DS", "Registering new property %s.%s" %
412 (cls.__name__, uniquekey))
413
414 exec "%s.%s = property(fget=%s)" % \
415 (cls.__name__, uniquekey, getter)
416
417
418 sampleskey = "samplesper%s" % key[:-1]
419 if __debug__:
420 debug("DS", "Registering new property %s.%s" %
421 (cls.__name__, sampleskey))
422
423 exec "%s.%s = property(fget=%s)" % \
424 (cls.__name__, sampleskey,
425 "lambda x: x._getNSamplesPerAttr(attrib='%s')" % key)
426
427 cls._uniqueattributes.append(uniquekey)
428
429
430 sampleskey = "idsby%s" % key
431 if __debug__:
432 debug("DS", "Registering new property %s.%s" %
433 (cls.__name__, sampleskey))
434
435 exec "%s.%s = %s" % (cls.__name__, sampleskey,
436 "lambda self, x: " +
437 "self._getSampleIdsByAttr(x,attrib='%s')" % key)
438
439 cls._uniqueattributes.append(uniquekey)
440
441 cls._registeredattributes.append(key)
442 elif __debug__:
443 warning('Trying to reregister attribute `%s`. For now ' % key +
444 'such capability is not present')
445
446
454
456 return "<%s>" % str(self)
457
458 - def summary(self, uniq=True, stats=True, idhash=False):
459 """String summary over the object
460
461 :Parameters:
462 uniq : bool
463 include summary over data attributes which have unique
464 idhash : bool
465 include idhash value for dataset and samples
466 stats : bool
467 include some basic statistics (mean, std, var) over dataset samples
468 """
469 if idhash:
470 idhash_ds = "{%s}" % self.idhash
471 idhash_samples = "{%s}" % idhash_(self.samples)
472 else:
473 idhash_ds = ""
474 idhash_samples = ""
475
476 s = """Dataset %s/ %s %d%s x %d""" % \
477 (idhash_ds, self.samples.dtype,
478 self.nsamples, idhash_samples, self.nfeatures)
479
480 if uniq:
481 s += " uniq:"
482 for uattr in self._dsattr.keys():
483 if not uattr.startswith("unique"):
484 continue
485 attr = uattr[6:]
486 try:
487 value = self._getuniqueattr(attrib=uattr,
488 dict_=self._data)
489 s += " %d %s" % (len(value), attr)
490 except:
491 pass
492
493 if stats:
494
495 s += " stats: mean=%g std=%g var=%g" % \
496 (N.mean(self.samples), N.std(self.samples),
497 N.var(self.samples))
498 return s
499
500
502 """Merge the samples of one Dataset object to another (in-place).
503
504 No dataset attributes will be merged!
505 """
506 if not self.nfeatures == other.nfeatures:
507 raise DatasetError, "Cannot add Dataset, because the number of " \
508 "feature do not match."
509
510
511 for k, v in self._data.iteritems():
512 self._data[k] = N.concatenate((v, other._data[k]), axis=0)
513
514
515 self._resetallunique()
516
517 return self
518
519
521 """Merge the samples two Dataset objects.
522
523 All data of both datasets is copied, concatenated and a new Dataset is
524 returned.
525
526 NOTE: This can be a costly operation (both memory and time). If
527 performance is important consider the '+=' operator.
528 """
529
530 out = super(Dataset, self).__new__(self.__class__)
531
532
533
534 out.__init__(data=self._data,
535 dsattr=self._dsattr,
536 copy_samples=True,
537 copy_data=True,
538 copy_dsattr=True)
539
540 out += other
541
542 return out
543
544
546 """Select a number of features from the current set.
547
548 :Parameters:
549 ids
550 iterable container to select ids
551 sort : bool
552 if to sort Ids. Order matters and `selectFeatures` assumes
553 incremental order. If not such, in non-optimized code
554 selectFeatures would verify the order and sort
555
556 Returns a new Dataset object with a view of the original
557 samples array (no copying is performed).
558
559 WARNING: The order of ids determines the order of features in
560 the returned dataset. This might be useful sometimes, but can
561 also cause major headaches! Order would is verified when
562 running in non-optimized code (if __debug__)
563 """
564
565
566 if sort:
567 ids.sort()
568
569
570
571
572
573
574
575 new_data = self._data.copy()
576
577
578
579 new_data['samples'] = self._data['samples'][:, ids]
580
581
582 dataset = super(Dataset, self).__new__(self.__class__)
583
584
585
586 dataset.__init__(data=new_data,
587 dsattr=self._dsattr,
588 check_data=False,
589 copy_samples=False,
590 copy_data=False,
591 copy_dsattr=False
592 )
593
594 return dataset
595
596
597 - def applyMapper(self, featuresmapper=None, samplesmapper=None):
598 """Obtain new dataset by applying mappers over features and/or samples.
599
600 :Parameters:
601 featuresmapper : Mapper
602 `Mapper` to somehow transform each sample's features
603 samplesmapper : Mapper
604 `Mapper` to transform each feature across samples
605
606 WARNING: At the moment, handling of samplesmapper is not yet
607 implemented since there were no real use case.
608
609 TODO: selectFeatures is pretty much applyMapper(featuresmapper=MaskMapper(...))
610 """
611
612
613 new_data = self._data.copy()
614
615
616
617 if samplesmapper:
618 raise NotImplementedError
619
620 if featuresmapper:
621 if __debug__:
622 debug("DS", "Applying featuresmapper %s" % `featuresmapper` +
623 " to samples of dataset `%s`" % `self`)
624 new_data['samples'] = featuresmapper.forward(self._data['samples'])
625
626
627 dataset = super(Dataset, self).__new__(self.__class__)
628
629
630
631 dataset.__init__(data=new_data,
632 dsattr=self._dsattr,
633 check_data=False,
634 copy_samples=False,
635 copy_data=False,
636 copy_dsattr=False
637 )
638
639 return dataset
640
641
643 """Choose a subset of samples.
644
645 Returns a new dataset object containing the selected sample
646 subset.
647
648 TODO: yoh, we might need to sort the mask if the mask is a
649 list of ids and is not ordered. Clarify with Michael what is
650 our intent here!
651 """
652
653
654 if not operator.isSequenceType( mask ):
655 mask = [mask]
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674 data = {}
675 for k, v in self._data.iteritems():
676 data[k] = v[mask, ]
677
678
679 dataset = super(Dataset, self).__new__(self.__class__)
680
681
682
683 dataset.__init__(data=data,
684 dsattr=self._dsattr,
685 check_data=False,
686 copy_samples=False,
687 copy_data=False,
688 copy_dsattr=False)
689
690 dataset._resetallunique(force=True)
691 return dataset
692
693
694
696 """Permute the labels.
697
698 TODO: rename status into something closer in semantics.
699
700 Calling this method with 'status' set to True, the labels are
701 permuted among all samples.
702
703 If 'perorigin' is True permutation is limited to samples sharing the
704 same chunk value. Therefore only the association of a certain sample
705 with a label is permuted while keeping the absolute number of
706 occurences of each label value within a certain chunk constant.
707
708 If 'status' is False the original labels are restored.
709 """
710 if not status:
711
712 if self._data.get('origlabels', None) is None:
713 raise RuntimeError, 'Cannot restore labels. ' \
714 'permuteLabels() has never been ' \
715 'called with status == True.'
716 self.labels = self._data['origlabels']
717 self._data['origlabels'] = None
718 else:
719
720
721 if not self._data.has_key('origlabels') \
722 or self._data['origlabels'] == None:
723
724 self._data['origlabels'] = self._data['labels']
725
726 self._data['labels'] = self._data['labels'].copy()
727
728
729 if perchunk:
730 for o in self.uniquechunks:
731 self._data['labels'][self.chunks == o ] = \
732 N.random.permutation( self.labels[ self.chunks == o ] )
733
734 self.labels = self._data['labels']
735 else:
736 self.labels = N.random.permutation(self._data['labels'])
737
738
740 """Select a random set of samples.
741
742 If 'nperlabel' is an integer value, the specified number of samples is
743 randomly choosen from the group of samples sharing a unique label
744 value ( total number of selected samples: nperlabel x len(uniquelabels).
745
746 If 'nperlabel' is a list which's length has to match the number of
747 unique label values. In this case 'nperlabel' specifies the number of
748 samples that shall be selected from the samples with the corresponding
749 label.
750
751 The method returns a Dataset object containing the selected
752 samples.
753 """
754
755 if isinstance(nperlabel, int):
756 nperlabel = [ nperlabel for i in self.uniquelabels ]
757
758 sample = []
759
760 for i, r in enumerate(self.uniquelabels):
761
762 sample += random.sample( (self.labels == r).nonzero()[0],
763 nperlabel[i] )
764
765 return self.selectSamples( sample )
766
767
768
769
770
771
772
773
774
776 """Currently available number of patterns.
777 """
778 return self._data['samples'].shape[0]
779
780
782 """Number of features per pattern.
783 """
784 return self._data['samples'].shape[1]
785
786
787
789 """Set the data type of the samples array.
790 """
791 if self._data['samples'].dtype != dtype:
792 self._data['samples'] = self._data['samples'].astype(dtype)
793
794
796 """Returns a boolean mask with all features in `ids` selected.
797
798 :Parameters:
799 ids: list or 1d array
800 To be selected features ids.
801
802 :Returns:
803 ndarray: dtype='bool'
804 All selected features are set to True; False otherwise.
805 """
806 fmask = N.repeat(False, self.nfeatures)
807 fmask[ids] = True
808
809 return fmask
810
811
813 """Returns feature ids corresponding to non-zero elements in the mask.
814
815 :Parameters:
816 mask: 1d ndarray
817 Feature mask.
818
819 :Returns:
820 ndarray: integer
821 Ids of non-zero (non-False) mask elements.
822 """
823 return mask.nonzero()[0]
824
825
826
827
828 nsamples = property( fget=getNSamples )
829 nfeatures = property( fget=getNFeatures )
830
831
832
833 Dataset._registerAttribute("samples", "_data", hasunique=False)
834 Dataset._registerAttribute("labels", "_data", hasunique=True)
835 Dataset._registerAttribute("chunks", "_data", hasunique=True)
836