1
2
3
4
5
6
7
8
9 """Dataset container"""
10
11 __docformat__ = 'restructuredtext'
12
13 import operator
14 import random
15 import mvpa.misc.copy as copy
16 import numpy as N
17
18 from sets import Set
19
20
21
22
23
24
25 from mvpa.misc.exceptions import DatasetError
26 from mvpa.misc.support import idhash as idhash_
27 from mvpa.base.dochelpers import enhancedDocString, table2string
28
29 if __debug__:
30 from mvpa.base import debug, warning
33 if operator.isSequenceType(seq):
34 seq_unique = N.unique(seq)
35 if len(seq) != len(seq_unique):
36 warning("%s() operates only with indexes for %s without"
37 " repetitions. Repetitions were removed."
38 % (fname, item))
39 if N.any(N.sort(seq) != seq_unique):
40 warning("%s() does not guarantee the original order"
41 " of selected %ss. Use selectSamples() and "
42 " selectFeatures(sort=False) instead" % (fname, item))
43
47 """*The* Dataset.
48
49 This class provides a container to store all necessary data to
50 perform MVPA analyses. These are the data samples, as well as the
51 labels associated with the samples. Additionally, samples can be
52 grouped into chunks.
53
54 :Groups:
55 - `Creators`: `__init__`, `selectFeatures`, `selectSamples`,
56 `applyMapper`
57 - `Mutators`: `permuteLabels`
58
59 Important: labels assumed to be immutable, i.e. noone should modify
60 them externally by accessing indexed items, ie something like
61 ``dataset.labels[1] += "_bad"`` should not be used. If a label has
62 to be modified, full copy of labels should be obtained, operated on,
63 and assigned back to the dataset, otherwise dataset.uniquelabels
64 would not work. The same applies to any other attribute which has
65 corresponding unique* access property.
66
67 """
68
69
70
71
72
73
74
75
76
77
78
79
80
81 _uniqueattributes = []
82 """Unique attributes associated with the data"""
83
84 _registeredattributes = []
85 """Registered attributes (stored in _data)"""
86
87 _requiredattributes = ['samples', 'labels']
88 """Attributes which have to be provided to __init__, or otherwise
89 no default values would be assumed and construction of the
90 instance would fail"""
91
92
93
94
95
96
97
98
99
100
101 - def __init__(self,
102
103 data=None,
104 dsattr=None,
105
106 dtype=None,
107
108 samples=None,
109 labels=None,
110 labels_map=None,
111 chunks=None,
112 origids=None,
113
114 check_data=True,
115 copy_samples=False,
116 copy_data=True,
117 copy_dsattr=True):
118 """Initialize dataset instance
119
120 There are basically two different way to create a dataset:
121
122 1. Create a new dataset from samples and sample attributes. In
123 this mode a two-dimensional `ndarray` has to be passed to the
124 `samples` keyword argument and the corresponding samples
125 attributes are provided via the `labels` and `chunks`
126 arguments.
127
128 2. Copy contructor mode
129 The second way is used internally to perform quick coyping
130 of datasets, e.g. when performing feature selection. In this
131 mode and the two dictionaries (`data` and `dsattr`) are
132 required. For performance reasons this mode bypasses most of
133 the sanity check performed by the previous mode, as for
134 internal operations data integrity is assumed.
135
136
137 :Parameters:
138 data : dict
139 Dictionary with an arbitrary number of entries. The value for
140 each key in the dict has to be an ndarray with the
141 same length as the number of rows in the samples array.
142 A special entry in this dictionary is 'samples', a 2d array
143 (samples x features). A shallow copy is stored in the object.
144 dsattr : dict
145 Dictionary of dataset attributes. An arbitrary number of
146 arbitrarily named and typed objects can be stored here. A
147 shallow copy of the dictionary is stored in the object.
148 dtype: type | None
149 If None -- do not change data type if samples
150 is an ndarray. Otherwise convert samples to dtype.
151
152
153 :Keywords:
154 samples : ndarray
155 2d array (samples x features)
156 labels
157 An array or scalar value defining labels for each samples
158 labels_map : None or bool or dict
159 Map from labels into literal names. If is None or True,
160 the mapping is computed, from labels which must be literal.
161 If is False, no mapping is computed. If dict -- mapping is
162 verified and taken, labels get remapped. Dict must map
163 literal -> number
164 chunks
165 An array or scalar value defining chunks for each sample
166
167 Each of the Keywords arguments overwrites what is/might be
168 already in the `data` container.
169
170 """
171
172
173
174
175 if data is None:
176 data = {}
177 if dsattr is None:
178 dsattr = {}
179
180
181
182
183 if copy_data:
184
185
186
187
188 lcl_data = data.copy()
189 for k, v in data.iteritems():
190
191 if k == 'samples' and not copy_samples:
192 continue
193 lcl_data[k] = v.copy()
194 else:
195
196
197
198 lcl_data = data.copy()
199
200 if copy_dsattr and len(dsattr)>0:
201
202 if __debug__:
203 debug('DS', "Deep copying dsattr %s" % `dsattr`)
204 lcl_dsattr = copy.deepcopy(dsattr)
205
206 else:
207
208 lcl_dsattr = copy.copy(dsattr)
209
210
211
212
213 self._data = lcl_data
214 """What makes a dataset."""
215
216 self._dsattr = lcl_dsattr
217 """Dataset attriibutes."""
218
219
220 if not samples == None:
221 if __debug__:
222 if lcl_data.has_key('samples'):
223 debug('DS',
224 "`Data` dict has `samples` (%s) but there is also" \
225 " __init__ parameter `samples` which overrides " \
226 " stored in `data`" % (`lcl_data['samples'].shape`))
227 lcl_data['samples'] = self._shapeSamples(samples, dtype,
228 copy_samples)
229
230
231
232
233
234 if not labels == None:
235 if __debug__:
236 if lcl_data.has_key('labels'):
237 debug('DS',
238 "`Data` dict has `labels` (%s) but there is also" +
239 " __init__ parameter `labels` which overrides " +
240 " stored in `data`" % (`lcl_data['labels']`))
241 if lcl_data.has_key('samples'):
242 lcl_data['labels'] = \
243 self._expandSampleAttribute(labels, 'labels')
244
245
246 for attr in self._requiredattributes:
247 if not lcl_data.has_key(attr):
248 raise DatasetError, \
249 "Attribute %s is required to initialize dataset" % \
250 attr
251
252 nsamples = self.nsamples
253
254
255 if not chunks == None:
256 lcl_data['chunks'] = \
257 self._expandSampleAttribute(chunks, 'chunks')
258 elif not lcl_data.has_key('chunks'):
259
260
261 lcl_data['chunks'] = N.arange(nsamples)
262
263
264 if not origids is None:
265
266 lcl_data['origids'] = origids
267 elif not lcl_data.has_key('origids'):
268
269 lcl_data['origids'] = N.arange(len(lcl_data['labels']))
270 else:
271
272
273
274 pass
275
276
277 for attr in self._registeredattributes:
278 if not lcl_data.has_key(attr):
279 if __debug__:
280 debug("DS", "Initializing attribute %s" % attr)
281 lcl_data[attr] = N.zeros(nsamples)
282
283
284 labels_ = N.asarray(lcl_data['labels'])
285 labels_map_known = lcl_dsattr.has_key('labels_map')
286 if labels_map is True:
287
288 if labels_.dtype.char == 'S' or not labels_map_known:
289
290 ulabels = list(Set(labels_))
291 ulabels.sort()
292 labels_map = dict([ (x[1], x[0]) for x in enumerate(ulabels) ])
293 if __debug__:
294 debug('DS', 'Mapping for the labels computed to be %s'
295 % labels_map)
296 else:
297 if __debug__:
298 debug('DS', 'Mapping of labels was requested but labels '
299 'are not strings. Skipped')
300 labels_map = None
301 pass
302 elif labels_map is False:
303 labels_map = None
304
305 if isinstance(labels_map, dict):
306 if labels_map_known:
307 if __debug__:
308 debug('DS',
309 "`dsattr` dict has `labels_map` (%s) but there is also" \
310 " __init__ parameter `labels_map` (%s) which overrides " \
311 " stored in `dsattr`" % (lcl_dsattr['labels_map'], labels_map))
312
313 lcl_dsattr['labels_map'] = labels_map
314
315 if labels_.dtype.char == 'S' or not labels_map_known:
316 if __debug__:
317 debug('DS_', "Remapping labels using mapping %s" % labels_map)
318
319
320 try:
321 lcl_data['labels'] = N.array(
322 [labels_map[x] for x in lcl_data['labels']])
323 except KeyError, e:
324 raise ValueError, "Provided labels_map %s is insufficient " \
325 "to map all the labels. Mapping for label %s is " \
326 "missing" % (labels_map, e)
327
328 elif not lcl_dsattr.has_key('labels_map'):
329 lcl_dsattr['labels_map'] = labels_map
330 elif __debug__:
331 debug('DS_', 'Not overriding labels_map in dsattr since it has one')
332
333 if check_data:
334 self._checkData()
335
336
337
338
339
340
341
342 if not labels is None or not chunks is None:
343
344
345 lcl_dsattr['__uniquereseted'] = False
346 self._resetallunique(force=True)
347
348
349 __doc__ = enhancedDocString('Dataset', locals())
350
351
352 @property
354 """To verify if dataset is in the same state as when smth else was done
355
356 Like if classifier was trained on the same dataset as in question"""
357
358 _data = self._data
359 res = idhash_(_data)
360
361
362
363
364 keys = _data.keys()
365 keys.sort()
366 for k in keys:
367 res += idhash_(_data[k])
368 return res
369
370
372 """Set to None all unique* attributes of corresponding dictionary
373 """
374 _dsattr = self._dsattr
375
376 if not force and _dsattr['__uniquereseted']:
377 return
378
379 _uniqueattributes = self._uniqueattributes
380
381 if __debug__ and "DS_" in debug.active:
382 debug("DS_", "Reseting all attributes %s for dataset %s"
383 % (_uniqueattributes,
384 self.summary(uniq=False, idhash=False,
385 stats=False, lstats=False)))
386
387
388 for k in _uniqueattributes:
389 _dsattr[k] = None
390 _dsattr['__uniquereseted'] = True
391
392
394 """Provide common facility to return unique attributes
395
396 XXX `dict_` can be simply replaced now with self._dsattr
397 """
398
399
400 _dsattr = self._dsattr
401
402 if not _dsattr.has_key(attrib) or _dsattr[attrib] is None:
403 if __debug__ and 'DS_' in debug.active:
404 debug("DS_", "Recomputing unique set for attrib %s within %s" %
405 (attrib, self.summary(uniq=False,
406 stats=False, lstats=False)))
407
408
409 _dsattr[attrib] = N.unique( N.asanyarray(dict_[attrib[6:]]) )
410 assert(not _dsattr[attrib] is None)
411 _dsattr['__uniquereseted'] = False
412
413 return _dsattr[attrib]
414
415
417 """Provide common facility to set attributes
418
419 """
420 if len(value) != self.nsamples:
421 raise ValueError, \
422 "Provided %s have %d entries while there is %d samples" % \
423 (attrib, len(value), self.nsamples)
424 self._data[attrib] = N.asarray(value)
425 uniqueattr = "unique" + attrib
426
427 _dsattr = self._dsattr
428 if _dsattr.has_key(uniqueattr):
429 _dsattr[uniqueattr] = None
430
431
433 """Returns the number of samples per unique label.
434 """
435
436 _data = self._data
437
438
439 uniqueattr = self._getuniqueattr(attrib="unique" + attrib,
440 dict_=_data)
441
442
443 result = dict(zip(uniqueattr, [ 0 ] * len(uniqueattr)))
444 for l in _data[attrib]:
445 result[l] += 1
446
447
448
449
450 return result
451
452
455 """Return indecies of samples given a list of attributes
456 """
457
458 if not operator.isSequenceType(values) \
459 or isinstance(values, basestring):
460 values = [ values ]
461
462
463
464 sel = N.array([], dtype=N.int16)
465 _data = self._data
466 for value in values:
467 sel = N.concatenate((
468 sel, N.where(_data[attrib]==value)[0]))
469
470 if sort:
471
472 sel.sort()
473
474 return sel
475
476
477 - def idsonboundaries(self, prior=0, post=0,
478 attributes_to_track=['labels', 'chunks'],
479 affected_labels=None,
480 revert=False):
481 """Find samples which are on the boundaries of the blocks
482
483 Such samples might need to be removed. By default (with
484 prior=0, post=0) ids of the first samples in a 'block' are
485 reported
486
487 :Parameters:
488 prior : int
489 how many samples prior to transition sample to include
490 post : int
491 how many samples post the transition sample to include
492 attributes_to_track : list of basestring
493 which attributes to track to decide on the boundary condition
494 affected_labels : list of basestring
495 for which labels to perform selection. If None - for all
496 revert : bool
497 either to revert the meaning and provide ids of samples which are found
498 to not to be boundary samples
499 """
500
501 _data = self._data
502 labels = self.labels
503 nsamples = self.nsamples
504
505 lastseen = [None for attr in attributes_to_track]
506 transitions = []
507
508 for i in xrange(nsamples):
509 current = [_data[attr][i] for attr in attributes_to_track]
510 if lastseen != current:
511
512 new_transitions = range(max(0, i-prior),
513 min(nsamples-1, i+post)+1)
514 if affected_labels is not None:
515 new_transitions = filter(lambda i: labels[i] in affected_labels,
516 new_transitions)
517 transitions += new_transitions
518 lastseen = current
519
520 transitions = Set(transitions)
521 if revert:
522 transitions = Set(range(nsamples)).difference(transitions)
523
524
525 transitions = N.array(list(transitions))
526 transitions.sort()
527 return list(transitions)
528
529
531 """Adapt different kinds of samples
532
533 Handle all possible input value for 'samples' and tranform
534 them into a 2d (samples x feature) representation.
535 """
536
537
538 if (not isinstance(samples, N.ndarray)):
539
540
541 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy)
542 else:
543 if samples.ndim < 2 \
544 or (not dtype is None and dtype != samples.dtype):
545 if dtype is None:
546 dtype = samples.dtype
547 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy)
548 elif copy:
549 samples = samples.copy()
550
551
552 if len(samples.shape) > 2:
553 raise DatasetError, "Only (samples x features) -> 2d sample " \
554 + "are supported (got %s shape of samples)." \
555 % (`samples.shape`) \
556 +" Consider MappedDataset if applicable."
557
558 return samples
559
560
562 """Checks `_data` members to have the same # of samples.
563 """
564
565
566
567
568
569
570 nsamples = self.nsamples
571 _data = self._data
572
573 for k, v in _data.iteritems():
574 if not len(v) == nsamples:
575 raise DatasetError, \
576 "Length of sample attribute '%s' [%i] does not " \
577 "match the number of samples in the dataset [%i]." \
578 % (k, len(v), nsamples)
579
580
581 uniques = N.unique(_data['origids'])
582 uniques.sort()
583
584 sorted_ids = _data['origids'].copy()
585 sorted_ids.sort()
586
587 if not (uniques == sorted_ids).all():
588 raise DatasetError, "Samples IDs are not unique."
589
590
592 """If a sample attribute is given as a scalar expand/repeat it to a
593 length matching the number of samples in the dataset.
594 """
595 try:
596
597
598 if isinstance(attr, basestring):
599 raise TypeError
600 if len(attr) != self.nsamples:
601 raise DatasetError, \
602 "Length of sample attribute '%s' [%d]" \
603 % (attr_name, len(attr)) \
604 + " has to match the number of samples" \
605 + " [%d]." % self.nsamples
606
607 return N.array(attr)
608
609 except TypeError:
610
611
612 return N.repeat(attr, self.nsamples)
613
614
615 @classmethod
617 """Register an attribute for any Dataset class.
618
619 Creates property assigning getters/setters depending on the
620 availability of corresponding _get, _set functions.
621 """
622 classdict = cls.__dict__
623 if not classdict.has_key(key):
624 if __debug__:
625 debug("DS", "Registering new attribute %s" % key)
626
627
628 getter = '_get%s' % key
629 if classdict.has_key(getter):
630 getter = '%s.%s' % (cls.__name__, getter)
631 else:
632 getter = "lambda x: x.%s['%s']" % (dictname, key)
633
634
635
636 setter = '_set%s' % key
637 if classdict.has_key(setter):
638 setter = '%s.%s' % (cls.__name__, setter)
639 elif dictname=="_data":
640 setter = "lambda self,x: self._setdataattr" + \
641 "(attrib='%s', value=x)" % (key)
642 else:
643 setter = None
644
645 if __debug__:
646 debug("DS", "Registering new property %s.%s" %
647 (cls.__name__, key))
648 exec "%s.%s = property(fget=%s, fset=%s)" % \
649 (cls.__name__, key, getter, setter)
650
651 if abbr is not None:
652 exec "%s.%s = property(fget=%s, fset=%s)" % \
653 (cls.__name__, abbr, getter, setter)
654
655 if hasunique:
656 uniquekey = "unique%s" % key
657 getter = '_get%s' % uniquekey
658 if classdict.has_key(getter):
659 getter = '%s.%s' % (cls.__name__, getter)
660 else:
661 getter = "lambda x: x._getuniqueattr" + \
662 "(attrib='%s', dict_=x.%s)" % (uniquekey, dictname)
663
664 if __debug__:
665 debug("DS", "Registering new property %s.%s" %
666 (cls.__name__, uniquekey))
667
668 exec "%s.%s = property(fget=%s)" % \
669 (cls.__name__, uniquekey, getter)
670 if abbr is not None:
671 exec "%s.U%s = property(fget=%s)" % \
672 (cls.__name__, abbr, getter)
673
674
675 sampleskey = "samplesper%s" % key[:-1]
676 if __debug__:
677 debug("DS", "Registering new property %s.%s" %
678 (cls.__name__, sampleskey))
679
680 exec "%s.%s = property(fget=%s)" % \
681 (cls.__name__, sampleskey,
682 "lambda x: x._getNSamplesPerAttr(attrib='%s')" % key)
683
684 cls._uniqueattributes.append(uniquekey)
685
686
687 sampleskey = "idsby%s" % key
688 if __debug__:
689 debug("DS", "Registering new property %s.%s" %
690 (cls.__name__, sampleskey))
691
692 exec "%s.%s = %s" % (cls.__name__, sampleskey,
693 "lambda self, x: " +
694 "self._getSampleIdsByAttr(x,attrib='%s')" % key)
695
696 cls._uniqueattributes.append(uniquekey)
697
698 cls._registeredattributes.append(key)
699 elif __debug__:
700 warning('Trying to reregister attribute `%s`. For now ' % key +
701 'such capability is not present')
702
703
712
713
715 return "<%s>" % str(self)
716
717
718 - def summary(self, uniq=True, stats=True, idhash=False, lstats=True,
719 maxc=30, maxl=20):
720 """String summary over the object
721
722 :Parameters:
723 uniq : bool
724 Include summary over data attributes which have unique
725 idhash : bool
726 Include idhash value for dataset and samples
727 stats : bool
728 Include some basic statistics (mean, std, var) over dataset samples
729 lstats : bool
730 Include statistics on chunks/labels
731 maxc : int
732 Maximal number of chunks when provide details on labels/chunks
733 maxl : int
734 Maximal number of labels when provide details on labels/chunks
735 """
736
737 samples = self.samples
738 _data = self._data
739 _dsattr = self._dsattr
740
741 if idhash:
742 idhash_ds = "{%s}" % self.idhash
743 idhash_samples = "{%s}" % idhash_(samples)
744 else:
745 idhash_ds = ""
746 idhash_samples = ""
747
748 s = """Dataset %s/ %s %d%s x %d""" % \
749 (idhash_ds, samples.dtype,
750 self.nsamples, idhash_samples, self.nfeatures)
751
752 ssep = (' ', '\n')[lstats]
753 if uniq:
754 s += "%suniq:" % ssep
755 for uattr in _dsattr.keys():
756 if not uattr.startswith("unique"):
757 continue
758 attr = uattr[6:]
759 try:
760 value = self._getuniqueattr(attrib=uattr,
761 dict_=_data)
762 s += " %d %s" % (len(value), attr)
763 except:
764 pass
765
766 if isinstance(self.labels_map, dict):
767 s += ' labels_mapped'
768
769 if stats:
770
771
772
773 s += "%sstats: mean=%g std=%g var=%g min=%g max=%g\n" % \
774 (ssep, N.mean(samples), N.std(samples),
775 N.var(samples), N.min(samples), N.max(samples))
776
777 if lstats:
778 s += self.summary_labels(maxc=maxc, maxl=maxl)
779
780 return s
781
782
784 """Provide summary statistics over the labels and chunks
785
786 :Parameters:
787 maxc : int
788 Maximal number of chunks when provide details
789 maxl : int
790 Maximal number of labels when provide details
791 """
792
793
794 from mvpa.datasets.miscfx import getSamplesPerChunkLabel
795 spcl = getSamplesPerChunkLabel(self)
796
797 ul = self.uniquelabels.tolist()
798 uc = self.uniquechunks.tolist()
799 s = ""
800 if len(ul) < maxl and len(uc) < maxc:
801 s += "\nCounts of labels in each chunk:"
802
803 table = [[' chunks\labels'] + ul]
804 table += [[''] + ['---'] * len(ul)]
805 for c, counts in zip(uc, spcl):
806 table.append([ str(c) ] + counts.tolist())
807 s += '\n' + table2string(table)
808 else:
809 s += "No details due to large number of labels or chunks. " \
810 "Increase maxc and maxl if desired"
811
812 labels_map = self.labels_map
813 if isinstance(labels_map, dict):
814 s += "\nOriginal labels were mapped using following mapping:"
815 s += '\n\t'+'\n\t'.join([':\t'.join(map(str, x))
816 for x in labels_map.items()]) + '\n'
817
818 def cl_stats(axis, u, name1, name2):
819
820 stats = {'min': N.min(spcl, axis=axis),
821 'max': N.max(spcl, axis=axis),
822 'mean': N.mean(spcl, axis=axis),
823 'std': N.std(spcl, axis=axis),
824 '#%ss' % name2: N.sum(spcl>0, axis=axis)}
825 entries = [' ' + name1, 'mean', 'std', 'min', 'max', '#%ss' % name2]
826 table = [ entries ]
827 for i, l in enumerate(u):
828 d = {' ' + name1 : l}
829 d.update(dict([ (k, stats[k][i]) for k in stats.keys()]))
830 table.append( [ ('%.3g', '%s')[isinstance(d[e], basestring)]
831 % d[e] for e in entries] )
832 return '\nSummary per %s across %ss\n' % (name1, name2) \
833 + table2string(table)
834
835 if len(ul) < maxl:
836 s += cl_stats(0, ul, 'label', 'chunk')
837 if len(uc) < maxc:
838 s += cl_stats(1, uc, 'chunk', 'label')
839 return s
840
841
843 """Merge the samples of one Dataset object to another (in-place).
844
845 No dataset attributes, besides labels_map, will be merged!
846 Additionally, a new set of unique `origids` will be generated.
847 """
848
849 _data = self._data
850 other_data = other._data
851
852 if not self.nfeatures == other.nfeatures:
853 raise DatasetError, "Cannot add Dataset, because the number of " \
854 "feature do not match."
855
856
857 slm = self.labels_map
858 olm = other.labels_map
859 if N.logical_xor(slm is None, olm is None):
860 raise ValueError, "Cannot add datasets where only one of them " \
861 "has labels map assigned. If needed -- implement it"
862
863
864 for k,v in _data.iteritems():
865 if k == 'origids':
866
867
868
869 _data[k] = N.arange(len(v) + len(other_data[k]))
870
871 elif k == 'labels' and slm is not None:
872
873
874
875 nlm = slm.copy()
876
877 nextid = N.sort(nlm.values())[-1] + 1
878 olabels = other.labels
879 olabels_remap = {}
880 for ol, olnum in olm.iteritems():
881 if not nlm.has_key(ol):
882
883
884
885 if olnum in nlm.values():
886 nextid = N.sort(nlm.values() + olm.values())[-1] + 1
887 else:
888 nextid = olnum
889 olabels_remap[olnum] = nextid
890 nlm[ol] = nextid
891 nextid += 1
892 else:
893 olabels_remap[olnum] = nlm[ol]
894 olabels = [olabels_remap[x] for x in olabels]
895
896 _data['labels'] = N.concatenate((v, olabels), axis=0)
897
898 self._dsattr['labels_map'] = nlm
899
900 if __debug__:
901
902
903
904 if (len(Set(slm.keys())) != len(Set(slm.values()))) or \
905 (len(Set(olm.keys())) != len(Set(olm.values()))):
906 warning("Adding datasets where multiple labels "
907 "mapped to the same ID is not recommended. "
908 "Please check the outcome. Original mappings "
909 "were %s and %s. Resultant is %s"
910 % (slm, olm, nlm))
911
912 else:
913 _data[k] = N.concatenate((v, other_data[k]), axis=0)
914
915
916 self._resetallunique()
917
918 return self
919
920
922 """Merge the samples two Dataset objects.
923
924 All data of both datasets is copied, concatenated and a new Dataset is
925 returned.
926
927 NOTE: This can be a costly operation (both memory and time). If
928 performance is important consider the '+=' operator.
929 """
930
931 out = super(Dataset, self).__new__(self.__class__)
932
933
934
935 out.__init__(data=self._data,
936 dsattr=self._dsattr,
937 copy_samples=True,
938 copy_data=True,
939 copy_dsattr=True)
940
941 out += other
942
943 return out
944
945
947 """Create a copy (clone) of the dataset, by fully copying current one
948
949 """
950
951 out = super(Dataset, self).__new__(self.__class__)
952
953
954
955 out.__init__(data=self._data,
956 dsattr=self._dsattr,
957 copy_samples=True,
958 copy_data=True,
959 copy_dsattr=True)
960
961 return out
962
963
965 """Select a number of features from the current set.
966
967 :Parameters:
968 ids
969 iterable container to select ids
970 sort : bool
971 if to sort Ids. Order matters and `selectFeatures` assumes
972 incremental order. If not such, in non-optimized code
973 selectFeatures would verify the order and sort
974
975 Returns a new Dataset object with a view of the original
976 samples array (no copying is performed).
977
978 WARNING: The order of ids determines the order of features in
979 the returned dataset. This might be useful sometimes, but can
980 also cause major headaches! Order would is verified when
981 running in non-optimized code (if __debug__)
982 """
983 if ids is None and groups is None:
984 raise ValueError, "No feature selection specified."
985
986
987 if ids is None:
988 ids = []
989
990 if not groups is None:
991 if not self._dsattr.has_key('featuregroups'):
992 raise RuntimeError, \
993 "Dataset has no feature grouping information."
994
995 for g in groups:
996 ids += (self._dsattr['featuregroups'] == g).nonzero()[0].tolist()
997
998
999
1000 if sort:
1001 ids.sort()
1002 elif __debug__ and 'CHECK_DS_SORTED' in debug.active:
1003 from mvpa.misc.support import isSorted
1004 if not isSorted(ids):
1005 warning("IDs for selectFeatures must be provided " +
1006 "in sorted order, otherwise major headache might occur")
1007
1008
1009 new_data = self._data.copy()
1010
1011
1012
1013 new_data['samples'] = self._data['samples'][:, ids]
1014
1015
1016 if self._dsattr.has_key('featuregroups'):
1017 new_dsattr = self._dsattr.copy()
1018 new_dsattr['featuregroups'] = self._dsattr['featuregroups'][ids]
1019 else:
1020 new_dsattr = self._dsattr
1021
1022
1023 dataset = super(Dataset, self).__new__(self.__class__)
1024
1025
1026
1027 dataset.__init__(data=new_data,
1028 dsattr=new_dsattr,
1029 check_data=False,
1030 copy_samples=False,
1031 copy_data=False,
1032 copy_dsattr=False
1033 )
1034
1035 return dataset
1036
1037
1038 - def applyMapper(self, featuresmapper=None, samplesmapper=None,
1039 train=True):
1040 """Obtain new dataset by applying mappers over features and/or samples.
1041
1042 While featuresmappers leave the sample attributes information
1043 unchanged, as the number of samples in the dataset is invariant,
1044 samplesmappers are also applied to the samples attributes themselves!
1045
1046 Applying a featuresmapper will destroy any feature grouping information.
1047
1048 :Parameters:
1049 featuresmapper : Mapper
1050 `Mapper` to somehow transform each sample's features
1051 samplesmapper : Mapper
1052 `Mapper` to transform each feature across samples
1053 train : bool
1054 Flag whether to train the mapper with this dataset before applying
1055 it.
1056
1057 TODO: selectFeatures is pretty much
1058 applyMapper(featuresmapper=MaskMapper(...))
1059 """
1060
1061
1062 new_data = self._data.copy()
1063
1064
1065
1066 if samplesmapper:
1067 if __debug__:
1068 debug("DS", "Training samplesmapper %s" % `samplesmapper`)
1069 samplesmapper.train(self)
1070
1071 if __debug__:
1072 debug("DS", "Applying samplesmapper %s" % `samplesmapper` +
1073 " to samples of dataset `%s`" % `self`)
1074
1075
1076
1077 if new_data.has_key('origids'):
1078 del(new_data['origids'])
1079
1080
1081 for k in new_data.keys():
1082 new_data[k] = samplesmapper.forward(self._data[k])
1083
1084
1085
1086 new_dsattr = self._dsattr
1087
1088 if featuresmapper:
1089 if __debug__:
1090 debug("DS", "Training featuresmapper %s" % `featuresmapper`)
1091 featuresmapper.train(self)
1092
1093 if __debug__:
1094 debug("DS", "Applying featuresmapper %s" % `featuresmapper` +
1095 " to samples of dataset `%s`" % `self`)
1096 new_data['samples'] = featuresmapper.forward(self._data['samples'])
1097
1098
1099
1100 if self._dsattr.has_key('featuregroups'):
1101 new_dsattr = self._dsattr.copy()
1102 del(new_dsattr['featuregroups'])
1103 else:
1104 new_dsattr = self._dsattr
1105
1106
1107 dataset = super(Dataset, self).__new__(self.__class__)
1108
1109
1110
1111 dataset.__init__(data=new_data,
1112 dsattr=new_dsattr,
1113 check_data=False,
1114 copy_samples=False,
1115 copy_data=False,
1116 copy_dsattr=False
1117 )
1118
1119
1120 if samplesmapper:
1121 dataset._resetallunique(force=True)
1122
1123 return dataset
1124
1125
1127 """Choose a subset of samples defined by samples IDs.
1128
1129 Returns a new dataset object containing the selected sample
1130 subset.
1131
1132 TODO: yoh, we might need to sort the mask if the mask is a
1133 list of ids and is not ordered. Clarify with Michael what is
1134 our intent here!
1135 """
1136
1137
1138 if not operator.isSequenceType( ids ):
1139 ids = [ids]
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158 data = {}
1159 for k, v in self._data.iteritems():
1160 data[k] = v[ids, ]
1161
1162
1163 dataset = super(Dataset, self).__new__(self.__class__)
1164
1165
1166
1167 dataset.__init__(data=data,
1168 dsattr=self._dsattr,
1169 check_data=False,
1170 copy_samples=False,
1171 copy_data=False,
1172 copy_dsattr=False)
1173
1174 dataset._resetallunique(force=True)
1175 return dataset
1176
1177
1178
1179 - def index(self, *args, **kwargs):
1180 """Universal indexer to obtain indexes of interesting samples/features.
1181 See .select() for more information
1182
1183 :Return: tuple of (samples indexes, features indexes). Each
1184 item could be also None, if no selection on samples or
1185 features was requested (to discriminate between no selected
1186 items, and no selections)
1187 """
1188 s_indx = []
1189 f_indx = []
1190 return_dataset = kwargs.pop('return_dataset', False)
1191 largs = len(args)
1192
1193 args = list(args)
1194
1195 largs_nonstring = 0
1196
1197 for i in xrange(largs):
1198 l = args[i]
1199 if isinstance(l, basestring):
1200 if l.lower() == 'all':
1201
1202 args[i] = slice(None)
1203 else:
1204 break
1205 largs_nonstring += 1
1206
1207 if largs_nonstring >= 1:
1208 s_indx.append(args[0])
1209 if __debug__ and 'CHECK_DS_SELECT' in debug.active:
1210 _validate_indexes_uniq_sorted(args[0], 'select', 'samples')
1211 if largs_nonstring == 2:
1212 f_indx.append(args[1])
1213 if __debug__ and 'CHECK_DS_SELECT' in debug.active:
1214 _validate_indexes_uniq_sorted(args[1], 'select', 'features')
1215 elif largs_nonstring > 2:
1216 raise ValueError, "Only two positional arguments are allowed" \
1217 ". 1st for samples, 2nd for features"
1218
1219
1220
1221
1222 if (largs - largs_nonstring) % 2 != 0:
1223 raise ValueError, "Positional selections must come in pairs:" \
1224 " e.g. ('labels', [1,2,3])"
1225
1226 for i in xrange(largs_nonstring, largs, 2):
1227 k, v = args[i:i+2]
1228 kwargs[k] = v
1229
1230
1231 data_ = self._data
1232 for k, v in kwargs.iteritems():
1233 if k == 'samples':
1234 s_indx.append(v)
1235 elif k == 'features':
1236 f_indx.append(v)
1237 elif data_.has_key(k):
1238
1239
1240 if __debug__:
1241 if not N.any([isinstance(v, cls) for cls in
1242 [list, tuple, slice, int]]):
1243 raise ValueError, "Trying to specify selection for %s " \
1244 "based on unsupported '%s'" % (k, v)
1245 s_indx.append(self._getSampleIdsByAttr(v, attrib=k, sort=False))
1246 else:
1247 raise ValueError, 'Keyword "%s" is not known, thus' \
1248 'select() failed' % k
1249
1250 def combine_indexes(indx, nelements):
1251 """Helper function: intersect selections given in indx
1252
1253 :Parameters:
1254 indxs : list of lists or slices
1255 selections of elements
1256 nelements : int
1257 number of elements total for deriving indexes from slices
1258 """
1259 indx_sel = None
1260 for s in indx:
1261 if isinstance(s, slice) or \
1262 isinstance(s, N.ndarray) and s.dtype==bool:
1263
1264
1265
1266 all_indexes = N.arange(nelements)
1267 s = all_indexes[s]
1268 elif not operator.isSequenceType(s):
1269 s = [ s ]
1270
1271 if indx_sel is None:
1272 indx_sel = Set(s)
1273 else:
1274
1275
1276
1277 indx_sel = indx_sel.intersection(s)
1278
1279
1280 if isinstance(indx_sel, Set):
1281 indx_sel = list(indx_sel)
1282
1283
1284 indx_sel.sort()
1285
1286 return indx_sel
1287
1288
1289 if len(s_indx) == 1 and isinstance(s_indx[0], slice) \
1290 and s_indx[0] == slice(None):
1291
1292 s_indx = s_indx[0]
1293 else:
1294
1295 if len(s_indx) == 0:
1296 s_indx = None
1297 else:
1298 s_indx = combine_indexes(s_indx, self.nsamples)
1299
1300
1301 if len(f_indx):
1302 f_indx = combine_indexes(f_indx, self.nfeatures)
1303 else:
1304 f_indx = None
1305
1306 return s_indx, f_indx
1307
1308
1309 - def select(self, *args, **kwargs):
1310 """Universal selector
1311
1312 WARNING: if you need to select duplicate samples
1313 (e.g. samples=[5,5]) or order of selected samples of features
1314 is important and has to be not ordered (e.g. samples=[3,2,1]),
1315 please use selectFeatures or selectSamples functions directly
1316
1317 Examples:
1318 Mimique plain selectSamples::
1319
1320 dataset.select([1,2,3])
1321 dataset[[1,2,3]]
1322
1323 Mimique plain selectFeatures::
1324
1325 dataset.select(slice(None), [1,2,3])
1326 dataset.select('all', [1,2,3])
1327 dataset[:, [1,2,3]]
1328
1329 Mixed (select features and samples)::
1330
1331 dataset.select([1,2,3], [1, 2])
1332 dataset[[1,2,3], [1, 2]]
1333
1334 Select samples matching some attributes::
1335
1336 dataset.select(labels=[1,2], chunks=[2,4])
1337 dataset.select('labels', [1,2], 'chunks', [2,4])
1338 dataset['labels', [1,2], 'chunks', [2,4]]
1339
1340 Mixed -- out of first 100 samples, select only those with
1341 labels 1 or 2 and belonging to chunks 2 or 4, and select
1342 features 2 and 3::
1343
1344 dataset.select(slice(0,100), [2,3], labels=[1,2], chunks=[2,4])
1345 dataset[:100, [2,3], 'labels', [1,2], 'chunks', [2,4]]
1346
1347 """
1348 s_indx, f_indx = self.index(*args, **kwargs)
1349
1350
1351 if s_indx == slice(None):
1352
1353
1354 if __debug__:
1355 debug('DS', 'in select() not selecting samples')
1356 ds = self
1357 else:
1358
1359 if __debug__:
1360 debug('DS', 'in select() selecting samples given selections'
1361 + str(s_indx))
1362 ds = self.selectSamples(s_indx)
1363
1364
1365 if f_indx is not None:
1366 if __debug__:
1367 debug('DS', 'in select() selecting features given selections'
1368 + str(f_indx))
1369 ds = ds.selectFeatures(f_indx)
1370
1371 return ds
1372
1373
1374
1375 - def where(self, *args, **kwargs):
1376 """Obtain indexes of interesting samples/features. See select() for more information
1377
1378 XXX somewhat obsoletes idsby...
1379 """
1380 s_indx, f_indx = self.index(*args, **kwargs)
1381 if s_indx is not None and f_indx is not None:
1382 return s_indx, f_indx
1383 elif s_indx is not None:
1384 return s_indx
1385 else:
1386 return f_indx
1387
1388
1390 """Convinience dataset parts selection
1391
1392 See select for more information
1393 """
1394
1395 if len(args) == 1 and isinstance(args[0], tuple):
1396 args = args[0]
1397
1398 args_,args = args,()
1399 for a in args_:
1400 if isinstance(a, slice) and \
1401 isinstance(a.start, basestring):
1402
1403 if a.stop is None or a.step is not None:
1404 raise ValueError, \
1405 "Selection must look like ['chunks':[2,3]]"
1406 args += (a.start, a.stop)
1407 else:
1408 args += (a,)
1409 return self.select(*args)
1410
1411
1412 - def permuteLabels(self, status, perchunk=True, assure_permute=False):
1413 """Permute the labels.
1414
1415 TODO: rename status into something closer in semantics.
1416
1417 :Parameters:
1418 status : bool
1419 Calling this method with set to True, the labels are
1420 permuted among all samples. If 'status' is False the
1421 original labels are restored.
1422 perchunk : bool
1423 If True permutation is limited to samples sharing the same
1424 chunk value. Therefore only the association of a certain
1425 sample with a label is permuted while keeping the absolute
1426 number of occurences of each label value within a certain
1427 chunk constant.
1428 assure_permute : bool
1429 If True, assures that labels are permutted, ie any one is
1430 different from the original one
1431 """
1432
1433 _data = self._data
1434
1435 if len(self.uniquelabels)<2:
1436 raise RuntimeError, \
1437 "Call to permuteLabels is bogus since there is insuficient" \
1438 " number of labels: %s" % self.uniquelabels
1439
1440 if not status:
1441
1442 if _data.get('origlabels', None) is None:
1443 raise RuntimeError, 'Cannot restore labels. ' \
1444 'permuteLabels() has never been ' \
1445 'called with status == True.'
1446 self.labels = _data['origlabels']
1447 _data.pop('origlabels')
1448 else:
1449
1450
1451 if not _data.has_key('origlabels') \
1452 or _data['origlabels'] == None:
1453
1454 _data['origlabels'] = _data['labels']
1455
1456 _data['labels'] = copy.copy(_data['labels'])
1457
1458 labels = _data['labels']
1459
1460 if perchunk:
1461 for o in self.uniquechunks:
1462 labels[self.chunks == o] = \
1463 N.random.permutation(labels[self.chunks == o])
1464 else:
1465 labels = N.random.permutation(labels)
1466
1467 self.labels = labels
1468
1469 if assure_permute:
1470 if not (_data['labels'] != _data['origlabels']).any():
1471 if not (assure_permute is True):
1472 if assure_permute == 1:
1473 raise RuntimeError, \
1474 "Cannot assure permutation of labels %s for " \
1475 "some reason with chunks %s and while " \
1476 "perchunk=%s . Should not happen" % \
1477 (self.labels, self.chunks, perchunk)
1478 else:
1479 assure_permute = 11
1480 if __debug__:
1481 debug("DS", "Recalling permute to assure different labels")
1482 self.permuteLabels(status, perchunk=perchunk,
1483 assure_permute=assure_permute-1)
1484
1485
1487 """Select a random set of samples.
1488
1489 If 'nperlabel' is an integer value, the specified number of samples is
1490 randomly choosen from the group of samples sharing a unique label
1491 value ( total number of selected samples: nperlabel x len(uniquelabels).
1492
1493 If 'nperlabel' is a list which's length has to match the number of
1494 unique label values. In this case 'nperlabel' specifies the number of
1495 samples that shall be selected from the samples with the corresponding
1496 label.
1497
1498 The method returns a Dataset object containing the selected
1499 samples.
1500 """
1501
1502 if isinstance(nperlabel, int):
1503 nperlabel = [ nperlabel for i in self.uniquelabels ]
1504
1505 sample = []
1506
1507 for i, r in enumerate(self.uniquelabels):
1508
1509 sample += random.sample( (self.labels == r).nonzero()[0],
1510 nperlabel[i] )
1511
1512 return self.selectSamples( sample )
1513
1514
1515
1516
1517
1518
1519
1520
1521
1523 """Currently available number of patterns.
1524 """
1525 return self._data['samples'].shape[0]
1526
1527
1529 """Number of features per pattern.
1530 """
1531 return self._data['samples'].shape[1]
1532
1533
1535 """Stored labels map (if any)
1536 """
1537 return self._dsattr.get('labels_map', None)
1538
1539
1541 """Set labels map.
1542
1543 Checks for the validity of the mapping -- values should cover
1544 all existing labels in the dataset
1545 """
1546 values = Set(lm.values())
1547 labels = Set(self.uniquelabels)
1548 if not values.issuperset(labels):
1549 raise ValueError, \
1550 "Provided mapping %s has some existing labels (out of %s) " \
1551 "missing from mapping" % (list(values), list(labels))
1552 self._dsattr['labels_map'] = lm
1553
1554
1556 """Set the data type of the samples array.
1557 """
1558
1559 _data = self._data
1560
1561 if _data['samples'].dtype != dtype:
1562 _data['samples'] = _data['samples'].astype(dtype)
1563
1564
1566 """
1567 """
1568 if not len(definition) == self.nfeatures:
1569 raise ValueError, \
1570 "Length of feature group definition %i " \
1571 "does not match the number of features %i " \
1572 % (len(definition), self.nfeatures)
1573
1574 self._dsattr['featuregroups'] = N.array(definition)
1575
1576
1578 """Returns a boolean mask with all features in `ids` selected.
1579
1580 :Parameters:
1581 ids: list or 1d array
1582 To be selected features ids.
1583
1584 :Returns:
1585 ndarray: dtype='bool'
1586 All selected features are set to True; False otherwise.
1587 """
1588 fmask = N.repeat(False, self.nfeatures)
1589 fmask[ids] = True
1590
1591 return fmask
1592
1593
1595 """Returns feature ids corresponding to non-zero elements in the mask.
1596
1597 :Parameters:
1598 mask: 1d ndarray
1599 Feature mask.
1600
1601 :Returns:
1602 ndarray: integer
1603 Ids of non-zero (non-False) mask elements.
1604 """
1605 return mask.nonzero()[0]
1606
1607
1608 @staticmethod
1610 """Common sanity check for Dataset copy constructor calls."""
1611
1612 samples = None
1613 if kwargs.has_key('samples'):
1614 samples = kwargs['samples']
1615 if samples is None and kwargs.has_key('data') \
1616 and kwargs['data'].has_key('samples'):
1617 samples = kwargs['data']['samples']
1618 if samples is None:
1619 raise DatasetError, \
1620 "`samples` must be provided to copy constructor call."
1621
1622 if not len(samples.shape) == 2:
1623 raise DatasetError, \
1624 "samples must be in 2D shape in copy constructor call."
1625
1626
1627
1628 nsamples = property( fget=getNSamples )
1629 nfeatures = property( fget=getNFeatures )
1630 labels_map = property( fget=getLabelsMap, fset=setLabelsMap )
1631
1633 """Decorator to easily bind functions to a Dataset class
1634 """
1635 if __debug__:
1636 debug("DS_", "Binding function %s to Dataset class" % func.func_name)
1637
1638
1639 setattr(Dataset, func.func_name, func)
1640
1641
1642 return func
1643
1644
1645
1646 Dataset._registerAttribute("samples", "_data", abbr='S', hasunique=False)
1647 Dataset._registerAttribute("labels", "_data", abbr='L', hasunique=True)
1648 Dataset._registerAttribute("chunks", "_data", abbr='C', hasunique=True)
1649
1650 Dataset._registerAttribute("origids", "_data", abbr='I', hasunique=False)
1651