Package mvpa :: Package datasets :: Module base
[hide private]
[frames] | no frames]

Source Code for Module mvpa.datasets.base

   1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
   2  #ex: set sts=4 ts=4 sw=4 et: 
   3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   4  # 
   5  #   See COPYING file distributed along with the PyMVPA package for the 
   6  #   copyright and license terms. 
   7  # 
   8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
   9  """Dataset container""" 
  10   
  11  __docformat__ = 'restructuredtext' 
  12   
  13  import operator 
  14  import random 
  15  import mvpa.misc.copy as copy 
  16  import numpy as N 
  17   
  18  from sets import Set 
  19   
  20  # Sooner or later Dataset would become ClassWithCollections as well, but for 
  21  # now just an object -- thus commenting out tentative changes 
  22  # 
  23  #XXX from mvpa.misc.state import ClassWithCollections, SampleAttribute 
  24   
  25  from mvpa.misc.exceptions import DatasetError 
  26  from mvpa.misc.support import idhash as idhash_ 
  27  from mvpa.base.dochelpers import enhancedDocString, table2string 
  28   
  29  if __debug__: 
  30      from mvpa.base import debug, warning 
31 32 - def _validate_indexes_uniq_sorted(seq, fname, item):
33 if operator.isSequenceType(seq): 34 seq_unique = N.unique(seq) 35 if len(seq) != len(seq_unique): 36 warning("%s() operates only with indexes for %s without" 37 " repetitions. Repetitions were removed." 38 % (fname, item)) 39 if N.any(N.sort(seq) != seq_unique): 40 warning("%s() does not guarantee the original order" 41 " of selected %ss. Use selectSamples() and " 42 " selectFeatures(sort=False) instead" % (fname, item))
43
44 45 #XXX class Dataset(ClassWithCollections): 46 -class Dataset(object):
47 """*The* Dataset. 48 49 This class provides a container to store all necessary data to 50 perform MVPA analyses. These are the data samples, as well as the 51 labels associated with the samples. Additionally, samples can be 52 grouped into chunks. 53 54 :Groups: 55 - `Creators`: `__init__`, `selectFeatures`, `selectSamples`, 56 `applyMapper` 57 - `Mutators`: `permuteLabels` 58 59 Important: labels assumed to be immutable, i.e. noone should modify 60 them externally by accessing indexed items, ie something like 61 ``dataset.labels[1] += "_bad"`` should not be used. If a label has 62 to be modified, full copy of labels should be obtained, operated on, 63 and assigned back to the dataset, otherwise dataset.uniquelabels 64 would not work. The same applies to any other attribute which has 65 corresponding unique* access property. 66 67 """ 68 # XXX Notes about migration to use Collections to store data and 69 # attributes for samples, features, and dataset itself: 70 71 # changes: 72 # _data -> s_attr collection (samples attributes) 73 # _dsattr -> ds_attr collection 74 # f_attr collection (features attributes) 75 76 # static definition to track which unique attributes 77 # have to be reset/recomputed whenever anything relevant 78 # changes 79 80 # unique{labels,chunks} become a part of dsattr 81 _uniqueattributes = [] 82 """Unique attributes associated with the data""" 83 84 _registeredattributes = [] 85 """Registered attributes (stored in _data)""" 86 87 _requiredattributes = ['samples', 'labels'] 88 """Attributes which have to be provided to __init__, or otherwise 89 no default values would be assumed and construction of the 90 instance would fail""" 91 92 #XXX _ATTRIBUTE_COLLECTIONS = [ 's_attr', 'f_attr', 'ds_attr' ] 93 #XXX """Assure those 3 collections to be present in all datasets""" 94 #XXX 95 #XXX samples__ = SampleAttribute(doc="Samples data. 0th index is time", hasunique=False) # XXX 96 #XXX labels__ = SampleAttribute(doc="Labels for the samples", hasunique=True) 97 #XXX chunks__ = SampleAttribute(doc="Chunk identities for the samples", hasunique=True) 98 #XXX # samples ids (already unique by definition) 99 #XXX origids__ = SampleAttribute(doc="Chunk identities for the samples", hasunique=False) 100
101 - def __init__(self, 102 # for copy constructor 103 data=None, 104 dsattr=None, 105 # automatic dtype conversion 106 dtype=None, 107 # new instances 108 samples=None, 109 labels=None, 110 labels_map=None, 111 chunks=None, 112 origids=None, 113 # flags 114 check_data=True, 115 copy_samples=False, 116 copy_data=True, 117 copy_dsattr=True):
118 """Initialize dataset instance 119 120 There are basically two different way to create a dataset: 121 122 1. Create a new dataset from samples and sample attributes. In 123 this mode a two-dimensional `ndarray` has to be passed to the 124 `samples` keyword argument and the corresponding samples 125 attributes are provided via the `labels` and `chunks` 126 arguments. 127 128 2. Copy contructor mode 129 The second way is used internally to perform quick coyping 130 of datasets, e.g. when performing feature selection. In this 131 mode and the two dictionaries (`data` and `dsattr`) are 132 required. For performance reasons this mode bypasses most of 133 the sanity check performed by the previous mode, as for 134 internal operations data integrity is assumed. 135 136 137 :Parameters: 138 data : dict 139 Dictionary with an arbitrary number of entries. The value for 140 each key in the dict has to be an ndarray with the 141 same length as the number of rows in the samples array. 142 A special entry in this dictionary is 'samples', a 2d array 143 (samples x features). A shallow copy is stored in the object. 144 dsattr : dict 145 Dictionary of dataset attributes. An arbitrary number of 146 arbitrarily named and typed objects can be stored here. A 147 shallow copy of the dictionary is stored in the object. 148 dtype: type | None 149 If None -- do not change data type if samples 150 is an ndarray. Otherwise convert samples to dtype. 151 152 153 :Keywords: 154 samples : ndarray 155 2d array (samples x features) 156 labels 157 An array or scalar value defining labels for each samples 158 labels_map : None or bool or dict 159 Map from labels into literal names. If is None or True, 160 the mapping is computed, from labels which must be literal. 161 If is False, no mapping is computed. If dict -- mapping is 162 verified and taken, labels get remapped. Dict must map 163 literal -> number 164 chunks 165 An array or scalar value defining chunks for each sample 166 167 Each of the Keywords arguments overwrites what is/might be 168 already in the `data` container. 169 170 """ 171 172 #XXX ClassWithCollections.__init__(self) 173 174 # see if data and dsattr are none, if so, make them empty dicts 175 if data is None: 176 data = {} 177 if dsattr is None: 178 dsattr = {} 179 180 # initialize containers; default values are empty dicts 181 # always make a shallow copy of what comes in, otherwise total chaos 182 # is likely to happen soon 183 if copy_data: 184 # deep copy (cannot use copy.deepcopy, because samples is an 185 # exception 186 # but shallow copy first to get a shared version of the data in 187 # any case 188 lcl_data = data.copy() 189 for k, v in data.iteritems(): 190 # skip copying samples if requested 191 if k == 'samples' and not copy_samples: 192 continue 193 lcl_data[k] = v.copy() 194 else: 195 # shallow copy 196 # XXX? yoh: it might be better speed wise just assign dictionary 197 # without any shallow .copy 198 lcl_data = data.copy() 199 200 if copy_dsattr and len(dsattr)>0: 201 # deep copy 202 if __debug__: 203 debug('DS', "Deep copying dsattr %s" % `dsattr`) 204 lcl_dsattr = copy.deepcopy(dsattr) 205 206 else: 207 # shallow copy 208 lcl_dsattr = copy.copy(dsattr) 209 210 # has to be not private since otherwise derived methods 211 # would have problem accessing it and _registerAttribute 212 # would fail on lambda getters 213 self._data = lcl_data 214 """What makes a dataset.""" 215 216 self._dsattr = lcl_dsattr 217 """Dataset attriibutes.""" 218 219 # store samples (and possibly transform/reshape/retype them) 220 if not samples == None: 221 if __debug__: 222 if lcl_data.has_key('samples'): 223 debug('DS', 224 "`Data` dict has `samples` (%s) but there is also" \ 225 " __init__ parameter `samples` which overrides " \ 226 " stored in `data`" % (`lcl_data['samples'].shape`)) 227 lcl_data['samples'] = self._shapeSamples(samples, dtype, 228 copy_samples) 229 230 # TODO? we might want to have the same logic for chunks and labels 231 # ie if no labels present -- assign arange 232 # MH: don't think this is necessary -- or is there a use case? 233 # labels 234 if not labels == None: 235 if __debug__: 236 if lcl_data.has_key('labels'): 237 debug('DS', 238 "`Data` dict has `labels` (%s) but there is also" + 239 " __init__ parameter `labels` which overrides " + 240 " stored in `data`" % (`lcl_data['labels']`)) 241 if lcl_data.has_key('samples'): 242 lcl_data['labels'] = \ 243 self._expandSampleAttribute(labels, 'labels') 244 245 # check if we got all required attributes 246 for attr in self._requiredattributes: 247 if not lcl_data.has_key(attr): 248 raise DatasetError, \ 249 "Attribute %s is required to initialize dataset" % \ 250 attr 251 252 nsamples = self.nsamples 253 254 # chunks 255 if not chunks == None: 256 lcl_data['chunks'] = \ 257 self._expandSampleAttribute(chunks, 'chunks') 258 elif not lcl_data.has_key('chunks'): 259 # if no chunk information is given assume that every pattern 260 # is its own chunk 261 lcl_data['chunks'] = N.arange(nsamples) 262 263 # samples origids 264 if not origids is None: 265 # simply assign if provided 266 lcl_data['origids'] = origids 267 elif not lcl_data.has_key('origids'): 268 # otherwise contruct unqiue ones 269 lcl_data['origids'] = N.arange(len(lcl_data['labels'])) 270 else: 271 # assume origids have been specified already (copy constructor 272 # mode) leave them as they are, e.g. to make origids survive 273 # selectSamples() 274 pass 275 276 # Initialize attributes which are registered but were not setup 277 for attr in self._registeredattributes: 278 if not lcl_data.has_key(attr): 279 if __debug__: 280 debug("DS", "Initializing attribute %s" % attr) 281 lcl_data[attr] = N.zeros(nsamples) 282 283 # labels_map 284 labels_ = N.asarray(lcl_data['labels']) 285 labels_map_known = lcl_dsattr.has_key('labels_map') 286 if labels_map is True: 287 # need to composte labels_map 288 if labels_.dtype.char == 'S' or not labels_map_known: 289 # Create mapping 290 ulabels = list(Set(labels_)) 291 ulabels.sort() 292 labels_map = dict([ (x[1], x[0]) for x in enumerate(ulabels) ]) 293 if __debug__: 294 debug('DS', 'Mapping for the labels computed to be %s' 295 % labels_map) 296 else: 297 if __debug__: 298 debug('DS', 'Mapping of labels was requested but labels ' 299 'are not strings. Skipped') 300 labels_map = None 301 pass 302 elif labels_map is False: 303 labels_map = None 304 305 if isinstance(labels_map, dict): 306 if labels_map_known: 307 if __debug__: 308 debug('DS', 309 "`dsattr` dict has `labels_map` (%s) but there is also" \ 310 " __init__ parameter `labels_map` (%s) which overrides " \ 311 " stored in `dsattr`" % (lcl_dsattr['labels_map'], labels_map)) 312 313 lcl_dsattr['labels_map'] = labels_map 314 # map labels if needed (if strings or was explicitely requested) 315 if labels_.dtype.char == 'S' or not labels_map_known: 316 if __debug__: 317 debug('DS_', "Remapping labels using mapping %s" % labels_map) 318 # need to remap 319 # !!! N.array is important here 320 try: 321 lcl_data['labels'] = N.array( 322 [labels_map[x] for x in lcl_data['labels']]) 323 except KeyError, e: 324 raise ValueError, "Provided labels_map %s is insufficient " \ 325 "to map all the labels. Mapping for label %s is " \ 326 "missing" % (labels_map, e) 327 328 elif not lcl_dsattr.has_key('labels_map'): 329 lcl_dsattr['labels_map'] = labels_map 330 elif __debug__: 331 debug('DS_', 'Not overriding labels_map in dsattr since it has one') 332 333 if check_data: 334 self._checkData() 335 336 # lazy computation of unique members 337 #self._resetallunique('_dsattr', self._dsattr) 338 339 # Michael: we cannot do this conditional here. When selectSamples() 340 # removes a whole data chunk the uniquechunks values will be invalid. 341 # Same applies to labels of course. 342 if not labels is None or not chunks is None: 343 # for a speed up to don't go through all uniqueattributes 344 # when no need 345 lcl_dsattr['__uniquereseted'] = False 346 self._resetallunique(force=True)
347 348 349 __doc__ = enhancedDocString('Dataset', locals()) 350 351 352 @property
353 - def idhash(self):
354 """To verify if dataset is in the same state as when smth else was done 355 356 Like if classifier was trained on the same dataset as in question""" 357 358 _data = self._data 359 res = idhash_(_data) 360 361 # we cannot count on the order the values in the dict will show up 362 # with `self._data.value()` and since idhash will be order-dependent 363 # we have to make it deterministic 364 keys = _data.keys() 365 keys.sort() 366 for k in keys: 367 res += idhash_(_data[k]) 368 return res
369 370
371 - def _resetallunique(self, force=False):
372 """Set to None all unique* attributes of corresponding dictionary 373 """ 374 _dsattr = self._dsattr 375 376 if not force and _dsattr['__uniquereseted']: 377 return 378 379 _uniqueattributes = self._uniqueattributes 380 381 if __debug__ and "DS_" in debug.active: 382 debug("DS_", "Reseting all attributes %s for dataset %s" 383 % (_uniqueattributes, 384 self.summary(uniq=False, idhash=False, 385 stats=False, lstats=False))) 386 387 # I guess we better checked if dictname is known but... 388 for k in _uniqueattributes: 389 _dsattr[k] = None 390 _dsattr['__uniquereseted'] = True
391 392
393 - def _getuniqueattr(self, attrib, dict_):
394 """Provide common facility to return unique attributes 395 396 XXX `dict_` can be simply replaced now with self._dsattr 397 """ 398 399 # local bindings 400 _dsattr = self._dsattr 401 402 if not _dsattr.has_key(attrib) or _dsattr[attrib] is None: 403 if __debug__ and 'DS_' in debug.active: 404 debug("DS_", "Recomputing unique set for attrib %s within %s" % 405 (attrib, self.summary(uniq=False, 406 stats=False, lstats=False))) 407 # uff... might come up with better strategy to keep relevant 408 # attribute name 409 _dsattr[attrib] = N.unique( N.asanyarray(dict_[attrib[6:]]) ) 410 assert(not _dsattr[attrib] is None) 411 _dsattr['__uniquereseted'] = False 412 413 return _dsattr[attrib]
414 415
416 - def _setdataattr(self, attrib, value):
417 """Provide common facility to set attributes 418 419 """ 420 if len(value) != self.nsamples: 421 raise ValueError, \ 422 "Provided %s have %d entries while there is %d samples" % \ 423 (attrib, len(value), self.nsamples) 424 self._data[attrib] = N.asarray(value) 425 uniqueattr = "unique" + attrib 426 427 _dsattr = self._dsattr 428 if _dsattr.has_key(uniqueattr): 429 _dsattr[uniqueattr] = None
430 431
432 - def _getNSamplesPerAttr( self, attrib='labels' ):
433 """Returns the number of samples per unique label. 434 """ 435 # local bindings 436 _data = self._data 437 438 # XXX hardcoded dict_=self._data.... might be in self._dsattr 439 uniqueattr = self._getuniqueattr(attrib="unique" + attrib, 440 dict_=_data) 441 442 # use dictionary to cope with arbitrary labels 443 result = dict(zip(uniqueattr, [ 0 ] * len(uniqueattr))) 444 for l in _data[attrib]: 445 result[l] += 1 446 447 # XXX only return values to mimic the old interface but we might want 448 # to return the full dict instead 449 # return result 450 return result
451 452
453 - def _getSampleIdsByAttr(self, values, attrib="labels", 454 sort=True):
455 """Return indecies of samples given a list of attributes 456 """ 457 458 if not operator.isSequenceType(values) \ 459 or isinstance(values, basestring): 460 values = [ values ] 461 462 # TODO: compare to plain for loop through the labels 463 # on a real data example 464 sel = N.array([], dtype=N.int16) 465 _data = self._data 466 for value in values: 467 sel = N.concatenate(( 468 sel, N.where(_data[attrib]==value)[0])) 469 470 if sort: 471 # place samples in the right order 472 sel.sort() 473 474 return sel
475 476
477 - def idsonboundaries(self, prior=0, post=0, 478 attributes_to_track=['labels', 'chunks'], 479 affected_labels=None, 480 revert=False):
481 """Find samples which are on the boundaries of the blocks 482 483 Such samples might need to be removed. By default (with 484 prior=0, post=0) ids of the first samples in a 'block' are 485 reported 486 487 :Parameters: 488 prior : int 489 how many samples prior to transition sample to include 490 post : int 491 how many samples post the transition sample to include 492 attributes_to_track : list of basestring 493 which attributes to track to decide on the boundary condition 494 affected_labels : list of basestring 495 for which labels to perform selection. If None - for all 496 revert : bool 497 either to revert the meaning and provide ids of samples which are found 498 to not to be boundary samples 499 """ 500 # local bindings 501 _data = self._data 502 labels = self.labels 503 nsamples = self.nsamples 504 505 lastseen = [None for attr in attributes_to_track] 506 transitions = [] 507 508 for i in xrange(nsamples): 509 current = [_data[attr][i] for attr in attributes_to_track] 510 if lastseen != current: 511 # transition point 512 new_transitions = range(max(0, i-prior), 513 min(nsamples-1, i+post)+1) 514 if affected_labels is not None: 515 new_transitions = filter(lambda i: labels[i] in affected_labels, 516 new_transitions) 517 transitions += new_transitions 518 lastseen = current 519 520 transitions = Set(transitions) 521 if revert: 522 transitions = Set(range(nsamples)).difference(transitions) 523 524 # postprocess 525 transitions = N.array(list(transitions)) 526 transitions.sort() 527 return list(transitions)
528 529
530 - def _shapeSamples(self, samples, dtype, copy):
531 """Adapt different kinds of samples 532 533 Handle all possible input value for 'samples' and tranform 534 them into a 2d (samples x feature) representation. 535 """ 536 # put samples array into correct shape 537 # 1d arrays or simple sequences are assumed to be a single pattern 538 if (not isinstance(samples, N.ndarray)): 539 # it is safe to provide dtype which defaults to None, 540 # when N would choose appropriate dtype automagically 541 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy) 542 else: 543 if samples.ndim < 2 \ 544 or (not dtype is None and dtype != samples.dtype): 545 if dtype is None: 546 dtype = samples.dtype 547 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy) 548 elif copy: 549 samples = samples.copy() 550 551 # only samples x features matrices are supported 552 if len(samples.shape) > 2: 553 raise DatasetError, "Only (samples x features) -> 2d sample " \ 554 + "are supported (got %s shape of samples)." \ 555 % (`samples.shape`) \ 556 +" Consider MappedDataset if applicable." 557 558 return samples
559 560
561 - def _checkData(self):
562 """Checks `_data` members to have the same # of samples. 563 """ 564 # 565 # XXX: Maybe just run this under __debug__ and remove the `check_data` 566 # from the constructor, which is too complicated anyway? 567 # 568 569 # local bindings 570 nsamples = self.nsamples 571 _data = self._data 572 573 for k, v in _data.iteritems(): 574 if not len(v) == nsamples: 575 raise DatasetError, \ 576 "Length of sample attribute '%s' [%i] does not " \ 577 "match the number of samples in the dataset [%i]." \ 578 % (k, len(v), nsamples) 579 580 # check for unique origids 581 uniques = N.unique(_data['origids']) 582 uniques.sort() 583 # need to copy to prevent sorting the original array 584 sorted_ids = _data['origids'].copy() 585 sorted_ids.sort() 586 587 if not (uniques == sorted_ids).all(): 588 raise DatasetError, "Samples IDs are not unique."
589 590
591 - def _expandSampleAttribute(self, attr, attr_name):
592 """If a sample attribute is given as a scalar expand/repeat it to a 593 length matching the number of samples in the dataset. 594 """ 595 try: 596 # if we are initializing with a single string -- we should 597 # treat it as a single label 598 if isinstance(attr, basestring): 599 raise TypeError 600 if len(attr) != self.nsamples: 601 raise DatasetError, \ 602 "Length of sample attribute '%s' [%d]" \ 603 % (attr_name, len(attr)) \ 604 + " has to match the number of samples" \ 605 + " [%d]." % self.nsamples 606 # store the sequence as array 607 return N.array(attr) 608 609 except TypeError: 610 # make sequence of identical value matching the number of 611 # samples 612 return N.repeat(attr, self.nsamples)
613 614 615 @classmethod
616 - def _registerAttribute(cls, key, dictname="_data", abbr=None, hasunique=False):
617 """Register an attribute for any Dataset class. 618 619 Creates property assigning getters/setters depending on the 620 availability of corresponding _get, _set functions. 621 """ 622 classdict = cls.__dict__ 623 if not classdict.has_key(key): 624 if __debug__: 625 debug("DS", "Registering new attribute %s" % key) 626 # define get function and use corresponding 627 # _getATTR if such defined 628 getter = '_get%s' % key 629 if classdict.has_key(getter): 630 getter = '%s.%s' % (cls.__name__, getter) 631 else: 632 getter = "lambda x: x.%s['%s']" % (dictname, key) 633 634 # define set function and use corresponding 635 # _setATTR if such defined 636 setter = '_set%s' % key 637 if classdict.has_key(setter): 638 setter = '%s.%s' % (cls.__name__, setter) 639 elif dictname=="_data": 640 setter = "lambda self,x: self._setdataattr" + \ 641 "(attrib='%s', value=x)" % (key) 642 else: 643 setter = None 644 645 if __debug__: 646 debug("DS", "Registering new property %s.%s" % 647 (cls.__name__, key)) 648 exec "%s.%s = property(fget=%s, fset=%s)" % \ 649 (cls.__name__, key, getter, setter) 650 651 if abbr is not None: 652 exec "%s.%s = property(fget=%s, fset=%s)" % \ 653 (cls.__name__, abbr, getter, setter) 654 655 if hasunique: 656 uniquekey = "unique%s" % key 657 getter = '_get%s' % uniquekey 658 if classdict.has_key(getter): 659 getter = '%s.%s' % (cls.__name__, getter) 660 else: 661 getter = "lambda x: x._getuniqueattr" + \ 662 "(attrib='%s', dict_=x.%s)" % (uniquekey, dictname) 663 664 if __debug__: 665 debug("DS", "Registering new property %s.%s" % 666 (cls.__name__, uniquekey)) 667 668 exec "%s.%s = property(fget=%s)" % \ 669 (cls.__name__, uniquekey, getter) 670 if abbr is not None: 671 exec "%s.U%s = property(fget=%s)" % \ 672 (cls.__name__, abbr, getter) 673 674 # create samplesper<ATTR> properties 675 sampleskey = "samplesper%s" % key[:-1] # remove ending 's' XXX 676 if __debug__: 677 debug("DS", "Registering new property %s.%s" % 678 (cls.__name__, sampleskey)) 679 680 exec "%s.%s = property(fget=%s)" % \ 681 (cls.__name__, sampleskey, 682 "lambda x: x._getNSamplesPerAttr(attrib='%s')" % key) 683 684 cls._uniqueattributes.append(uniquekey) 685 686 # create idsby<ATTR> properties 687 sampleskey = "idsby%s" % key # remove ending 's' XXX 688 if __debug__: 689 debug("DS", "Registering new property %s.%s" % 690 (cls.__name__, sampleskey)) 691 692 exec "%s.%s = %s" % (cls.__name__, sampleskey, 693 "lambda self, x: " + 694 "self._getSampleIdsByAttr(x,attrib='%s')" % key) 695 696 cls._uniqueattributes.append(uniquekey) 697 698 cls._registeredattributes.append(key) 699 elif __debug__: 700 warning('Trying to reregister attribute `%s`. For now ' % key + 701 'such capability is not present')
702 703
704 - def __str__(self):
705 """String summary over the object 706 """ 707 return self.summary(uniq=True, 708 idhash=__debug__ and ('DS_ID' in debug.active), 709 stats=__debug__ and ('DS_STATS' in debug.active), 710 lstats=__debug__ and ('DS_STATS' in debug.active), 711 )
712 713
714 - def __repr__(self):
715 return "<%s>" % str(self)
716 717
718 - def summary(self, uniq=True, stats=True, idhash=False, lstats=True, 719 maxc=30, maxl=20):
720 """String summary over the object 721 722 :Parameters: 723 uniq : bool 724 Include summary over data attributes which have unique 725 idhash : bool 726 Include idhash value for dataset and samples 727 stats : bool 728 Include some basic statistics (mean, std, var) over dataset samples 729 lstats : bool 730 Include statistics on chunks/labels 731 maxc : int 732 Maximal number of chunks when provide details on labels/chunks 733 maxl : int 734 Maximal number of labels when provide details on labels/chunks 735 """ 736 # local bindings 737 samples = self.samples 738 _data = self._data 739 _dsattr = self._dsattr 740 741 if idhash: 742 idhash_ds = "{%s}" % self.idhash 743 idhash_samples = "{%s}" % idhash_(samples) 744 else: 745 idhash_ds = "" 746 idhash_samples = "" 747 748 s = """Dataset %s/ %s %d%s x %d""" % \ 749 (idhash_ds, samples.dtype, 750 self.nsamples, idhash_samples, self.nfeatures) 751 752 ssep = (' ', '\n')[lstats] 753 if uniq: 754 s += "%suniq:" % ssep 755 for uattr in _dsattr.keys(): 756 if not uattr.startswith("unique"): 757 continue 758 attr = uattr[6:] 759 try: 760 value = self._getuniqueattr(attrib=uattr, 761 dict_=_data) 762 s += " %d %s" % (len(value), attr) 763 except: 764 pass 765 766 if isinstance(self.labels_map, dict): 767 s += ' labels_mapped' 768 769 if stats: 770 # TODO -- avg per chunk? 771 # XXX We might like to use scipy.stats.describe to get 772 # quick summary statistics (mean/range/skewness/kurtosis) 773 s += "%sstats: mean=%g std=%g var=%g min=%g max=%g\n" % \ 774 (ssep, N.mean(samples), N.std(samples), 775 N.var(samples), N.min(samples), N.max(samples)) 776 777 if lstats: 778 s += self.summary_labels(maxc=maxc, maxl=maxl) 779 780 return s
781 782
783 - def summary_labels(self, maxc=30, maxl=20):
784 """Provide summary statistics over the labels and chunks 785 786 :Parameters: 787 maxc : int 788 Maximal number of chunks when provide details 789 maxl : int 790 Maximal number of labels when provide details 791 """ 792 # We better avoid bound function since if people only 793 # imported Dataset without miscfx it would fail 794 from mvpa.datasets.miscfx import getSamplesPerChunkLabel 795 spcl = getSamplesPerChunkLabel(self) 796 # XXX couldn't they be unordered? 797 ul = self.uniquelabels.tolist() 798 uc = self.uniquechunks.tolist() 799 s = "" 800 if len(ul) < maxl and len(uc) < maxc: 801 s += "\nCounts of labels in each chunk:" 802 # only in a resonable case do printing 803 table = [[' chunks\labels'] + ul] 804 table += [[''] + ['---'] * len(ul)] 805 for c, counts in zip(uc, spcl): 806 table.append([ str(c) ] + counts.tolist()) 807 s += '\n' + table2string(table) 808 else: 809 s += "No details due to large number of labels or chunks. " \ 810 "Increase maxc and maxl if desired" 811 812 labels_map = self.labels_map 813 if isinstance(labels_map, dict): 814 s += "\nOriginal labels were mapped using following mapping:" 815 s += '\n\t'+'\n\t'.join([':\t'.join(map(str, x)) 816 for x in labels_map.items()]) + '\n' 817 818 def cl_stats(axis, u, name1, name2): 819 # Compute statistics per label 820 stats = {'min': N.min(spcl, axis=axis), 821 'max': N.max(spcl, axis=axis), 822 'mean': N.mean(spcl, axis=axis), 823 'std': N.std(spcl, axis=axis), 824 '#%ss' % name2: N.sum(spcl>0, axis=axis)} 825 entries = [' ' + name1, 'mean', 'std', 'min', 'max', '#%ss' % name2] 826 table = [ entries ] 827 for i, l in enumerate(u): 828 d = {' ' + name1 : l} 829 d.update(dict([ (k, stats[k][i]) for k in stats.keys()])) 830 table.append( [ ('%.3g', '%s')[isinstance(d[e], basestring)] 831 % d[e] for e in entries] ) 832 return '\nSummary per %s across %ss\n' % (name1, name2) \ 833 + table2string(table)
834 835 if len(ul) < maxl: 836 s += cl_stats(0, ul, 'label', 'chunk') 837 if len(uc) < maxc: 838 s += cl_stats(1, uc, 'chunk', 'label') 839 return s
840 841
842 - def __iadd__(self, other):
843 """Merge the samples of one Dataset object to another (in-place). 844 845 No dataset attributes, besides labels_map, will be merged! 846 Additionally, a new set of unique `origids` will be generated. 847 """ 848 # local bindings 849 _data = self._data 850 other_data = other._data 851 852 if not self.nfeatures == other.nfeatures: 853 raise DatasetError, "Cannot add Dataset, because the number of " \ 854 "feature do not match." 855 856 # take care about labels_map and labels 857 slm = self.labels_map 858 olm = other.labels_map 859 if N.logical_xor(slm is None, olm is None): 860 raise ValueError, "Cannot add datasets where only one of them " \ 861 "has labels map assigned. If needed -- implement it" 862 863 # concatenate all sample attributes 864 for k,v in _data.iteritems(): 865 if k == 'origids': 866 # special case samples origids: for now just regenerate unique 867 # ones could also check if concatenation is unique, but it 868 # would be costly performance-wise 869 _data[k] = N.arange(len(v) + len(other_data[k])) 870 871 elif k == 'labels' and slm is not None: 872 # special care about labels if mapping was in effect, 873 # we need to append 2nd map to the first one and 874 # relabel 2nd dataset 875 nlm = slm.copy() 876 # figure out maximal numerical label used now 877 nextid = N.sort(nlm.values())[-1] + 1 878 olabels = other.labels 879 olabels_remap = {} 880 for ol, olnum in olm.iteritems(): 881 if not nlm.has_key(ol): 882 # check if we can preserve old numberic label 883 # if not -- assign some new one not yet present 884 # in any dataset 885 if olnum in nlm.values(): 886 nextid = N.sort(nlm.values() + olm.values())[-1] + 1 887 else: 888 nextid = olnum 889 olabels_remap[olnum] = nextid 890 nlm[ol] = nextid 891 nextid += 1 892 else: 893 olabels_remap[olnum] = nlm[ol] 894 olabels = [olabels_remap[x] for x in olabels] 895 # finally compose new labels 896 _data['labels'] = N.concatenate((v, olabels), axis=0) 897 # and reassign new mapping 898 self._dsattr['labels_map'] = nlm 899 900 if __debug__: 901 # check if we are not dealing with colliding 902 # mapping, since it is problematic and might lead 903 # to various complications 904 if (len(Set(slm.keys())) != len(Set(slm.values()))) or \ 905 (len(Set(olm.keys())) != len(Set(olm.values()))): 906 warning("Adding datasets where multiple labels " 907 "mapped to the same ID is not recommended. " 908 "Please check the outcome. Original mappings " 909 "were %s and %s. Resultant is %s" 910 % (slm, olm, nlm)) 911 912 else: 913 _data[k] = N.concatenate((v, other_data[k]), axis=0) 914 915 # might be more sophisticated but for now just reset -- it is safer ;) 916 self._resetallunique() 917 918 return self
919 920
921 - def __add__( self, other ):
922 """Merge the samples two Dataset objects. 923 924 All data of both datasets is copied, concatenated and a new Dataset is 925 returned. 926 927 NOTE: This can be a costly operation (both memory and time). If 928 performance is important consider the '+=' operator. 929 """ 930 # create a new object of the same type it is now and NOT only Dataset 931 out = super(Dataset, self).__new__(self.__class__) 932 933 # now init it: to make it work all Dataset contructors have to accept 934 # Class(data=Dict, dsattr=Dict) 935 out.__init__(data=self._data, 936 dsattr=self._dsattr, 937 copy_samples=True, 938 copy_data=True, 939 copy_dsattr=True) 940 941 out += other 942 943 return out
944 945
946 - def copy(self):
947 """Create a copy (clone) of the dataset, by fully copying current one 948 949 """ 950 # create a new object of the same type it is now and NOT only Dataset 951 out = super(Dataset, self).__new__(self.__class__) 952 953 # now init it: to make it work all Dataset contructors have to accept 954 # Class(data=Dict, dsattr=Dict) 955 out.__init__(data=self._data, 956 dsattr=self._dsattr, 957 copy_samples=True, 958 copy_data=True, 959 copy_dsattr=True) 960 961 return out
962 963
964 - def selectFeatures(self, ids=None, sort=True, groups=None):
965 """Select a number of features from the current set. 966 967 :Parameters: 968 ids 969 iterable container to select ids 970 sort : bool 971 if to sort Ids. Order matters and `selectFeatures` assumes 972 incremental order. If not such, in non-optimized code 973 selectFeatures would verify the order and sort 974 975 Returns a new Dataset object with a view of the original 976 samples array (no copying is performed). 977 978 WARNING: The order of ids determines the order of features in 979 the returned dataset. This might be useful sometimes, but can 980 also cause major headaches! Order would is verified when 981 running in non-optimized code (if __debug__) 982 """ 983 if ids is None and groups is None: 984 raise ValueError, "No feature selection specified." 985 986 # start with empty list if no ids where specified (so just groups) 987 if ids is None: 988 ids = [] 989 990 if not groups is None: 991 if not self._dsattr.has_key('featuregroups'): 992 raise RuntimeError, \ 993 "Dataset has no feature grouping information." 994 995 for g in groups: 996 ids += (self._dsattr['featuregroups'] == g).nonzero()[0].tolist() 997 998 # XXX set sort default to True, now sorting has to be explicitely 999 # disabled and warning is not necessary anymore 1000 if sort: 1001 ids.sort() 1002 elif __debug__ and 'CHECK_DS_SORTED' in debug.active: 1003 from mvpa.misc.support import isSorted 1004 if not isSorted(ids): 1005 warning("IDs for selectFeatures must be provided " + 1006 "in sorted order, otherwise major headache might occur") 1007 1008 # shallow-copy all stuff from current data dict 1009 new_data = self._data.copy() 1010 1011 # assign the selected features -- data is still shared with 1012 # current dataset 1013 new_data['samples'] = self._data['samples'][:, ids] 1014 1015 # apply selection to feature groups as well 1016 if self._dsattr.has_key('featuregroups'): 1017 new_dsattr = self._dsattr.copy() 1018 new_dsattr['featuregroups'] = self._dsattr['featuregroups'][ids] 1019 else: 1020 new_dsattr = self._dsattr 1021 1022 # create a new object of the same type it is now and NOT onyl Dataset 1023 dataset = super(Dataset, self).__new__(self.__class__) 1024 1025 # now init it: to make it work all Dataset contructors have to accept 1026 # Class(data=Dict, dsattr=Dict) 1027 dataset.__init__(data=new_data, 1028 dsattr=new_dsattr, 1029 check_data=False, 1030 copy_samples=False, 1031 copy_data=False, 1032 copy_dsattr=False 1033 ) 1034 1035 return dataset
1036 1037
1038 - def applyMapper(self, featuresmapper=None, samplesmapper=None, 1039 train=True):
1040 """Obtain new dataset by applying mappers over features and/or samples. 1041 1042 While featuresmappers leave the sample attributes information 1043 unchanged, as the number of samples in the dataset is invariant, 1044 samplesmappers are also applied to the samples attributes themselves! 1045 1046 Applying a featuresmapper will destroy any feature grouping information. 1047 1048 :Parameters: 1049 featuresmapper : Mapper 1050 `Mapper` to somehow transform each sample's features 1051 samplesmapper : Mapper 1052 `Mapper` to transform each feature across samples 1053 train : bool 1054 Flag whether to train the mapper with this dataset before applying 1055 it. 1056 1057 TODO: selectFeatures is pretty much 1058 applyMapper(featuresmapper=MaskMapper(...)) 1059 """ 1060 1061 # shallow-copy all stuff from current data dict 1062 new_data = self._data.copy() 1063 1064 # apply mappers 1065 1066 if samplesmapper: 1067 if __debug__: 1068 debug("DS", "Training samplesmapper %s" % `samplesmapper`) 1069 samplesmapper.train(self) 1070 1071 if __debug__: 1072 debug("DS", "Applying samplesmapper %s" % `samplesmapper` + 1073 " to samples of dataset `%s`" % `self`) 1074 1075 # get rid of existing 'origids' as they are not valid anymore and 1076 # applying a mapper to them is not really meaningful 1077 if new_data.has_key('origids'): 1078 del(new_data['origids']) 1079 1080 # apply mapper to all sample-wise data in dataset 1081 for k in new_data.keys(): 1082 new_data[k] = samplesmapper.forward(self._data[k]) 1083 1084 # feature mapping might affect dataset attributes 1085 # XXX: might be obsolete when proper feature attributes are implemented 1086 new_dsattr = self._dsattr 1087 1088 if featuresmapper: 1089 if __debug__: 1090 debug("DS", "Training featuresmapper %s" % `featuresmapper`) 1091 featuresmapper.train(self) 1092 1093 if __debug__: 1094 debug("DS", "Applying featuresmapper %s" % `featuresmapper` + 1095 " to samples of dataset `%s`" % `self`) 1096 new_data['samples'] = featuresmapper.forward(self._data['samples']) 1097 1098 # remove feature grouping, who knows what the mapper did to the 1099 # features 1100 if self._dsattr.has_key('featuregroups'): 1101 new_dsattr = self._dsattr.copy() 1102 del(new_dsattr['featuregroups']) 1103 else: 1104 new_dsattr = self._dsattr 1105 1106 # create a new object of the same type it is now and NOT only Dataset 1107 dataset = super(Dataset, self).__new__(self.__class__) 1108 1109 # now init it: to make it work all Dataset contructors have to accept 1110 # Class(data=Dict, dsattr=Dict) 1111 dataset.__init__(data=new_data, 1112 dsattr=new_dsattr, 1113 check_data=False, 1114 copy_samples=False, 1115 copy_data=False, 1116 copy_dsattr=False 1117 ) 1118 1119 # samples attributes might have changed after applying samplesmapper 1120 if samplesmapper: 1121 dataset._resetallunique(force=True) 1122 1123 return dataset
1124 1125
1126 - def selectSamples(self, ids):
1127 """Choose a subset of samples defined by samples IDs. 1128 1129 Returns a new dataset object containing the selected sample 1130 subset. 1131 1132 TODO: yoh, we might need to sort the mask if the mask is a 1133 list of ids and is not ordered. Clarify with Michael what is 1134 our intent here! 1135 """ 1136 # without having a sequence a index the masked sample array would 1137 # loose its 2d layout 1138 if not operator.isSequenceType( ids ): 1139 ids = [ids] 1140 # TODO: Reconsider crafting a slice if it can be done to don't copy 1141 # the data 1142 #try: 1143 # minmask = min(mask) 1144 # maxmask = max(mask) 1145 #except: 1146 # minmask = min(map(int,mask)) 1147 # maxmask = max(map(int,mask)) 1148 # lets see if we could get it done with cheap view/slice 1149 #(minmask, maxmask) != (0, 1) and \ 1150 #if len(mask) > 2 and \ 1151 # N.array([N.arange(minmask, maxmask+1) == N.array(mask)]).all(): 1152 # slice_ = slice(minmask, maxmask+1) 1153 # if __debug__: 1154 # debug("DS", "We can and do convert mask %s into splice %s" % 1155 # (mask, slice_)) 1156 # mask = slice_ 1157 # mask all sample attributes 1158 data = {} 1159 for k, v in self._data.iteritems(): 1160 data[k] = v[ids, ] 1161 1162 # create a new object of the same type it is now and NOT onyl Dataset 1163 dataset = super(Dataset, self).__new__(self.__class__) 1164 1165 # now init it: to make it work all Dataset contructors have to accept 1166 # Class(data=Dict, dsattr=Dict) 1167 dataset.__init__(data=data, 1168 dsattr=self._dsattr, 1169 check_data=False, 1170 copy_samples=False, 1171 copy_data=False, 1172 copy_dsattr=False) 1173 1174 dataset._resetallunique(force=True) 1175 return dataset
1176 1177 1178
1179 - def index(self, *args, **kwargs):
1180 """Universal indexer to obtain indexes of interesting samples/features. 1181 See .select() for more information 1182 1183 :Return: tuple of (samples indexes, features indexes). Each 1184 item could be also None, if no selection on samples or 1185 features was requested (to discriminate between no selected 1186 items, and no selections) 1187 """ 1188 s_indx = [] # selections for samples 1189 f_indx = [] # selections for features 1190 return_dataset = kwargs.pop('return_dataset', False) 1191 largs = len(args) 1192 1193 args = list(args) # so we could override 1194 # Figure out number of positional 1195 largs_nonstring = 0 1196 # need to go with index since we might need to override internally 1197 for i in xrange(largs): 1198 l = args[i] 1199 if isinstance(l, basestring): 1200 if l.lower() == 'all': 1201 # override with a slice 1202 args[i] = slice(None) 1203 else: 1204 break 1205 largs_nonstring += 1 1206 1207 if largs_nonstring >= 1: 1208 s_indx.append(args[0]) 1209 if __debug__ and 'CHECK_DS_SELECT' in debug.active: 1210 _validate_indexes_uniq_sorted(args[0], 'select', 'samples') 1211 if largs_nonstring == 2: 1212 f_indx.append(args[1]) 1213 if __debug__ and 'CHECK_DS_SELECT' in debug.active: 1214 _validate_indexes_uniq_sorted(args[1], 'select', 'features') 1215 elif largs_nonstring > 2: 1216 raise ValueError, "Only two positional arguments are allowed" \ 1217 ". 1st for samples, 2nd for features" 1218 1219 # process left positional arguments which must encode selections like 1220 # ('labels', [1,2,3]) 1221 1222 if (largs - largs_nonstring) % 2 != 0: 1223 raise ValueError, "Positional selections must come in pairs:" \ 1224 " e.g. ('labels', [1,2,3])" 1225 1226 for i in xrange(largs_nonstring, largs, 2): 1227 k, v = args[i:i+2] 1228 kwargs[k] = v 1229 1230 # process keyword parameters 1231 data_ = self._data 1232 for k, v in kwargs.iteritems(): 1233 if k == 'samples': 1234 s_indx.append(v) 1235 elif k == 'features': 1236 f_indx.append(v) 1237 elif data_.has_key(k): 1238 # so it is an attribute for samples 1239 # XXX may be do it not only if __debug__ 1240 if __debug__: # and 'CHECK_DS_SELECT' in debug.active: 1241 if not N.any([isinstance(v, cls) for cls in 1242 [list, tuple, slice, int]]): 1243 raise ValueError, "Trying to specify selection for %s " \ 1244 "based on unsupported '%s'" % (k, v) 1245 s_indx.append(self._getSampleIdsByAttr(v, attrib=k, sort=False)) 1246 else: 1247 raise ValueError, 'Keyword "%s" is not known, thus' \ 1248 'select() failed' % k 1249 1250 def combine_indexes(indx, nelements): 1251 """Helper function: intersect selections given in indx 1252 1253 :Parameters: 1254 indxs : list of lists or slices 1255 selections of elements 1256 nelements : int 1257 number of elements total for deriving indexes from slices 1258 """ 1259 indx_sel = None # pure list of ids for selection 1260 for s in indx: 1261 if isinstance(s, slice) or \ 1262 isinstance(s, N.ndarray) and s.dtype==bool: 1263 # XXX there might be a better way than reconstructing the full 1264 # index list. Also we are loosing ability to do simlpe slicing, 1265 # ie w.o making a copy of the selected data 1266 all_indexes = N.arange(nelements) 1267 s = all_indexes[s] 1268 elif not operator.isSequenceType(s): 1269 s = [ s ] 1270 1271 if indx_sel is None: 1272 indx_sel = Set(s) 1273 else: 1274 # To be consistent 1275 #if not isinstance(indx_sel, Set): 1276 # indx_sel = Set(indx_sel) 1277 indx_sel = indx_sel.intersection(s) 1278 1279 # if we got Set -- convert 1280 if isinstance(indx_sel, Set): 1281 indx_sel = list(indx_sel) 1282 1283 # sort for the sake of sanity 1284 indx_sel.sort() 1285 1286 return indx_sel
1287 1288 # Select samples 1289 if len(s_indx) == 1 and isinstance(s_indx[0], slice) \ 1290 and s_indx[0] == slice(None): 1291 # so no actual selection -- full slice 1292 s_indx = s_indx[0] 1293 else: 1294 # else - get indexes 1295 if len(s_indx) == 0: 1296 s_indx = None 1297 else: 1298 s_indx = combine_indexes(s_indx, self.nsamples) 1299 1300 # Select features 1301 if len(f_indx): 1302 f_indx = combine_indexes(f_indx, self.nfeatures) 1303 else: 1304 f_indx = None 1305 1306 return s_indx, f_indx 1307 1308
1309 - def select(self, *args, **kwargs):
1310 """Universal selector 1311 1312 WARNING: if you need to select duplicate samples 1313 (e.g. samples=[5,5]) or order of selected samples of features 1314 is important and has to be not ordered (e.g. samples=[3,2,1]), 1315 please use selectFeatures or selectSamples functions directly 1316 1317 Examples: 1318 Mimique plain selectSamples:: 1319 1320 dataset.select([1,2,3]) 1321 dataset[[1,2,3]] 1322 1323 Mimique plain selectFeatures:: 1324 1325 dataset.select(slice(None), [1,2,3]) 1326 dataset.select('all', [1,2,3]) 1327 dataset[:, [1,2,3]] 1328 1329 Mixed (select features and samples):: 1330 1331 dataset.select([1,2,3], [1, 2]) 1332 dataset[[1,2,3], [1, 2]] 1333 1334 Select samples matching some attributes:: 1335 1336 dataset.select(labels=[1,2], chunks=[2,4]) 1337 dataset.select('labels', [1,2], 'chunks', [2,4]) 1338 dataset['labels', [1,2], 'chunks', [2,4]] 1339 1340 Mixed -- out of first 100 samples, select only those with 1341 labels 1 or 2 and belonging to chunks 2 or 4, and select 1342 features 2 and 3:: 1343 1344 dataset.select(slice(0,100), [2,3], labels=[1,2], chunks=[2,4]) 1345 dataset[:100, [2,3], 'labels', [1,2], 'chunks', [2,4]] 1346 1347 """ 1348 s_indx, f_indx = self.index(*args, **kwargs) 1349 1350 # Select samples 1351 if s_indx == slice(None): 1352 # so no actual selection was requested among samples. 1353 # thus proceed with original dataset 1354 if __debug__: 1355 debug('DS', 'in select() not selecting samples') 1356 ds = self 1357 else: 1358 # else do selection 1359 if __debug__: 1360 debug('DS', 'in select() selecting samples given selections' 1361 + str(s_indx)) 1362 ds = self.selectSamples(s_indx) 1363 1364 # Select features 1365 if f_indx is not None: 1366 if __debug__: 1367 debug('DS', 'in select() selecting features given selections' 1368 + str(f_indx)) 1369 ds = ds.selectFeatures(f_indx) 1370 1371 return ds
1372 1373 1374
1375 - def where(self, *args, **kwargs):
1376 """Obtain indexes of interesting samples/features. See select() for more information 1377 1378 XXX somewhat obsoletes idsby... 1379 """ 1380 s_indx, f_indx = self.index(*args, **kwargs) 1381 if s_indx is not None and f_indx is not None: 1382 return s_indx, f_indx 1383 elif s_indx is not None: 1384 return s_indx 1385 else: 1386 return f_indx
1387 1388
1389 - def __getitem__(self, *args):
1390 """Convinience dataset parts selection 1391 1392 See select for more information 1393 """ 1394 # for cases like ['labels', 1] 1395 if len(args) == 1 and isinstance(args[0], tuple): 1396 args = args[0] 1397 1398 args_,args = args,() 1399 for a in args_: 1400 if isinstance(a, slice) and \ 1401 isinstance(a.start, basestring): 1402 # for the constructs like ['labels':[1,2]] 1403 if a.stop is None or a.step is not None: 1404 raise ValueError, \ 1405 "Selection must look like ['chunks':[2,3]]" 1406 args += (a.start, a.stop) 1407 else: 1408 args += (a,) 1409 return self.select(*args)
1410 1411
1412 - def permuteLabels(self, status, perchunk=True, assure_permute=False):
1413 """Permute the labels. 1414 1415 TODO: rename status into something closer in semantics. 1416 1417 :Parameters: 1418 status : bool 1419 Calling this method with set to True, the labels are 1420 permuted among all samples. If 'status' is False the 1421 original labels are restored. 1422 perchunk : bool 1423 If True permutation is limited to samples sharing the same 1424 chunk value. Therefore only the association of a certain 1425 sample with a label is permuted while keeping the absolute 1426 number of occurences of each label value within a certain 1427 chunk constant. 1428 assure_permute : bool 1429 If True, assures that labels are permutted, ie any one is 1430 different from the original one 1431 """ 1432 # local bindings 1433 _data = self._data 1434 1435 if len(self.uniquelabels)<2: 1436 raise RuntimeError, \ 1437 "Call to permuteLabels is bogus since there is insuficient" \ 1438 " number of labels: %s" % self.uniquelabels 1439 1440 if not status: 1441 # restore originals 1442 if _data.get('origlabels', None) is None: 1443 raise RuntimeError, 'Cannot restore labels. ' \ 1444 'permuteLabels() has never been ' \ 1445 'called with status == True.' 1446 self.labels = _data['origlabels'] 1447 _data.pop('origlabels') 1448 else: 1449 # store orig labels, but only if not yet done, otherwise multiple 1450 # calls with status == True will destroy the original labels 1451 if not _data.has_key('origlabels') \ 1452 or _data['origlabels'] == None: 1453 # bind old labels to origlabels 1454 _data['origlabels'] = _data['labels'] 1455 # copy labels 1456 _data['labels'] = copy.copy(_data['labels']) 1457 1458 labels = _data['labels'] 1459 # now scramble 1460 if perchunk: 1461 for o in self.uniquechunks: 1462 labels[self.chunks == o] = \ 1463 N.random.permutation(labels[self.chunks == o]) 1464 else: 1465 labels = N.random.permutation(labels) 1466 1467 self.labels = labels 1468 1469 if assure_permute: 1470 if not (_data['labels'] != _data['origlabels']).any(): 1471 if not (assure_permute is True): 1472 if assure_permute == 1: 1473 raise RuntimeError, \ 1474 "Cannot assure permutation of labels %s for " \ 1475 "some reason with chunks %s and while " \ 1476 "perchunk=%s . Should not happen" % \ 1477 (self.labels, self.chunks, perchunk) 1478 else: 1479 assure_permute = 11 # make 10 attempts 1480 if __debug__: 1481 debug("DS", "Recalling permute to assure different labels") 1482 self.permuteLabels(status, perchunk=perchunk, 1483 assure_permute=assure_permute-1)
1484 1485
1486 - def getRandomSamples(self, nperlabel):
1487 """Select a random set of samples. 1488 1489 If 'nperlabel' is an integer value, the specified number of samples is 1490 randomly choosen from the group of samples sharing a unique label 1491 value ( total number of selected samples: nperlabel x len(uniquelabels). 1492 1493 If 'nperlabel' is a list which's length has to match the number of 1494 unique label values. In this case 'nperlabel' specifies the number of 1495 samples that shall be selected from the samples with the corresponding 1496 label. 1497 1498 The method returns a Dataset object containing the selected 1499 samples. 1500 """ 1501 # if interger is given take this value for all classes 1502 if isinstance(nperlabel, int): 1503 nperlabel = [ nperlabel for i in self.uniquelabels ] 1504 1505 sample = [] 1506 # for each available class 1507 for i, r in enumerate(self.uniquelabels): 1508 # get the list of pattern ids for this class 1509 sample += random.sample( (self.labels == r).nonzero()[0], 1510 nperlabel[i] ) 1511 1512 return self.selectSamples( sample )
1513 1514 1515 # def _setchunks(self, chunks): 1516 # """Sets chunks and recomputes uniquechunks 1517 # """ 1518 # self._data['chunks'] = N.array(chunks) 1519 # self._dsattr['uniquechunks'] = None # None!since we might not need them 1520 1521
1522 - def getNSamples( self ):
1523 """Currently available number of patterns. 1524 """ 1525 return self._data['samples'].shape[0]
1526 1527
1528 - def getNFeatures( self ):
1529 """Number of features per pattern. 1530 """ 1531 return self._data['samples'].shape[1]
1532 1533
1534 - def getLabelsMap(self):
1535 """Stored labels map (if any) 1536 """ 1537 return self._dsattr.get('labels_map', None)
1538 1539
1540 - def setLabelsMap(self, lm):
1541 """Set labels map. 1542 1543 Checks for the validity of the mapping -- values should cover 1544 all existing labels in the dataset 1545 """ 1546 values = Set(lm.values()) 1547 labels = Set(self.uniquelabels) 1548 if not values.issuperset(labels): 1549 raise ValueError, \ 1550 "Provided mapping %s has some existing labels (out of %s) " \ 1551 "missing from mapping" % (list(values), list(labels)) 1552 self._dsattr['labels_map'] = lm
1553 1554
1555 - def setSamplesDType(self, dtype):
1556 """Set the data type of the samples array. 1557 """ 1558 # local bindings 1559 _data = self._data 1560 1561 if _data['samples'].dtype != dtype: 1562 _data['samples'] = _data['samples'].astype(dtype)
1563 1564
1565 - def defineFeatureGroups(self, definition):
1566 """ 1567 """ 1568 if not len(definition) == self.nfeatures: 1569 raise ValueError, \ 1570 "Length of feature group definition %i " \ 1571 "does not match the number of features %i " \ 1572 % (len(definition), self.nfeatures) 1573 1574 self._dsattr['featuregroups'] = N.array(definition)
1575 1576
1577 - def convertFeatureIds2FeatureMask(self, ids):
1578 """Returns a boolean mask with all features in `ids` selected. 1579 1580 :Parameters: 1581 ids: list or 1d array 1582 To be selected features ids. 1583 1584 :Returns: 1585 ndarray: dtype='bool' 1586 All selected features are set to True; False otherwise. 1587 """ 1588 fmask = N.repeat(False, self.nfeatures) 1589 fmask[ids] = True 1590 1591 return fmask
1592 1593
1594 - def convertFeatureMask2FeatureIds(self, mask):
1595 """Returns feature ids corresponding to non-zero elements in the mask. 1596 1597 :Parameters: 1598 mask: 1d ndarray 1599 Feature mask. 1600 1601 :Returns: 1602 ndarray: integer 1603 Ids of non-zero (non-False) mask elements. 1604 """ 1605 return mask.nonzero()[0]
1606 1607 1608 @staticmethod
1609 - def _checkCopyConstructorArgs(**kwargs):
1610 """Common sanity check for Dataset copy constructor calls.""" 1611 # check if we have samples (somwhere) 1612 samples = None 1613 if kwargs.has_key('samples'): 1614 samples = kwargs['samples'] 1615 if samples is None and kwargs.has_key('data') \ 1616 and kwargs['data'].has_key('samples'): 1617 samples = kwargs['data']['samples'] 1618 if samples is None: 1619 raise DatasetError, \ 1620 "`samples` must be provided to copy constructor call." 1621 1622 if not len(samples.shape) == 2: 1623 raise DatasetError, \ 1624 "samples must be in 2D shape in copy constructor call."
1625 1626 1627 # read-only class properties 1628 nsamples = property( fget=getNSamples ) 1629 nfeatures = property( fget=getNFeatures ) 1630 labels_map = property( fget=getLabelsMap, fset=setLabelsMap ) 1631
1632 -def datasetmethod(func):
1633 """Decorator to easily bind functions to a Dataset class 1634 """ 1635 if __debug__: 1636 debug("DS_", "Binding function %s to Dataset class" % func.func_name) 1637 1638 # Bind the function 1639 setattr(Dataset, func.func_name, func) 1640 1641 # return the original one 1642 return func
1643 1644 1645 # Following attributes adherent to the basic dataset 1646 Dataset._registerAttribute("samples", "_data", abbr='S', hasunique=False) 1647 Dataset._registerAttribute("labels", "_data", abbr='L', hasunique=True) 1648 Dataset._registerAttribute("chunks", "_data", abbr='C', hasunique=True) 1649 # samples ids (already unique by definition) 1650 Dataset._registerAttribute("origids", "_data", abbr='I', hasunique=False) 1651