Package mvpa :: Package datasets :: Module base
[hide private]
[frames] | no frames]

Source Code for Module mvpa.datasets.base

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Dataset container""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  import operator 
 14  import random 
 15  import copy 
 16   
 17  import numpy as N 
 18   
 19  from mvpa.misc.exceptions import DatasetError 
 20  from mvpa.misc.support import idhash as idhash_ 
 21   
 22  if __debug__: 
 23      from mvpa.misc import debug, warning 
24 25 -class Dataset(object):
26 """This class provides a container to store all necessary data to perform 27 MVPA analyses. These are the data samples, as well as the labels 28 associated with these patterns. Additionally samples can be grouped into 29 chunks. 30 31 :Groups: 32 - `Creators`: `__init__`, `selectFeatures`, `selectSamples`, `applyMapper` 33 - `Mutators`: `permuteLabels` 34 35 Important: labels assumed to be immutable, ie noone should modify 36 them externally by accessing indexed items, ie something like 37 ``dataset.labels[1] += "_bad"`` should not be used. If a label has 38 to be modified, full copy of labels should be obtained, operated 39 on, and assigned back to the dataset, otherwise 40 dataset.uniquelabels would not work. The same applies to any 41 other attribute which has corresponding unique* access property. 42 """ 43 44 # static definition to track which unique attributes 45 # have to be reset/recomputed whenever anything relevant 46 # changes 47 48 # unique{labels,chunks} become a part of dsattr 49 _uniqueattributes = [] 50 """Unique attributes associated with the data""" 51 52 _registeredattributes = [] 53 """Registered attributes (stored in _data)""" 54 55 _requiredattributes = ['samples', 'labels'] 56 """Attributes which have to be provided to __init__, or otherwise 57 no default values would be assumed and construction of the 58 instance would fail""" 59
60 - def __init__(self, data=None, dsattr=None, dtype=None, \ 61 samples=None, labels=None, chunks=None, check_data=True, 62 copy_samples=False, copy_data=True, copy_dsattr=True):
63 """Initialize dataset instance 64 65 :Parameters: 66 data : dict 67 Dictionary with an arbitrary number of entries. The value for 68 each key in the dict has to be an ndarray with the 69 same length as the number of rows in the samples array. 70 A special entry in theis dictionary is 'samples', a 2d array 71 (samples x features). A shallow copy is stored in the object. 72 dsattr : dict 73 Dictionary of dataset attributes. An arbitrary number of 74 arbitrarily named and typed objects can be stored here. A 75 shallow copy of the dictionary is stored in the object. 76 dtype 77 If None -- do not change data type if samples 78 is an ndarray. Otherwise convert samples to dtype. 79 80 :Keywords: 81 samples : ndarray 82 a 2d array (samples x features) 83 labels 84 array or scalar value defining labels for each samples 85 chunks 86 array or scalar value defining chunks for each sample 87 88 Each of the Keywords arguments overwrites what is/might be 89 already in the `data` container. 90 91 """ 92 # see if data and dsattr are none, if so, make them empty dicts 93 if data is None: 94 data = {} 95 if dsattr is None: 96 dsattr = {} 97 98 # initialize containers; default values are empty dicts 99 # always make a shallow copy of what comes in, otherwise total chaos 100 # is likely to happen soon 101 if copy_data: 102 # deep copy (cannot use copy.deepcopy, because samples is an 103 # exception 104 # but shallow copy first to get a shared version of the data in 105 # any case 106 lcl_data = data.copy() 107 for k, v in data.iteritems(): 108 # skip copying samples if requested 109 if k == 'samples' and not copy_samples: 110 continue 111 lcl_data[k] = v.copy() 112 else: 113 # shallow copy 114 # XXX? yoh: it might be better speed wise just assign dictionary 115 # without any shallow .copy 116 lcl_data = data.copy() 117 118 if copy_dsattr and len(dsattr)>0: 119 # deep copy 120 if __debug__: 121 debug('DS', "Deep copying dsattr %s" % `dsattr`) 122 lcl_dsattr = copy.deepcopy(dsattr) 123 124 else: 125 # shallow copy 126 lcl_dsattr = copy.copy(dsattr) 127 128 # has to be not private since otherwise derived methods 129 # would have problem accessing it and _registerAttribute 130 # would fail on lambda getters 131 self._data = lcl_data 132 """What makes a dataset.""" 133 134 self._dsattr = lcl_dsattr 135 """Dataset attriibutes.""" 136 137 # store samples (and possibly transform/reshape/retype them) 138 if not samples == None: 139 if __debug__: 140 if self._data.has_key('samples'): 141 debug('DS', 142 "`Data` dict has `samples` (%s) but there is also" + 143 " __init__ parameter `samples` which overrides " + 144 " stored in `data`" % (`self._data['samples'].shape`)) 145 self._data['samples'] = self._shapeSamples(samples, dtype, 146 copy_samples) 147 148 # TODO? we might want to have the same logic for chunks and labels 149 # ie if no labels present -- assign arange 150 # labels 151 if not labels == None: 152 if __debug__: 153 if self._data.has_key('labels'): 154 debug('DS', 155 "`Data` dict has `labels` (%s) but there is also" + 156 " __init__ parameter `labels` which overrides " + 157 " stored in `data`" % (`self._data['labels']`)) 158 if self._data.has_key('samples'): 159 self._data['labels'] = \ 160 self._expandSampleAttribute(labels, 'labels') 161 162 # check if we got all required attributes 163 for attr in self._requiredattributes: 164 if not self._data.has_key(attr): 165 raise DatasetError, \ 166 "Attribute %s is required to initialize dataset" % \ 167 attr 168 169 # chunks 170 if not chunks == None: 171 self._data['chunks'] = \ 172 self._expandSampleAttribute(chunks, 'chunks') 173 elif not self._data.has_key('chunks'): 174 # if no chunk information is given assume that every pattern 175 # is its own chunk 176 self._data['chunks'] = N.arange(self.nsamples) 177 178 # Initialize attributes which are registered but were not setup 179 for attr in self._registeredattributes: 180 if not self._data.has_key(attr): 181 if __debug__: 182 debug("DS", "Initializing attribute %s" % attr) 183 self._data[attr] = N.zeros(self.nsamples) 184 185 if check_data: 186 self._checkData() 187 188 # lazy computation of unique members 189 #self._resetallunique('_dsattr', self._dsattr) 190 191 # Michael: we cannot do this conditional here. When selectSamples() 192 # removes a whole data chunk the uniquechunks values will be invalid. 193 # Same applies to labels of course. 194 if not labels is None or not chunks is None: 195 # for a speed up to don't go through all uniqueattributes 196 # when no need 197 self._dsattr['__uniquereseted'] = False 198 self._resetallunique(force=True)
199 200 @property
201 - def idhash(self):
202 """To verify if dataset is in the same state as when smth else was done 203 204 Like if classifier was trained on the same dataset as in question""" 205 206 res = id(self._data) 207 for val in self._data.values(): 208 res += idhash_(val) 209 return res
210 211
212 - def _resetallunique(self, force=False):
213 """Set to None all unique* attributes of corresponding dictionary 214 """ 215 216 if not force and self._dsattr['__uniquereseted']: 217 return 218 219 # I guess we better checked if dictname is known but... 220 for k in self._uniqueattributes: 221 if __debug__: 222 debug("DS_", "Reset attribute %s" % k) 223 self._dsattr[k] = None 224 self._dsattr['__uniquereseted'] = True
225 226
227 - def _getuniqueattr(self, attrib, dict_):
228 """Provide common facility to return unique attributes 229 230 XXX `dict_` can be simply replaced now with self._dsattr 231 """ 232 if not self._dsattr.has_key(attrib) or self._dsattr[attrib] is None: 233 if __debug__: 234 debug("DS_", "Recomputing unique set for attrib %s within %s" % 235 (attrib, self.summary(uniq=False))) 236 # uff... might come up with better strategy to keep relevant 237 # attribute name 238 self._dsattr[attrib] = N.unique( dict_[attrib[6:]] ) 239 assert(not self._dsattr[attrib] is None) 240 self._dsattr['__uniquereseted'] = False 241 242 return self._dsattr[attrib]
243 244
245 - def _setdataattr(self, attrib, value):
246 """Provide common facility to set attributes 247 248 """ 249 if len(value) != self.nsamples: 250 raise ValueError, \ 251 "Provided %s have %d entries while there is %d samples" % \ 252 (attrib, len(value), self.nsamples) 253 self._data[attrib] = N.asarray(value) 254 uniqueattr = "unique" + attrib 255 256 if self._dsattr.has_key(uniqueattr): 257 self._dsattr[uniqueattr] = None
258 259
260 - def _getNSamplesPerAttr( self, attrib='labels' ):
261 """Returns the number of samples per unique label. 262 """ 263 # XXX hardcoded dict_=self._data.... might be in self._dsattr 264 uniqueattr = self._getuniqueattr(attrib="unique" + attrib, 265 dict_=self._data) 266 267 # use dictionary to cope with arbitrary labels 268 result = dict(zip(uniqueattr, [ 0 ] * len(uniqueattr))) 269 for l in self._data[attrib]: 270 result[l] += 1 271 272 # XXX only return values to mimic the old interface but we might want 273 # to return the full dict instead 274 # return result 275 return result
276 277 278 279
280 - def _getSampleIdsByAttr(self, values, attrib="labels"):
281 """Return indecies of samples given a list of attributes 282 """ 283 284 if not operator.isSequenceType(values): 285 values = [ values ] 286 287 # TODO: compare to plain for loop through the labels 288 # on a real data example 289 sel = N.array([], dtype=N.int16) 290 for value in values: 291 sel = N.concatenate(( 292 sel, N.where(self._data[attrib]==value)[0])) 293 294 # place samples in the right order 295 sel.sort() 296 297 return sel
298 299
300 - def _shapeSamples(self, samples, dtype, copy):
301 """Adapt different kinds of samples 302 303 Handle all possible input value for 'samples' and tranform 304 them into a 2d (samples x feature) representation. 305 """ 306 # put samples array into correct shape 307 # 1d arrays or simple sequences are assumed to be a single pattern 308 if (not isinstance(samples, N.ndarray)): 309 # it is safe to provide dtype which defaults to None, 310 # when N would choose appropriate dtype automagically 311 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy) 312 else: 313 if samples.ndim < 2 \ 314 or (not dtype is None and dtype != samples.dtype): 315 if dtype is None: 316 dtype = samples.dtype 317 samples = N.array(samples, ndmin=2, dtype=dtype, copy=copy) 318 elif copy: 319 samples = samples.copy() 320 321 # only samples x features matrices are supported 322 if len(samples.shape) > 2: 323 raise DatasetError, "Only (samples x features) -> 2d sample " \ 324 + "are supported (got %s shape of samples)." \ 325 % (`samples.shape`) \ 326 +" Consider MappedDataset if applicable." 327 328 return samples
329 330
331 - def _checkData(self):
332 """Checks `_data` members to have the same # of samples. 333 """ 334 for k, v in self._data.iteritems(): 335 if not len(v) == self.nsamples: 336 raise DatasetError, \ 337 "Length of sample attribute '%s' [%i] does not " \ 338 "match the number of samples in the dataset [%i]." \ 339 % (k, len(v), self.nsamples)
340 341
342 - def _expandSampleAttribute(self, attr, attr_name):
343 """If a sample attribute is given as a scalar expand/repeat it to a 344 length matching the number of samples in the dataset. 345 """ 346 try: 347 if len(attr) != self.nsamples: 348 raise DatasetError, \ 349 "Length of sample attribute '%s' [%d]" \ 350 % (attr_name, len(attr)) \ 351 + " has to match the number of samples" \ 352 + " [%d]." % self.nsamples 353 # store the sequence as array 354 return N.array(attr) 355 356 except TypeError: 357 # make sequence of identical value matching the number of 358 # samples 359 return N.repeat(attr, self.nsamples)
360 361 362 @classmethod
363 - def _registerAttribute(cls, key, dictname="_data", hasunique=False, 364 default_setter=True):
365 """Register an attribute for any Dataset class. 366 367 Creates property assigning getters/setters depending on the 368 availability of corresponding _get, _set functions. 369 """ 370 #import pydb 371 #pydb.debugger() 372 classdict = cls.__dict__ 373 if not classdict.has_key(key): 374 if __debug__: 375 debug("DS", "Registering new attribute %s" % key) 376 # define get function and use corresponding 377 # _getATTR if such defined 378 getter = '_get%s' % key 379 if classdict.has_key(getter): 380 getter = '%s.%s' % (cls.__name__, getter) 381 else: 382 getter = "lambda x: x.%s['%s']" % (dictname, key) 383 384 # define set function and use corresponding 385 # _setATTR if such defined 386 setter = '_set%s' % key 387 if classdict.has_key(setter): 388 setter = '%s.%s' % (cls.__name__, setter) 389 elif default_setter and dictname=="_data": 390 setter = "lambda self,x: self._setdataattr" + \ 391 "(attrib='%s', value=x)" % (key) 392 else: 393 setter = None 394 395 if __debug__: 396 debug("DS", "Registering new property %s.%s" % 397 (cls.__name__, key)) 398 exec "%s.%s = property(fget=%s, fset=%s)" % \ 399 (cls.__name__, key, getter, setter) 400 401 if hasunique: 402 uniquekey = "unique%s" % key 403 getter = '_get%s' % uniquekey 404 if classdict.has_key(getter): 405 getter = '%s.%s' % (cls.__name__, getter) 406 else: 407 getter = "lambda x: x._getuniqueattr" + \ 408 "(attrib='%s', dict_=x.%s)" % (uniquekey, dictname) 409 410 if __debug__: 411 debug("DS", "Registering new property %s.%s" % 412 (cls.__name__, uniquekey)) 413 414 exec "%s.%s = property(fget=%s)" % \ 415 (cls.__name__, uniquekey, getter) 416 417 # create samplesper<ATTR> properties 418 sampleskey = "samplesper%s" % key[:-1] # remove ending 's' XXX 419 if __debug__: 420 debug("DS", "Registering new property %s.%s" % 421 (cls.__name__, sampleskey)) 422 423 exec "%s.%s = property(fget=%s)" % \ 424 (cls.__name__, sampleskey, 425 "lambda x: x._getNSamplesPerAttr(attrib='%s')" % key) 426 427 cls._uniqueattributes.append(uniquekey) 428 429 # create idsby<ATTR> properties 430 sampleskey = "idsby%s" % key # remove ending 's' XXX 431 if __debug__: 432 debug("DS", "Registering new property %s.%s" % 433 (cls.__name__, sampleskey)) 434 435 exec "%s.%s = %s" % (cls.__name__, sampleskey, 436 "lambda self, x: " + 437 "self._getSampleIdsByAttr(x,attrib='%s')" % key) 438 439 cls._uniqueattributes.append(uniquekey) 440 441 cls._registeredattributes.append(key) 442 elif __debug__: 443 warning('Trying to reregister attribute `%s`. For now ' % key + 444 'such capability is not present')
445 446
447 - def __str__(self):
448 """String summary over the object 449 """ 450 return self.summary(uniq=True, 451 idhash=__debug__ and ('DS_ID' in debug.active), 452 stats=__debug__ and ('DS_STATS' in debug.active), 453 )
454
455 - def __repr__(self):
456 return "<%s>" % str(self)
457
458 - def summary(self, uniq=True, stats=True, idhash=False):
459 """String summary over the object 460 461 :Parameters: 462 uniq : bool 463 include summary over data attributes which have unique 464 idhash : bool 465 include idhash value for dataset and samples 466 stats : bool 467 include some basic statistics (mean, std, var) over dataset samples 468 """ 469 if idhash: 470 idhash_ds = "{%s}" % self.idhash 471 idhash_samples = "{%s}" % idhash_(self.samples) 472 else: 473 idhash_ds = "" 474 idhash_samples = "" 475 476 s = """Dataset %s/ %s %d%s x %d""" % \ 477 (idhash_ds, self.samples.dtype, 478 self.nsamples, idhash_samples, self.nfeatures) 479 480 if uniq: 481 s += " uniq:" 482 for uattr in self._dsattr.keys(): 483 if not uattr.startswith("unique"): 484 continue 485 attr = uattr[6:] 486 try: 487 value = self._getuniqueattr(attrib=uattr, 488 dict_=self._data) 489 s += " %d %s" % (len(value), attr) 490 except: 491 pass 492 493 if stats: 494 # TODO -- avg per chunk? 495 s += " stats: mean=%g std=%g var=%g" % \ 496 (N.mean(self.samples), N.std(self.samples), 497 N.var(self.samples)) 498 return s
499 500
501 - def __iadd__( self, other ):
502 """Merge the samples of one Dataset object to another (in-place). 503 504 No dataset attributes will be merged! 505 """ 506 if not self.nfeatures == other.nfeatures: 507 raise DatasetError, "Cannot add Dataset, because the number of " \ 508 "feature do not match." 509 510 # concatenate all sample attributes 511 for k, v in self._data.iteritems(): 512 self._data[k] = N.concatenate((v, other._data[k]), axis=0) 513 514 # might be more sophisticated but for now just reset -- it is safer ;) 515 self._resetallunique() 516 517 return self
518 519
520 - def __add__( self, other ):
521 """Merge the samples two Dataset objects. 522 523 All data of both datasets is copied, concatenated and a new Dataset is 524 returned. 525 526 NOTE: This can be a costly operation (both memory and time). If 527 performance is important consider the '+=' operator. 528 """ 529 # create a new object of the same type it is now and NOT onyl Dataset 530 out = super(Dataset, self).__new__(self.__class__) 531 532 # now init it: to make it work all Dataset contructors have to accept 533 # Class(data=Dict, dsattr=Dict) 534 out.__init__(data=self._data, 535 dsattr=self._dsattr, 536 copy_samples=True, 537 copy_data=True, 538 copy_dsattr=True) 539 540 out += other 541 542 return out
543 544
545 - def selectFeatures(self, ids, sort=True):
546 """Select a number of features from the current set. 547 548 :Parameters: 549 ids 550 iterable container to select ids 551 sort : bool 552 if to sort Ids. Order matters and `selectFeatures` assumes 553 incremental order. If not such, in non-optimized code 554 selectFeatures would verify the order and sort 555 556 Returns a new Dataset object with a view of the original 557 samples array (no copying is performed). 558 559 WARNING: The order of ids determines the order of features in 560 the returned dataset. This might be useful sometimes, but can 561 also cause major headaches! Order would is verified when 562 running in non-optimized code (if __debug__) 563 """ 564 # XXX set sort default to True, now sorting has to be explicitely 565 # disabled and warning is not necessary anymore 566 if sort: 567 ids.sort() 568 # elif __debug__: 569 # from mvpa.misc.support import isSorted 570 # if not isSorted(ids): 571 # warning("IDs for selectFeatures must be provided " + 572 # "in sorted order, otherwise major headache might occur") 573 574 # shallow-copy all stuff from current data dict 575 new_data = self._data.copy() 576 577 # assign the selected features -- data is still shared with 578 # current dataset 579 new_data['samples'] = self._data['samples'][:, ids] 580 581 # create a new object of the same type it is now and NOT onyl Dataset 582 dataset = super(Dataset, self).__new__(self.__class__) 583 584 # now init it: to make it work all Dataset contructors have to accept 585 # Class(data=Dict, dsattr=Dict) 586 dataset.__init__(data=new_data, 587 dsattr=self._dsattr, 588 check_data=False, 589 copy_samples=False, 590 copy_data=False, 591 copy_dsattr=False 592 ) 593 594 return dataset
595 596
597 - def applyMapper(self, featuresmapper=None, samplesmapper=None):
598 """Obtain new dataset by applying mappers over features and/or samples. 599 600 :Parameters: 601 featuresmapper : Mapper 602 `Mapper` to somehow transform each sample's features 603 samplesmapper : Mapper 604 `Mapper` to transform each feature across samples 605 606 WARNING: At the moment, handling of samplesmapper is not yet 607 implemented since there were no real use case. 608 609 TODO: selectFeatures is pretty much applyMapper(featuresmapper=MaskMapper(...)) 610 """ 611 612 # shallow-copy all stuff from current data dict 613 new_data = self._data.copy() 614 615 # apply mappers 616 617 if samplesmapper: 618 raise NotImplementedError 619 620 if featuresmapper: 621 if __debug__: 622 debug("DS", "Applying featuresmapper %s" % `featuresmapper` + 623 " to samples of dataset `%s`" % `self`) 624 new_data['samples'] = featuresmapper.forward(self._data['samples']) 625 626 # create a new object of the same type it is now and NOT only Dataset 627 dataset = super(Dataset, self).__new__(self.__class__) 628 629 # now init it: to make it work all Dataset contructors have to accept 630 # Class(data=Dict, dsattr=Dict) 631 dataset.__init__(data=new_data, 632 dsattr=self._dsattr, 633 check_data=False, 634 copy_samples=False, 635 copy_data=False, 636 copy_dsattr=False 637 ) 638 639 return dataset
640 641
642 - def selectSamples(self, mask):
643 """Choose a subset of samples. 644 645 Returns a new dataset object containing the selected sample 646 subset. 647 648 TODO: yoh, we might need to sort the mask if the mask is a 649 list of ids and is not ordered. Clarify with Michael what is 650 our intent here! 651 """ 652 # without having a sequence a index the masked sample array would 653 # loose its 2d layout 654 if not operator.isSequenceType( mask ): 655 mask = [mask] 656 # TODO: Reconsider crafting a slice if it can be done to don't copy 657 # the data 658 #try: 659 # minmask = min(mask) 660 # maxmask = max(mask) 661 #except: 662 # minmask = min(map(int,mask)) 663 # maxmask = max(map(int,mask)) 664 # lets see if we could get it done with cheap view/slice 665 #(minmask, maxmask) != (0, 1) and \ 666 #if len(mask) > 2 and \ 667 # N.array([N.arange(minmask, maxmask+1) == N.array(mask)]).all(): 668 # slice_ = slice(minmask, maxmask+1) 669 # if __debug__: 670 # debug("DS", "We can and do convert mask %s into splice %s" % 671 # (mask, slice_)) 672 # mask = slice_ 673 # mask all sample attributes 674 data = {} 675 for k, v in self._data.iteritems(): 676 data[k] = v[mask, ] 677 678 # create a new object of the same type it is now and NOT onyl Dataset 679 dataset = super(Dataset, self).__new__(self.__class__) 680 681 # now init it: to make it work all Dataset contructors have to accept 682 # Class(data=Dict, dsattr=Dict) 683 dataset.__init__(data=data, 684 dsattr=self._dsattr, 685 check_data=False, 686 copy_samples=False, 687 copy_data=False, 688 copy_dsattr=False) 689 690 dataset._resetallunique(force=True) 691 return dataset
692 693 694
695 - def permuteLabels(self, status, perchunk = True):
696 """Permute the labels. 697 698 TODO: rename status into something closer in semantics. 699 700 Calling this method with 'status' set to True, the labels are 701 permuted among all samples. 702 703 If 'perorigin' is True permutation is limited to samples sharing the 704 same chunk value. Therefore only the association of a certain sample 705 with a label is permuted while keeping the absolute number of 706 occurences of each label value within a certain chunk constant. 707 708 If 'status' is False the original labels are restored. 709 """ 710 if not status: 711 # restore originals 712 if self._data.get('origlabels', None) is None: 713 raise RuntimeError, 'Cannot restore labels. ' \ 714 'permuteLabels() has never been ' \ 715 'called with status == True.' 716 self.labels = self._data['origlabels'] 717 self._data['origlabels'] = None 718 else: 719 # store orig labels, but only if not yet done, otherwise multiple 720 # calls with status == True will destroy the original labels 721 if not self._data.has_key('origlabels') \ 722 or self._data['origlabels'] == None: 723 # rebind old labels to origlabels 724 self._data['origlabels'] = self._data['labels'] 725 # assign a copy so modifications do not impact original data 726 self._data['labels'] = self._data['labels'].copy() 727 728 # now scramble the rest 729 if perchunk: 730 for o in self.uniquechunks: 731 self._data['labels'][self.chunks == o ] = \ 732 N.random.permutation( self.labels[ self.chunks == o ] ) 733 # to recompute uniquelabels 734 self.labels = self._data['labels'] 735 else: 736 self.labels = N.random.permutation(self._data['labels'])
737 738
739 - def getRandomSamples( self, nperlabel ):
740 """Select a random set of samples. 741 742 If 'nperlabel' is an integer value, the specified number of samples is 743 randomly choosen from the group of samples sharing a unique label 744 value ( total number of selected samples: nperlabel x len(uniquelabels). 745 746 If 'nperlabel' is a list which's length has to match the number of 747 unique label values. In this case 'nperlabel' specifies the number of 748 samples that shall be selected from the samples with the corresponding 749 label. 750 751 The method returns a Dataset object containing the selected 752 samples. 753 """ 754 # if interger is given take this value for all classes 755 if isinstance(nperlabel, int): 756 nperlabel = [ nperlabel for i in self.uniquelabels ] 757 758 sample = [] 759 # for each available class 760 for i, r in enumerate(self.uniquelabels): 761 # get the list of pattern ids for this class 762 sample += random.sample( (self.labels == r).nonzero()[0], 763 nperlabel[i] ) 764 765 return self.selectSamples( sample )
766 767 768 # def _setchunks(self, chunks): 769 # """Sets chunks and recomputes uniquechunks 770 # """ 771 # self._data['chunks'] = N.array(chunks) 772 # self._dsattr['uniquechunks'] = None # None!since we might not need them 773 774
775 - def getNSamples( self ):
776 """Currently available number of patterns. 777 """ 778 return self._data['samples'].shape[0]
779 780
781 - def getNFeatures( self ):
782 """Number of features per pattern. 783 """ 784 return self._data['samples'].shape[1]
785 786 787
788 - def setSamplesDType(self, dtype):
789 """Set the data type of the samples array. 790 """ 791 if self._data['samples'].dtype != dtype: 792 self._data['samples'] = self._data['samples'].astype(dtype)
793 794
795 - def convertFeatureIds2FeatureMask(self, ids):
796 """Returns a boolean mask with all features in `ids` selected. 797 798 :Parameters: 799 ids: list or 1d array 800 To be selected features ids. 801 802 :Returns: 803 ndarray: dtype='bool' 804 All selected features are set to True; False otherwise. 805 """ 806 fmask = N.repeat(False, self.nfeatures) 807 fmask[ids] = True 808 809 return fmask
810 811
812 - def convertFeatureMask2FeatureIds(self, mask):
813 """Returns feature ids corresponding to non-zero elements in the mask. 814 815 :Parameters: 816 mask: 1d ndarray 817 Feature mask. 818 819 :Returns: 820 ndarray: integer 821 Ids of non-zero (non-False) mask elements. 822 """ 823 return mask.nonzero()[0]
824 825 826 827 # read-only class properties 828 nsamples = property( fget=getNSamples ) 829 nfeatures = property( fget=getNFeatures )
830 831 832 # Following attributes adherent to the basic dataset 833 Dataset._registerAttribute("samples", "_data", hasunique=False) 834 Dataset._registerAttribute("labels", "_data", hasunique=True) 835 Dataset._registerAttribute("chunks", "_data", hasunique=True) 836