Package mvpa :: Package misc :: Package io :: Module base
[hide private]
[frames] | no frames]

Source Code for Module mvpa.misc.io.base

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Some little helper for reading (and writing) common formats from and to 
 10  disk.""" 
 11   
 12  __docformat__ = 'restructuredtext' 
 13   
 14  import numpy as N 
 15  import mvpa.misc.copy as copy 
 16  from mvpa.base.dochelpers import enhancedDocString 
 17  from sets import Set 
 18  from re import sub as re_sub 
 19  from mvpa.base import warning 
 20   
 21  if __debug__: 
 22      from mvpa.base import debug 
 23   
 24   
25 -class DataReader(object):
26 """Base class for data readers. 27 28 Every subclass has to put all information into to variable: 29 30 `self._data`: ndarray 31 The data array has to have the samples separating dimension along the 32 first axis. 33 `self._props`: dict 34 All other meaningful information has to be stored in a dictionary. 35 36 This class provides two methods (and associated properties) to retrieve 37 this information. 38 """
39 - def __init__(self):
40 """Cheap init. 41 """ 42 self._props = {} 43 self._data = None
44 45
46 - def getPropsAsDict(self):
47 """Return the dictionary with the data properties. 48 """ 49 return self._props
50 51
52 - def getData(self):
53 """Return the data array. 54 """ 55 return self._data
56 57 58 data = property(fget=getData, doc="Data array") 59 props = property(fget=getPropsAsDict, doc="Property dict")
60 61 62
63 -class ColumnData(dict):
64 """Read data that is stored in columns of text files. 65 66 All read data is available via a dictionary-like interface. If 67 column headers are available, the column names serve as dictionary keys. 68 If no header exists an articfical key is generated: str(number_of_column). 69 70 Splitting of text file lines is performed by the standard split() function 71 (which gets passed the `sep` argument as separator string) and each 72 element is converted into the desired datatype. 73 74 Because data is read into a dictionary no two columns can have the same 75 name in the header! Each column is stored as a list in the dictionary. 76 """
77 - def __init__(self, source, header=True, sep=None, headersep=None, 78 dtype=float, skiplines=0):
79 """Read data from file into a dictionary. 80 81 :Parameters: 82 source : basestring or dict 83 If values is given as a string all data is read from the 84 file and additonal keyword arguments can be sued to 85 customize the read procedure. If a dictionary is passed 86 a deepcopy is performed. 87 header : bool or list of basestring 88 Indicates whether the column names should be read from the 89 first line (`header=True`). If `header=False` unique 90 column names will be generated (see class docs). If 91 `header` is a python list, it's content is used as column 92 header names and its length has to match the number of 93 columns in the file. 94 sep : basestring or None 95 Separator string. The actual meaning depends on the output 96 format (see class docs). 97 headersep : basestring or None 98 Separator string used in the header. The actual meaning 99 depends on the output format (see class docs). 100 dtype : type or list(types) 101 Desired datatype(s). Datatype per column get be specified by 102 passing a list of types. 103 skiplines : int 104 Number of lines to skip at the beginning of the file. 105 """ 106 # init base class 107 dict.__init__(self) 108 109 # intialize with default 110 self._header_order = None 111 112 if isinstance(source, str): 113 self._fromFile(source, header=header, sep=sep, headersep=headersep, 114 dtype=dtype, skiplines=skiplines) 115 116 elif isinstance(source, dict): 117 for k, v in source.iteritems(): 118 self[k] = v 119 # check data integrity 120 self._check() 121 122 else: 123 raise ValueError, 'Unkown source for ColumnData [%s]' \ 124 % `type(source)` 125 126 # generate missing properties for each item in the header 127 classdict = self.__class__.__dict__ 128 for k in self.keys(): 129 if not classdict.has_key(k): 130 getter = "lambda self: self._getAttrib('%s')" % (k) 131 # Sanitarize the key, substitute ' []' with '_' 132 k_ = re_sub('[[\] ]', '_', k) 133 # replace multipe _s 134 k_ = re_sub('__+', '_', k_) 135 # remove quotes 136 k_ = re_sub('["\']', '', k_) 137 if __debug__: 138 debug("IOH", "Registering property %s for ColumnData key %s" 139 % (k_, k)) 140 # make sure to import class directly into local namespace 141 # otherwise following does not work for classes defined 142 # elsewhere 143 exec 'from %s import %s' % (self.__module__, 144 self.__class__.__name__) 145 exec "%s.%s = property(fget=%s)" % \ 146 (self.__class__.__name__, k_, getter)
147 # TODO!!! Check if it is safe actually here to rely on value of 148 # k in lambda. May be it is treated as continuation and 149 # some local space would override it???? 150 #setattr(self.__class__, 151 # k, 152 # property(fget=lambda x: x._getAttrib("%s" % k))) 153 # it seems to be error-prone due to continuation... 154 155 156 __doc__ = enhancedDocString('ColumnData', locals()) 157 158
159 - def _getAttrib(self, key):
160 """Return corresponding value if given key is known to current instance 161 162 Is used for automatically added properties to the class. 163 164 :raises: ValueError, if `key` is not known to given instance 165 166 :return: value if `key` is known 167 """ 168 if self.has_key(key): 169 return self[key] 170 else: 171 raise ValueError, "Instance %s has no data about %s" \ 172 % (`self`, `key`)
173 174
175 - def __str__(self):
176 s = self.__class__.__name__ 177 if len(self.keys())>0: 178 s += " %d rows, %d columns [" % \ 179 (self.getNRows(), self.getNColumns()) 180 s += reduce(lambda x, y: x+" %s" % y, self.keys()) 181 s += "]" 182 return s
183
184 - def _check(self):
185 """Performs some checks for data integrity. 186 """ 187 length = None 188 for k in self.keys(): 189 if length == None: 190 length = len(self[k]) 191 else: 192 if not len(self[k]) == length: 193 raise ValueError, "Data integrity lost. Columns do not " \ 194 "have equal length."
195 196
197 - def _fromFile(self, filename, header, sep, headersep, 198 dtype, skiplines):
199 """Loads column data from file -- clears object first. 200 """ 201 # make a clean table 202 self.clear() 203 204 file_ = open(filename, 'r') 205 206 self._header_order = None 207 208 [ file_.readline() for x in range(skiplines) ] 209 """Simply skip some lines""" 210 # make column names, either take header or generate 211 if header == True: 212 # read first line and split by 'sep' 213 hdr = file_.readline().split(headersep) 214 # remove bogus empty header titles 215 hdr = filter(lambda x:len(x.strip()), hdr) 216 self._header_order = hdr 217 elif isinstance(header, list): 218 hdr = header 219 else: 220 hdr = [ str(i) for i in xrange(len(file_.readline().split(sep))) ] 221 # reset file to not miss the first line 222 file_.seek(0) 223 [ file_.readline() for x in range(skiplines) ] 224 225 226 # string in lists: one per column 227 tbl = [ [] for i in xrange(len(hdr)) ] 228 229 # do per column dtypes 230 if not isinstance(dtype, list): 231 dtype = [dtype] * len(hdr) 232 233 # parse line by line and feed into the lists 234 for line in file_: 235 # get rid of leading and trailing whitespace 236 line = line.strip() 237 # ignore empty lines and comment lines 238 if not line or line.startswith('#'): 239 continue 240 l = line.split(sep) 241 242 if not len(l) == len(hdr): 243 raise RuntimeError, \ 244 "Number of entries in line [%i] does not match number " \ 245 "of columns in header [%i]." % (len(l), len(hdr)) 246 247 for i, v in enumerate(l): 248 if not dtype[i] is None: 249 try: 250 v = dtype[i](v) 251 except ValueError: 252 warning("Can't convert %s to desired datatype %s." % 253 (`v`, `dtype`) + " Leaving original type") 254 tbl[i].append(v) 255 256 # check 257 if not len(tbl) == len(hdr): 258 raise RuntimeError, "Number of columns read from file does not " \ 259 "match the number of header entries." 260 261 # fill dict 262 for i, v in enumerate(hdr): 263 self[v] = tbl[i]
264 265
266 - def __iadd__(self, other):
267 """Merge column data. 268 """ 269 # for all columns in the other object 270 for k, v in other.iteritems(): 271 if not self.has_key(k): 272 raise ValueError, 'Unknown key [%s].' % `k` 273 if not isinstance(v, list): 274 raise ValueError, 'Can only merge list data, but got [%s].' \ 275 % `type(v)` 276 # now it seems to be ok 277 # XXX check for datatype? 278 self[k] += v 279 280 # look for problems, like columns present in self, but not in other 281 self._check() 282 283 return self
284 285
286 - def selectSamples(self, selection):
287 """Return new ColumnData with selected samples""" 288 289 data = copy.deepcopy(self) 290 for k, v in data.iteritems(): 291 data[k] = [v[x] for x in selection] 292 293 data._check() 294 return data
295 296
297 - def getNColumns(self):
298 """Returns the number of columns. 299 """ 300 return len(self.keys())
301 302
303 - def tofile(self, filename, header=True, header_order=None, sep=' '):
304 """Write column data to a text file. 305 306 :Parameter: 307 filename: Think about it! 308 header: If `True` a column header is written, using the column 309 keys. If `False` no header is written. 310 header_order: If it is a list of strings, they will be used instead 311 of simply asking for the dictionary keys. However 312 these strings must match the dictionary keys in number 313 and identity. This argument type can be used to 314 determine the order of the columns in the output file. 315 The default value is `None`. In this case the columns 316 will be in an arbitrary order. 317 sep: String that is written as a separator between to data columns. 318 """ 319 # XXX do the try: except: dance 320 file_ = open(filename, 'w') 321 322 # write header 323 if header_order == None: 324 if self._header_order is None: 325 col_hdr = self.keys() 326 else: 327 # use stored order + newly added keys at the last columns 328 col_hdr = self._header_order + \ 329 list(Set(self.keys()).difference( 330 Set(self._header_order))) 331 else: 332 if not len(header_order) == self.getNColumns(): 333 raise ValueError, 'Header list does not match number of ' \ 334 'columns.' 335 for k in header_order: 336 if not self.has_key(k): 337 raise ValueError, 'Unknown key [%s]' % `k` 338 col_hdr = header_order 339 340 if header == True: 341 file_.write(sep.join(col_hdr) + '\n') 342 343 # for all rows 344 for r in xrange(self.getNRows()): 345 # get attributes for all keys 346 l = [str(self[k][r]) for k in col_hdr] 347 # write to file with proper separator 348 file_.write(sep.join(l) + '\n') 349 350 file_.close()
351 352
353 - def getNRows(self):
354 """Returns the number of rows. 355 """ 356 # no data no rows (after Bob Marley) 357 if not len(self.keys()): 358 return 0 359 # otherwise first key is as good as any other 360 else: 361 return len(self[self.keys()[0]])
362 363 ncolumns = property(fget=getNColumns) 364 nrows = property(fget=getNRows)
365 366 367
368 -class SampleAttributes(ColumnData):
369 """Read and write PyMVPA sample attribute definitions from and to text 370 files. 371 """
372 - def __init__(self, source, literallabels=False):
373 """Read PyMVPA sample attributes from disk. 374 375 :Parameter: 376 source: filename of an atrribute file 377 """ 378 if literallabels: 379 dtypes = [str, float] 380 else: 381 dtypes = float 382 383 ColumnData.__init__(self, source, 384 header=['labels', 'chunks'], 385 sep=None, dtype=dtypes)
386 387
388 - def tofile(self, filename):
389 """Write sample attributes to a text file. 390 """ 391 ColumnData.tofile(self, filename, 392 header=False, 393 header_order=['labels', 'chunks'], 394 sep=' ')
395 396
397 - def getNSamples(self):
398 """Returns the number of samples in the file. 399 """ 400 return self.getNRows()
401 402 403 nsamples = property(fget=getNSamples)
404 405
406 -class SensorLocations(ColumnData):
407 """Base class for sensor location readers. 408 409 Each subclass should provide x, y, z coordinates via the `pos_x`, `pos_y`, 410 and `pos_z` attrbibutes. 411 412 Axes should follow the following convention: 413 414 x-axis: left -> right 415 y-axis: anterior -> posterior 416 z-axis: superior -> inferior 417 """
418 - def __init__(self, *args, **kwargs):
419 """Pass arguments to ColumnData. 420 """ 421 ColumnData.__init__(self, *args, **kwargs)
422 423
424 - def locations(self):
425 """Get the sensor locations as an array. 426 427 :Returns: 428 (nchannels x 3) array with coordinates in (x, y, z) 429 """ 430 return N.array((self.pos_x, self.pos_y, self.pos_z)).T
431 432 433
434 -class XAVRSensorLocations(SensorLocations):
435 """Read sensor location definitions from a specific text file format. 436 437 File layout is assumed to be 5 columns: 438 439 1. sensor name 440 2. some useless integer 441 3. position on x-axis 442 4. position on y-axis 443 5. position on z-axis 444 """
445 - def __init__(self, source):
446 """Read sensor locations from file. 447 448 :Parameter: 449 source : filename of an attribute file 450 """ 451 SensorLocations.__init__( 452 self, source, 453 header=['names', 'some_number', 'pos_x', 'pos_y', 'pos_z'], 454 sep=None, dtype=[str, int, float, float, float])
455 456
457 -class TuebingenMEGSensorLocations(SensorLocations):
458 """Read sensor location definitions from a specific text file format. 459 460 File layout is assumed to be 7 columns: 461 462 1: sensor name 463 2: position on y-axis 464 3: position on x-axis 465 4: position on z-axis 466 5-7: same as 2-4, but for some outer surface thingie. 467 468 Note that x and y seem to be swapped, ie. y as defined by SensorLocations 469 conventions seems to be first axis and followed by x. 470 471 Only inner surface coordinates are reported by `locations()`. 472 """
473 - def __init__(self, source):
474 """Read sensor locations from file. 475 476 :Parameter: 477 source : filename of an attribute file 478 """ 479 SensorLocations.__init__( 480 self, source, 481 header=['names', 'pos_y', 'pos_x', 'pos_z', 482 'pos_y2', 'pos_x2', 'pos_z2'], 483 sep=None, dtype=[str, float, float, float, float, float, float])
484 485
486 -def design2labels(columndata, baseline_label=0, 487 func=lambda x: x > 0.0):
488 """Helper to convert design matrix into a list of labels 489 490 Given a design, assign a single label to any given sample 491 492 TODO: fix description/naming 493 494 :Parameters: 495 columndata : ColumnData 496 Attributes where each known will be considered as a separate 497 explanatory variable (EV) in the design. 498 baseline_label 499 What label to assign for samples where none of EVs was given a value 500 func : functor 501 Function which decides either a value should be considered 502 503 :Output: 504 list of labels which are taken from column names in 505 ColumnData and baseline_label 506 507 """ 508 # doing it simple naive way but it should be of better control if 509 # we decide to process columndata with non-numeric entries etc 510 keys = columndata.keys() 511 labels = [] 512 for row in xrange(columndata.nrows): 513 entries = [ columndata[key][row] for key in keys ] 514 # which entries get selected 515 selected = filter(lambda x: func(x[1]), zip(keys, entries)) 516 nselected = len(selected) 517 518 if nselected > 1: 519 # if there is more than a single one -- we are in problem 520 raise ValueError, "Row #%i with items %s has multiple entries " \ 521 "meeting the criterion. Cannot decide on the label" % \ 522 (row, entries) 523 elif nselected == 1: 524 label = selected[0][0] 525 else: 526 label = baseline_label 527 labels.append(label) 528 return labels
529 530 531 __known_chunking_methods = { 532 'alllabels': 'Each chunk must contain instances of all labels' 533 } 534
535 -def labels2chunks(labels, method="alllabels", ignore_labels=None):
536 """Automagically decide on chunks based on labels 537 538 :Parameters: 539 labels 540 labels to base chunking on 541 method : basestring 542 codename for method to use. Known are %s 543 ignore_labels : list of basestring 544 depends on the method. If method ``alllabels``, then don't 545 seek for such labels in chunks. E.g. some 'reject' samples 546 547 :rtype: list 548 """ % __known_chunking_methods.keys() 549 550 chunks = [] 551 if ignore_labels is None: 552 ignore_labels = [] 553 alllabels = Set(labels).difference(Set(ignore_labels)) 554 if method == 'alllabels': 555 seenlabels = Set() 556 lastlabel = None 557 chunk = 0 558 for label in labels: 559 if label != lastlabel: 560 if seenlabels == alllabels: 561 chunk += 1 562 seenlabels = Set() 563 lastlabel = label 564 if not label in ignore_labels: 565 seenlabels.union_update([label]) 566 chunks.append(chunk) 567 chunks = N.array(chunks) 568 # fix up a bit the trailer 569 if seenlabels != alllabels: 570 chunks[chunks == chunk] = chunk-1 571 chunks = list(chunks) 572 else: 573 errmsg = "Unknown method to derive chunks is requested. Known are:\n" 574 for method, descr in __known_chunking_methods.iteritems(): 575 errmsg += " %s : %s\n" % (method, descr) 576 raise ValueError, errmsg 577 return chunks
578