Package mvpa :: Package misc :: Module iohelpers
[hide private]
[frames] | no frames]

Source Code for Module mvpa.misc.iohelpers

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Some little helper for reading (and writing) common formats from and to 
 10  disk.""" 
 11   
 12  __docformat__ = 'restructuredtext' 
 13   
 14  import copy 
 15  from sets import Set 
 16   
 17  from mvpa.misc import warning 
 18   
 19  if __debug__: 
 20      from mvpa.misc import debug 
 21   
22 -class ColumnData(dict):
23 """Read data that is stored in columns of text files. 24 25 All read data is available via a dictionary-like interface. If 26 column headers are available, the column names serve as dictionary keys. 27 If no header exists an articfical key is generated: str(number_of_column). 28 29 Splitting of text file lines is performed by the standard split() function 30 (which gets passed the `sep` argument as separator string) and each 31 element is converted into the desired datatype. 32 33 Because data is read into a dictionary no two columns can have the same 34 name in the header! Each column is stored as a list in the dictionary. 35 """
36 - def __init__(self, source, header=True, sep=None, dtype=float):
37 """Read data from file into a dictionary. 38 39 Parameters 40 ---------- 41 - `source`: Can be a filename or a dictionary. In the case of the 42 first all data is read from that file and additonal 43 keyword arguments can be sued to customize the read 44 procedure. If a dictionary is passed a deepcopy is 45 performed. 46 - `header`: Indicates whether the column names should be read from the 47 first line (`header=True`). If `header=False` unique 48 column names will be generated (see class docs). If 49 `header` is a python list, it's content is used as column 50 header names and its length has to match the number of 51 columns in the file. 52 - `sep`: Separator string. The actual meaning depends on the output 53 format (see class docs). 54 - `dtype`: Desired datatype. 55 """ 56 # init base class 57 dict.__init__(self) 58 59 # intialize with default 60 self._header_order = None 61 62 if isinstance(source, str): 63 self._fromFile(source, header=header, sep=sep, dtype=dtype) 64 65 elif isinstance(source, dict): 66 for k, v in source.iteritems(): 67 self[k] = v 68 # check data integrity 69 self._check() 70 71 else: 72 raise ValueError, 'Unkown source for ColumnData [%s]' \ 73 % `type(source)` 74 75 # generate missing properties for each item in the header 76 classdict = self.__class__.__dict__ 77 for k in self.keys(): 78 if not classdict.has_key(k): 79 getter = "lambda self: self._getAttrib('%s')" % (k) 80 if __debug__: 81 debug("IOH", "Registering property %s for ColumnData" % `k`) 82 exec "%s.%s = property(fget=%s)" % \ 83 (self.__class__.__name__, k, getter)
84 # TODO!!! Check if it is safe actually here to rely on value of 85 # k in lambda. May be it is treated as continuation and 86 # some local space would override it???? 87 #setattr(self.__class__, 88 # k, 89 # property(fget=lambda x: x._getAttrib("%s" % k))) 90 # it seems to be error-prone due to continuation... 91 92
93 - def _getAttrib(self, key):
94 """Return corresponding value if given key is known to current instance 95 96 Is used for automatically added properties to the class. 97 98 :raises: ValueError, if `key` is not known to given instance 99 100 :return: value if `key` is known 101 """ 102 if self.has_key(key): 103 return self[key] 104 else: 105 raise ValueError, "Instance %s has no data about %s" \ 106 % (`self`, `key`)
107 108
109 - def __str__(self):
110 s = self.__class__.__name__ 111 if len(self.keys())>0: 112 s += " %d rows, %d columns [" % \ 113 (self.getNRows(), self.getNColumns()) 114 s += reduce(lambda x, y: x+" %s" % y, self.keys()) 115 s += "]" 116 return s
117
118 - def _check(self):
119 """Performs some checks for data integrity. 120 """ 121 length = None 122 for k in self.keys(): 123 if length == None: 124 length = len(self[k]) 125 else: 126 if not len(self[k]) == length: 127 raise ValueError, "Data integrity lost. Columns do not " \ 128 "have equal length."
129 130
131 - def _fromFile(self, filename, header, sep, dtype):
132 """Loads column data from file -- clears object first. 133 """ 134 # make a clean table 135 self.clear() 136 137 file_ = open(filename, 'r') 138 139 self._header_order = None 140 # make column names, either take header or generate 141 if header == True: 142 # read first line and split by 'sep' 143 hdr = file_.readline().split(sep) 144 self._header_order = hdr 145 elif isinstance(header, list): 146 hdr = header 147 else: 148 hdr = [ str(i) for i in xrange(len(file_.readline().split(sep))) ] 149 # reset file to not miss the first line 150 file_.seek(0) 151 152 # string in lists: one per column 153 tbl = [ [] for i in xrange(len(hdr)) ] 154 155 # parse line by line and feed into the lists 156 for line in file_: 157 # get rid of leading and trailing whitespace 158 line = line.strip() 159 # ignore empty lines and comment lines 160 if not line or line.startswith('#'): 161 continue 162 l = line.split(sep) 163 164 if not len(l) == len(hdr): 165 raise RuntimeError, \ 166 "Number of entries in line [%i] does not match number " \ 167 "of columns in header [%i]." % (len(l), len(hdr)) 168 169 for i, v in enumerate(l): 170 if not dtype is None: 171 try: 172 v = dtype(v) 173 except ValueError: 174 warning("Can't convert %s to desired datatype %s." % 175 (`v`, `dtype`) + " Leaving original type") 176 tbl[i].append(v) 177 178 # check 179 if not len(tbl) == len(hdr): 180 raise RuntimeError, "Number of columns read from file does not " \ 181 "match the number of header entries." 182 183 # fill dict 184 for i, v in enumerate(hdr): 185 self[v] = tbl[i]
186 187
188 - def __iadd__(self, other):
189 """Merge column data. 190 """ 191 # for all columns in the other object 192 for k, v in other.iteritems(): 193 if not self.has_key(k): 194 raise ValueError, 'Unknown key [%s].' % `k` 195 if not isinstance(v, list): 196 raise ValueError, 'Can only merge list data, but got [%s].' \ 197 % `type(v)` 198 # now it seems to be ok 199 # XXX check for datatype? 200 self[k] += v 201 202 # look for problems, like columns present in self, but not in other 203 self._check() 204 205 return self
206 207
208 - def selectSamples(self, selection):
209 """Return new ColumnData with selected samples""" 210 211 data = copy.deepcopy(self) 212 for k, v in data.iteritems(): 213 data[k] = [v[x] for x in selection] 214 215 data._check() 216 return data
217 218
219 - def getNColumns(self):
220 """Returns the number of columns. 221 """ 222 return len(self.keys())
223 224
225 - def tofile(self, filename, header=True, header_order=None, sep=' '):
226 """Write column data to a text file. 227 228 Parameter 229 --------- 230 231 - `filename`: Think about it! 232 - `header`: If `True` a column header is written, using the column 233 keys. If `False` no header is written. 234 - `header_order`: If it is a list of strings, they will be used instead 235 of simply asking for the dictionary keys. However 236 these strings must match the dictionary keys in number 237 and identity. This argument type can be used to 238 determine the order of the columns in the output file. 239 The default value is `None`. In this case the columns 240 will be in an arbitrary order. 241 - `sep`: String that is written as a separator between to data columns. 242 """ 243 # XXX do the try: except: dance 244 file_ = open(filename, 'w') 245 246 # write header 247 if header_order == None: 248 if self._header_order is None: 249 col_hdr = self.keys() 250 else: 251 # use stored order + newly added keys at the last columns 252 col_hdr = self._header_order + \ 253 list(Set(self.keys()).difference( 254 Set(self._header_order))) 255 else: 256 if not len(header_order) == self.getNColumns(): 257 raise ValueError, 'Header list does not match number of ' \ 258 'columns.' 259 for k in header_order: 260 if not self.has_key(k): 261 raise ValueError, 'Unknown key [%s]' % `k` 262 col_hdr = header_order 263 264 if header == True: 265 file_.write(sep.join(col_hdr) + '\n') 266 267 # for all rows 268 for r in xrange(self.getNRows()): 269 # get attributes for all keys 270 l = [str(self[k][r]) for k in col_hdr] 271 # write to file with proper separator 272 file_.write(sep.join(l) + '\n') 273 274 file_.close()
275 276
277 - def getNRows(self):
278 """Returns the number of rows. 279 """ 280 # no data no rows (after Bob Marley) 281 if not len(self.keys()): 282 return 0 283 # otherwise first key is as good as any other 284 else: 285 return len(self[self.keys()[0]])
286 287 288
289 -class FslEV3(ColumnData):
290 """IO helper to read FSL's EV3 files. 291 292 This is a three-column textfile format that is used to specify stimulation 293 protocols for fMRI data analysis in FSL's FEAT module. 294 295 Data is always read as `float`. 296 """
297 - def __init__(self, source):
298 """Read and write FSL EV3 files. 299 300 Parameter 301 --------- 302 303 - `source`: filename of an EV3 file 304 """ 305 # init data from known format 306 ColumnData.__init__(self, source, 307 header=['onsets', 'durations', 'intensities'], 308 sep=None, dtype=float)
309 310
311 - def getNEVs(self):
312 """Returns the number of EVs in the file. 313 """ 314 return self.getNRows()
315 316
317 - def getEV(self, evid):
318 """Returns a tuple of (onset time, simulus duration, intensity) for a 319 certain EV. 320 """ 321 return (self['onsets'][evid], 322 self['durations'][evid], 323 self['intensities'][evid])
324 325
326 - def tofile(self, filename):
327 """Write data to a FSL EV3 file. 328 """ 329 ColumnData.tofile(self, filename, 330 header=False, 331 header_order=['onsets', 'durations', 'intensities'], 332 sep=' ')
333 334 335 onsets = property(fget=lambda self: self['onsets']) 336 durations = property(fget=lambda self: self['durations']) 337 intensities = property(fget=lambda self: self['intensities']) 338 nevs = property(fget=getNEVs)
339 340 341
342 -class SampleAttributes(ColumnData):
343 """Read and write PyMVPA sample attribute definitions from and to text 344 files. 345 """
346 - def __init__(self, source):
347 """Read PyMVPA sample attributes from disk. 348 349 Parameter 350 --------- 351 352 - `source`: filename of an atrribute file 353 """ 354 ColumnData.__init__(self, source, 355 header=['labels', 'chunks'], 356 sep=None, dtype=float)
357 358
359 - def tofile(self, filename):
360 """Write sample attributes to a text file. 361 """ 362 ColumnData.tofile(self, filename, 363 header=False, 364 header_order=['labels', 'chunks'], 365 sep=' ')
366 367
368 - def getNSamples(self):
369 """Returns the number of samples in the file. 370 """ 371 return self.getNRows()
372 373 374 nsamples = property(fget=getNSamples)
375 376 377
378 -class McFlirtParams(ColumnData):
379 """Read and write McFlirt's motion estimation parameters from and to text 380 files. 381 """ 382 header_def = ['rot1', 'rot2', 'rot3', 'x', 'y', 'z'] 383
384 - def __init__(self, source):
385 """ 386 :Parameter: 387 388 source: str 389 Filename of a parameter file. 390 """ 391 ColumnData.__init__(self, source, 392 header=McFlirtParams.header_def, 393 sep=None, dtype=float)
394 395
396 - def tofile(self, filename):
397 """Write motion parameters to file. 398 """ 399 ColumnData.tofile(self, filename, 400 header=False, 401 header_order=McFlirtParams.header_def, 402 sep=' ')
403