1
2
3
4
5
6
7
8
9 """Some little helper for reading (and writing) common formats from and to
10 disk."""
11
12 __docformat__ = 'restructuredtext'
13
14 import numpy as N
15 import mvpa.misc.copy as copy
16 from mvpa.base.dochelpers import enhancedDocString
17 from sets import Set
18 from re import sub as re_sub
19 from mvpa.base import warning
20
21 if __debug__:
22 from mvpa.base import debug
23
24
26 """Base class for data readers.
27
28 Every subclass has to put all information into to variable:
29
30 `self._data`: ndarray
31 The data array has to have the samples separating dimension along the
32 first axis.
33 `self._props`: dict
34 All other meaningful information has to be stored in a dictionary.
35
36 This class provides two methods (and associated properties) to retrieve
37 this information.
38 """
40 """Cheap init.
41 """
42 self._props = {}
43 self._data = None
44
45
47 """Return the dictionary with the data properties.
48 """
49 return self._props
50
51
53 """Return the data array.
54 """
55 return self._data
56
57
58 data = property(fget=getData, doc="Data array")
59 props = property(fget=getPropsAsDict, doc="Property dict")
60
61
62
64 """Read data that is stored in columns of text files.
65
66 All read data is available via a dictionary-like interface. If
67 column headers are available, the column names serve as dictionary keys.
68 If no header exists an articfical key is generated: str(number_of_column).
69
70 Splitting of text file lines is performed by the standard split() function
71 (which gets passed the `sep` argument as separator string) and each
72 element is converted into the desired datatype.
73
74 Because data is read into a dictionary no two columns can have the same
75 name in the header! Each column is stored as a list in the dictionary.
76 """
77 - def __init__(self, source, header=True, sep=None, headersep=None,
78 dtype=float, skiplines=0):
79 """Read data from file into a dictionary.
80
81 :Parameters:
82 source : basestring or dict
83 If values is given as a string all data is read from the
84 file and additonal keyword arguments can be sued to
85 customize the read procedure. If a dictionary is passed
86 a deepcopy is performed.
87 header : bool or list of basestring
88 Indicates whether the column names should be read from the
89 first line (`header=True`). If `header=False` unique
90 column names will be generated (see class docs). If
91 `header` is a python list, it's content is used as column
92 header names and its length has to match the number of
93 columns in the file.
94 sep : basestring or None
95 Separator string. The actual meaning depends on the output
96 format (see class docs).
97 headersep : basestring or None
98 Separator string used in the header. The actual meaning
99 depends on the output format (see class docs).
100 dtype : type or list(types)
101 Desired datatype(s). Datatype per column get be specified by
102 passing a list of types.
103 skiplines : int
104 Number of lines to skip at the beginning of the file.
105 """
106
107 dict.__init__(self)
108
109
110 self._header_order = None
111
112 if isinstance(source, str):
113 self._fromFile(source, header=header, sep=sep, headersep=headersep,
114 dtype=dtype, skiplines=skiplines)
115
116 elif isinstance(source, dict):
117 for k, v in source.iteritems():
118 self[k] = v
119
120 self._check()
121
122 else:
123 raise ValueError, 'Unkown source for ColumnData [%s]' \
124 % `type(source)`
125
126
127 classdict = self.__class__.__dict__
128 for k in self.keys():
129 if not classdict.has_key(k):
130 getter = "lambda self: self._getAttrib('%s')" % (k)
131
132 k_ = re_sub('[[\] ]', '_', k)
133
134 k_ = re_sub('__+', '_', k_)
135
136 k_ = re_sub('["\']', '', k_)
137 if __debug__:
138 debug("IOH", "Registering property %s for ColumnData key %s"
139 % (k_, k))
140
141
142
143 exec 'from %s import %s' % (self.__module__,
144 self.__class__.__name__)
145 exec "%s.%s = property(fget=%s)" % \
146 (self.__class__.__name__, k_, getter)
147
148
149
150
151
152
153
154
155
156 __doc__ = enhancedDocString('ColumnData', locals())
157
158
160 """Return corresponding value if given key is known to current instance
161
162 Is used for automatically added properties to the class.
163
164 :raises: ValueError, if `key` is not known to given instance
165
166 :return: value if `key` is known
167 """
168 if self.has_key(key):
169 return self[key]
170 else:
171 raise ValueError, "Instance %s has no data about %s" \
172 % (`self`, `key`)
173
174
176 s = self.__class__.__name__
177 if len(self.keys())>0:
178 s += " %d rows, %d columns [" % \
179 (self.getNRows(), self.getNColumns())
180 s += reduce(lambda x, y: x+" %s" % y, self.keys())
181 s += "]"
182 return s
183
185 """Performs some checks for data integrity.
186 """
187 length = None
188 for k in self.keys():
189 if length == None:
190 length = len(self[k])
191 else:
192 if not len(self[k]) == length:
193 raise ValueError, "Data integrity lost. Columns do not " \
194 "have equal length."
195
196
197 - def _fromFile(self, filename, header, sep, headersep,
198 dtype, skiplines):
199 """Loads column data from file -- clears object first.
200 """
201
202 self.clear()
203
204 file_ = open(filename, 'r')
205
206 self._header_order = None
207
208 [ file_.readline() for x in range(skiplines) ]
209 """Simply skip some lines"""
210
211 if header == True:
212
213 hdr = file_.readline().split(headersep)
214
215 hdr = filter(lambda x:len(x.strip()), hdr)
216 self._header_order = hdr
217 elif isinstance(header, list):
218 hdr = header
219 else:
220 hdr = [ str(i) for i in xrange(len(file_.readline().split(sep))) ]
221
222 file_.seek(0)
223 [ file_.readline() for x in range(skiplines) ]
224
225
226
227 tbl = [ [] for i in xrange(len(hdr)) ]
228
229
230 if not isinstance(dtype, list):
231 dtype = [dtype] * len(hdr)
232
233
234 for line in file_:
235
236 line = line.strip()
237
238 if not line or line.startswith('#'):
239 continue
240 l = line.split(sep)
241
242 if not len(l) == len(hdr):
243 raise RuntimeError, \
244 "Number of entries in line [%i] does not match number " \
245 "of columns in header [%i]." % (len(l), len(hdr))
246
247 for i, v in enumerate(l):
248 if not dtype[i] is None:
249 try:
250 v = dtype[i](v)
251 except ValueError:
252 warning("Can't convert %s to desired datatype %s." %
253 (`v`, `dtype`) + " Leaving original type")
254 tbl[i].append(v)
255
256
257 if not len(tbl) == len(hdr):
258 raise RuntimeError, "Number of columns read from file does not " \
259 "match the number of header entries."
260
261
262 for i, v in enumerate(hdr):
263 self[v] = tbl[i]
264
265
267 """Merge column data.
268 """
269
270 for k, v in other.iteritems():
271 if not self.has_key(k):
272 raise ValueError, 'Unknown key [%s].' % `k`
273 if not isinstance(v, list):
274 raise ValueError, 'Can only merge list data, but got [%s].' \
275 % `type(v)`
276
277
278 self[k] += v
279
280
281 self._check()
282
283 return self
284
285
287 """Return new ColumnData with selected samples"""
288
289 data = copy.deepcopy(self)
290 for k, v in data.iteritems():
291 data[k] = [v[x] for x in selection]
292
293 data._check()
294 return data
295
296
298 """Returns the number of columns.
299 """
300 return len(self.keys())
301
302
303 - def tofile(self, filename, header=True, header_order=None, sep=' '):
304 """Write column data to a text file.
305
306 :Parameter:
307 filename: Think about it!
308 header: If `True` a column header is written, using the column
309 keys. If `False` no header is written.
310 header_order: If it is a list of strings, they will be used instead
311 of simply asking for the dictionary keys. However
312 these strings must match the dictionary keys in number
313 and identity. This argument type can be used to
314 determine the order of the columns in the output file.
315 The default value is `None`. In this case the columns
316 will be in an arbitrary order.
317 sep: String that is written as a separator between to data columns.
318 """
319
320 file_ = open(filename, 'w')
321
322
323 if header_order == None:
324 if self._header_order is None:
325 col_hdr = self.keys()
326 else:
327
328 col_hdr = self._header_order + \
329 list(Set(self.keys()).difference(
330 Set(self._header_order)))
331 else:
332 if not len(header_order) == self.getNColumns():
333 raise ValueError, 'Header list does not match number of ' \
334 'columns.'
335 for k in header_order:
336 if not self.has_key(k):
337 raise ValueError, 'Unknown key [%s]' % `k`
338 col_hdr = header_order
339
340 if header == True:
341 file_.write(sep.join(col_hdr) + '\n')
342
343
344 for r in xrange(self.getNRows()):
345
346 l = [str(self[k][r]) for k in col_hdr]
347
348 file_.write(sep.join(l) + '\n')
349
350 file_.close()
351
352
354 """Returns the number of rows.
355 """
356
357 if not len(self.keys()):
358 return 0
359
360 else:
361 return len(self[self.keys()[0]])
362
363 ncolumns = property(fget=getNColumns)
364 nrows = property(fget=getNRows)
365
366
367
369 """Read and write PyMVPA sample attribute definitions from and to text
370 files.
371 """
372 - def __init__(self, source, literallabels=False):
373 """Read PyMVPA sample attributes from disk.
374
375 :Parameter:
376 source: filename of an atrribute file
377 """
378 if literallabels:
379 dtypes = [str, float]
380 else:
381 dtypes = float
382
383 ColumnData.__init__(self, source,
384 header=['labels', 'chunks'],
385 sep=None, dtype=dtypes)
386
387
389 """Write sample attributes to a text file.
390 """
391 ColumnData.tofile(self, filename,
392 header=False,
393 header_order=['labels', 'chunks'],
394 sep=' ')
395
396
398 """Returns the number of samples in the file.
399 """
400 return self.getNRows()
401
402
403 nsamples = property(fget=getNSamples)
404
405
407 """Base class for sensor location readers.
408
409 Each subclass should provide x, y, z coordinates via the `pos_x`, `pos_y`,
410 and `pos_z` attrbibutes.
411
412 Axes should follow the following convention:
413
414 x-axis: left -> right
415 y-axis: anterior -> posterior
416 z-axis: superior -> inferior
417 """
422
423
425 """Get the sensor locations as an array.
426
427 :Returns:
428 (nchannels x 3) array with coordinates in (x, y, z)
429 """
430 return N.array((self.pos_x, self.pos_y, self.pos_z)).T
431
432
433
435 """Read sensor location definitions from a specific text file format.
436
437 File layout is assumed to be 5 columns:
438
439 1. sensor name
440 2. some useless integer
441 3. position on x-axis
442 4. position on y-axis
443 5. position on z-axis
444 """
446 """Read sensor locations from file.
447
448 :Parameter:
449 source : filename of an attribute file
450 """
451 SensorLocations.__init__(
452 self, source,
453 header=['names', 'some_number', 'pos_x', 'pos_y', 'pos_z'],
454 sep=None, dtype=[str, int, float, float, float])
455
456
458 """Read sensor location definitions from a specific text file format.
459
460 File layout is assumed to be 7 columns:
461
462 1: sensor name
463 2: position on y-axis
464 3: position on x-axis
465 4: position on z-axis
466 5-7: same as 2-4, but for some outer surface thingie.
467
468 Note that x and y seem to be swapped, ie. y as defined by SensorLocations
469 conventions seems to be first axis and followed by x.
470
471 Only inner surface coordinates are reported by `locations()`.
472 """
474 """Read sensor locations from file.
475
476 :Parameter:
477 source : filename of an attribute file
478 """
479 SensorLocations.__init__(
480 self, source,
481 header=['names', 'pos_y', 'pos_x', 'pos_z',
482 'pos_y2', 'pos_x2', 'pos_z2'],
483 sep=None, dtype=[str, float, float, float, float, float, float])
484
485
486 -def design2labels(columndata, baseline_label=0,
487 func=lambda x: x > 0.0):
488 """Helper to convert design matrix into a list of labels
489
490 Given a design, assign a single label to any given sample
491
492 TODO: fix description/naming
493
494 :Parameters:
495 columndata : ColumnData
496 Attributes where each known will be considered as a separate
497 explanatory variable (EV) in the design.
498 baseline_label
499 What label to assign for samples where none of EVs was given a value
500 func : functor
501 Function which decides either a value should be considered
502
503 :Output:
504 list of labels which are taken from column names in
505 ColumnData and baseline_label
506
507 """
508
509
510 keys = columndata.keys()
511 labels = []
512 for row in xrange(columndata.nrows):
513 entries = [ columndata[key][row] for key in keys ]
514
515 selected = filter(lambda x: func(x[1]), zip(keys, entries))
516 nselected = len(selected)
517
518 if nselected > 1:
519
520 raise ValueError, "Row #%i with items %s has multiple entries " \
521 "meeting the criterion. Cannot decide on the label" % \
522 (row, entries)
523 elif nselected == 1:
524 label = selected[0][0]
525 else:
526 label = baseline_label
527 labels.append(label)
528 return labels
529
530
531 __known_chunking_methods = {
532 'alllabels': 'Each chunk must contain instances of all labels'
533 }
534
535 -def labels2chunks(labels, method="alllabels", ignore_labels=None):
536 """Automagically decide on chunks based on labels
537
538 :Parameters:
539 labels
540 labels to base chunking on
541 method : basestring
542 codename for method to use. Known are %s
543 ignore_labels : list of basestring
544 depends on the method. If method ``alllabels``, then don't
545 seek for such labels in chunks. E.g. some 'reject' samples
546
547 :rtype: list
548 """ % __known_chunking_methods.keys()
549
550 chunks = []
551 if ignore_labels is None:
552 ignore_labels = []
553 alllabels = Set(labels).difference(Set(ignore_labels))
554 if method == 'alllabels':
555 seenlabels = Set()
556 lastlabel = None
557 chunk = 0
558 for label in labels:
559 if label != lastlabel:
560 if seenlabels == alllabels:
561 chunk += 1
562 seenlabels = Set()
563 lastlabel = label
564 if not label in ignore_labels:
565 seenlabels.union_update([label])
566 chunks.append(chunk)
567 chunks = N.array(chunks)
568
569 if seenlabels != alllabels:
570 chunks[chunks == chunk] = chunk-1
571 chunks = list(chunks)
572 else:
573 errmsg = "Unknown method to derive chunks is requested. Known are:\n"
574 for method, descr in __known_chunking_methods.iteritems():
575 errmsg += " %s : %s\n" % (method, descr)
576 raise ValueError, errmsg
577 return chunks
578