Package mvpa :: Package misc :: Module data_generators
[hide private]
[frames] | no frames]

Source Code for Module mvpa.misc.data_generators

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Miscelaneous data generators for unittests and demos""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  import numpy as N 
 14   
 15  from sets import Set 
 16   
 17  from mvpa.datasets import Dataset 
 18   
 19  if __debug__: 
 20      from mvpa.misc import debug 
 21   
22 -def dumbFeatureDataset():
23 """Create a very simple dataset with 2 features and 3 labels 24 """ 25 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1], 26 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1], 27 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0], 28 [12, 1]] 29 regs = ([1] * 8) + ([2] * 8) + ([3] * 8) 30 31 return Dataset(samples=data, labels=regs)
32 33
34 -def dumbFeatureBinaryDataset():
35 """Very simple binary (2 labels) dataset 36 """ 37 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1], 38 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1], 39 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0], 40 [12, 1]] 41 regs = ([0] * 12) + ([1] * 12) 42 43 return Dataset(samples=data, labels=regs)
44 45 46
47 -def normalFeatureDataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5, 48 means=None, nonbogus_features=None, snr=1.0):
49 """Generate a dataset where each label is some normally 50 distributed beastie around specified mean (0 if None). 51 52 snr is assuming that signal has std 1.0 so we just divide noise by snr 53 54 Probably it is a generalization of pureMultivariateSignal where 55 means=[ [0,1], [1,0] ] 56 57 Specify either means or nonbogus_features so means get assigned 58 accordingly 59 """ 60 61 data = N.random.standard_normal((perlabel*nlabels, nfeatures))/N.sqrt(snr) 62 if (means is None) and (not nonbogus_features is None): 63 if len(nonbogus_features) > nlabels: 64 raise ValueError, "Can't assign simply a feature to a " + \ 65 "class: more nonbogus_features than labels" 66 means = N.zeros((len(nonbogus_features), nfeatures)) 67 # pure multivariate -- single bit per feature 68 for i in xrange(len(nonbogus_features)): 69 means[i, nonbogus_features[i]] = 1.0 70 if not means is None: 71 # add mean 72 data += N.repeat(N.array(means, ndmin=2), perlabel, axis=0) 73 # bring it 'under 1', since otherwise some classifiers have difficulties 74 # during optimization 75 data = 1.0/(N.max(N.abs(data))) * data 76 labels = N.concatenate([N.repeat(i, perlabel) for i in range(nlabels)]) 77 chunks = N.concatenate([N.repeat(range(nchunks), 78 perlabel/nchunks) for i in range(nlabels)]) 79 ds = Dataset(samples=data, labels=labels, chunks=chunks) 80 ds.nonbogus_features = nonbogus_features 81 return ds
82
83 -def pureMultivariateSignal(patterns, signal2noise = 1.5, chunks=None):
84 """ Create a 2d dataset with a clear multivariate signal, but no 85 univariate information. 86 87 %%%%%%%%% 88 % O % X % 89 %%%%%%%%% 90 % X % O % 91 %%%%%%%%% 92 """ 93 94 # start with noise 95 data = N.random.normal(size=(4*patterns, 2)) 96 97 # add signal 98 data[:2*patterns, 1] += signal2noise 99 100 data[2*patterns:4*patterns, 1] -= signal2noise 101 data[:patterns, 0] -= signal2noise 102 data[2*patterns:3*patterns, 0] -= signal2noise 103 data[patterns:2*patterns, 0] += signal2noise 104 data[3*patterns:4*patterns, 0] += signal2noise 105 106 # two conditions 107 regs = N.array(([0] * patterns) + ([1] * 2 * patterns) + ([0] * patterns)) 108 109 return Dataset(samples=data, labels=regs, chunks=chunks)
110 111
112 -def normalFeatureDataset__(dataset=None, labels=None, nchunks=None, 113 perlabel=50, activation_probability_steps=1, 114 randomseed=None, randomvoxels=False):
115 """ NOT FINISHED """ 116 raise NotImplementedError 117 118 if dataset is None and labels is None: 119 raise ValueError, \ 120 "Provide at least labels or a background dataset" 121 122 if dataset is None: 123 nlabels = len(labels) 124 else: 125 nchunks = len(dataset.uniquechunks) 126 127 N.random.seed(randomseed) 128 129 # Create a sequence of indexes from which to select voxels to be used 130 # for features 131 if randomvoxels: 132 indexes = N.random.permutation(dataset.nfeatures) 133 else: 134 indexes = N.arange(dataset.nfeatures) 135 136 allind, maps = [], [] 137 if __debug__: 138 debug('DG', "Ugly creation of the copy of background") 139 140 dtype = dataset.samples.dtype 141 if not N.issubdtype(dtype, N.float): 142 dtype = N.float 143 totalsignal = N.zeros(dataset.samples.shape, dtype=dtype) 144 145 for l in xrange(len(labels)): 146 label = labels[l] 147 if __debug__: 148 debug('DG', "Simulating independent labels for %s" % label) 149 150 # What sample ids belong to this label 151 labelids = dataset.idsbylabels(label) 152 153 154 # What features belong here and what is left over 155 nfeatures = perlabel * activation_probability_steps 156 ind, indexes = indexes[0:nfeatures], \ 157 indexes[nfeatures+1:] 158 allind += list(ind) # store what indexes we used 159 160 # Create a dataset only for 'selected' features NB there is 161 # sideeffect that selectFeatures will sort those ind provided 162 ds = dataset.selectFeatures(ind) 163 ds.samples[:] = 0.0 # zero them out 164 165 # assign data 166 prob = [1.0 - x*1.0/activation_probability_steps 167 for x in xrange(activation_probability_steps)] 168 169 # repeat so each feature gets itw own 170 probabilities = N.repeat(prob, perlabel) 171 if __debug__: 172 debug('DG', 'For prob=%s probabilities=%s' % (prob, probabilities)) 173 174 for chunk in ds.uniquechunks: 175 chunkids = ds.idsbychunks(chunk) # samples in this chunk 176 ids = list(Set(chunkids).intersection(Set(labelids))) 177 chunkvalue = N.random.uniform() # random number to decide either 178 # to 'activate' the voxel 179 for id_ in ids: 180 ds.samples[id_, :] = \ 181 (chunkvalue <= probabilities).astype('float') 182 183 maps.append(N.array(probabilities, copy=True)) 184 185 signal = ds.map2Nifti(ds.samples) 186 totalsignal[:, ind] += ds.samples 187 188 # figure out average variance across all 'working' features 189 wfeatures = dataset.samples[:, allind] 190 meanstd = N.mean(N.std(wfeatures, 1)) 191 if __debug__: 192 debug('DG', "Mean deviation is %f" % meanstd) 193 194 totalsignal *= meanstd * options.snr 195 # add signal on top of background 196 dataset.samples += totalsignal 197 198 return dataset
199
200 -def getMVPattern(s2n):
201 """Simple multivariate dataset""" 202 203 run1 = pureMultivariateSignal(5, s2n, 1) 204 run2 = pureMultivariateSignal(5, s2n, 2) 205 run3 = pureMultivariateSignal(5, s2n, 3) 206 run4 = pureMultivariateSignal(5, s2n, 4) 207 run5 = pureMultivariateSignal(5, s2n, 5) 208 run6 = pureMultivariateSignal(5, s2n, 6) 209 210 data = run1 + run2 + run3 + run4 + run5 + run6 211 212 return data
213 214
215 -def wr1996(size=200):
216 """Generate '6d robot arm' dataset (Williams and Rasmussen 1996) 217 218 Was originally created in order to test the correctness of the 219 implementation of kernel ARD. For full details see: 220 http://www.gaussianprocess.org/gpml/code/matlab/doc/regression.html#ard 221 222 x_1 picked randomly in [-1.932, -0.453] 223 x_2 picked randomly in [0.534, 3.142] 224 r_1 = 2.0 225 r_2 = 1.3 226 f(x_1,x_2) = r_1 cos (x_1) + r_2 cos(x_1 + x_2) + N(0,0.0025) 227 etc. 228 229 Expected relevances: 230 ell_1 1.804377 231 ell_2 1.963956 232 ell_3 8.884361 233 ell_4 34.417657 234 ell_5 1081.610451 235 ell_6 375.445823 236 sigma_f 2.379139 237 sigma_n 0.050835 238 """ 239 intervals = N.array([[-1.932, -0.453], [0.534, 3.142]]) 240 r = N.array([2.0, 1.3]) 241 x = N.random.rand(size, 2) 242 x *= N.array(intervals[:, 1]-intervals[:, 0]) 243 x += N.array(intervals[:, 0]) 244 if __debug__: 245 for i in xrange(2): 246 debug('DG', '%d columnt Min: %g Max: %g' % 247 (i, x[:, i].min(), x[:, i].max())) 248 y = r[0]*N.cos(x[:, 0] + r[1]*N.cos(x.sum(1))) + \ 249 N.random.randn(size)*N.sqrt(0.0025) 250 y -= y.mean() 251 x34 = x + N.random.randn(size, 2)*0.02 252 x56 = N.random.randn(size, 2) 253 x = N.hstack([x, x34, x56]) 254 return Dataset(samples=x, labels=y)
255