Package mvpa :: Package misc :: Module data_generators
[hide private]
[frames] | no frames]

Source Code for Module mvpa.misc.data_generators

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Miscelaneous data generators for unittests and demos""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  import numpy as N 
 14   
 15  from sets import Set 
 16   
 17  from mvpa.datasets import Dataset 
 18   
 19  if __debug__: 
 20      from mvpa.base import debug 
 21   
22 -def multipleChunks(func, n_chunks, *args, **kwargs):
23 """Replicate datasets multiple times raising different chunks 24 25 Given some randomized (noisy) generator of a dataset with a single 26 chunk call generator multiple times and place results into a 27 distinct chunks 28 """ 29 for chunk in xrange(n_chunks): 30 dataset_ = func(*args, **kwargs) 31 dataset_.chunks[:] = chunk + 1 32 if chunk == 0: 33 dataset = dataset_ 34 else: 35 dataset += dataset_ 36 37 return dataset
38 39
40 -def dumbFeatureDataset():
41 """Create a very simple dataset with 2 features and 3 labels 42 """ 43 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1], 44 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1], 45 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0], 46 [12, 1]] 47 regs = ([1] * 8) + ([2] * 8) + ([3] * 8) 48 49 return Dataset(samples=data, labels=regs)
50 51
52 -def dumbFeatureBinaryDataset():
53 """Very simple binary (2 labels) dataset 54 """ 55 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1], 56 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1], 57 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0], 58 [12, 1]] 59 regs = ([0] * 12) + ([1] * 12) 60 61 return Dataset(samples=data, labels=regs)
62 63 64
65 -def normalFeatureDataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5, 66 means=None, nonbogus_features=None, snr=1.0):
67 """Generate a dataset where each label is some normally 68 distributed beastie around specified mean (0 if None). 69 70 snr is assuming that signal has std 1.0 so we just divide noise by snr 71 72 Probably it is a generalization of pureMultivariateSignal where 73 means=[ [0,1], [1,0] ] 74 75 Specify either means or nonbogus_features so means get assigned 76 accordingly 77 """ 78 79 data = N.random.standard_normal((perlabel*nlabels, nfeatures))/N.sqrt(snr) 80 if (means is None) and (not nonbogus_features is None): 81 if len(nonbogus_features) > nlabels: 82 raise ValueError, "Can't assign simply a feature to a " + \ 83 "class: more nonbogus_features than labels" 84 means = N.zeros((len(nonbogus_features), nfeatures)) 85 # pure multivariate -- single bit per feature 86 for i in xrange(len(nonbogus_features)): 87 means[i, nonbogus_features[i]] = 1.0 88 if not means is None: 89 # add mean 90 data += N.repeat(N.array(means, ndmin=2), perlabel, axis=0) 91 # bring it 'under 1', since otherwise some classifiers have difficulties 92 # during optimization 93 data = 1.0/(N.max(N.abs(data))) * data 94 labels = N.concatenate([N.repeat('L%d' % i, perlabel) 95 for i in range(nlabels)]) 96 chunks = N.concatenate([N.repeat(range(nchunks), 97 perlabel/nchunks) for i in range(nlabels)]) 98 ds = Dataset(samples=data, labels=labels, chunks=chunks, labels_map=True) 99 ds.nonbogus_features = nonbogus_features 100 return ds
101
102 -def pureMultivariateSignal(patterns, signal2noise = 1.5, chunks=None):
103 """ Create a 2d dataset with a clear multivariate signal, but no 104 univariate information. 105 106 :: 107 108 %%%%%%%%% 109 % O % X % 110 %%%%%%%%% 111 % X % O % 112 %%%%%%%%% 113 """ 114 115 # start with noise 116 data = N.random.normal(size=(4*patterns, 2)) 117 118 # add signal 119 data[:2*patterns, 1] += signal2noise 120 121 data[2*patterns:4*patterns, 1] -= signal2noise 122 data[:patterns, 0] -= signal2noise 123 data[2*patterns:3*patterns, 0] -= signal2noise 124 data[patterns:2*patterns, 0] += signal2noise 125 data[3*patterns:4*patterns, 0] += signal2noise 126 127 # two conditions 128 regs = N.array(([0] * patterns) + ([1] * 2 * patterns) + ([0] * patterns)) 129 130 return Dataset(samples=data, labels=regs, chunks=chunks)
131 132
133 -def normalFeatureDataset__(dataset=None, labels=None, nchunks=None, 134 perlabel=50, activation_probability_steps=1, 135 randomseed=None, randomvoxels=False):
136 """ NOT FINISHED """ 137 raise NotImplementedError 138 139 if dataset is None and labels is None: 140 raise ValueError, \ 141 "Provide at least labels or a background dataset" 142 143 if dataset is None: 144 nlabels = len(labels) 145 else: 146 nchunks = len(dataset.uniquechunks) 147 148 N.random.seed(randomseed) 149 150 # Create a sequence of indexes from which to select voxels to be used 151 # for features 152 if randomvoxels: 153 indexes = N.random.permutation(dataset.nfeatures) 154 else: 155 indexes = N.arange(dataset.nfeatures) 156 157 allind, maps = [], [] 158 if __debug__: 159 debug('DG', "Ugly creation of the copy of background") 160 161 dtype = dataset.samples.dtype 162 if not N.issubdtype(dtype, N.float): 163 dtype = N.float 164 totalsignal = N.zeros(dataset.samples.shape, dtype=dtype) 165 166 for l in xrange(len(labels)): 167 label = labels[l] 168 if __debug__: 169 debug('DG', "Simulating independent labels for %s" % label) 170 171 # What sample ids belong to this label 172 labelids = dataset.idsbylabels(label) 173 174 175 # What features belong here and what is left over 176 nfeatures = perlabel * activation_probability_steps 177 ind, indexes = indexes[0:nfeatures], \ 178 indexes[nfeatures+1:] 179 allind += list(ind) # store what indexes we used 180 181 # Create a dataset only for 'selected' features NB there is 182 # sideeffect that selectFeatures will sort those ind provided 183 ds = dataset.selectFeatures(ind) 184 ds.samples[:] = 0.0 # zero them out 185 186 # assign data 187 prob = [1.0 - x*1.0/activation_probability_steps 188 for x in xrange(activation_probability_steps)] 189 190 # repeat so each feature gets itw own 191 probabilities = N.repeat(prob, perlabel) 192 if __debug__: 193 debug('DG', 'For prob=%s probabilities=%s' % (prob, probabilities)) 194 195 for chunk in ds.uniquechunks: 196 chunkids = ds.idsbychunks(chunk) # samples in this chunk 197 ids = list(Set(chunkids).intersection(Set(labelids))) 198 chunkvalue = N.random.uniform() # random number to decide either 199 # to 'activate' the voxel 200 for id_ in ids: 201 ds.samples[id_, :] = \ 202 (chunkvalue <= probabilities).astype('float') 203 204 maps.append(N.array(probabilities, copy=True)) 205 206 signal = ds.map2Nifti(ds.samples) 207 totalsignal[:, ind] += ds.samples 208 209 # figure out average variance across all 'working' features 210 wfeatures = dataset.samples[:, allind] 211 meanstd = N.mean(N.std(wfeatures, 1)) 212 if __debug__: 213 debug('DG', "Mean deviation is %f" % meanstd) 214 215 totalsignal *= meanstd * options.snr 216 # add signal on top of background 217 dataset.samples += totalsignal 218 219 return dataset
220
221 -def getMVPattern(s2n):
222 """Simple multivariate dataset""" 223 return multipleChunks(pureMultivariateSignal, 6, 224 5, s2n, 1)
225 226
227 -def wr1996(size=200):
228 """Generate '6d robot arm' dataset (Williams and Rasmussen 1996) 229 230 Was originally created in order to test the correctness of the 231 implementation of kernel ARD. For full details see: 232 http://www.gaussianprocess.org/gpml/code/matlab/doc/regression.html#ard 233 234 x_1 picked randomly in [-1.932, -0.453] 235 x_2 picked randomly in [0.534, 3.142] 236 r_1 = 2.0 237 r_2 = 1.3 238 f(x_1,x_2) = r_1 cos (x_1) + r_2 cos(x_1 + x_2) + N(0,0.0025) 239 etc. 240 241 Expected relevances: 242 ell_1 1.804377 243 ell_2 1.963956 244 ell_3 8.884361 245 ell_4 34.417657 246 ell_5 1081.610451 247 ell_6 375.445823 248 sigma_f 2.379139 249 sigma_n 0.050835 250 """ 251 intervals = N.array([[-1.932, -0.453], [0.534, 3.142]]) 252 r = N.array([2.0, 1.3]) 253 x = N.random.rand(size, 2) 254 x *= N.array(intervals[:, 1]-intervals[:, 0]) 255 x += N.array(intervals[:, 0]) 256 if __debug__: 257 for i in xrange(2): 258 debug('DG', '%d columnt Min: %g Max: %g' % 259 (i, x[:, i].min(), x[:, i].max())) 260 y = r[0]*N.cos(x[:, 0] + r[1]*N.cos(x.sum(1))) + \ 261 N.random.randn(size)*N.sqrt(0.0025) 262 y -= y.mean() 263 x34 = x + N.random.randn(size, 2)*0.02 264 x56 = N.random.randn(size, 2) 265 x = N.hstack([x, x34, x56]) 266 return Dataset(samples=x, labels=y)
267 268
269 -def sinModulated(n_instances, n_features, 270 flat=False, noise=0.4):
271 """ Generate a (quite) complex multidimensional non-linear dataset 272 273 Used for regression testing. In the data label is a sin of a x^2 + 274 uniform noise 275 """ 276 if flat: 277 data = (N.arange(0.0, 1.0, 1.0/n_instances)*N.pi) 278 data.resize(n_instances, n_features) 279 else: 280 data = N.random.rand(n_instances, n_features)*N.pi 281 label = N.sin((data**2).sum(1)).round() 282 label += N.random.rand(label.size)*noise 283 return Dataset(samples=data, labels=label)
284
285 -def chirpLinear(n_instances, n_features=4, n_nonbogus_features=2, 286 data_noise=0.4, noise=0.1):
287 """ Generates simple dataset for linear regressions 288 289 Generates chirp signal, populates n_nonbogus_features out of 290 n_features with it with different noise level and then provides 291 signal itself with additional noise as labels 292 """ 293 x = N.linspace(0, 1, n_instances) 294 y = N.sin((10 * N.pi * x **2)) 295 296 data = N.random.normal(size=(n_instances, n_features ))*data_noise 297 for i in xrange(n_nonbogus_features): 298 data[:, i] += y[:] 299 300 labels = y + N.random.normal(size=(n_instances,))*noise 301 302 return Dataset(samples=data, labels=labels)
303 304
305 -def linear_awgn(size=10, intercept=0.0, slope=0.4, noise_std=0.01, flat=False):
306 """ 307 Generate a dataset from a linear function with Added White 308 Gaussian Noise (AWGN). It can be multidimensional if 'slope' is a 309 vector. If flat is True (in 1 dimesion) generate equally spaces 310 samples instead of random ones. This is useful for the test phase. 311 """ 312 dimensions = 1 313 if isinstance(slope, N.ndarray): 314 dimensions = slope.size 315 316 if flat and dimensions == 1: 317 x = N.linspace(0, 1, size)[:, N.newaxis] 318 else: 319 x = N.random.rand(size, dimensions) 320 321 y = N.dot(x, slope)[:, N.newaxis] \ 322 + (N.random.randn(*(x.shape[0], 1)) * noise_std) + intercept 323 324 return Dataset(samples=x, labels=y)
325 326
327 -def noisy_2d_fx(size_per_fx, dfx, sfx, center, noise_std=1):
328 """ 329 """ 330 x = [] 331 y = [] 332 labels = [] 333 for fx in sfx: 334 nx = N.random.normal(size=size_per_fx) 335 ny = fx(nx) + N.random.normal(size=nx.shape, scale=noise_std) 336 x.append(nx) 337 y.append(ny) 338 339 # whenever larger than first function value 340 labels.append(N.array(ny < dfx(nx), dtype='int')) 341 342 samples = N.array((N.hstack(x), N.hstack(y))).squeeze().T 343 labels = N.hstack(labels).squeeze().T 344 345 samples += N.array(center) 346 347 return Dataset(samples=samples, labels=labels)
348