Package mvpa :: Package misc :: Module data_generators
[hide private]
[frames] | no frames]

Source Code for Module mvpa.misc.data_generators

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Miscelaneous data generators for unittests and demos""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  import numpy as N 
 14   
 15  from sets import Set 
 16   
 17  from mvpa.datasets import Dataset 
 18   
 19  if __debug__: 
 20      from mvpa.base import debug 
 21   
22 -def multipleChunks(func, n_chunks, *args, **kwargs):
23 """Replicate datasets multiple times raising different chunks 24 25 Given some randomized (noisy) generator of a dataset with a single 26 chunk call generator multiple times and place results into a 27 distinct chunks 28 """ 29 for chunk in xrange(n_chunks): 30 dataset_ = func(*args, **kwargs) 31 dataset_.chunks[:] = chunk + 1 32 if chunk == 0: 33 dataset = dataset_ 34 else: 35 dataset += dataset_ 36 37 return dataset
38 39
40 -def dumbFeatureDataset():
41 """Create a very simple dataset with 2 features and 3 labels 42 """ 43 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1], 44 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1], 45 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0], 46 [12, 1]] 47 regs = ([1] * 8) + ([2] * 8) + ([3] * 8) 48 49 return Dataset(samples=data, labels=regs)
50 51
52 -def dumbFeatureBinaryDataset():
53 """Very simple binary (2 labels) dataset 54 """ 55 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1], 56 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1], 57 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0], 58 [12, 1]] 59 regs = ([0] * 12) + ([1] * 12) 60 61 return Dataset(samples=data, labels=regs)
62 63 64
65 -def normalFeatureDataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5, 66 means=None, nonbogus_features=None, snr=3.0):
67 """Generate a univariate dataset with normal noise and specified means. 68 69 :Keywords: 70 perlabel : int 71 Number of samples per each label 72 nlabels : int 73 Number of labels in the dataset 74 nfeatures : int 75 Total number of features (including bogus features which carry 76 no label-related signal) 77 nchunks : int 78 Number of chunks (perlabel should be multiple of nchunks) 79 means : None or list of float or ndarray 80 Specified means for each of features among nfeatures. 81 nonbogus_features : None or list of int 82 Indexes of non-bogus features (1 per label) 83 snr : float 84 Signal-to-noise ration assuming that signal has std 1.0 so we 85 just divide random normal noise by snr 86 87 Probably it is a generalization of pureMultivariateSignal where 88 means=[ [0,1], [1,0] ] 89 90 Specify either means or nonbogus_features so means get assigned 91 accordingly 92 """ 93 94 data = N.random.standard_normal((perlabel*nlabels, nfeatures))/N.sqrt(snr) 95 if (means is None) and (not nonbogus_features is None): 96 if len(nonbogus_features) > nlabels: 97 raise ValueError, "Can't assign simply a feature to a " + \ 98 "class: more nonbogus_features than labels" 99 means = N.zeros((len(nonbogus_features), nfeatures)) 100 # pure multivariate -- single bit per feature 101 for i in xrange(len(nonbogus_features)): 102 means[i, nonbogus_features[i]] = 1.0 103 if not means is None: 104 # add mean 105 data += N.repeat(N.array(means, ndmin=2), perlabel, axis=0) 106 # bring it 'under 1', since otherwise some classifiers have difficulties 107 # during optimization 108 data = 1.0/(N.max(N.abs(data))) * data 109 labels = N.concatenate([N.repeat('L%d' % i, perlabel) 110 for i in range(nlabels)]) 111 chunks = N.concatenate([N.repeat(range(nchunks), 112 perlabel/nchunks) for i in range(nlabels)]) 113 ds = Dataset(samples=data, labels=labels, chunks=chunks, labels_map=True) 114 ds.nonbogus_features = nonbogus_features 115 return ds
116
117 -def pureMultivariateSignal(patterns, signal2noise = 1.5, chunks=None):
118 """ Create a 2d dataset with a clear multivariate signal, but no 119 univariate information. 120 121 :: 122 123 %%%%%%%%% 124 % O % X % 125 %%%%%%%%% 126 % X % O % 127 %%%%%%%%% 128 """ 129 130 # start with noise 131 data = N.random.normal(size=(4*patterns, 2)) 132 133 # add signal 134 data[:2*patterns, 1] += signal2noise 135 136 data[2*patterns:4*patterns, 1] -= signal2noise 137 data[:patterns, 0] -= signal2noise 138 data[2*patterns:3*patterns, 0] -= signal2noise 139 data[patterns:2*patterns, 0] += signal2noise 140 data[3*patterns:4*patterns, 0] += signal2noise 141 142 # two conditions 143 regs = N.array(([0] * patterns) + ([1] * 2 * patterns) + ([0] * patterns)) 144 145 return Dataset(samples=data, labels=regs, chunks=chunks)
146 147
148 -def normalFeatureDataset__(dataset=None, labels=None, nchunks=None, 149 perlabel=50, activation_probability_steps=1, 150 randomseed=None, randomvoxels=False):
151 """ NOT FINISHED """ 152 raise NotImplementedError 153 154 if dataset is None and labels is None: 155 raise ValueError, \ 156 "Provide at least labels or a background dataset" 157 158 if dataset is None: 159 nlabels = len(labels) 160 else: 161 nchunks = len(dataset.uniquechunks) 162 163 N.random.seed(randomseed) 164 165 # Create a sequence of indexes from which to select voxels to be used 166 # for features 167 if randomvoxels: 168 indexes = N.random.permutation(dataset.nfeatures) 169 else: 170 indexes = N.arange(dataset.nfeatures) 171 172 allind, maps = [], [] 173 if __debug__: 174 debug('DG', "Ugly creation of the copy of background") 175 176 dtype = dataset.samples.dtype 177 if not N.issubdtype(dtype, N.float): 178 dtype = N.float 179 totalsignal = N.zeros(dataset.samples.shape, dtype=dtype) 180 181 for l in xrange(len(labels)): 182 label = labels[l] 183 if __debug__: 184 debug('DG', "Simulating independent labels for %s" % label) 185 186 # What sample ids belong to this label 187 labelids = dataset.idsbylabels(label) 188 189 190 # What features belong here and what is left over 191 nfeatures = perlabel * activation_probability_steps 192 ind, indexes = indexes[0:nfeatures], \ 193 indexes[nfeatures+1:] 194 allind += list(ind) # store what indexes we used 195 196 # Create a dataset only for 'selected' features NB there is 197 # sideeffect that selectFeatures will sort those ind provided 198 ds = dataset.selectFeatures(ind) 199 ds.samples[:] = 0.0 # zero them out 200 201 # assign data 202 prob = [1.0 - x*1.0/activation_probability_steps 203 for x in xrange(activation_probability_steps)] 204 205 # repeat so each feature gets itw own 206 probabilities = N.repeat(prob, perlabel) 207 if __debug__: 208 debug('DG', 'For prob=%s probabilities=%s' % (prob, probabilities)) 209 210 for chunk in ds.uniquechunks: 211 chunkids = ds.idsbychunks(chunk) # samples in this chunk 212 ids = list(Set(chunkids).intersection(Set(labelids))) 213 chunkvalue = N.random.uniform() # random number to decide either 214 # to 'activate' the voxel 215 for id_ in ids: 216 ds.samples[id_, :] = \ 217 (chunkvalue <= probabilities).astype('float') 218 219 maps.append(N.array(probabilities, copy=True)) 220 221 signal = ds.map2Nifti(ds.samples) 222 totalsignal[:, ind] += ds.samples 223 224 # figure out average variance across all 'working' features 225 wfeatures = dataset.samples[:, allind] 226 meanstd = N.mean(N.std(wfeatures, 1)) 227 if __debug__: 228 debug('DG', "Mean deviation is %f" % meanstd) 229 230 totalsignal *= meanstd * options.snr 231 # add signal on top of background 232 dataset.samples += totalsignal 233 234 return dataset
235
236 -def getMVPattern(s2n):
237 """Simple multivariate dataset""" 238 return multipleChunks(pureMultivariateSignal, 6, 239 5, s2n, 1)
240 241
242 -def wr1996(size=200):
243 """Generate '6d robot arm' dataset (Williams and Rasmussen 1996) 244 245 Was originally created in order to test the correctness of the 246 implementation of kernel ARD. For full details see: 247 http://www.gaussianprocess.org/gpml/code/matlab/doc/regression.html#ard 248 249 x_1 picked randomly in [-1.932, -0.453] 250 x_2 picked randomly in [0.534, 3.142] 251 r_1 = 2.0 252 r_2 = 1.3 253 f(x_1,x_2) = r_1 cos (x_1) + r_2 cos(x_1 + x_2) + N(0,0.0025) 254 etc. 255 256 Expected relevances: 257 ell_1 1.804377 258 ell_2 1.963956 259 ell_3 8.884361 260 ell_4 34.417657 261 ell_5 1081.610451 262 ell_6 375.445823 263 sigma_f 2.379139 264 sigma_n 0.050835 265 """ 266 intervals = N.array([[-1.932, -0.453], [0.534, 3.142]]) 267 r = N.array([2.0, 1.3]) 268 x = N.random.rand(size, 2) 269 x *= N.array(intervals[:, 1]-intervals[:, 0]) 270 x += N.array(intervals[:, 0]) 271 if __debug__: 272 for i in xrange(2): 273 debug('DG', '%d columnt Min: %g Max: %g' % 274 (i, x[:, i].min(), x[:, i].max())) 275 y = r[0]*N.cos(x[:, 0] + r[1]*N.cos(x.sum(1))) + \ 276 N.random.randn(size)*N.sqrt(0.0025) 277 y -= y.mean() 278 x34 = x + N.random.randn(size, 2)*0.02 279 x56 = N.random.randn(size, 2) 280 x = N.hstack([x, x34, x56]) 281 return Dataset(samples=x, labels=y)
282 283
284 -def sinModulated(n_instances, n_features, 285 flat=False, noise=0.4):
286 """ Generate a (quite) complex multidimensional non-linear dataset 287 288 Used for regression testing. In the data label is a sin of a x^2 + 289 uniform noise 290 """ 291 if flat: 292 data = (N.arange(0.0, 1.0, 1.0/n_instances)*N.pi) 293 data.resize(n_instances, n_features) 294 else: 295 data = N.random.rand(n_instances, n_features)*N.pi 296 label = N.sin((data**2).sum(1)).round() 297 label += N.random.rand(label.size)*noise 298 return Dataset(samples=data, labels=label)
299
300 -def chirpLinear(n_instances, n_features=4, n_nonbogus_features=2, 301 data_noise=0.4, noise=0.1):
302 """ Generates simple dataset for linear regressions 303 304 Generates chirp signal, populates n_nonbogus_features out of 305 n_features with it with different noise level and then provides 306 signal itself with additional noise as labels 307 """ 308 x = N.linspace(0, 1, n_instances) 309 y = N.sin((10 * N.pi * x **2)) 310 311 data = N.random.normal(size=(n_instances, n_features ))*data_noise 312 for i in xrange(n_nonbogus_features): 313 data[:, i] += y[:] 314 315 labels = y + N.random.normal(size=(n_instances,))*noise 316 317 return Dataset(samples=data, labels=labels)
318 319
320 -def linear_awgn(size=10, intercept=0.0, slope=0.4, noise_std=0.01, flat=False):
321 """Generate a dataset from a linear function with AWGN 322 (Added White Gaussian Noise). 323 324 It can be multidimensional if 'slope' is a vector. If flat is True 325 (in 1 dimesion) generate equally spaces samples instead of random 326 ones. This is useful for the test phase. 327 """ 328 dimensions = 1 329 if isinstance(slope, N.ndarray): 330 dimensions = slope.size 331 332 if flat and dimensions == 1: 333 x = N.linspace(0, 1, size)[:, N.newaxis] 334 else: 335 x = N.random.rand(size, dimensions) 336 337 y = N.dot(x, slope)[:, N.newaxis] \ 338 + (N.random.randn(*(x.shape[0], 1)) * noise_std) + intercept 339 340 return Dataset(samples=x, labels=y)
341 342
343 -def noisy_2d_fx(size_per_fx, dfx, sfx, center, noise_std=1):
344 """ 345 """ 346 x = [] 347 y = [] 348 labels = [] 349 for fx in sfx: 350 nx = N.random.normal(size=size_per_fx) 351 ny = fx(nx) + N.random.normal(size=nx.shape, scale=noise_std) 352 x.append(nx) 353 y.append(ny) 354 355 # whenever larger than first function value 356 labels.append(N.array(ny < dfx(nx), dtype='int')) 357 358 samples = N.array((N.hstack(x), N.hstack(y))).squeeze().T 359 labels = N.hstack(labels).squeeze().T 360 361 samples += N.array(center) 362 363 return Dataset(samples=samples, labels=labels)
364