1
2
3
4
5
6
7
8
9 """Miscelaneous data generators for unittests and demos"""
10
11 __docformat__ = 'restructuredtext'
12
13 import numpy as N
14
15 from sets import Set
16
17 from mvpa.datasets import Dataset
18
19 if __debug__:
20 from mvpa.base import debug
21
23 """Replicate datasets multiple times raising different chunks
24
25 Given some randomized (noisy) generator of a dataset with a single
26 chunk call generator multiple times and place results into a
27 distinct chunks
28 """
29 for chunk in xrange(n_chunks):
30 dataset_ = func(*args, **kwargs)
31 dataset_.chunks[:] = chunk + 1
32 if chunk == 0:
33 dataset = dataset_
34 else:
35 dataset += dataset_
36
37 return dataset
38
39
41 """Create a very simple dataset with 2 features and 3 labels
42 """
43 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1],
44 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1],
45 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0],
46 [12, 1]]
47 regs = ([1] * 8) + ([2] * 8) + ([3] * 8)
48
49 return Dataset(samples=data, labels=regs)
50
51
53 """Very simple binary (2 labels) dataset
54 """
55 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1],
56 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1],
57 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0],
58 [12, 1]]
59 regs = ([0] * 12) + ([1] * 12)
60
61 return Dataset(samples=data, labels=regs)
62
63
64
65 -def normalFeatureDataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5,
66 means=None, nonbogus_features=None, snr=1.0):
67 """Generate a dataset where each label is some normally
68 distributed beastie around specified mean (0 if None).
69
70 snr is assuming that signal has std 1.0 so we just divide noise by snr
71
72 Probably it is a generalization of pureMultivariateSignal where
73 means=[ [0,1], [1,0] ]
74
75 Specify either means or nonbogus_features so means get assigned
76 accordingly
77 """
78
79 data = N.random.standard_normal((perlabel*nlabels, nfeatures))/N.sqrt(snr)
80 if (means is None) and (not nonbogus_features is None):
81 if len(nonbogus_features) > nlabels:
82 raise ValueError, "Can't assign simply a feature to a " + \
83 "class: more nonbogus_features than labels"
84 means = N.zeros((len(nonbogus_features), nfeatures))
85
86 for i in xrange(len(nonbogus_features)):
87 means[i, nonbogus_features[i]] = 1.0
88 if not means is None:
89
90 data += N.repeat(N.array(means, ndmin=2), perlabel, axis=0)
91
92
93 data = 1.0/(N.max(N.abs(data))) * data
94 labels = N.concatenate([N.repeat('L%d' % i, perlabel)
95 for i in range(nlabels)])
96 chunks = N.concatenate([N.repeat(range(nchunks),
97 perlabel/nchunks) for i in range(nlabels)])
98 ds = Dataset(samples=data, labels=labels, chunks=chunks, labels_map=True)
99 ds.nonbogus_features = nonbogus_features
100 return ds
101
103 """ Create a 2d dataset with a clear multivariate signal, but no
104 univariate information.
105
106 ::
107
108 %%%%%%%%%
109 % O % X %
110 %%%%%%%%%
111 % X % O %
112 %%%%%%%%%
113 """
114
115
116 data = N.random.normal(size=(4*patterns, 2))
117
118
119 data[:2*patterns, 1] += signal2noise
120
121 data[2*patterns:4*patterns, 1] -= signal2noise
122 data[:patterns, 0] -= signal2noise
123 data[2*patterns:3*patterns, 0] -= signal2noise
124 data[patterns:2*patterns, 0] += signal2noise
125 data[3*patterns:4*patterns, 0] += signal2noise
126
127
128 regs = N.array(([0] * patterns) + ([1] * 2 * patterns) + ([0] * patterns))
129
130 return Dataset(samples=data, labels=regs, chunks=chunks)
131
132
133 -def normalFeatureDataset__(dataset=None, labels=None, nchunks=None,
134 perlabel=50, activation_probability_steps=1,
135 randomseed=None, randomvoxels=False):
136 """ NOT FINISHED """
137 raise NotImplementedError
138
139 if dataset is None and labels is None:
140 raise ValueError, \
141 "Provide at least labels or a background dataset"
142
143 if dataset is None:
144 nlabels = len(labels)
145 else:
146 nchunks = len(dataset.uniquechunks)
147
148 N.random.seed(randomseed)
149
150
151
152 if randomvoxels:
153 indexes = N.random.permutation(dataset.nfeatures)
154 else:
155 indexes = N.arange(dataset.nfeatures)
156
157 allind, maps = [], []
158 if __debug__:
159 debug('DG', "Ugly creation of the copy of background")
160
161 dtype = dataset.samples.dtype
162 if not N.issubdtype(dtype, N.float):
163 dtype = N.float
164 totalsignal = N.zeros(dataset.samples.shape, dtype=dtype)
165
166 for l in xrange(len(labels)):
167 label = labels[l]
168 if __debug__:
169 debug('DG', "Simulating independent labels for %s" % label)
170
171
172 labelids = dataset.idsbylabels(label)
173
174
175
176 nfeatures = perlabel * activation_probability_steps
177 ind, indexes = indexes[0:nfeatures], \
178 indexes[nfeatures+1:]
179 allind += list(ind)
180
181
182
183 ds = dataset.selectFeatures(ind)
184 ds.samples[:] = 0.0
185
186
187 prob = [1.0 - x*1.0/activation_probability_steps
188 for x in xrange(activation_probability_steps)]
189
190
191 probabilities = N.repeat(prob, perlabel)
192 if __debug__:
193 debug('DG', 'For prob=%s probabilities=%s' % (prob, probabilities))
194
195 for chunk in ds.uniquechunks:
196 chunkids = ds.idsbychunks(chunk)
197 ids = list(Set(chunkids).intersection(Set(labelids)))
198 chunkvalue = N.random.uniform()
199
200 for id_ in ids:
201 ds.samples[id_, :] = \
202 (chunkvalue <= probabilities).astype('float')
203
204 maps.append(N.array(probabilities, copy=True))
205
206 signal = ds.map2Nifti(ds.samples)
207 totalsignal[:, ind] += ds.samples
208
209
210 wfeatures = dataset.samples[:, allind]
211 meanstd = N.mean(N.std(wfeatures, 1))
212 if __debug__:
213 debug('DG', "Mean deviation is %f" % meanstd)
214
215 totalsignal *= meanstd * options.snr
216
217 dataset.samples += totalsignal
218
219 return dataset
220
225
226
228 """Generate '6d robot arm' dataset (Williams and Rasmussen 1996)
229
230 Was originally created in order to test the correctness of the
231 implementation of kernel ARD. For full details see:
232 http://www.gaussianprocess.org/gpml/code/matlab/doc/regression.html#ard
233
234 x_1 picked randomly in [-1.932, -0.453]
235 x_2 picked randomly in [0.534, 3.142]
236 r_1 = 2.0
237 r_2 = 1.3
238 f(x_1,x_2) = r_1 cos (x_1) + r_2 cos(x_1 + x_2) + N(0,0.0025)
239 etc.
240
241 Expected relevances:
242 ell_1 1.804377
243 ell_2 1.963956
244 ell_3 8.884361
245 ell_4 34.417657
246 ell_5 1081.610451
247 ell_6 375.445823
248 sigma_f 2.379139
249 sigma_n 0.050835
250 """
251 intervals = N.array([[-1.932, -0.453], [0.534, 3.142]])
252 r = N.array([2.0, 1.3])
253 x = N.random.rand(size, 2)
254 x *= N.array(intervals[:, 1]-intervals[:, 0])
255 x += N.array(intervals[:, 0])
256 if __debug__:
257 for i in xrange(2):
258 debug('DG', '%d columnt Min: %g Max: %g' %
259 (i, x[:, i].min(), x[:, i].max()))
260 y = r[0]*N.cos(x[:, 0] + r[1]*N.cos(x.sum(1))) + \
261 N.random.randn(size)*N.sqrt(0.0025)
262 y -= y.mean()
263 x34 = x + N.random.randn(size, 2)*0.02
264 x56 = N.random.randn(size, 2)
265 x = N.hstack([x, x34, x56])
266 return Dataset(samples=x, labels=y)
267
268
269 -def sinModulated(n_instances, n_features,
270 flat=False, noise=0.4):
271 """ Generate a (quite) complex multidimensional non-linear dataset
272
273 Used for regression testing. In the data label is a sin of a x^2 +
274 uniform noise
275 """
276 if flat:
277 data = (N.arange(0.0, 1.0, 1.0/n_instances)*N.pi)
278 data.resize(n_instances, n_features)
279 else:
280 data = N.random.rand(n_instances, n_features)*N.pi
281 label = N.sin((data**2).sum(1)).round()
282 label += N.random.rand(label.size)*noise
283 return Dataset(samples=data, labels=label)
284
285 -def chirpLinear(n_instances, n_features=4, n_nonbogus_features=2,
286 data_noise=0.4, noise=0.1):
287 """ Generates simple dataset for linear regressions
288
289 Generates chirp signal, populates n_nonbogus_features out of
290 n_features with it with different noise level and then provides
291 signal itself with additional noise as labels
292 """
293 x = N.linspace(0, 1, n_instances)
294 y = N.sin((10 * N.pi * x **2))
295
296 data = N.random.normal(size=(n_instances, n_features ))*data_noise
297 for i in xrange(n_nonbogus_features):
298 data[:, i] += y[:]
299
300 labels = y + N.random.normal(size=(n_instances,))*noise
301
302 return Dataset(samples=data, labels=labels)
303
304
305 -def linear_awgn(size=10, intercept=0.0, slope=0.4, noise_std=0.01, flat=False):
306 """
307 Generate a dataset from a linear function with Added White
308 Gaussian Noise (AWGN). It can be multidimensional if 'slope' is a
309 vector. If flat is True (in 1 dimesion) generate equally spaces
310 samples instead of random ones. This is useful for the test phase.
311 """
312 dimensions = 1
313 if isinstance(slope, N.ndarray):
314 dimensions = slope.size
315
316 if flat and dimensions == 1:
317 x = N.linspace(0, 1, size)[:, N.newaxis]
318 else:
319 x = N.random.rand(size, dimensions)
320
321 y = N.dot(x, slope)[:, N.newaxis] \
322 + (N.random.randn(*(x.shape[0], 1)) * noise_std) + intercept
323
324 return Dataset(samples=x, labels=y)
325
326
327 -def noisy_2d_fx(size_per_fx, dfx, sfx, center, noise_std=1):
328 """
329 """
330 x = []
331 y = []
332 labels = []
333 for fx in sfx:
334 nx = N.random.normal(size=size_per_fx)
335 ny = fx(nx) + N.random.normal(size=nx.shape, scale=noise_std)
336 x.append(nx)
337 y.append(ny)
338
339
340 labels.append(N.array(ny < dfx(nx), dtype='int'))
341
342 samples = N.array((N.hstack(x), N.hstack(y))).squeeze().T
343 labels = N.hstack(labels).squeeze().T
344
345 samples += N.array(center)
346
347 return Dataset(samples=samples, labels=labels)
348