1
2
3
4
5
6
7
8
9 """Miscelaneous data generators for unittests and demos"""
10
11 __docformat__ = 'restructuredtext'
12
13 import numpy as N
14
15 from sets import Set
16
17 from mvpa.datasets import Dataset
18
19 if __debug__:
20 from mvpa.base import debug
21
23 """Replicate datasets multiple times raising different chunks
24
25 Given some randomized (noisy) generator of a dataset with a single
26 chunk call generator multiple times and place results into a
27 distinct chunks
28 """
29 for chunk in xrange(n_chunks):
30 dataset_ = func(*args, **kwargs)
31 dataset_.chunks[:] = chunk + 1
32 if chunk == 0:
33 dataset = dataset_
34 else:
35 dataset += dataset_
36
37 return dataset
38
39
41 """Create a very simple dataset with 2 features and 3 labels
42 """
43 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1],
44 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1],
45 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0],
46 [12, 1]]
47 regs = ([1] * 8) + ([2] * 8) + ([3] * 8)
48
49 return Dataset(samples=data, labels=regs)
50
51
53 """Very simple binary (2 labels) dataset
54 """
55 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1],
56 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1],
57 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0],
58 [12, 1]]
59 regs = ([0] * 12) + ([1] * 12)
60
61 return Dataset(samples=data, labels=regs)
62
63
64
65 -def normalFeatureDataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5,
66 means=None, nonbogus_features=None, snr=3.0):
67 """Generate a univariate dataset with normal noise and specified means.
68
69 :Keywords:
70 perlabel : int
71 Number of samples per each label
72 nlabels : int
73 Number of labels in the dataset
74 nfeatures : int
75 Total number of features (including bogus features which carry
76 no label-related signal)
77 nchunks : int
78 Number of chunks (perlabel should be multiple of nchunks)
79 means : None or list of float or ndarray
80 Specified means for each of features among nfeatures.
81 nonbogus_features : None or list of int
82 Indexes of non-bogus features (1 per label)
83 snr : float
84 Signal-to-noise ration assuming that signal has std 1.0 so we
85 just divide random normal noise by snr
86
87 Probably it is a generalization of pureMultivariateSignal where
88 means=[ [0,1], [1,0] ]
89
90 Specify either means or nonbogus_features so means get assigned
91 accordingly
92 """
93
94 data = N.random.standard_normal((perlabel*nlabels, nfeatures))/N.sqrt(snr)
95 if (means is None) and (not nonbogus_features is None):
96 if len(nonbogus_features) > nlabels:
97 raise ValueError, "Can't assign simply a feature to a " + \
98 "class: more nonbogus_features than labels"
99 means = N.zeros((len(nonbogus_features), nfeatures))
100
101 for i in xrange(len(nonbogus_features)):
102 means[i, nonbogus_features[i]] = 1.0
103 if not means is None:
104
105 data += N.repeat(N.array(means, ndmin=2), perlabel, axis=0)
106
107
108 data = 1.0/(N.max(N.abs(data))) * data
109 labels = N.concatenate([N.repeat('L%d' % i, perlabel)
110 for i in range(nlabels)])
111 chunks = N.concatenate([N.repeat(range(nchunks),
112 perlabel/nchunks) for i in range(nlabels)])
113 ds = Dataset(samples=data, labels=labels, chunks=chunks, labels_map=True)
114 ds.nonbogus_features = nonbogus_features
115 return ds
116
118 """ Create a 2d dataset with a clear multivariate signal, but no
119 univariate information.
120
121 ::
122
123 %%%%%%%%%
124 % O % X %
125 %%%%%%%%%
126 % X % O %
127 %%%%%%%%%
128 """
129
130
131 data = N.random.normal(size=(4*patterns, 2))
132
133
134 data[:2*patterns, 1] += signal2noise
135
136 data[2*patterns:4*patterns, 1] -= signal2noise
137 data[:patterns, 0] -= signal2noise
138 data[2*patterns:3*patterns, 0] -= signal2noise
139 data[patterns:2*patterns, 0] += signal2noise
140 data[3*patterns:4*patterns, 0] += signal2noise
141
142
143 regs = N.array(([0] * patterns) + ([1] * 2 * patterns) + ([0] * patterns))
144
145 return Dataset(samples=data, labels=regs, chunks=chunks)
146
147
148 -def normalFeatureDataset__(dataset=None, labels=None, nchunks=None,
149 perlabel=50, activation_probability_steps=1,
150 randomseed=None, randomvoxels=False):
151 """ NOT FINISHED """
152 raise NotImplementedError
153
154 if dataset is None and labels is None:
155 raise ValueError, \
156 "Provide at least labels or a background dataset"
157
158 if dataset is None:
159 nlabels = len(labels)
160 else:
161 nchunks = len(dataset.uniquechunks)
162
163 N.random.seed(randomseed)
164
165
166
167 if randomvoxels:
168 indexes = N.random.permutation(dataset.nfeatures)
169 else:
170 indexes = N.arange(dataset.nfeatures)
171
172 allind, maps = [], []
173 if __debug__:
174 debug('DG', "Ugly creation of the copy of background")
175
176 dtype = dataset.samples.dtype
177 if not N.issubdtype(dtype, N.float):
178 dtype = N.float
179 totalsignal = N.zeros(dataset.samples.shape, dtype=dtype)
180
181 for l in xrange(len(labels)):
182 label = labels[l]
183 if __debug__:
184 debug('DG', "Simulating independent labels for %s" % label)
185
186
187 labelids = dataset.idsbylabels(label)
188
189
190
191 nfeatures = perlabel * activation_probability_steps
192 ind, indexes = indexes[0:nfeatures], \
193 indexes[nfeatures+1:]
194 allind += list(ind)
195
196
197
198 ds = dataset.selectFeatures(ind)
199 ds.samples[:] = 0.0
200
201
202 prob = [1.0 - x*1.0/activation_probability_steps
203 for x in xrange(activation_probability_steps)]
204
205
206 probabilities = N.repeat(prob, perlabel)
207 if __debug__:
208 debug('DG', 'For prob=%s probabilities=%s' % (prob, probabilities))
209
210 for chunk in ds.uniquechunks:
211 chunkids = ds.idsbychunks(chunk)
212 ids = list(Set(chunkids).intersection(Set(labelids)))
213 chunkvalue = N.random.uniform()
214
215 for id_ in ids:
216 ds.samples[id_, :] = \
217 (chunkvalue <= probabilities).astype('float')
218
219 maps.append(N.array(probabilities, copy=True))
220
221 signal = ds.map2Nifti(ds.samples)
222 totalsignal[:, ind] += ds.samples
223
224
225 wfeatures = dataset.samples[:, allind]
226 meanstd = N.mean(N.std(wfeatures, 1))
227 if __debug__:
228 debug('DG', "Mean deviation is %f" % meanstd)
229
230 totalsignal *= meanstd * options.snr
231
232 dataset.samples += totalsignal
233
234 return dataset
235
240
241
243 """Generate '6d robot arm' dataset (Williams and Rasmussen 1996)
244
245 Was originally created in order to test the correctness of the
246 implementation of kernel ARD. For full details see:
247 http://www.gaussianprocess.org/gpml/code/matlab/doc/regression.html#ard
248
249 x_1 picked randomly in [-1.932, -0.453]
250 x_2 picked randomly in [0.534, 3.142]
251 r_1 = 2.0
252 r_2 = 1.3
253 f(x_1,x_2) = r_1 cos (x_1) + r_2 cos(x_1 + x_2) + N(0,0.0025)
254 etc.
255
256 Expected relevances:
257 ell_1 1.804377
258 ell_2 1.963956
259 ell_3 8.884361
260 ell_4 34.417657
261 ell_5 1081.610451
262 ell_6 375.445823
263 sigma_f 2.379139
264 sigma_n 0.050835
265 """
266 intervals = N.array([[-1.932, -0.453], [0.534, 3.142]])
267 r = N.array([2.0, 1.3])
268 x = N.random.rand(size, 2)
269 x *= N.array(intervals[:, 1]-intervals[:, 0])
270 x += N.array(intervals[:, 0])
271 if __debug__:
272 for i in xrange(2):
273 debug('DG', '%d columnt Min: %g Max: %g' %
274 (i, x[:, i].min(), x[:, i].max()))
275 y = r[0]*N.cos(x[:, 0] + r[1]*N.cos(x.sum(1))) + \
276 N.random.randn(size)*N.sqrt(0.0025)
277 y -= y.mean()
278 x34 = x + N.random.randn(size, 2)*0.02
279 x56 = N.random.randn(size, 2)
280 x = N.hstack([x, x34, x56])
281 return Dataset(samples=x, labels=y)
282
283
284 -def sinModulated(n_instances, n_features,
285 flat=False, noise=0.4):
286 """ Generate a (quite) complex multidimensional non-linear dataset
287
288 Used for regression testing. In the data label is a sin of a x^2 +
289 uniform noise
290 """
291 if flat:
292 data = (N.arange(0.0, 1.0, 1.0/n_instances)*N.pi)
293 data.resize(n_instances, n_features)
294 else:
295 data = N.random.rand(n_instances, n_features)*N.pi
296 label = N.sin((data**2).sum(1)).round()
297 label += N.random.rand(label.size)*noise
298 return Dataset(samples=data, labels=label)
299
300 -def chirpLinear(n_instances, n_features=4, n_nonbogus_features=2,
301 data_noise=0.4, noise=0.1):
302 """ Generates simple dataset for linear regressions
303
304 Generates chirp signal, populates n_nonbogus_features out of
305 n_features with it with different noise level and then provides
306 signal itself with additional noise as labels
307 """
308 x = N.linspace(0, 1, n_instances)
309 y = N.sin((10 * N.pi * x **2))
310
311 data = N.random.normal(size=(n_instances, n_features ))*data_noise
312 for i in xrange(n_nonbogus_features):
313 data[:, i] += y[:]
314
315 labels = y + N.random.normal(size=(n_instances,))*noise
316
317 return Dataset(samples=data, labels=labels)
318
319
320 -def linear_awgn(size=10, intercept=0.0, slope=0.4, noise_std=0.01, flat=False):
321 """Generate a dataset from a linear function with AWGN
322 (Added White Gaussian Noise).
323
324 It can be multidimensional if 'slope' is a vector. If flat is True
325 (in 1 dimesion) generate equally spaces samples instead of random
326 ones. This is useful for the test phase.
327 """
328 dimensions = 1
329 if isinstance(slope, N.ndarray):
330 dimensions = slope.size
331
332 if flat and dimensions == 1:
333 x = N.linspace(0, 1, size)[:, N.newaxis]
334 else:
335 x = N.random.rand(size, dimensions)
336
337 y = N.dot(x, slope)[:, N.newaxis] \
338 + (N.random.randn(*(x.shape[0], 1)) * noise_std) + intercept
339
340 return Dataset(samples=x, labels=y)
341
342
343 -def noisy_2d_fx(size_per_fx, dfx, sfx, center, noise_std=1):
344 """
345 """
346 x = []
347 y = []
348 labels = []
349 for fx in sfx:
350 nx = N.random.normal(size=size_per_fx)
351 ny = fx(nx) + N.random.normal(size=nx.shape, scale=noise_std)
352 x.append(nx)
353 y.append(ny)
354
355
356 labels.append(N.array(ny < dfx(nx), dtype='int'))
357
358 samples = N.array((N.hstack(x), N.hstack(y))).squeeze().T
359 labels = N.hstack(labels).squeeze().T
360
361 samples += N.array(center)
362
363 return Dataset(samples=samples, labels=labels)
364