1
2
3
4
5
6
7
8
9 """Miscelaneous data generators for unittests and demos"""
10
11 __docformat__ = 'restructuredtext'
12
13 import numpy as N
14
15 from sets import Set
16
17 from mvpa.datasets import Dataset
18
19 if __debug__:
20 from mvpa.misc import debug
21
23 """Create a very simple dataset with 2 features and 3 labels
24 """
25 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1],
26 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1],
27 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0],
28 [12, 1]]
29 regs = ([1] * 8) + ([2] * 8) + ([3] * 8)
30
31 return Dataset(samples=data, labels=regs)
32
33
35 """Very simple binary (2 labels) dataset
36 """
37 data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1],
38 [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1],
39 [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0],
40 [12, 1]]
41 regs = ([0] * 12) + ([1] * 12)
42
43 return Dataset(samples=data, labels=regs)
44
45
46
47 -def normalFeatureDataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5,
48 means=None, nonbogus_features=None, snr=1.0):
49 """Generate a dataset where each label is some normally
50 distributed beastie around specified mean (0 if None).
51
52 snr is assuming that signal has std 1.0 so we just divide noise by snr
53
54 Probably it is a generalization of pureMultivariateSignal where
55 means=[ [0,1], [1,0] ]
56
57 Specify either means or nonbogus_features so means get assigned
58 accordingly
59 """
60
61 data = N.random.standard_normal((perlabel*nlabels, nfeatures))/N.sqrt(snr)
62 if (means is None) and (not nonbogus_features is None):
63 if len(nonbogus_features) > nlabels:
64 raise ValueError, "Can't assign simply a feature to a " + \
65 "class: more nonbogus_features than labels"
66 means = N.zeros((len(nonbogus_features), nfeatures))
67
68 for i in xrange(len(nonbogus_features)):
69 means[i, nonbogus_features[i]] = 1.0
70 if not means is None:
71
72 data += N.repeat(N.array(means, ndmin=2), perlabel, axis=0)
73
74
75 data = 1.0/(N.max(N.abs(data))) * data
76 labels = N.concatenate([N.repeat(i, perlabel) for i in range(nlabels)])
77 chunks = N.concatenate([N.repeat(range(nchunks),
78 perlabel/nchunks) for i in range(nlabels)])
79 ds = Dataset(samples=data, labels=labels, chunks=chunks)
80 ds.nonbogus_features = nonbogus_features
81 return ds
82
84 """ Create a 2d dataset with a clear multivariate signal, but no
85 univariate information.
86
87 %%%%%%%%%
88 % O % X %
89 %%%%%%%%%
90 % X % O %
91 %%%%%%%%%
92 """
93
94
95 data = N.random.normal(size=(4*patterns, 2))
96
97
98 data[:2*patterns, 1] += signal2noise
99
100 data[2*patterns:4*patterns, 1] -= signal2noise
101 data[:patterns, 0] -= signal2noise
102 data[2*patterns:3*patterns, 0] -= signal2noise
103 data[patterns:2*patterns, 0] += signal2noise
104 data[3*patterns:4*patterns, 0] += signal2noise
105
106
107 regs = N.array(([0] * patterns) + ([1] * 2 * patterns) + ([0] * patterns))
108
109 return Dataset(samples=data, labels=regs, chunks=chunks)
110
111
112 -def normalFeatureDataset__(dataset=None, labels=None, nchunks=None,
113 perlabel=50, activation_probability_steps=1,
114 randomseed=None, randomvoxels=False):
115 """ NOT FINISHED """
116 raise NotImplementedError
117
118 if dataset is None and labels is None:
119 raise ValueError, \
120 "Provide at least labels or a background dataset"
121
122 if dataset is None:
123 nlabels = len(labels)
124 else:
125 nchunks = len(dataset.uniquechunks)
126
127 N.random.seed(randomseed)
128
129
130
131 if randomvoxels:
132 indexes = N.random.permutation(dataset.nfeatures)
133 else:
134 indexes = N.arange(dataset.nfeatures)
135
136 allind, maps = [], []
137 if __debug__:
138 debug('DG', "Ugly creation of the copy of background")
139
140 dtype = dataset.samples.dtype
141 if not N.issubdtype(dtype, N.float):
142 dtype = N.float
143 totalsignal = N.zeros(dataset.samples.shape, dtype=dtype)
144
145 for l in xrange(len(labels)):
146 label = labels[l]
147 if __debug__:
148 debug('DG', "Simulating independent labels for %s" % label)
149
150
151 labelids = dataset.idsbylabels(label)
152
153
154
155 nfeatures = perlabel * activation_probability_steps
156 ind, indexes = indexes[0:nfeatures], \
157 indexes[nfeatures+1:]
158 allind += list(ind)
159
160
161
162 ds = dataset.selectFeatures(ind)
163 ds.samples[:] = 0.0
164
165
166 prob = [1.0 - x*1.0/activation_probability_steps
167 for x in xrange(activation_probability_steps)]
168
169
170 probabilities = N.repeat(prob, perlabel)
171 if __debug__:
172 debug('DG', 'For prob=%s probabilities=%s' % (prob, probabilities))
173
174 for chunk in ds.uniquechunks:
175 chunkids = ds.idsbychunks(chunk)
176 ids = list(Set(chunkids).intersection(Set(labelids)))
177 chunkvalue = N.random.uniform()
178
179 for id_ in ids:
180 ds.samples[id_, :] = \
181 (chunkvalue <= probabilities).astype('float')
182
183 maps.append(N.array(probabilities, copy=True))
184
185 signal = ds.map2Nifti(ds.samples)
186 totalsignal[:, ind] += ds.samples
187
188
189 wfeatures = dataset.samples[:, allind]
190 meanstd = N.mean(N.std(wfeatures, 1))
191 if __debug__:
192 debug('DG', "Mean deviation is %f" % meanstd)
193
194 totalsignal *= meanstd * options.snr
195
196 dataset.samples += totalsignal
197
198 return dataset
199
201 """Simple multivariate dataset"""
202
203 run1 = pureMultivariateSignal(5, s2n, 1)
204 run2 = pureMultivariateSignal(5, s2n, 2)
205 run3 = pureMultivariateSignal(5, s2n, 3)
206 run4 = pureMultivariateSignal(5, s2n, 4)
207 run5 = pureMultivariateSignal(5, s2n, 5)
208 run6 = pureMultivariateSignal(5, s2n, 6)
209
210 data = run1 + run2 + run3 + run4 + run5 + run6
211
212 return data
213
214
216 """Generate '6d robot arm' dataset (Williams and Rasmussen 1996)
217
218 Was originally created in order to test the correctness of the
219 implementation of kernel ARD. For full details see:
220 http://www.gaussianprocess.org/gpml/code/matlab/doc/regression.html#ard
221
222 x_1 picked randomly in [-1.932, -0.453]
223 x_2 picked randomly in [0.534, 3.142]
224 r_1 = 2.0
225 r_2 = 1.3
226 f(x_1,x_2) = r_1 cos (x_1) + r_2 cos(x_1 + x_2) + N(0,0.0025)
227 etc.
228
229 Expected relevances:
230 ell_1 1.804377
231 ell_2 1.963956
232 ell_3 8.884361
233 ell_4 34.417657
234 ell_5 1081.610451
235 ell_6 375.445823
236 sigma_f 2.379139
237 sigma_n 0.050835
238 """
239 intervals = N.array([[-1.932, -0.453], [0.534, 3.142]])
240 r = N.array([2.0, 1.3])
241 x = N.random.rand(size, 2)
242 x *= N.array(intervals[:, 1]-intervals[:, 0])
243 x += N.array(intervals[:, 0])
244 if __debug__:
245 for i in xrange(2):
246 debug('DG', '%d columnt Min: %g Max: %g' %
247 (i, x[:, i].min(), x[:, i].max()))
248 y = r[0]*N.cos(x[:, 0] + r[1]*N.cos(x.sum(1))) + \
249 N.random.randn(size)*N.sqrt(0.0025)
250 y -= y.mean()
251 x34 = x + N.random.randn(size, 2)*0.02
252 x56 = N.random.randn(size, 2)
253 x = N.hstack([x, x34, x56])
254 return Dataset(samples=x, labels=y)
255