1
2
3
4
5
6
7
8
9 """Distance functions to be used in kernels and elsewhere
10 """
11
12 __docformat__ = 'restructuredtext'
13
14 import numpy as N
15 from mvpa.base import externals
16
17 if __debug__:
18 from mvpa.base import debug, warning
19
20
22 """Return Cartesian distance between a and b
23 """
24 return N.linalg.norm(a-b)
25
26
28 """Returns dinstance max(\|a-b\|)
29 XXX There must be better name!
30
31 Useful to select a whole cube of a given "radius"
32 """
33 return max(abs(a-b))
34
35
37 """Return Manhatten distance between a and b
38 """
39 return sum(abs(a-b))
40
41
43 """Caclulcate Mahalanobis distance of the pairs of points.
44
45 :Parameters:
46 `x`
47 first list of points. Rows are samples, columns are
48 features.
49 `y`
50 second list of points (optional)
51 `w` : N.ndarray
52 optional inverse covariance matrix between the points. It is
53 computed if not given
54
55 Inverse covariance matrix can be calculated with the following
56
57 w = N.linalg.solve(N.cov(x.T), N.identity(x.shape[1]))
58
59 or
60
61 w = N.linalg.inv(N.cov(x.T))
62 """
63
64 if y is None:
65
66
67 if w is None:
68 w = N.linalg.inv(N.cov(x.T))
69
70
71 mx, nx = x.shape
72
73
74
75 d = N.zeros((mx, mx), dtype=N.float32)
76 for i in range(mx-1):
77
78 xi = x[i, :]
79
80 xi = xi[N.newaxis, :].repeat(mx-i-1, axis=0)
81
82 dc = x[i+1:mx, :] - xi
83
84 d[i+1:mx, i] = N.real(N.sum((N.inner(dc, w) * N.conj(dc)), 1))
85
86 d[i, i+1:mx] = d[i+1:mx, i].T
87 else:
88
89
90 if w is None:
91
92 w = N.linalg.inv(N.cov(N.concatenate((x, y)).T))
93
94
95 mx, nx = x.shape
96 my, ny = y.shape
97
98
99 d = N.zeros((mx, my), dtype=N.float32)
100
101
102 if mx <= my:
103
104 for i in range(mx):
105
106 xi = x[i, :]
107
108 xi = xi[N.newaxis, :].repeat(my, axis=0)
109
110 dc = xi - y
111
112 d[i, :] = N.real(N.sum((N.inner(dc, w) * N.conj(dc)), 1))
113 else:
114
115 for j in range(my):
116
117 yj = y[j, :]
118
119 yj = yj[N.newaxis, :].repeat(mx, axis=0)
120
121 dc = x - yj
122
123 d[:, j] = N.real(N.sum((N.inner(dc, w) * N.conj(dc)), 1))
124
125
126 return N.sqrt(d)
127
128
130 """Compute weighted euclidean distance matrix between two datasets.
131
132
133 :Parameters:
134 data1 : N.ndarray
135 first dataset
136 data2 : N.ndarray
137 second dataset. If None, compute the euclidean distance between
138 the first dataset versus itself.
139 (Defaults to None)
140 weight : N.ndarray
141 vector of weights, each one associated to each dimension of the
142 dataset (Defaults to None)
143 """
144 if __debug__:
145
146 if not N.issubdtype(data1.dtype, 'f') \
147 or (data2 is not None and not N.issubdtype(data2.dtype, 'f')):
148 warning('Computing euclidean distance on integer data ' \
149 'is not supported.')
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173 if weight is None:
174 data1w = data1
175 if data2 is None:
176 data2, data2w = data1, data1w
177 else:
178 data2w = data2
179 else:
180 data1w = data1 * weight
181 if data2 is None:
182 data2, data2w = data1, data1w
183 else:
184 data2w = data2 * weight
185
186 squared_euclidean_distance_matrix = \
187 (data1w * data1).sum(1)[:, None] \
188 -2 * N.dot(data1w, data2.T) \
189 + (data2 * data2w).sum(1)
190
191
192 less0 = squared_euclidean_distance_matrix < 0
193 if __debug__ and 'CHECK_STABILITY' in debug.active:
194 less0num = N.sum(less0)
195 if less0num > 0:
196 norm0 = N.linalg.norm(squared_euclidean_distance_matrix[less0])
197 totalnorm = N.linalg.norm(squared_euclidean_distance_matrix)
198 if totalnorm != 0 and norm0 / totalnorm > 1e-8:
199 warning("Found %d elements out of %d unstable (<0) in " \
200 "computation of squared_euclidean_distance_matrix. " \
201 "Their norm is %s when total norm is %s" % \
202 (less0num, N.sum(less0.shape), norm0, totalnorm))
203 squared_euclidean_distance_matrix[less0] = 0
204 return squared_euclidean_distance_matrix
205
206
207 -def pnorm_w_python(data1, data2=None, weight=None, p=2,
208 heuristic='auto', use_sq_euclidean=True):
209 """Weighted p-norm between two datasets (pure Python implementation)
210
211 ||x - x'||_w = (\sum_{i=1...N} (w_i*|x_i - x'_i|)**p)**(1/p)
212
213 :Parameters:
214 data1 : N.ndarray
215 First dataset
216 data2 : N.ndarray or None
217 Optional second dataset
218 weight : N.ndarray or None
219 Optional weights per 2nd dimension (features)
220 p
221 Power
222 heuristic : basestring
223 Which heuristic to use:
224 * 'samples' -- python sweep over 0th dim
225 * 'features' -- python sweep over 1st dim
226 * 'auto' decides automatically. If # of features (shape[1]) is much
227 larger than # of samples (shape[0]) -- use 'samples', and use
228 'features' otherwise
229 use_sq_euclidean : bool
230 Either to use squared_euclidean_distance_matrix for computation if p==2
231 """
232 if weight == None:
233 weight = N.ones(data1.shape[1], 'd')
234 pass
235
236 if p == 2 and use_sq_euclidean:
237 return N.sqrt(squared_euclidean_distance(data1=data1, data2=data2,
238 weight=weight**2))
239
240 if data2 == None:
241 data2 = data1
242 pass
243
244 S1,F1 = data1.shape[:2]
245 S2,F2 = data2.shape[:2]
246
247 if not (F1==F2==weight.size):
248 raise ValueError, \
249 "Datasets should have same #columns == #weights. Got " \
250 "%d %d %d" % (F1, F2, weight.size)
251 d = N.zeros((S1, S2), 'd')
252
253
254
255
256 if p == 1:
257 pf = lambda x:x
258 af = lambda x:x
259 else:
260 pf = lambda x:x ** p
261 af = lambda x:x ** (1.0/p)
262
263
264 if heuristic == 'auto':
265 heuristic = {False: 'samples',
266 True: 'features'}[(F1/S1) < 500]
267
268 if heuristic == 'features':
269
270 for NF in range(F1):
271 d += pf(N.abs(N.subtract.outer(data1[:,NF],
272 data2[:,NF]))*weight[NF])
273 pass
274 elif heuristic == 'samples':
275
276
277 for NS in xrange(S1):
278 dfw = pf(N.abs(data1[NS] - data2) * weight)
279 d[NS] = N.sum(dfw, axis=1)
280 pass
281 else:
282 raise ValueError, "Unknown heuristic '%s'. Need one of " \
283 "'auto', 'samples', 'features'" % heuristic
284 return af(d)
285
286
287 if externals.exists('weave'):
288 from scipy import weave
289 from scipy.weave import converters
290
291 - def pnorm_w(data1, data2=None, weight=None, p=2):
292 """Weighted p-norm between two datasets (scipy.weave implementation)
293
294 ||x - x'||_w = (\sum_{i=1...N} (w_i*|x_i - x'_i|)**p)**(1/p)
295
296 :Parameters:
297 data1 : N.ndarray
298 First dataset
299 data2 : N.ndarray or None
300 Optional second dataset
301 weight : N.ndarray or None
302 Optional weights per 2nd dimension (features)
303 p
304 Power
305 """
306
307 if weight == None:
308 weight = N.ones(data1.shape[1], 'd')
309 pass
310 S1, F1 = data1.shape[:2]
311 code = ""
312 if data2 == None or id(data1)==id(data2):
313 if not (F1==weight.size):
314 raise ValueError, \
315 "Dataset should have same #columns == #weights. Got " \
316 "%d %d" % (F1, weight.size)
317 F = F1
318 d = N.zeros((S1, S1), 'd')
319 try:
320 code_peritem = \
321 {1.0 : "tmp = tmp+weight(t)*fabs(data1(i,t)-data1(j,t))",
322 2.0 : "tmp2 = weight(t)*(data1(i,t)-data1(j,t));" \
323 " tmp = tmp + tmp2*tmp2"}[p]
324 except KeyError:
325 code_peritem = "tmp = tmp+pow(weight(t)*fabs(data1(i,t)-data1(j,t)),p)"
326
327 code = """
328 int i,j,t;
329 double tmp, tmp2;
330 for (i=0; i<S1-1; i++) {
331 for (j=i+1; j<S1; j++) {
332 tmp = 0.0;
333 for(t=0; t<F; t++) {
334 %s;
335 }
336 d(i,j) = tmp;
337 }
338 }
339 return_val = 0;
340 """ % code_peritem
341
342
343 counter = weave.inline(code,
344 ['data1', 'S1', 'F', 'weight', 'd', 'p'],
345 type_converters=converters.blitz,
346 compiler = 'gcc')
347 d = d + N.triu(d).T
348 return d**(1.0/p)
349
350 S2,F2 = data2.shape[:2]
351 if not (F1==F2==weight.size):
352 raise ValueError, \
353 "Datasets should have same #columns == #weights. Got " \
354 "%d %d %d" % (F1, F2, weight.size)
355 F = F1
356 d = N.zeros((S1, S2), 'd')
357 try:
358 code_peritem = \
359 {1.0 : "tmp = tmp+weight(t)*fabs(data1(i,t)-data2(j,t))",
360 2.0 : "tmp2 = weight(t)*(data1(i,t)-data2(j,t));" \
361 " tmp = tmp + tmp2*tmp2"}[p]
362 except KeyError:
363 code_peritem = "tmp = tmp+pow(weight(t)*fabs(data1(i,t)-data2(j,t)),p)"
364 pass
365
366 code = """
367 int i,j,t;
368 double tmp, tmp2;
369 for (i=0; i<S1; i++) {
370 for (j=0; j<S2; j++) {
371 tmp = 0.0;
372 for(t=0; t<F; t++) {
373 %s;
374 }
375 d(i,j) = tmp;
376 }
377 }
378 return_val = 0;
379
380 """ % code_peritem
381
382 counter = weave.inline(code,
383 ['data1', 'data2', 'S1', 'S2',
384 'F', 'weight', 'd', 'p'],
385 type_converters=converters.blitz,
386 compiler = 'gcc')
387 return d**(1.0/p)
388
389 else:
390
391 pnorm_w = pnorm_w_python
392 pass
393