Package mvpa :: Package clfs :: Module distance
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.distance

  1  #emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  #ex: set sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Distance functions to be used in kernels and elsewhere 
 10  """ 
 11   
 12  __docformat__ = 'restructuredtext' 
 13   
 14  import numpy as N 
 15  from mvpa.base import externals 
 16   
 17  if __debug__: 
 18      from mvpa.base import debug, warning 
 19   
 20   
21 -def cartesianDistance(a, b):
22 """Return Cartesian distance between a and b 23 """ 24 return N.linalg.norm(a-b)
25 26
27 -def absminDistance(a, b):
28 """Returns dinstance max(\|a-b\|) 29 XXX There must be better name! 30 31 Useful to select a whole cube of a given "radius" 32 """ 33 return max(abs(a-b))
34 35
36 -def manhattenDistance(a, b):
37 """Return Manhatten distance between a and b 38 """ 39 return sum(abs(a-b))
40 41
42 -def mahalanobisDistance(x, y=None, w=None):
43 """Caclulcate Mahalanobis distance of the pairs of points. 44 45 :Parameters: 46 `x` 47 first list of points. Rows are samples, columns are 48 features. 49 `y` 50 second list of points (optional) 51 `w` : N.ndarray 52 optional inverse covariance matrix between the points. It is 53 computed if not given 54 55 Inverse covariance matrix can be calculated with the following 56 57 w = N.linalg.solve(N.cov(x.T), N.identity(x.shape[1])) 58 59 or 60 61 w = N.linalg.inv(N.cov(x.T)) 62 """ 63 # see if pairwise between two matrices or just within a single matrix 64 if y is None: 65 # pairwise distances of single matrix 66 # calculate the inverse correlation matrix if necessary 67 if w is None: 68 w = N.linalg.inv(N.cov(x.T)) 69 70 # get some shapes of the data 71 mx, nx = x.shape 72 #mw, nw = w.shape 73 74 # allocate for the matrix to fill 75 d = N.zeros((mx, mx), dtype=N.float32) 76 for i in range(mx-1): 77 # get the current row to compare 78 xi = x[i, :] 79 # replicate the row 80 xi = xi[N.newaxis, :].repeat(mx-i-1, axis=0) 81 # take the distance between all the matrices 82 dc = x[i+1:mx, :] - xi 83 # scale the distance by the correlation 84 d[i+1:mx, i] = N.real(N.sum((N.inner(dc, w) * N.conj(dc)), 1)) 85 # fill the other direction of the matrix 86 d[i, i+1:mx] = d[i+1:mx, i].T 87 else: 88 # is between two matrixes 89 # calculate the inverse correlation matrix if necessary 90 if w is None: 91 # calculate over all points 92 w = N.linalg.inv(N.cov(N.concatenate((x, y)).T)) 93 94 # get some shapes of the data 95 mx, nx = x.shape 96 my, ny = y.shape 97 98 # allocate for the matrix to fill 99 d = N.zeros((mx, my), dtype=N.float32) 100 101 # loop over shorter of two dimensions 102 if mx <= my: 103 # loop over the x patterns 104 for i in range(mx): 105 # get the current row to compare 106 xi = x[i, :] 107 # replicate the row 108 xi = xi[N.newaxis, :].repeat(my, axis=0) 109 # take the distance between all the matrices 110 dc = xi - y 111 # scale the distance by the correlation 112 d[i, :] = N.real(N.sum((N.inner(dc, w) * N.conj(dc)), 1)) 113 else: 114 # loop over the y patterns 115 for j in range(my): 116 # get the current row to compare 117 yj = y[j, :] 118 # replicate the row 119 yj = yj[N.newaxis, :].repeat(mx, axis=0) 120 # take the distance between all the matrices 121 dc = x - yj 122 # scale the distance by the correlation 123 d[:, j] = N.real(N.sum((N.inner(dc, w) * N.conj(dc)), 1)) 124 125 # return the dist 126 return N.sqrt(d)
127 128
129 -def squared_euclidean_distance(data1, data2=None, weight=None):
130 """Compute weighted euclidean distance matrix between two datasets. 131 132 133 :Parameters: 134 data1 : N.ndarray 135 first dataset 136 data2 : N.ndarray 137 second dataset. If None, compute the euclidean distance between 138 the first dataset versus itself. 139 (Defaults to None) 140 weight : N.ndarray 141 vector of weights, each one associated to each dimension of the 142 dataset (Defaults to None) 143 """ 144 if __debug__: 145 # check if both datasets are floating point 146 if not N.issubdtype(data1.dtype, 'f') \ 147 or (data2 is not None and not N.issubdtype(data2.dtype, 'f')): 148 warning('Computing euclidean distance on integer data ' \ 149 'is not supported.') 150 151 # removed for efficiency (see below) 152 #if weight is None: 153 # weight = N.ones(data1.shape[1], 'd') # unitary weight 154 155 # In the following you can find faster implementations of this 156 # basic code: 157 # 158 # squared_euclidean_distance_matrix = \ 159 # N.zeros((data1.shape[0], data2.shape[0]), 'd') 160 # for i in range(size1): 161 # for j in range(size2): 162 # squared_euclidean_distance_matrix[i, j] = \ 163 # ((data1[i, :]-data2[j, :])**2*weight).sum() 164 # pass 165 # pass 166 167 # Fast computation of distance matrix in Python+NumPy, 168 # adapted from Bill Baxter's post on [numpy-discussion]. 169 # Basically: (x-y)**2*w = x*w*x - 2*x*w*y + y*y*w 170 171 # based on value of weight and data2 we might save on computation 172 # and resources 173 if weight is None: 174 data1w = data1 175 if data2 is None: 176 data2, data2w = data1, data1w 177 else: 178 data2w = data2 179 else: 180 data1w = data1 * weight 181 if data2 is None: 182 data2, data2w = data1, data1w 183 else: 184 data2w = data2 * weight 185 186 squared_euclidean_distance_matrix = \ 187 (data1w * data1).sum(1)[:, None] \ 188 -2 * N.dot(data1w, data2.T) \ 189 + (data2 * data2w).sum(1) 190 191 # correction to some possible numerical instabilities: 192 less0 = squared_euclidean_distance_matrix < 0 193 if __debug__ and 'CHECK_STABILITY' in debug.active: 194 less0num = N.sum(less0) 195 if less0num > 0: 196 norm0 = N.linalg.norm(squared_euclidean_distance_matrix[less0]) 197 totalnorm = N.linalg.norm(squared_euclidean_distance_matrix) 198 if totalnorm != 0 and norm0 / totalnorm > 1e-8: 199 warning("Found %d elements out of %d unstable (<0) in " \ 200 "computation of squared_euclidean_distance_matrix. " \ 201 "Their norm is %s when total norm is %s" % \ 202 (less0num, N.sum(less0.shape), norm0, totalnorm)) 203 squared_euclidean_distance_matrix[less0] = 0 204 return squared_euclidean_distance_matrix
205 206
207 -def pnorm_w_python(data1, data2=None, weight=None, p=2, 208 heuristic='auto', use_sq_euclidean=True):
209 """Weighted p-norm between two datasets (pure Python implementation) 210 211 ||x - x'||_w = (\sum_{i=1...N} (w_i*|x_i - x'_i|)**p)**(1/p) 212 213 :Parameters: 214 data1 : N.ndarray 215 First dataset 216 data2 : N.ndarray or None 217 Optional second dataset 218 weight : N.ndarray or None 219 Optional weights per 2nd dimension (features) 220 p 221 Power 222 heuristic : basestring 223 Which heuristic to use: 224 * 'samples' -- python sweep over 0th dim 225 * 'features' -- python sweep over 1st dim 226 * 'auto' decides automatically. If # of features (shape[1]) is much 227 larger than # of samples (shape[0]) -- use 'samples', and use 228 'features' otherwise 229 use_sq_euclidean : bool 230 Either to use squared_euclidean_distance_matrix for computation if p==2 231 """ 232 if weight == None: 233 weight = N.ones(data1.shape[1], 'd') 234 pass 235 236 if p == 2 and use_sq_euclidean: 237 return N.sqrt(squared_euclidean_distance(data1=data1, data2=data2, 238 weight=weight**2)) 239 240 if data2 == None: 241 data2 = data1 242 pass 243 244 S1,F1 = data1.shape[:2] 245 S2,F2 = data2.shape[:2] 246 # sanity check 247 if not (F1==F2==weight.size): 248 raise ValueError, \ 249 "Datasets should have same #columns == #weights. Got " \ 250 "%d %d %d" % (F1, F2, weight.size) 251 d = N.zeros((S1, S2), 'd') 252 253 # Adjust local functions for specific p values 254 # pf - power function 255 # af - after function 256 if p == 1: 257 pf = lambda x:x 258 af = lambda x:x 259 else: 260 pf = lambda x:x ** p 261 af = lambda x:x ** (1.0/p) 262 263 # heuristic 'auto' might need to be adjusted 264 if heuristic == 'auto': 265 heuristic = {False: 'samples', 266 True: 'features'}[(F1/S1) < 500] 267 268 if heuristic == 'features': 269 # Efficient implementation if the feature size is little. 270 for NF in range(F1): 271 d += pf(N.abs(N.subtract.outer(data1[:,NF], 272 data2[:,NF]))*weight[NF]) 273 pass 274 elif heuristic == 'samples': 275 # Efficient implementation if the feature size is much larger 276 # than number of samples 277 for NS in xrange(S1): 278 dfw = pf(N.abs(data1[NS] - data2) * weight) 279 d[NS] = N.sum(dfw, axis=1) 280 pass 281 else: 282 raise ValueError, "Unknown heuristic '%s'. Need one of " \ 283 "'auto', 'samples', 'features'" % heuristic 284 return af(d)
285 286 287 if externals.exists('weave'): 288 from scipy import weave 289 from scipy.weave import converters 290
291 - def pnorm_w(data1, data2=None, weight=None, p=2):
292 """Weighted p-norm between two datasets (scipy.weave implementation) 293 294 ||x - x'||_w = (\sum_{i=1...N} (w_i*|x_i - x'_i|)**p)**(1/p) 295 296 :Parameters: 297 data1 : N.ndarray 298 First dataset 299 data2 : N.ndarray or None 300 Optional second dataset 301 weight : N.ndarray or None 302 Optional weights per 2nd dimension (features) 303 p 304 Power 305 """ 306 307 if weight == None: 308 weight = N.ones(data1.shape[1], 'd') 309 pass 310 S1, F1 = data1.shape[:2] 311 code = "" 312 if data2 == None or id(data1)==id(data2): 313 if not (F1==weight.size): 314 raise ValueError, \ 315 "Dataset should have same #columns == #weights. Got " \ 316 "%d %d" % (F1, weight.size) 317 F = F1 318 d = N.zeros((S1, S1), 'd') 319 try: 320 code_peritem = \ 321 {1.0 : "tmp = tmp+weight(t)*fabs(data1(i,t)-data1(j,t))", 322 2.0 : "tmp2 = weight(t)*(data1(i,t)-data1(j,t));" \ 323 " tmp = tmp + tmp2*tmp2"}[p] 324 except KeyError: 325 code_peritem = "tmp = tmp+pow(weight(t)*fabs(data1(i,t)-data1(j,t)),p)" 326 327 code = """ 328 int i,j,t; 329 double tmp, tmp2; 330 for (i=0; i<S1-1; i++) { 331 for (j=i+1; j<S1; j++) { 332 tmp = 0.0; 333 for(t=0; t<F; t++) { 334 %s; 335 } 336 d(i,j) = tmp; 337 } 338 } 339 return_val = 0; 340 """ % code_peritem 341 342 343 counter = weave.inline(code, 344 ['data1', 'S1', 'F', 'weight', 'd', 'p'], 345 type_converters=converters.blitz, 346 compiler = 'gcc') 347 d = d + N.triu(d).T # copy upper part to lower part 348 return d**(1.0/p) 349 350 S2,F2 = data2.shape[:2] 351 if not (F1==F2==weight.size): 352 raise ValueError, \ 353 "Datasets should have same #columns == #weights. Got " \ 354 "%d %d %d" % (F1, F2, weight.size) 355 F = F1 356 d = N.zeros((S1, S2), 'd') 357 try: 358 code_peritem = \ 359 {1.0 : "tmp = tmp+weight(t)*fabs(data1(i,t)-data2(j,t))", 360 2.0 : "tmp2 = weight(t)*(data1(i,t)-data2(j,t));" \ 361 " tmp = tmp + tmp2*tmp2"}[p] 362 except KeyError: 363 code_peritem = "tmp = tmp+pow(weight(t)*fabs(data1(i,t)-data2(j,t)),p)" 364 pass 365 366 code = """ 367 int i,j,t; 368 double tmp, tmp2; 369 for (i=0; i<S1; i++) { 370 for (j=0; j<S2; j++) { 371 tmp = 0.0; 372 for(t=0; t<F; t++) { 373 %s; 374 } 375 d(i,j) = tmp; 376 } 377 } 378 return_val = 0; 379 380 """ % code_peritem 381 382 counter = weave.inline(code, 383 ['data1', 'data2', 'S1', 'S2', 384 'F', 'weight', 'd', 'p'], 385 type_converters=converters.blitz, 386 compiler = 'gcc') 387 return d**(1.0/p)
388 389 else: 390 # Bind pure python implementation 391 pnorm_w = pnorm_w_python 392 pass 393