00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdlib.h>
00023 #include <stdio.h>
00024 #include <math.h>
00025 #include <ctype.h>
00026 #include <string.h>
00027 #include "kernel/LocalAlignmentStringKernel.h"
00028
00029
00030
00031
00032
00033 #define NAA 20
00034 #define NLET 26
00035 static const char *aaList= "ARNDCQEGHILKMFPSTWYV";
00036
00037
00038
00039
00040
00041 #define OPENING 12
00042 #define EXTENSION 2
00043
00044
00045 const int32_t CLocalAlignmentStringKernel::blosum[] = {
00046 6,
00047 -2, 8,
00048 -2, -1, 9,
00049 -3, -2, 2, 9,
00050 -1, -5, -4, -5, 13,
00051 -1, 1, 0, 0, -4, 8,
00052 -1, 0, 0, 2, -5, 3, 7,
00053 0, -3, -1, -2, -4, -3, -3, 8,
00054 -2, 0, 1, -2, -4, 1, 0, -3, 11,
00055 -2, -5, -5, -5, -2, -4, -5, -6, -5, 6,
00056 -2, -3, -5, -5, -2, -3, -4, -5, -4, 2, 6,
00057 -1, 3, 0, -1, -5, 2, 1, -2, -1, -4, -4, 7,
00058 -1, -2, -3, -5, -2, -1, -3, -4, -2, 2, 3, -2, 8,
00059 -3, -4, -5, -5, -4, -5, -5, -5, -2, 0, 1, -5, 0, 9,
00060 -1, -3, -3, -2, -4, -2, -2, -3, -3, -4, -4, -2, -4, -5, 11,
00061 2, -1, 1, 0, -1, 0, 0, 0, -1, -4, -4, 0, -2, -4, -1, 6,
00062 0, -2, 0, -2, -1, -1, -1, -2, -3, -1, -2, -1, -1, -3, -2, 2, 7,
00063 -4, -4, -6, -6, -3, -3, -4, -4, -4, -4, -2, -4, -2, 1, -6, -4, -4, 16,
00064 -3, -3, -3, -5, -4, -2, -3, -5, 3, -2, -2, -3, -1, 4, -4, -3, -2, 3, 10,
00065 0, -4, -4, -5, -1, -3, -4, -5, -5, 4, 1, -3, 1, -1, -4, -2, 0, -4, -2, 6};
00066
00067
00068 #define BINDEX(i,j) (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2))
00069
00070
00071
00072
00073
00074 #define SCALING 0.1
00075
00076
00077
00078
00079
00080
00081
00082 #define LOGP(x,y) LogSum(x,y)
00083
00084
00085
00086
00087
00088
00089
00090 #define LOG0 -10000
00091 #define INTSCALE 1000.0
00092
00093 int32_t CLocalAlignmentStringKernel::logsum_lookup[LOGSUM_TBL];
00094
00095 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(int32_t size)
00096 : CStringKernel<char>(size), initialized(false)
00097 {
00098 scaled_blosum=new int32_t[sizeof(blosum)];
00099 init_logsum();
00100 initialize();
00101 }
00102
00103 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(
00104 CStringFeatures<char>* l, CStringFeatures<char>* r)
00105 : CStringKernel<char>(10), initialized(false)
00106 {
00107 scaled_blosum=new int32_t[sizeof(blosum)];
00108 init_logsum();
00109 initialize();
00110 init(l, r);
00111 }
00112
00113 CLocalAlignmentStringKernel::~CLocalAlignmentStringKernel()
00114 {
00115 cleanup();
00116 }
00117
00118 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r)
00119 {
00120 CStringKernel<char>::init(l, r);
00121 initialized = true;
00122 return init_normalizer();
00123 }
00124
00125 void CLocalAlignmentStringKernel::cleanup()
00126 {
00127 delete[] scaled_blosum;
00128 scaled_blosum=NULL;
00129
00130 free(isAA);
00131 isAA=NULL;
00132 free(aaIndex);
00133 aaIndex=NULL;
00134
00135 CKernel::cleanup();
00136 }
00137
00138
00139
00140
00141 void CLocalAlignmentStringKernel::init_logsum(void){
00142 int32_t i;
00143 for (i = 0; i < LOGSUM_TBL; i++)
00144 logsum_lookup[i] = (int32_t) (INTSCALE*
00145 (log(1.+exp( (float32_t) -i/INTSCALE))));
00146 }
00147
00148 int32_t CLocalAlignmentStringKernel::LogSum(int32_t p1, int32_t p2)
00149 {
00150 int32_t diff;
00151 static int32_t firsttime=1;
00152
00153 if (firsttime)
00154 {
00155 init_logsum();
00156 firsttime =0;
00157 }
00158
00159 diff=p1-p2;
00160 if (diff>=LOGSUM_TBL) return p1;
00161 else if (diff<=-LOGSUM_TBL) return p2;
00162 else if (diff>0) return p1+logsum_lookup[diff];
00163 else return p2+logsum_lookup[-diff];
00164 }
00165
00166
00167 float32_t CLocalAlignmentStringKernel::LogSum2(float32_t p1, float32_t p2)
00168 {
00169 if (p1 > p2)
00170 return (p1-p2 > 50.) ? p1 : p1 + log(1. + exp(p2-p1));
00171 else
00172 return (p2-p1 > 50.) ? p2 : p2 + log(1. + exp(p1-p2));
00173 }
00174
00175
00176 void CLocalAlignmentStringKernel::initialize(void)
00177
00178 {
00179 register int32_t i;
00180
00181
00182 if ((aaIndex=(int32_t *)calloc(NLET,sizeof(int32_t))) == NULL)
00183 SG_ERROR("run out o memory");
00184 for (i=0;i<NAA;i++)
00185 aaIndex[aaList[i]-'A']=i;
00186
00187
00188 if ((isAA=(int32_t *)calloc(256,sizeof(int32_t))) == NULL)
00189 SG_ERROR("run out of memory");
00190 for (i=0;i<NAA;i++)
00191 isAA[(int32_t)aaList[i]]=1;
00192
00193
00194 for (i=0 ; i<NAA*(NAA+1)/2; i++)
00195 scaled_blosum[i] = (int32_t) floor(blosum[i]*SCALING*INTSCALE);
00196
00197
00198
00199 opening = (int32_t) floor(OPENING * SCALING*INTSCALE);
00200 extension = (int32_t) floor(EXTENSION * SCALING*INTSCALE);
00201 }
00202
00203
00204
00205
00206
00207
00208 float64_t CLocalAlignmentStringKernel::LAkernelcompute(
00209 int32_t* aaX, int32_t* aaY,
00210 int32_t nX, int32_t nY )
00211 {
00212 register int32_t
00213 i,j,
00214 cur, old,
00215 curpos, frompos;
00216
00217 int32_t
00218 *logX,
00219 *logY,
00220 *logM,
00221 *logX2,
00222 *logY2,
00223
00224 aux , aux2;
00225 int32_t
00226 cl;
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238 cl = nY+1;
00239
00240 logM=new int32_t[2*cl];
00241 logX=new int32_t[2*cl];
00242 logY=new int32_t[2*cl];
00243 logX2=new int32_t[2*cl];
00244 logY2=new int32_t[2*cl];
00245
00246
00247
00248
00249
00250
00251 for (j=0;j<cl;j++) {
00252 logM[j]=LOG0;
00253 logX[j]=LOG0;
00254 logY[j]=LOG0;
00255 logX2[j]=LOG0;
00256 logY2[j]=LOG0;
00257
00258 }
00259
00260
00261 cur = 1;
00262 old = 0;
00263
00264
00265
00266
00267
00268
00269
00270 for (i=1;i<=nX;i++) {
00271
00272
00273 curpos = cur*cl;
00274 logM[curpos] = LOG0;
00275 logX[curpos] = LOG0;
00276 logY[curpos] = LOG0;
00277 logX2[curpos] = LOG0;
00278 logY2[curpos] = LOG0;
00279
00280
00281 for (j=1;j<=nY;j++) {
00282
00283 curpos = cur*cl + j;
00284
00285
00286
00287
00288 frompos = old*cl + j;
00289
00290
00291 logX[curpos] = LOGP( - opening + logM[frompos] , - extension + logX[frompos] );
00292
00293
00294
00295 logX2[curpos] = LOGP( logM[frompos] , logX2[frompos] );
00296
00297
00298
00299
00300 frompos = cur*cl + j-1;
00301
00302
00303 aux = LOGP( - opening + logM[frompos] , - extension + logY[frompos] );
00304 logY[curpos] = LOGP( aux , - opening + logX[frompos] );
00305
00306
00307 aux = LOGP( logM[frompos] , logY2[frompos] );
00308 logY2[curpos] = LOGP( aux , logX2[frompos] );
00309
00310
00311
00312
00313 frompos = old*cl + j-1;
00314
00315 aux = LOGP( logX[frompos] , logY[frompos] );
00316 aux2 = LOGP( 0 , logM[frompos] );
00317 logM[curpos] = LOGP( aux , aux2 ) + scaled_blosum[ BINDEX( aaX[i-1] , aaY[j-1] ) ];
00318
00319
00320
00321
00322 }
00323
00324
00325
00326 cur = 1-cur;
00327 old = 1-old;
00328
00329 }
00330
00331
00332
00333
00334
00335 curpos = old*cl + nY;
00336 aux = LOGP( logX2[curpos] , logY2[curpos] );
00337 aux2 = LOGP( 0 , logM[curpos] );
00338
00339
00340
00341 delete[] logM;
00342 delete[] logX;
00343 delete[] logY;
00344 delete[] logX2;
00345 delete[] logY2;
00346
00347
00348 return (float32_t)LOGP(aux,aux2)/INTSCALE;
00349 }
00350
00351
00352
00353
00354
00355
00356
00357
00358 float64_t CLocalAlignmentStringKernel::compute(int32_t idx_x, int32_t idx_y)
00359 {
00360 int32_t *aax,*aay;
00361 int32_t lx=0,ly=0;
00362 int32_t i,j;
00363
00364
00365 if (isAA == NULL)
00366 initialize();
00367
00368 char* x=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_x, lx);
00369 char* y=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_y, ly);
00370 ASSERT(x && y);
00371
00372 if ((lx<1) || (ly<1))
00373 SG_ERROR("empty chain");
00374
00375
00376
00377 if ((aax=(int32_t *)calloc(lx,sizeof(int32_t))) == NULL)
00378 SG_ERROR("run out of memory");
00379 if ((aay=(int32_t *)calloc(ly,sizeof(int32_t))) == NULL)
00380 SG_ERROR("run out of memory");
00381
00382
00383
00384 j=0;
00385 for (i=0 ; i<lx ; i++)
00386 if (isAA[toupper(x[i])])
00387 aax[j++] = aaIndex[toupper(x[i])-'A'];
00388 lx = j;
00389 j=0;
00390 for (i=0 ; i<ly ; i++)
00391 if (isAA[toupper(y[i])])
00392 aay[j++] = aaIndex[toupper(y[i])-'A'];
00393 ly = j;
00394
00395
00396
00397 float64_t result=LAkernelcompute(aax,aay,lx,ly);
00398
00399
00400 free(aax);
00401 free(aay);
00402
00403 return result;
00404 }