00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdlib.h>
00023 #include <stdio.h>
00024 #include <math.h>
00025 #include <ctype.h>
00026 #include <string.h>
00027 #include "kernel/LocalAlignmentStringKernel.h"
00028
00029
00030
00031
00032
00033 #define NAA 20
00034 #define NLET 26
00035 static const char *aaList= "ARNDCQEGHILKMFPSTWYV";
00036
00037
00038
00039
00040
00041 #define OPENING 12
00042 #define EXTENSION 2
00043
00044
00045 const int CLocalAlignmentStringKernel::blosum[] = {
00046 6,
00047 -2, 8,
00048 -2, -1, 9,
00049 -3, -2, 2, 9,
00050 -1, -5, -4, -5, 13,
00051 -1, 1, 0, 0, -4, 8,
00052 -1, 0, 0, 2, -5, 3, 7,
00053 0, -3, -1, -2, -4, -3, -3, 8,
00054 -2, 0, 1, -2, -4, 1, 0, -3, 11,
00055 -2, -5, -5, -5, -2, -4, -5, -6, -5, 6,
00056 -2, -3, -5, -5, -2, -3, -4, -5, -4, 2, 6,
00057 -1, 3, 0, -1, -5, 2, 1, -2, -1, -4, -4, 7,
00058 -1, -2, -3, -5, -2, -1, -3, -4, -2, 2, 3, -2, 8,
00059 -3, -4, -5, -5, -4, -5, -5, -5, -2, 0, 1, -5, 0, 9,
00060 -1, -3, -3, -2, -4, -2, -2, -3, -3, -4, -4, -2, -4, -5, 11,
00061 2, -1, 1, 0, -1, 0, 0, 0, -1, -4, -4, 0, -2, -4, -1, 6,
00062 0, -2, 0, -2, -1, -1, -1, -2, -3, -1, -2, -1, -1, -3, -2, 2, 7,
00063 -4, -4, -6, -6, -3, -3, -4, -4, -4, -4, -2, -4, -2, 1, -6, -4, -4, 16,
00064 -3, -3, -3, -5, -4, -2, -3, -5, 3, -2, -2, -3, -1, 4, -4, -3, -2, 3, 10,
00065 0, -4, -4, -5, -1, -3, -4, -5, -5, 4, 1, -3, 1, -1, -4, -2, 0, -4, -2, 6};
00066
00067
00068 #define BINDEX(i,j) (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2))
00069
00070
00071
00072
00073
00074 #define SCALING 0.1
00075
00076
00077
00078
00079
00080
00081
00082 #define LOGP(x,y) LogSum(x,y)
00083
00084
00085
00086
00087
00088
00089
00090 #define LOG0 -10000
00091 #define INTSCALE 1000.0
00092
00093 int CLocalAlignmentStringKernel::logsum_lookup[LOGSUM_TBL];
00094
00095 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(INT size)
00096 : CStringKernel<CHAR>(size), initialized(false)
00097 {
00098 scaled_blosum=new int[sizeof(blosum)];
00099 init_logsum();
00100 initialize();
00101 }
00102
00103 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(
00104 CStringFeatures<CHAR>* l, CStringFeatures<CHAR>* r)
00105 : CStringKernel<CHAR>(10), initialized(false)
00106 {
00107 scaled_blosum=new int[sizeof(blosum)];
00108 init_logsum();
00109 initialize();
00110 init(l, r);
00111 }
00112
00113 CLocalAlignmentStringKernel::~CLocalAlignmentStringKernel()
00114 {
00115 cleanup();
00116 }
00117
00118 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r)
00119 {
00120 bool result = CStringKernel<CHAR>::init(l, r);
00121 initialized = true;
00122 return result;
00123 }
00124
00125 void CLocalAlignmentStringKernel::cleanup()
00126 {
00127 delete[] scaled_blosum;
00128 scaled_blosum=NULL;
00129
00130 free(isAA);
00131 isAA=NULL;
00132 free(aaIndex);
00133 aaIndex=NULL;
00134
00135 CKernel::cleanup();
00136 }
00137
00138
00139
00140
00141 void CLocalAlignmentStringKernel::init_logsum(void){
00142 int i;
00143 for (i = 0; i < LOGSUM_TBL; i++)
00144 logsum_lookup[i] = (int) (INTSCALE*
00145 (log(1.+exp( (float) -i/INTSCALE))));
00146 }
00147
00148 int CLocalAlignmentStringKernel::LogSum(int p1, int p2){
00149 int diff;
00150 static int firsttime = 1;
00151 if (firsttime) {init_logsum(); firsttime = 0;}
00152 diff = p1 - p2;
00153 if (diff >= LOGSUM_TBL) return p1;
00154 else if (diff <= -LOGSUM_TBL) return p2;
00155 else if (diff > 0) return p1+logsum_lookup[diff];
00156 else return p2+logsum_lookup[-diff];
00157 }
00158
00159
00160 float CLocalAlignmentStringKernel::LogSum2(float p1, float p2)
00161 {
00162 if (p1 > p2)
00163 return (p1-p2 > 50.) ? p1 : p1 + log(1. + exp(p2-p1));
00164 else
00165 return (p2-p1 > 50.) ? p2 : p2 + log(1. + exp(p1-p2));
00166 }
00167
00168
00169 void CLocalAlignmentStringKernel::initialize(void)
00170
00171 {
00172 register int i;
00173
00174
00175 if ((aaIndex=(int *)calloc(NLET,sizeof(int))) == NULL)
00176 SG_ERROR("run out o memory");
00177 for (i=0;i<NAA;i++)
00178 aaIndex[aaList[i]-'A']=i;
00179
00180
00181 if ((isAA=(int *)calloc(256,sizeof(int))) == NULL)
00182 SG_ERROR("run out of memory");
00183 for (i=0;i<NAA;i++)
00184 isAA[(int)aaList[i]]=1;
00185
00186
00187 for (i=0 ; i<NAA*(NAA+1)/2; i++)
00188 scaled_blosum[i] = (int) floor(blosum[i]*SCALING*INTSCALE);
00189
00190
00191
00192 opening = (int) floor(OPENING * SCALING*INTSCALE);
00193 extension = (int) floor(EXTENSION * SCALING*INTSCALE);
00194 }
00195
00196
00197
00198 DREAL CLocalAlignmentStringKernel::LAkernelcompute(int* aaX, int* aaY,
00199
00200
00201
00202 int nX, int nY
00203 )
00204 {
00205 register int
00206 i,j,
00207 cur, old,
00208 curpos, frompos;
00209
00210 int
00211 *logX,
00212 *logY,
00213 *logM,
00214 *logX2,
00215 *logY2,
00216
00217 aux , aux2;
00218 int
00219 cl;
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231 cl = nY+1;
00232
00233 logM=new int[2*cl];
00234 logX=new int[2*cl];
00235 logY=new int[2*cl];
00236 logX2=new int[2*cl];
00237 logY2=new int[2*cl];
00238
00239
00240
00241
00242
00243
00244 for (j=0;j<cl;j++) {
00245 logM[j]=LOG0;
00246 logX[j]=LOG0;
00247 logY[j]=LOG0;
00248 logX2[j]=LOG0;
00249 logY2[j]=LOG0;
00250
00251 }
00252
00253
00254 cur = 1;
00255 old = 0;
00256
00257
00258
00259
00260
00261
00262
00263 for (i=1;i<=nX;i++) {
00264
00265
00266 curpos = cur*cl;
00267 logM[curpos] = LOG0;
00268 logX[curpos] = LOG0;
00269 logY[curpos] = LOG0;
00270 logX2[curpos] = LOG0;
00271 logY2[curpos] = LOG0;
00272
00273
00274 for (j=1;j<=nY;j++) {
00275
00276 curpos = cur*cl + j;
00277
00278
00279
00280
00281 frompos = old*cl + j;
00282
00283
00284 logX[curpos] = LOGP( - opening + logM[frompos] , - extension + logX[frompos] );
00285
00286
00287
00288 logX2[curpos] = LOGP( logM[frompos] , logX2[frompos] );
00289
00290
00291
00292
00293 frompos = cur*cl + j-1;
00294
00295
00296 aux = LOGP( - opening + logM[frompos] , - extension + logY[frompos] );
00297 logY[curpos] = LOGP( aux , - opening + logX[frompos] );
00298
00299
00300 aux = LOGP( logM[frompos] , logY2[frompos] );
00301 logY2[curpos] = LOGP( aux , logX2[frompos] );
00302
00303
00304
00305
00306 frompos = old*cl + j-1;
00307
00308 aux = LOGP( logX[frompos] , logY[frompos] );
00309 aux2 = LOGP( 0 , logM[frompos] );
00310 logM[curpos] = LOGP( aux , aux2 ) + scaled_blosum[ BINDEX( aaX[i-1] , aaY[j-1] ) ];
00311
00312
00313
00314
00315 }
00316
00317
00318
00319 cur = 1-cur;
00320 old = 1-old;
00321
00322 }
00323
00324
00325
00326
00327
00328 curpos = old*cl + nY;
00329 aux = LOGP( logX2[curpos] , logY2[curpos] );
00330 aux2 = LOGP( 0 , logM[curpos] );
00331
00332
00333
00334 delete[] logM;
00335 delete[] logX;
00336 delete[] logY;
00337 delete[] logX2;
00338 delete[] logY2;
00339
00340
00341 return (float)LOGP(aux,aux2)/INTSCALE;
00342 }
00343
00344
00345
00346
00347
00348
00349
00350
00351 DREAL CLocalAlignmentStringKernel::compute(INT idx_x, INT idx_y)
00352 {
00353 int *aax,*aay;
00354 int lx=0,ly=0;
00355 int i,j;
00356
00357
00358 if (isAA == NULL)
00359 initialize();
00360
00361 CHAR* x=((CStringFeatures<CHAR>*) lhs)->get_feature_vector(idx_x, lx);
00362 CHAR* y=((CStringFeatures<CHAR>*) rhs)->get_feature_vector(idx_y, ly);
00363 ASSERT(x && y);
00364
00365 if ((lx<1) || (ly<1))
00366 SG_ERROR("empty chain");
00367
00368
00369
00370 if ((aax=(int *)calloc(lx,sizeof(int))) == NULL)
00371 SG_ERROR("run out of memory");
00372 if ((aay=(int *)calloc(ly,sizeof(int))) == NULL)
00373 SG_ERROR("run out of memory");
00374
00375
00376
00377 j=0;
00378 for (i=0 ; i<lx ; i++)
00379 if (isAA[toupper(x[i])])
00380 aax[j++] = aaIndex[toupper(x[i])-'A'];
00381 lx = j;
00382 j=0;
00383 for (i=0 ; i<ly ; i++)
00384 if (isAA[toupper(y[i])])
00385 aay[j++] = aaIndex[toupper(y[i])-'A'];
00386 ly = j;
00387
00388
00389
00390 DREAL result=LAkernelcompute(aax,aay,lx,ly);
00391
00392
00393 free(aax);
00394 free(aay);
00395
00396 return result;
00397 }