LocalAlignmentStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * Compute the local alignment kernel
00003  *
00004  * Largely based on LAkernel.c (version 0.3)
00005  *
00006  * Copyright 2003 Jean-Philippe Vert
00007  * Copyright 2005 Jean-Philippe Vert, Hiroto Saigo
00008  *
00009  * Shogun specific adjustments Written (W) 2007-2008 Soeren Sonnenburg
00010  * 
00011  * Reference:
00012  * H. Saigo, J.-P. Vert, T. Akutsu and N. Ueda, "Protein homology
00013  * detection using string alignment kernels", Bioinformatics,
00014  * vol.20, p.1682-1689, 2004.
00015  * 
00016  * This program is free software; you can redistribute it and/or modify
00017  * it under the terms of the GNU General Public License as published by
00018  * the Free Software Foundation; either version 3 of the License, or
00019  * (at your option) any later version.
00020  */
00021 
00022 #include <stdlib.h>
00023 #include <stdio.h>
00024 #include <math.h>
00025 #include <ctype.h>
00026 #include <string.h>
00027 #include "kernel/LocalAlignmentStringKernel.h"
00028 
00029 /****************/
00030 /* The alphabet */
00031 /****************/
00032 
00033 #define NAA 20                                  /* Number of amino-acids */
00034 #define NLET 26                                 /* Number of letters in the alphabet */
00035 static const char *aaList= "ARNDCQEGHILKMFPSTWYV";    /* The list of amino acids */
00036 
00037 /*****************/
00038 /* SW parameters */
00039 /*****************/
00040 
00041 #define OPENING 12                              /* Gap opening penalty */
00042 #define EXTENSION 2                             /* Gap extension penalty */
00043 
00044 /* mutation matrix */
00045 const int CLocalAlignmentStringKernel::blosum[] = {
00046   6,
00047  -2,   8,
00048  -2,  -1,   9,
00049   -3,  -2,   2,   9,
00050   -1,  -5,  -4,  -5,  13,
00051   -1,   1,   0,   0,  -4,   8,
00052   -1,   0,   0,   2,  -5,   3,   7,
00053   0,  -3,  -1,  -2,  -4,  -3,  -3,   8,
00054   -2,   0,   1,  -2,  -4,   1,   0,  -3,  11,
00055   -2,  -5,  -5,  -5,  -2,  -4,  -5,  -6,  -5,   6,
00056   -2,  -3,  -5,  -5,  -2,  -3,  -4,  -5,  -4,   2,   6,
00057   -1,   3,   0,  -1,  -5,   2,   1,  -2,  -1,  -4,  -4,   7,
00058   -1,  -2,  -3,  -5,  -2,  -1,  -3,  -4,  -2,   2,   3,  -2,   8,
00059   -3,  -4,  -5,  -5,  -4,  -5,  -5,  -5,  -2,   0,   1,  -5,   0,   9,
00060   -1,  -3,  -3,  -2,  -4,  -2,  -2,  -3,  -3,  -4,  -4,  -2,  -4,  -5,  11,
00061   2,  -1,   1,   0,  -1,   0,   0,   0,  -1,  -4,  -4,   0,  -2,  -4,  -1,   6,
00062   0,  -2,   0,  -2,  -1,  -1,  -1,  -2,  -3,  -1,  -2,  -1,  -1,  -3,  -2,   2,   7,
00063   -4,  -4,  -6,  -6,  -3,  -3,  -4,  -4,  -4,  -4,  -2,  -4,  -2,   1,  -6,  -4,  -4,  16,
00064   -3,  -3,  -3,  -5,  -4,  -2,  -3,  -5,   3,  -2,  -2,  -3,  -1,   4,  -4,  -3,  -2,   3,  10,
00065   0,  -4,  -4,  -5,  -1,  -3,  -4,  -5,  -5,   4,   1,  -3,   1,  -1,  -4,  -2,   0,  -4,  -2,   6};
00066 
00067 /* Index corresponding to the (i,j) entry (i,j=0..19) in the blosum matrix */
00068 #define BINDEX(i,j) (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2))
00069 
00070 /*********************
00071  * Kernel parameters *
00072  *********************/
00073 
00074 #define SCALING 0.1           /* Factor to scale all SW parameters */
00075 
00076 /* If you want to compute the sum over all local alignments (to get a valid kernel), uncomment the following line : */
00077 /* If x=log(a) and y=log(b), compute log(a+b) : */
00078 /*
00079 #define LOGP(x,y) (((x)>(y))?(x)+log1p(exp((y)-(x))):(y)+log1p(exp((x)-(y))))
00080 */
00081 
00082 #define LOGP(x,y) LogSum(x,y)
00083 
00084 /* OR if you want to compute the score of the best local alignment (to get the SW score by Viterbi), uncomment the following line : */
00085 /*
00086 #define LOGP(x,y) (((x)>(y))?(x):(y))
00087 */
00088 
00089 /* Usefule constants */
00090 #define LOG0 -10000          /* log(0) */
00091 #define INTSCALE 1000.0      /* critical for speed and precise computation*/
00092 
00093 int CLocalAlignmentStringKernel::logsum_lookup[LOGSUM_TBL];
00094 
00095 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(INT size)
00096 : CStringKernel<CHAR>(size), initialized(false)
00097 {
00098     scaled_blosum=new int[sizeof(blosum)];
00099     init_logsum();
00100     initialize();
00101 }
00102 
00103 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(
00104     CStringFeatures<CHAR>* l, CStringFeatures<CHAR>* r)
00105 : CStringKernel<CHAR>(10), initialized(false)
00106 {
00107     scaled_blosum=new int[sizeof(blosum)];
00108     init_logsum();
00109     initialize();
00110     init(l, r);
00111 }
00112 
00113 CLocalAlignmentStringKernel::~CLocalAlignmentStringKernel()
00114 {
00115     cleanup();
00116 }
00117 
00118 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r)
00119 {
00120     bool result = CStringKernel<CHAR>::init(l, r);
00121     initialized = true;
00122     return result;
00123 }
00124 
00125 void CLocalAlignmentStringKernel::cleanup()
00126 {
00127     delete[] scaled_blosum;
00128     scaled_blosum=NULL;
00129 
00130     free(isAA);
00131     isAA=NULL;
00132     free(aaIndex);
00133     aaIndex=NULL;
00134 
00135     CKernel::cleanup();
00136 }
00137 
00138 /* LogSum - default log funciotion. fast, but not exact */
00139 /* LogSum2 - precise, but slow. Note that these two functions need different figure types  */
00140 
00141 void CLocalAlignmentStringKernel::init_logsum(void){
00142   int i;
00143   for (i = 0; i < LOGSUM_TBL; i++) 
00144     logsum_lookup[i] = (int) (INTSCALE*
00145                    (log(1.+exp( (float) -i/INTSCALE))));
00146 }
00147 
00148 int CLocalAlignmentStringKernel::LogSum(int p1, int p2){
00149   int diff;
00150   static int firsttime = 1;
00151   if (firsttime) {init_logsum(); firsttime = 0;}
00152   diff = p1 - p2;
00153   if      (diff >=  LOGSUM_TBL) return p1;
00154   else if (diff <= -LOGSUM_TBL) return p2;
00155   else if (diff > 0)            return p1+logsum_lookup[diff];
00156   else                          return p2+logsum_lookup[-diff];
00157 }
00158 
00159 
00160 float CLocalAlignmentStringKernel::LogSum2(float p1, float p2)
00161 {
00162   if (p1 > p2)
00163     return (p1-p2 > 50.) ? p1 : p1 + log(1. + exp(p2-p1));
00164   else
00165     return (p2-p1 > 50.) ? p2 : p2 + log(1. + exp(p1-p2));
00166 }
00167 
00168 
00169 void CLocalAlignmentStringKernel::initialize(void)
00170      /* Initialize all static variables. This function should be called once before computing the first pair HMM score */
00171 {
00172   register int i;
00173 
00174   /* Initialization of the array which gives the position of each amino-acid in the set of amino-acid */
00175   if ((aaIndex=(int *)calloc(NLET,sizeof(int))) == NULL)
00176     SG_ERROR("run out o memory");
00177   for (i=0;i<NAA;i++) 
00178     aaIndex[aaList[i]-'A']=i;
00179   
00180   /* Initialization of the array which indicates whether a char is an amino-acid */
00181   if ((isAA=(int *)calloc(256,sizeof(int))) == NULL)
00182     SG_ERROR("run out of memory");
00183   for (i=0;i<NAA;i++) 
00184     isAA[(int)aaList[i]]=1;
00185 
00186   /* Scale the blossum matrix */
00187   for (i=0 ; i<NAA*(NAA+1)/2; i++)
00188       scaled_blosum[i] = (int) floor(blosum[i]*SCALING*INTSCALE);
00189 
00190 
00191   /* Scale of gap penalties */
00192   opening = (int) floor(OPENING * SCALING*INTSCALE);
00193   extension = (int) floor(EXTENSION * SCALING*INTSCALE);
00194 }
00195 
00196 
00197 
00198 DREAL CLocalAlignmentStringKernel::LAkernelcompute(int* aaX, int* aaY, /* Implementation of the
00199                                 convolution kernel which generalizes the Smith-Waterman algorithm */
00200         /* the two amino-acid sequences (as sequences of indexes in [0..NAA-1] indicating the
00201          * position of the amino-acid in the variable 'aaList') */
00202         int nX, int nY /* the lengths of both sequences */
00203         )
00204 {
00205    register int
00206     i,j,                /* loop indexes */
00207     cur, old,           /* to indicate the array to use (0 or 1) */
00208     curpos, frompos;    /* position in an array */
00209 
00210    int
00211     *logX,           /* arrays to store the log-values of each state */
00212     *logY,
00213     *logM,
00214     *logX2,
00215     *logY2,
00216 
00217     aux , aux2;/* , aux3 , aux4 , aux5;*/
00218   int
00219     cl;                /* length of a column for the dynamic programming */
00220 
00221   /*
00222   printf("now computing pairHMM between %d and %d:\n",nX,nY);
00223   for (i=0;i<nX;printf("%d ",aaX[i++]));
00224   printf("\n and \n");
00225   for (i=0;i<nY;printf("%d ",aaY[i++]));
00226   printf("\n");
00227   */
00228 
00229   /* Initialization of the arrays */
00230   /* Each array stores two successive columns of the (nX+1)x(nY+1) table used in dynamic programming */
00231   cl = nY+1;           /* each column stores the positions in the aaY sequence, plus a position at zero */
00232 
00233   logM=new int[2*cl];
00234   logX=new int[2*cl];
00235   logY=new int[2*cl];
00236   logX2=new int[2*cl];
00237   logY2=new int[2*cl];
00238 
00239   /************************************************/
00240   /* First iteration : initialization of column 0 */
00241   /************************************************/
00242   /* The log=proabilities of each state are initialized for the first column (x=0,y=0..nY) */
00243 
00244   for (j=0;j<cl;j++) {
00245     logM[j]=LOG0;
00246     logX[j]=LOG0;
00247     logY[j]=LOG0;
00248     logX2[j]=LOG0;
00249     logY2[j]=LOG0;
00250 
00251   }
00252 
00253   /* Update column order */
00254   cur = 1;      /* Indexes [0..cl-1] are used to process the next column */
00255   old = 0;      /* Indexes [cl..2*cl-1] were used for column 0 */
00256 
00257 
00258   /************************************************/
00259   /* Next iterations : processing columns 1 .. nX */
00260   /************************************************/
00261 
00262   /* Main loop to vary the position in aaX : i=1..nX */
00263   for (i=1;i<=nX;i++) {
00264 
00265     /* Special update for positions (i=1..nX,j=0) */
00266     curpos = cur*cl;                  /* index of the state (i,0) */
00267     logM[curpos] = LOG0; 
00268     logX[curpos] = LOG0; 
00269     logY[curpos] = LOG0; 
00270     logX2[curpos] = LOG0; 
00271     logY2[curpos] = LOG0; 
00272 
00273     /* Secondary loop to vary the position in aaY : j=1..nY */
00274     for (j=1;j<=nY;j++) {
00275 
00276       curpos = cur*cl + j;            /* index of the state (i,j) */
00277 
00278       /* Update for states which emit X only */
00279       /***************************************/
00280 
00281       frompos = old*cl + j;            /* index of the state (i-1,j) */
00282       
00283       /* State RX */
00284       logX[curpos] = LOGP( - opening + logM[frompos] , - extension + logX[frompos] );
00285       /*      printf("%.5f\n",logX[curpos]);*/
00286       /*      printf("%.5f\n",logX_B[curpos]);*/
00287       /* State RX2 */
00288       logX2[curpos] = LOGP( logM[frompos] , logX2[frompos] );
00289 
00290       /* Update for states which emit Y only */
00291       /***************************************/
00292 
00293       frompos = cur*cl + j-1;          /* index of the state (i,j-1) */
00294 
00295       /* State RY */
00296       aux = LOGP( - opening + logM[frompos] , - extension + logY[frompos] );
00297       logY[curpos] = LOGP( aux , - opening + logX[frompos] );
00298 
00299       /* State RY2 */
00300       aux = LOGP( logM[frompos] , logY2[frompos] );
00301       logY2[curpos] = LOGP( aux , logX2[frompos] );
00302 
00303       /* Update for states which emit X and Y */
00304       /****************************************/
00305 
00306       frompos = old*cl + j-1;          /* index of the state (i-1,j-1) */
00307 
00308       aux = LOGP( logX[frompos] , logY[frompos] );
00309       aux2 = LOGP( 0 , logM[frompos] );
00310       logM[curpos] = LOGP( aux , aux2 ) + scaled_blosum[ BINDEX( aaX[i-1] , aaY[j-1] ) ];
00311       
00312       /*      printf("i=%d , j=%d\nM=%.5f\nX=%.5f\nY=%.5f\nX2=%.5f\nY2=%.5f\n",i,j,logM[curpos],logX[curpos],logY[curpos],logX2[curpos],logY2[curpos]);
00313        */
00314 
00315     }  /* end of j=1:nY loop */
00316 
00317 
00318     /* Update the culumn order */
00319     cur = 1-cur;
00320     old = 1-old;
00321 
00322   }  /* end of j=1:nX loop */
00323 
00324 
00325   /* Termination */
00326   /***************/
00327 
00328   curpos = old*cl + nY;                /* index of the state (nX,nY) */
00329   aux = LOGP( logX2[curpos] , logY2[curpos] );
00330   aux2 = LOGP( 0 , logM[curpos] );
00331   /*  kernel_value = LOGP( aux , aux2 );*/
00332 
00333   /* Memory release */
00334     delete[] logM;
00335     delete[] logX;
00336     delete[] logY;
00337     delete[] logX2;
00338     delete[] logY2;
00339 
00340   /* Return the logarithm of the kernel */
00341   return (float)LOGP(aux,aux2)/INTSCALE;
00342 }
00343 
00344 /********************/
00345 /* Public functions */
00346 /********************/
00347 
00348 
00349 /* Return the log-probability of two sequences x and y under a pair HMM model */
00350 /* x and y are strings of aminoacid letters, e.g., "AABRS" */
00351 DREAL CLocalAlignmentStringKernel::compute(INT idx_x, INT idx_y)
00352 {
00353   int *aax,*aay;  /* to convert x and y into sequences of amino-acid indexes */
00354   int lx=0,ly=0;       /* lengths of x and y */
00355   int i,j;
00356 
00357   /* If necessary, initialize static variables */
00358   if (isAA == NULL)
00359     initialize();
00360 
00361   CHAR* x=((CStringFeatures<CHAR>*) lhs)->get_feature_vector(idx_x, lx);
00362   CHAR* y=((CStringFeatures<CHAR>*) rhs)->get_feature_vector(idx_y, ly);
00363   ASSERT(x && y);
00364 
00365   if ((lx<1) || (ly<1))
00366     SG_ERROR("empty chain");
00367 
00368   /* Create aax and aay */
00369 
00370   if ((aax=(int *)calloc(lx,sizeof(int))) == NULL)
00371     SG_ERROR("run out of memory");
00372   if ((aay=(int *)calloc(ly,sizeof(int))) == NULL)
00373     SG_ERROR("run out of memory");
00374 
00375   /* Extract the characters corresponding to aminoacids and keep their indexes */
00376 
00377   j=0;
00378   for (i=0 ; i<lx ; i++) 
00379     if (isAA[toupper(x[i])])
00380       aax[j++] = aaIndex[toupper(x[i])-'A'];
00381   lx = j;
00382   j=0;
00383   for (i=0 ; i<ly ; i++)
00384     if (isAA[toupper(y[i])])
00385       aay[j++] = aaIndex[toupper(y[i])-'A'];
00386   ly = j;
00387 
00388 
00389   /* Compute the pair HMM score */
00390   DREAL result=LAkernelcompute(aax,aay,lx,ly);
00391 
00392   /* Release memory */
00393   free(aax);
00394   free(aay);
00395 
00396   return result;
00397 }

SHOGUN Machine Learning Toolbox - Documentation