OligoKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2008 Christian Igel, Tobias Glasmachers
00008  * Copyright (C) 2008 Christian Igel, Tobias Glasmachers
00009  *
00010  * Shogun adjustments (w) 2008 Soeren Sonnenburg
00011  */
00012 #include "kernel/OligoKernel.h"
00013 #include "kernel/SqrtDiagKernelNormalizer.h"
00014 #include "features/StringFeatures.h"
00015 
00016 #include <map>
00017 #include <cmath>
00018 #include <vector>
00019 #include <algorithm>
00020 #include <iostream>
00021 
00022 using namespace std;
00023 
00024 COligoKernel::COligoKernel(int32_t cache_sz, int32_t kmer_len, float64_t w)
00025 : CStringKernel<char>(cache_sz), k(kmer_len), width(w)
00026 {
00027     set_normalizer(new CSqrtDiagKernelNormalizer());
00028 }
00029 
00030 COligoKernel::~COligoKernel()
00031 {
00032 
00033 }
00034 
00035 bool COligoKernel::init(CFeatures* l, CFeatures* r)
00036 {
00037     CStringKernel<char>::init(l,r);
00038     return init_normalizer();
00039 }
00040 
00041 bool COligoKernel::cmpOligos_(
00042     pair<int32_t, float64_t> a, pair<int32_t, float64_t> b)
00043 {
00044     return (a.second < b.second);
00045 }
00046 
00047 void COligoKernel::encodeOligo(
00048     const string& sequence, uint32_t k_mer_length,
00049     const string& allowed_characters,
00050     vector< pair<int32_t, float64_t> >& values)
00051 {
00052     float64_t oligo_value = 0.;
00053     float64_t factor      = 1.;
00054     map<string::value_type, uint32_t> residue_values;
00055     uint32_t counter = 0;
00056     uint32_t number_of_residues = allowed_characters.size();
00057     uint32_t sequence_length = sequence.size();
00058     bool sequence_ok = true;
00059 
00060     // checking if sequence contains illegal characters
00061     for (uint32_t i = 0; i < sequence.size(); ++i)
00062     {
00063         if (allowed_characters.find(sequence.at(i)) == string::npos)
00064             sequence_ok = false;
00065     }
00066 
00067     if (sequence_ok && k_mer_length <= sequence_length)
00068     {
00069         values.resize(sequence_length - k_mer_length + 1,
00070             pair<int32_t, float64_t>());
00071         for (uint32_t i = 0; i < number_of_residues; ++i)
00072         {   
00073             residue_values.insert(make_pair(allowed_characters[i], counter));
00074             ++counter;
00075         }
00076         for (int32_t k = k_mer_length - 1; k >= 0; k--)
00077         {
00078             oligo_value += factor * residue_values[sequence[k]];
00079             factor *= number_of_residues;
00080         }
00081         factor /= number_of_residues;
00082         counter = 0;
00083         values[counter].first = 1;
00084         values[counter].second = oligo_value;
00085         ++counter;
00086 
00087         for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++)
00088         {
00089             oligo_value -= factor * residue_values[sequence[j - 1]];
00090             oligo_value = oligo_value * number_of_residues +
00091                 residue_values[sequence[j + k_mer_length - 1]];
00092 
00093             values[counter].first = j + 1;
00094             values[counter].second = oligo_value ;
00095             ++counter;
00096         }
00097         stable_sort(values.begin(), values.end(), cmpOligos_);
00098     }
00099     else
00100     {
00101         values.clear();
00102     }   
00103 }
00104 
00105 void COligoKernel::getSequences(
00106     const vector<string>& sequences, uint32_t k_mer_length,
00107     const string& allowed_characters,
00108     vector< vector< pair<int32_t, float64_t> > >& encoded_sequences)
00109 {
00110     vector< pair<int32_t, float64_t> > temp_vector;
00111     encoded_sequences.resize(sequences.size(),
00112         vector< pair<int32_t, float64_t> >());
00113 
00114     for (uint32_t i = 0; i < sequences.size(); ++i)
00115     {
00116         encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector);
00117         encoded_sequences[i] = temp_vector;
00118     }
00119 }
00120 
00121 void COligoKernel::getExpFunctionCache(
00122     float64_t sigma, uint32_t sequence_length, vector<float64_t>& cache)
00123 {
00124     cache.resize(sequence_length, 0.);
00125     cache[0] = 1;
00126     for (uint32_t i = 1; i < sequence_length - 1; i++)
00127     {
00128         cache[i] = exp((-1 / (4.0 * sigma * sigma)) * i * i);
00129     }
00130 }
00131 
00132 float64_t COligoKernel::kernelOligoFast(
00133     const vector< pair<int32_t, float64_t> >& x,
00134     const vector< pair<int32_t, float64_t> >& y,
00135     const vector<float64_t>& gauss_table, int32_t max_distance)
00136 {
00137     float64_t kernel = 0;
00138     int32_t  i1     = 0;
00139     int32_t  i2     = 0;
00140     int32_t  c1     = 0;
00141     uint32_t x_size = x.size();
00142     uint32_t y_size = y.size();
00143 
00144     while ((uint32_t) i1 < x_size && (uint32_t) i2 < y_size)
00145     {
00146         if (x[i1].second == y[i2].second)
00147         {
00148             if (max_distance < 0
00149                     || (abs(x[i1].first - y[i2].first)) <= max_distance)
00150             {
00151                 kernel += gauss_table.at(abs((x[i1].first - y[i2].first)));
00152                 if (x[i1].second == x[i1 + 1].second)
00153                 {
00154                     i1++;
00155                     c1++;
00156                 }
00157                 else if (y[i2].second == y[i2 + 1].second)
00158                 {
00159                     i2++;
00160                     i1 -= c1;
00161                     c1 = 0;
00162                 }
00163                 else
00164                 {
00165                     i1++;
00166                     i2++;
00167                 }
00168             }
00169             else
00170             {
00171                 if (x[i1].first < y[i2].first)
00172                 {
00173                     if (x[i1].second == x[i1 + 1].second)
00174                     {
00175                         i1++;
00176                     }
00177                     else if (y[i2].second == y[i2 + 1].second)
00178                     {
00179                         while(y[i2++].second == y[i2].second)
00180                         {
00181                             ;
00182                         }
00183                         ++i1;
00184                         c1 = 0;
00185                     }
00186                     else
00187                     {
00188                         i1++;
00189                         i2++;
00190                         c1 = 0;
00191                     }
00192                 }
00193                 else
00194                 {
00195                     i2++;
00196                     i1 -= c1;
00197                     c1 = 0;
00198                 }
00199             }
00200         }
00201         else
00202         {
00203             if (x[i1].second < y[i2].second)
00204                 i1++;
00205             else
00206                 i2++;
00207             c1 = 0;
00208         }
00209     }
00210     return kernel;
00211 }       
00212 
00213 
00214 float64_t COligoKernel::kernelOligo(
00215         const vector< pair<int32_t, float64_t> >& x,
00216         const vector< pair<int32_t, float64_t> >& y,
00217         float64_t sigma_square)
00218 {
00219     float64_t   kernel = 0;
00220     int32_t  i1     = 0;
00221     int32_t  i2     = 0;
00222     int32_t  c1     = 0;
00223     uint32_t x_size = x.size();
00224     uint32_t y_size = y.size();
00225 
00226     while ((uint32_t) i1 < x_size && (uint32_t) i2 < y_size)
00227     {
00228         if (x[i1].second == y[i2].second)
00229         {
00230             kernel += exp(-1 * (x[i1].first - y[i2].first) * (x[i1].first - y[i2].first) / (4 * sigma_square));
00231 
00232             if (((uint32_t) i1+1) < x_size && x[i1].second == x[i1 + 1].second)
00233             {
00234                 i1++;
00235                 c1++;
00236             }
00237             else if (((uint32_t) i2+1) <y_size && y[i2].second == y[i2 + 1].second)
00238             {
00239                 i2++;
00240                 i1 -= c1;
00241                 c1 = 0;
00242             }
00243             else
00244             {
00245                 i1++;
00246                 i2++;
00247             }
00248         }
00249         else
00250         {
00251             if (x[i1].second < y[i2].second)
00252                 i1++;
00253             else
00254                 i2++;
00255 
00256             c1 = 0;
00257         }
00258     }
00259     return kernel;
00260 }       
00261 
00262 float64_t COligoKernel::compute(int32_t idx_a, int32_t idx_b)
00263 {
00264     int32_t alen, blen;
00265     char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen);
00266     char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen);
00267     vector< pair<int32_t, float64_t> > aenc;
00268     vector< pair<int32_t, float64_t> > benc;
00269     encodeOligo(string(avec, alen), k, "ACGT", aenc);
00270     encodeOligo(string(bvec, alen), k, "ACGT", benc);
00271     float64_t result=kernelOligo(aenc, benc, width);
00272     return result;
00273 }
00274 

SHOGUN Machine Learning Toolbox - Documentation