00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "kernel/OligoKernel.h"
00013 #include "kernel/SqrtDiagKernelNormalizer.h"
00014 #include "features/StringFeatures.h"
00015
00016 #include <map>
00017 #include <cmath>
00018 #include <vector>
00019 #include <algorithm>
00020 #include <iostream>
00021
00022 using namespace std;
00023
00024 COligoKernel::COligoKernel(int32_t cache_sz, int32_t kmer_len, float64_t w)
00025 : CStringKernel<char>(cache_sz), k(kmer_len), width(w)
00026 {
00027 set_normalizer(new CSqrtDiagKernelNormalizer());
00028 }
00029
00030 COligoKernel::~COligoKernel()
00031 {
00032
00033 }
00034
00035 bool COligoKernel::init(CFeatures* l, CFeatures* r)
00036 {
00037 CStringKernel<char>::init(l,r);
00038 return init_normalizer();
00039 }
00040
00041 bool COligoKernel::cmpOligos_(
00042 pair<int32_t, float64_t> a, pair<int32_t, float64_t> b)
00043 {
00044 return (a.second < b.second);
00045 }
00046
00047 void COligoKernel::encodeOligo(
00048 const string& sequence, uint32_t k_mer_length,
00049 const string& allowed_characters,
00050 vector< pair<int32_t, float64_t> >& values)
00051 {
00052 float64_t oligo_value = 0.;
00053 float64_t factor = 1.;
00054 map<string::value_type, uint32_t> residue_values;
00055 uint32_t counter = 0;
00056 uint32_t number_of_residues = allowed_characters.size();
00057 uint32_t sequence_length = sequence.size();
00058 bool sequence_ok = true;
00059
00060
00061 for (uint32_t i = 0; i < sequence.size(); ++i)
00062 {
00063 if (allowed_characters.find(sequence.at(i)) == string::npos)
00064 sequence_ok = false;
00065 }
00066
00067 if (sequence_ok && k_mer_length <= sequence_length)
00068 {
00069 values.resize(sequence_length - k_mer_length + 1,
00070 pair<int32_t, float64_t>());
00071 for (uint32_t i = 0; i < number_of_residues; ++i)
00072 {
00073 residue_values.insert(make_pair(allowed_characters[i], counter));
00074 ++counter;
00075 }
00076 for (int32_t k = k_mer_length - 1; k >= 0; k--)
00077 {
00078 oligo_value += factor * residue_values[sequence[k]];
00079 factor *= number_of_residues;
00080 }
00081 factor /= number_of_residues;
00082 counter = 0;
00083 values[counter].first = 1;
00084 values[counter].second = oligo_value;
00085 ++counter;
00086
00087 for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++)
00088 {
00089 oligo_value -= factor * residue_values[sequence[j - 1]];
00090 oligo_value = oligo_value * number_of_residues +
00091 residue_values[sequence[j + k_mer_length - 1]];
00092
00093 values[counter].first = j + 1;
00094 values[counter].second = oligo_value ;
00095 ++counter;
00096 }
00097 stable_sort(values.begin(), values.end(), cmpOligos_);
00098 }
00099 else
00100 {
00101 values.clear();
00102 }
00103 }
00104
00105 void COligoKernel::getSequences(
00106 const vector<string>& sequences, uint32_t k_mer_length,
00107 const string& allowed_characters,
00108 vector< vector< pair<int32_t, float64_t> > >& encoded_sequences)
00109 {
00110 vector< pair<int32_t, float64_t> > temp_vector;
00111 encoded_sequences.resize(sequences.size(),
00112 vector< pair<int32_t, float64_t> >());
00113
00114 for (uint32_t i = 0; i < sequences.size(); ++i)
00115 {
00116 encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector);
00117 encoded_sequences[i] = temp_vector;
00118 }
00119 }
00120
00121 void COligoKernel::getExpFunctionCache(
00122 float64_t sigma, uint32_t sequence_length, vector<float64_t>& cache)
00123 {
00124 cache.resize(sequence_length, 0.);
00125 cache[0] = 1;
00126 for (uint32_t i = 1; i < sequence_length - 1; i++)
00127 {
00128 cache[i] = exp((-1 / (4.0 * sigma * sigma)) * i * i);
00129 }
00130 }
00131
00132 float64_t COligoKernel::kernelOligoFast(
00133 const vector< pair<int32_t, float64_t> >& x,
00134 const vector< pair<int32_t, float64_t> >& y,
00135 const vector<float64_t>& gauss_table, int32_t max_distance)
00136 {
00137 float64_t kernel = 0;
00138 int32_t i1 = 0;
00139 int32_t i2 = 0;
00140 int32_t c1 = 0;
00141 uint32_t x_size = x.size();
00142 uint32_t y_size = y.size();
00143
00144 while ((uint32_t) i1 < x_size && (uint32_t) i2 < y_size)
00145 {
00146 if (x[i1].second == y[i2].second)
00147 {
00148 if (max_distance < 0
00149 || (abs(x[i1].first - y[i2].first)) <= max_distance)
00150 {
00151 kernel += gauss_table.at(abs((x[i1].first - y[i2].first)));
00152 if (x[i1].second == x[i1 + 1].second)
00153 {
00154 i1++;
00155 c1++;
00156 }
00157 else if (y[i2].second == y[i2 + 1].second)
00158 {
00159 i2++;
00160 i1 -= c1;
00161 c1 = 0;
00162 }
00163 else
00164 {
00165 i1++;
00166 i2++;
00167 }
00168 }
00169 else
00170 {
00171 if (x[i1].first < y[i2].first)
00172 {
00173 if (x[i1].second == x[i1 + 1].second)
00174 {
00175 i1++;
00176 }
00177 else if (y[i2].second == y[i2 + 1].second)
00178 {
00179 while(y[i2++].second == y[i2].second)
00180 {
00181 ;
00182 }
00183 ++i1;
00184 c1 = 0;
00185 }
00186 else
00187 {
00188 i1++;
00189 i2++;
00190 c1 = 0;
00191 }
00192 }
00193 else
00194 {
00195 i2++;
00196 i1 -= c1;
00197 c1 = 0;
00198 }
00199 }
00200 }
00201 else
00202 {
00203 if (x[i1].second < y[i2].second)
00204 i1++;
00205 else
00206 i2++;
00207 c1 = 0;
00208 }
00209 }
00210 return kernel;
00211 }
00212
00213
00214 float64_t COligoKernel::kernelOligo(
00215 const vector< pair<int32_t, float64_t> >& x,
00216 const vector< pair<int32_t, float64_t> >& y,
00217 float64_t sigma_square)
00218 {
00219 float64_t kernel = 0;
00220 int32_t i1 = 0;
00221 int32_t i2 = 0;
00222 int32_t c1 = 0;
00223 uint32_t x_size = x.size();
00224 uint32_t y_size = y.size();
00225
00226 while ((uint32_t) i1 < x_size && (uint32_t) i2 < y_size)
00227 {
00228 if (x[i1].second == y[i2].second)
00229 {
00230 kernel += exp(-1 * (x[i1].first - y[i2].first) * (x[i1].first - y[i2].first) / (4 * sigma_square));
00231
00232 if (((uint32_t) i1+1) < x_size && x[i1].second == x[i1 + 1].second)
00233 {
00234 i1++;
00235 c1++;
00236 }
00237 else if (((uint32_t) i2+1) <y_size && y[i2].second == y[i2 + 1].second)
00238 {
00239 i2++;
00240 i1 -= c1;
00241 c1 = 0;
00242 }
00243 else
00244 {
00245 i1++;
00246 i2++;
00247 }
00248 }
00249 else
00250 {
00251 if (x[i1].second < y[i2].second)
00252 i1++;
00253 else
00254 i2++;
00255
00256 c1 = 0;
00257 }
00258 }
00259 return kernel;
00260 }
00261
00262 float64_t COligoKernel::compute(int32_t idx_a, int32_t idx_b)
00263 {
00264 int32_t alen, blen;
00265 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen);
00266 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen);
00267 vector< pair<int32_t, float64_t> > aenc;
00268 vector< pair<int32_t, float64_t> > benc;
00269 encodeOligo(string(avec, alen), k, "ACGT", aenc);
00270 encodeOligo(string(bvec, alen), k, "ACGT", benc);
00271 float64_t result=kernelOligo(aenc, benc, width);
00272 return result;
00273 }
00274