WeightedDegreeStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017 
00018 #include "kernel/WeightedDegreeStringKernel.h"
00019 #include "kernel/FirstElementKernelNormalizer.h"
00020 #include "features/Features.h"
00021 #include "features/StringFeatures.h"
00022 
00023 #ifndef WIN32
00024 #include <pthread.h>
00025 #endif
00026 
00027 struct S_THREAD_PARAM
00028 {
00029     int32_t* vec;
00030     float64_t* result;
00031     float64_t* weights;
00032     CWeightedDegreeStringKernel* kernel;
00033     CTrie<DNATrie>* tries;
00034     float64_t factor;
00035     int32_t j;
00036     int32_t start;
00037     int32_t end;
00038     int32_t length;
00039     int32_t* vec_idx;
00040 };
00041 
00042 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00043     int32_t degree_, EWDKernType type_)
00044 : CStringKernel<char>(10),weights(NULL),position_weights(NULL),
00045     weights_buffer(NULL), mkl_stepsize(1),degree(degree_), length(0),
00046     max_mismatch(0), seq_length(0), block_computation(true),
00047     num_block_weights_external(0), block_weights_external(NULL),
00048     block_weights(NULL), type(type_), which_degree(-1), tries(NULL),
00049     tree_initialized(false), alphabet(NULL)
00050 {
00051     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00052     lhs=NULL;
00053     rhs=NULL;
00054 
00055     if (type!=E_EXTERNAL)
00056         set_wd_weights_by_type(type);
00057 
00058     set_normalizer(new CFirstElementKernelNormalizer());
00059 }
00060 
00061 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00062     float64_t *weights_, int32_t degree_)
00063 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00064     weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00065     max_mismatch(0), seq_length(0), block_computation(true),
00066     num_block_weights_external(0), block_weights_external(NULL),
00067     block_weights(NULL), type(E_EXTERNAL), which_degree(-1), tries(NULL),
00068     tree_initialized(false), alphabet(NULL)
00069 {
00070     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00071     lhs=NULL;
00072     rhs=NULL;
00073 
00074     weights=new float64_t[degree*(1+max_mismatch)];
00075     for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00076         weights[i]=weights_[i];
00077     set_normalizer(new CFirstElementKernelNormalizer());
00078 }
00079 
00080 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(
00081     CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree_)
00082 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00083     weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00084     max_mismatch(0), seq_length(0), block_computation(true),
00085     num_block_weights_external(0), block_weights_external(NULL),
00086     block_weights(NULL), type(E_WD), which_degree(-1), tries(NULL),
00087     tree_initialized(false), alphabet(NULL)
00088 {
00089     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00090 
00091     set_wd_weights_by_type(type);
00092     set_normalizer(new CFirstElementKernelNormalizer());
00093     init(l, r);
00094 }
00095 
00096 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel()
00097 {
00098     cleanup();
00099 
00100     delete[] weights;
00101     weights=NULL;
00102 
00103     delete[] block_weights;
00104     block_weights=NULL;
00105 
00106     delete[] position_weights;
00107     position_weights=NULL;
00108 
00109     delete[] weights_buffer;
00110     weights_buffer=NULL;
00111 }
00112 
00113 
00114 void CWeightedDegreeStringKernel::remove_lhs()
00115 { 
00116     SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00117     delete_optimization();
00118 
00119     if (tries!=NULL)
00120         tries->destroy();
00121 
00122     CKernel::remove_lhs();
00123 }
00124 
00125 void CWeightedDegreeStringKernel::create_empty_tries()
00126 {
00127     seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length();
00128 
00129     if (tries!=NULL)
00130     {
00131         tries->destroy() ;
00132         tries->create(seq_length, max_mismatch==0) ;
00133     }
00134 }
00135   
00136 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r)
00137 {
00138     int32_t lhs_changed=(lhs!=l);
00139     int32_t rhs_changed=(rhs!=r);
00140 
00141     CStringKernel<char>::init(l,r);
00142 
00143     SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00144     SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00145 
00146     CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00147     CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00148 
00149     int32_t len=sf_l->get_max_vector_length();
00150     if (lhs_changed && !sf_l->have_same_length(len))
00151         SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00152 
00153     if (rhs_changed && !sf_r->have_same_length(len))
00154         SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00155 
00156     delete alphabet;
00157     alphabet=new CAlphabet(sf_l->get_alphabet());
00158     CAlphabet* ralphabet=(sf_r->get_alphabet());
00159     if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00160         properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00161 
00162     ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00163     SG_UNREF(ralphabet);
00164 
00165     if (tries!=NULL) {
00166         tries->delete_trees(max_mismatch==0);
00167         delete tries;
00168     }
00169     tries=new CTrie<DNATrie>(degree, max_mismatch==0);
00170     create_empty_tries();
00171 
00172     init_block_weights();
00173 
00174     return init_normalizer();
00175 }
00176 
00177 void CWeightedDegreeStringKernel::cleanup()
00178 {
00179     SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n");
00180     delete_optimization();
00181 
00182     delete[] block_weights;
00183     block_weights=NULL;
00184 
00185     if (tries!=NULL)
00186     {
00187         tries->destroy();
00188         delete tries;
00189         tries=NULL;
00190     }
00191 
00192     seq_length=0;
00193     tree_initialized = false;
00194 
00195     delete alphabet;
00196     alphabet=NULL;
00197 
00198     CKernel::cleanup();
00199 }
00200 
00201 bool CWeightedDegreeStringKernel::load_init(FILE* src)
00202 {
00203     return false;
00204 }
00205 
00206 bool CWeightedDegreeStringKernel::save_init(FILE* dest)
00207 {
00208     return false;
00209 }
00210   
00211 
00212 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num)
00213 {
00214     if (tree_num<0)
00215         SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00216 
00217     delete_optimization();
00218 
00219     if (tree_num<0)
00220         SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ;
00221 
00222     for (int32_t i=0; i<count; i++)
00223     {
00224         if (tree_num<0)
00225         {
00226             if ( (i % (count/10+1)) == 0)
00227                 SG_PROGRESS(i, 0, count);
00228             
00229             if (max_mismatch==0)
00230                 add_example_to_tree(IDX[i], alphas[i]) ;
00231             else
00232                 add_example_to_tree_mismatch(IDX[i], alphas[i]) ;
00233 
00234             //SG_DEBUG( "number of used trie nodes: %i\n", tries.get_num_used_nodes()) ;
00235         }
00236         else
00237         {
00238             if (max_mismatch==0)
00239                 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ;
00240             else
00241                 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ;
00242         }
00243     }
00244     
00245     if (tree_num<0)
00246         SG_DONE();
00247 
00248     //tries.compact_nodes(NO_CHILD, 0, weights) ;
00249 
00250     set_is_initialized(true) ;
00251     return true ;
00252 }
00253 
00254 bool CWeightedDegreeStringKernel::delete_optimization()
00255 { 
00256     if (get_is_initialized())
00257     {
00258         if (tries!=NULL)
00259             tries->delete_trees(max_mismatch==0); 
00260         set_is_initialized(false);
00261         return true;
00262     }
00263     
00264     return false;
00265 }
00266 
00267 
00268 float64_t CWeightedDegreeStringKernel::compute_with_mismatch(
00269     char* avec, int32_t alen, char* bvec, int32_t blen)
00270 {
00271     float64_t sum = 0.0;
00272 
00273     for (int32_t i=0; i<alen; i++)
00274     {
00275         float64_t sumi = 0.0;
00276         int32_t mismatches=0;
00277 
00278         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00279         {
00280             if (avec[i+j]!=bvec[i+j])
00281             {
00282                 mismatches++ ;
00283                 if (mismatches>max_mismatch)
00284                     break ;
00285             } ;
00286             sumi += weights[j+degree*mismatches];
00287         }
00288         if (position_weights!=NULL)
00289             sum+=position_weights[i]*sumi ;
00290         else
00291             sum+=sumi ;
00292     }
00293     return sum ;
00294 }
00295 
00296 float64_t CWeightedDegreeStringKernel::compute_using_block(
00297     char* avec, int32_t alen, char* bvec, int32_t blen)
00298 {
00299     ASSERT(alen==blen);
00300 
00301     float64_t sum=0;
00302     int32_t match_len=-1;
00303 
00304     for (int32_t i=0; i<alen; i++)
00305     {
00306         if (avec[i]==bvec[i])
00307             match_len++;
00308         else
00309         {
00310             if (match_len>=0)
00311                 sum+=block_weights[match_len];
00312             match_len=-1;
00313         }
00314     }
00315 
00316     if (match_len>=0)
00317         sum+=block_weights[match_len];
00318 
00319     return sum;
00320 }
00321 
00322 float64_t CWeightedDegreeStringKernel::compute_without_mismatch(
00323     char* avec, int32_t alen, char* bvec, int32_t blen)
00324 {
00325     float64_t sum = 0.0;
00326 
00327     for (int32_t i=0; i<alen; i++)
00328     {
00329         float64_t sumi = 0.0;
00330 
00331         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00332         {
00333             if (avec[i+j]!=bvec[i+j])
00334                 break ;
00335             sumi += weights[j];
00336         }
00337         if (position_weights!=NULL)
00338             sum+=position_weights[i]*sumi ;
00339         else
00340             sum+=sumi ;
00341     }
00342     return sum ;
00343 }
00344 
00345 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix(
00346     char* avec, int32_t alen, char* bvec, int32_t blen)
00347 {
00348     float64_t sum = 0.0;
00349 
00350     for (int32_t i=0; i<alen; i++)
00351     {
00352         float64_t sumi=0.0;
00353         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00354         {
00355             if (avec[i+j]!=bvec[i+j])
00356                 break;
00357             sumi += weights[i*degree+j];
00358         }
00359         if (position_weights!=NULL)
00360             sum += position_weights[i]*sumi ;
00361         else
00362             sum += sumi ;
00363     }
00364 
00365     return sum ;
00366 }
00367 
00368 
00369 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b)
00370 {
00371     int32_t alen, blen;
00372     char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen);
00373     char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen);
00374     float64_t result=0;
00375 
00376     if (max_mismatch==0 && length==0 && block_computation)
00377         result=compute_using_block(avec, alen, bvec, blen);
00378     else
00379     {
00380         if (max_mismatch>0)
00381             result=compute_with_mismatch(avec, alen, bvec, blen);
00382         else if (length==0)
00383             result=compute_without_mismatch(avec, alen, bvec, blen);
00384         else
00385             result=compute_without_mismatch_matrix(avec, alen, bvec, blen);
00386     }
00387 
00388     return result;
00389 }
00390 
00391 
00392 void CWeightedDegreeStringKernel::add_example_to_tree(
00393     int32_t idx, float64_t alpha)
00394 {
00395     ASSERT(alphabet);
00396     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00397 
00398     int32_t len=0;
00399     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00400     ASSERT(max_mismatch==0);
00401     int32_t *vec=new int32_t[len];
00402 
00403     for (int32_t i=0; i<len; i++)
00404         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00405     
00406     if (length == 0 || max_mismatch > 0)
00407     {
00408         for (int32_t i=0; i<len; i++)
00409         {
00410             float64_t alpha_pw=alpha;
00411             /*if (position_weights!=NULL)
00412               alpha_pw *= position_weights[i] ;*/
00413             if (alpha_pw==0.0)
00414                 continue;
00415             ASSERT(tries);
00416             tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00417         }
00418     }
00419     else
00420     {
00421         for (int32_t i=0; i<len; i++)
00422         {
00423             float64_t alpha_pw=alpha;
00424             /*if (position_weights!=NULL) 
00425               alpha_pw = alpha*position_weights[i] ;*/
00426             if (alpha_pw==0.0)
00427                 continue ;
00428             ASSERT(tries);
00429             tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00430         }
00431     }
00432     delete[] vec ;
00433     tree_initialized=true ;
00434 }
00435 
00436 void CWeightedDegreeStringKernel::add_example_to_single_tree(
00437     int32_t idx, float64_t alpha, int32_t tree_num) 
00438 {
00439     ASSERT(alphabet);
00440     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00441 
00442     int32_t len ;
00443     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00444     ASSERT(max_mismatch==0);
00445     int32_t *vec = new int32_t[len] ;
00446 
00447     for (int32_t i=tree_num; i<tree_num+degree && i<len; i++)
00448         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00449     
00450     ASSERT(tries);
00451     if (alpha!=0.0)
00452         tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0));
00453 
00454     delete[] vec ;
00455     tree_initialized=true ;
00456 }
00457 
00458 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha)
00459 {
00460     ASSERT(tries);
00461     ASSERT(alphabet);
00462     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00463 
00464     int32_t len ;
00465     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00466     
00467     int32_t *vec = new int32_t[len] ;
00468     
00469     for (int32_t i=0; i<len; i++)
00470         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00471     
00472     for (int32_t i=0; i<len; i++)
00473     {
00474         if (alpha!=0.0)
00475             tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights);
00476     }
00477     
00478     delete[] vec ;
00479     tree_initialized=true ;
00480 }
00481 
00482 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch(
00483     int32_t idx, float64_t alpha, int32_t tree_num)
00484 {
00485     ASSERT(tries);
00486     ASSERT(alphabet);
00487     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00488 
00489     int32_t len=0;
00490     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00491     int32_t *vec=new int32_t[len];
00492 
00493     for (int32_t i=tree_num; i<len && i<tree_num+degree; i++)
00494         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00495 
00496     if (alpha!=0.0)
00497     {
00498         tries->add_example_to_tree_mismatch_recursion(
00499             NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num,
00500             0, 0, max_mismatch, weights);
00501     }
00502 
00503     delete[] vec;
00504     tree_initialized=true;
00505 }
00506 
00507 
00508 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx)
00509 {
00510     ASSERT(alphabet);
00511     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00512 
00513     int32_t len=0;
00514     char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len);
00515     ASSERT(char_vec && len>0);
00516     int32_t *vec=new int32_t[len];
00517 
00518     for (int32_t i=0; i<len; i++)
00519         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00520 
00521     float64_t sum=0;
00522     ASSERT(tries);
00523     for (int32_t i=0; i<len; i++)
00524         sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0));
00525 
00526     delete[] vec;
00527     return normalizer->normalize_rhs(sum, idx);
00528 }
00529 
00530 void CWeightedDegreeStringKernel::compute_by_tree(
00531     int32_t idx, float64_t* LevelContrib)
00532 {
00533     ASSERT(alphabet);
00534     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00535 
00536     int32_t len ;
00537     char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len);
00538 
00539     int32_t *vec = new int32_t[len] ;
00540 
00541     for (int32_t i=0; i<len; i++)
00542         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00543 
00544     ASSERT(tries);
00545     for (int32_t i=0; i<len; i++)
00546     {
00547         tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00548                 normalizer->normalize_rhs(1.0, idx),
00549                 mkl_stepsize, weights, (length!=0));
00550     }
00551 
00552     delete[] vec ;
00553 }
00554 
00555 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len)
00556 {
00557     ASSERT(tries);
00558     return tries->compute_abs_weights(len);
00559 }
00560 
00561 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type)
00562 {
00563     ASSERT(degree>0);
00564     ASSERT(p_type==E_WD); 
00565 
00566     delete[] weights;
00567     weights=new float64_t[degree];
00568     if (weights)
00569     {
00570         int32_t i;
00571         float64_t sum=0;
00572         for (i=0; i<degree; i++)
00573         {
00574             weights[i]=degree-i;
00575             sum+=weights[i];
00576         }
00577         for (i=0; i<degree; i++)
00578             weights[i]/=sum;
00579 
00580         for (i=0; i<degree; i++)
00581         {
00582             for (int32_t j=1; j<=max_mismatch; j++)
00583             {
00584                 if (j<i+1)
00585                 {
00586                     int32_t nk=CMath::nchoosek(i+1, j);
00587                     weights[i+j*degree]=weights[i]/(nk*pow(3,j));
00588                 }
00589                 else
00590                     weights[i+j*degree]= 0;
00591             }
00592         }
00593 
00594         if (which_degree>=0)
00595         {
00596             ASSERT(which_degree<degree);
00597             for (i=0; i<degree; i++)
00598             {
00599                 if (i!=which_degree)
00600                     weights[i]=0;
00601                 else
00602                     weights[i]=1;
00603             }
00604         }
00605         return true;
00606     }
00607     else
00608         return false;
00609 }
00610 
00611 bool CWeightedDegreeStringKernel::set_weights(
00612     float64_t* ws, int32_t d, int32_t len)
00613 {
00614     SG_DEBUG("degree = %i  d=%i\n", degree, d);
00615     degree=d;
00616     ASSERT(tries);
00617     tries->set_degree(degree);
00618     length=len;
00619     
00620     if (length==0) length=1;
00621     int32_t num_weights=degree*(length+max_mismatch);
00622     delete[] weights;
00623     weights=new float64_t[num_weights];
00624     if (weights)
00625     {
00626         for (int32_t i=0; i<num_weights; i++) {
00627             if (ws[i]) // len(ws) might be != num_weights?
00628                 weights[i]=ws[i];
00629         }
00630         return true;
00631     }
00632     else
00633         return false;
00634 }
00635 
00636 bool CWeightedDegreeStringKernel::set_position_weights(
00637     float64_t* pws, int32_t len)
00638 {
00639     if (len==0)
00640     {
00641         delete[] position_weights;
00642         position_weights=NULL;
00643         ASSERT(tries);
00644         tries->set_position_weights(position_weights);
00645     }
00646 
00647     if (seq_length!=len)
00648         SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00649 
00650     delete[] position_weights;
00651     position_weights=new float64_t[len];
00652     ASSERT(tries);
00653     tries->set_position_weights(position_weights);
00654     
00655     if (position_weights)
00656     {
00657         for (int32_t i=0; i<len; i++)
00658             position_weights[i]=pws[i];
00659         return true;
00660     }
00661     else
00662         return false;
00663 }
00664 
00665 bool CWeightedDegreeStringKernel::init_block_weights_from_wd()
00666 {
00667     delete[] block_weights;
00668     block_weights=new float64_t[CMath::max(seq_length,degree)];
00669 
00670     if (block_weights)
00671     {
00672         int32_t k;
00673         float64_t d=degree; // use float to evade rounding errors below
00674 
00675         for (k=0; k<degree; k++)
00676             block_weights[k]=
00677                 (-pow(k, 3)+(3*d-3)*pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
00678         for (k=degree; k<seq_length; k++)
00679             block_weights[k]=(-d+3*k+4)/3;
00680     }
00681 
00682     return (block_weights!=NULL);
00683 }
00684 
00685 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external()
00686 {
00687     ASSERT(weights);
00688     delete[] block_weights;
00689     block_weights=new float64_t[CMath::max(seq_length,degree)];
00690 
00691     if (block_weights)
00692     {
00693         int32_t i=0;
00694         block_weights[0]=weights[0];
00695         for (i=1; i<CMath::max(seq_length,degree); i++)
00696             block_weights[i]=0;
00697 
00698         for (i=1; i<CMath::max(seq_length,degree); i++)
00699         {
00700             block_weights[i]=block_weights[i-1];
00701 
00702             float64_t contrib=0;
00703             for (int32_t j=0; j<CMath::min(degree,i+1); j++)
00704                 contrib+=weights[j];
00705 
00706             block_weights[i]+=contrib;
00707         }
00708     }
00709 
00710     return (block_weights!=NULL);
00711 }
00712 
00713 bool CWeightedDegreeStringKernel::init_block_weights_const()
00714 {
00715     delete[] block_weights;
00716     block_weights=new float64_t[seq_length];
00717 
00718     if (block_weights)
00719     {
00720         for (int32_t i=1; i<seq_length+1 ; i++)
00721             block_weights[i-1]=1.0/seq_length;
00722     }
00723 
00724     return (block_weights!=NULL);
00725 }
00726 
00727 bool CWeightedDegreeStringKernel::init_block_weights_linear()
00728 {
00729     delete[] block_weights;
00730     block_weights=new float64_t[seq_length];
00731 
00732     if (block_weights)
00733     {
00734         for (int32_t i=1; i<seq_length+1 ; i++)
00735             block_weights[i-1]=degree*i;
00736     }
00737 
00738     return (block_weights!=NULL);
00739 }
00740 
00741 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly()
00742 {
00743     delete[] block_weights;
00744     block_weights=new float64_t[seq_length];
00745 
00746     if (block_weights)
00747     {
00748         for (int32_t i=1; i<degree+1 ; i++)
00749             block_weights[i-1]=((float64_t) i)*i;
00750 
00751         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00752             block_weights[i-1]=i;
00753     }
00754 
00755     return (block_weights!=NULL);
00756 }
00757 
00758 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly()
00759 {
00760     delete[] block_weights;
00761     block_weights=new float64_t[seq_length];
00762 
00763     if (block_weights)
00764     {
00765         for (int32_t i=1; i<degree+1 ; i++)
00766             block_weights[i-1]=((float64_t) i)*i*i;
00767 
00768         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00769             block_weights[i-1]=i;
00770     }
00771 
00772     return (block_weights!=NULL);
00773 }
00774 
00775 bool CWeightedDegreeStringKernel::init_block_weights_exp()
00776 {
00777     delete[] block_weights;
00778     block_weights=new float64_t[seq_length];
00779 
00780     if (block_weights)
00781     {
00782         for (int32_t i=1; i<degree+1 ; i++)
00783             block_weights[i-1]=exp(((float64_t) i/10.0));
00784 
00785         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00786             block_weights[i-1]=i;
00787     }
00788 
00789     return (block_weights!=NULL);
00790 }
00791 
00792 bool CWeightedDegreeStringKernel::init_block_weights_log()
00793 {
00794     delete[] block_weights;
00795     block_weights=new float64_t[seq_length];
00796 
00797     if (block_weights)
00798     {
00799         for (int32_t i=1; i<degree+1 ; i++)
00800             block_weights[i-1]=pow(log(i),2);
00801 
00802         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00803             block_weights[i-1]=i-degree+1+pow(log(degree+1),2);
00804     }
00805 
00806     return (block_weights!=NULL);
00807 }
00808 
00809 bool CWeightedDegreeStringKernel::init_block_weights_external()
00810 {
00811     if (block_weights_external && (seq_length == num_block_weights_external) )
00812     {
00813         delete[] block_weights;
00814         block_weights=new float64_t[seq_length];
00815 
00816         if (block_weights)
00817         {
00818             for (int32_t i=0; i<seq_length; i++)
00819                 block_weights[i]=block_weights_external[i];
00820         }
00821     }
00822     else {
00823       SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external);
00824    }
00825     return (block_weights!=NULL);
00826 }
00827 
00828 bool CWeightedDegreeStringKernel::init_block_weights()
00829 {
00830     switch (type)
00831     {
00832         case E_WD:
00833             return init_block_weights_from_wd();
00834         case E_EXTERNAL:
00835             return init_block_weights_from_wd_external();
00836         case E_BLOCK_CONST:
00837             return init_block_weights_const();
00838         case E_BLOCK_LINEAR:
00839             return init_block_weights_linear();
00840         case E_BLOCK_SQPOLY:
00841             return init_block_weights_sqpoly();
00842         case E_BLOCK_CUBICPOLY:
00843             return init_block_weights_cubicpoly();
00844         case E_BLOCK_EXP:
00845             return init_block_weights_exp();
00846         case E_BLOCK_LOG:
00847             return init_block_weights_log();
00848         case E_BLOCK_EXTERNAL:
00849             return init_block_weights_external();
00850         default:
00851             return false;
00852     };
00853 }
00854 
00855 
00856 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p)
00857 {
00858     S_THREAD_PARAM* params = (S_THREAD_PARAM*) p;
00859     int32_t j=params->j;
00860     CWeightedDegreeStringKernel* wd=params->kernel;
00861     CTrie<DNATrie>* tries=params->tries;
00862     float64_t* weights=params->weights;
00863     int32_t length=params->length;
00864     int32_t* vec=params->vec;
00865     float64_t* result=params->result;
00866     float64_t factor=params->factor;
00867     int32_t* vec_idx=params->vec_idx;
00868 
00869     CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
00870     CAlphabet* alpha=wd->alphabet;
00871 
00872     for (int32_t i=params->start; i<params->end; i++)
00873     {
00874         int32_t len=0;
00875         char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len);
00876         for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++)
00877             vec[k]=alpha->remap_to_bin(char_vec[k]);
00878 
00879         ASSERT(tries);
00880 
00881         result[i]+=factor*
00882             wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
00883     }
00884 
00885     SG_UNREF(rhs_feat);
00886 
00887     return NULL;
00888 }
00889 
00890 void CWeightedDegreeStringKernel::compute_batch(
00891     int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
00892     int32_t* IDX, float64_t* alphas, float64_t factor)
00893 {
00894     ASSERT(tries);
00895     ASSERT(alphabet);
00896     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00897     ASSERT(rhs);
00898     ASSERT(num_vec<=rhs->get_num_vectors());
00899     ASSERT(num_vec>0);
00900     ASSERT(vec_idx);
00901     ASSERT(result);
00902     create_empty_tries();
00903 
00904     int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
00905     ASSERT(num_feat>0);
00906     int32_t num_threads=parallel.get_num_threads();
00907     ASSERT(num_threads>0);
00908     int32_t* vec=new int32_t[num_threads*num_feat];
00909 
00910     if (num_threads < 2)
00911     {
00912 #ifdef CYGWIN
00913         for (int32_t j=0; j<num_feat; j++)
00914 #else
00915         CSignal::clear_cancel();
00916         for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00917 #endif
00918         {
00919             init_optimization(num_suppvec, IDX, alphas, j);
00920             S_THREAD_PARAM params;
00921             params.vec=vec;
00922             params.result=result;
00923             params.weights=weights;
00924             params.kernel=this;
00925             params.tries=tries;
00926             params.factor=factor;
00927             params.j=j;
00928             params.start=0;
00929             params.end=num_vec;
00930             params.length=length;
00931             params.vec_idx=vec_idx;
00932             compute_batch_helper((void*) &params);
00933 
00934             SG_PROGRESS(j,0,num_feat);
00935         }
00936     }
00937 #ifndef WIN32
00938     else
00939     {
00940         CSignal::clear_cancel();
00941         for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00942         {
00943             init_optimization(num_suppvec, IDX, alphas, j);
00944             pthread_t threads[num_threads-1];
00945             S_THREAD_PARAM params[num_threads];
00946             int32_t step= num_vec/num_threads;
00947             int32_t t;
00948 
00949             for (t=0; t<num_threads-1; t++)
00950             {
00951                 params[t].vec=&vec[num_feat*t];
00952                 params[t].result=result;
00953                 params[t].weights=weights;
00954                 params[t].kernel=this;
00955                 params[t].tries=tries;
00956                 params[t].factor=factor;
00957                 params[t].j=j;
00958                 params[t].start = t*step;
00959                 params[t].end = (t+1)*step;
00960                 params[t].length=length;
00961                 params[t].vec_idx=vec_idx;
00962                 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)&params[t]);
00963             }
00964             params[t].vec=&vec[num_feat*t];
00965             params[t].result=result;
00966             params[t].weights=weights;
00967             params[t].kernel=this;
00968             params[t].tries=tries;
00969             params[t].factor=factor;
00970             params[t].j=j;
00971             params[t].start=t*step;
00972             params[t].end=num_vec;
00973             params[t].length=length;
00974             params[t].vec_idx=vec_idx;
00975             compute_batch_helper((void*) &params[t]);
00976 
00977             for (t=0; t<num_threads-1; t++)
00978                 pthread_join(threads[t], NULL);
00979             SG_PROGRESS(j,0,num_feat);
00980         }
00981     }
00982 #endif
00983 
00984     delete[] vec;
00985 
00986     //really also free memory as this can be huge on testing especially when
00987     //using the combined kernel
00988     create_empty_tries();
00989 }
00990 
00991 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max)
00992 {
00993     if (type==E_EXTERNAL && max!=0) {
00994         return false;
00995     }
00996 
00997     max_mismatch=max;
00998 
00999     if (lhs!=NULL && rhs!=NULL)
01000         return init(lhs, rhs);
01001     else
01002         return true;
01003 }

SHOGUN Machine Learning Toolbox - Documentation