WeightedDegreeStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017 
00018 #include "kernel/WeightedDegreeStringKernel.h"
00019 #include "features/Features.h"
00020 #include "features/StringFeatures.h"
00021 
00022 #ifndef WIN32
00023 #include <pthread.h>
00024 #endif
00025 
00026 struct S_THREAD_PARAM
00027 {
00028     INT* vec;
00029     DREAL* result;
00030     DREAL* weights;
00031     CWeightedDegreeStringKernel* kernel;
00032     CTrie<DNATrie>* tries;
00033     DREAL factor;
00034     INT j;
00035     INT start;
00036     INT end;
00037     INT length;
00038     INT* vec_idx;
00039 };
00040 
00041 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00042     INT degree_, EWDKernType type_)
00043 : CStringKernel<CHAR>(10),weights(NULL),position_weights(NULL),
00044     weights_buffer(NULL), mkl_stepsize(1),degree(degree_), length(0),
00045     max_mismatch(0), seq_length(0), initialized(false),
00046     use_normalization(true),block_computation(true),
00047     normalization_const(1.0), num_block_weights_external(0),
00048     block_weights_external(NULL), block_weights(NULL), type(type_),
00049     which_degree(-1), tries(NULL), tree_initialized(false), alphabet(NULL)
00050 {
00051     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00052     lhs=NULL;
00053     rhs=NULL;
00054 
00055     if (type!=E_EXTERNAL)
00056         set_wd_weights_by_type(type);
00057 }
00058 
00059 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00060     DREAL *weights_, INT degree_)
00061 : CStringKernel<CHAR>(10), weights(NULL), position_weights(NULL),
00062     weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00063     max_mismatch(0), seq_length(0), initialized(false),
00064     use_normalization(true), block_computation(true),
00065     normalization_const(1.0), num_block_weights_external(0),
00066     block_weights_external(NULL), block_weights(NULL), type(E_EXTERNAL),
00067     which_degree(-1), tries(NULL), tree_initialized(false), alphabet(NULL)
00068 {
00069     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00070     lhs=NULL;
00071     rhs=NULL;
00072 
00073     weights=new DREAL[degree*(1+max_mismatch)];
00074     for (INT i=0; i<degree*(1+max_mismatch); i++)
00075         weights[i]=weights_[i];
00076 }
00077 
00078 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(
00079     CStringFeatures<CHAR>* l, CStringFeatures<CHAR>* r, INT degree_)
00080 : CStringKernel<CHAR>(10), weights(NULL), position_weights(NULL),
00081     weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00082     max_mismatch(0), seq_length(0), initialized(false),
00083     use_normalization(true), block_computation(true),
00084     normalization_const(1.0), num_block_weights_external(0),
00085     block_weights_external(NULL), block_weights(NULL), type(E_WD),
00086     which_degree(-1), tries(NULL), tree_initialized(false), alphabet(NULL)
00087 {
00088     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00089 
00090     set_wd_weights_by_type(type);
00091     init(l, r);
00092 }
00093 
00094 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel()
00095 {
00096     cleanup();
00097 
00098     delete[] weights;
00099     weights=NULL;
00100 
00101     delete[] block_weights;
00102     block_weights=NULL;
00103 
00104     delete[] position_weights;
00105     position_weights=NULL;
00106 
00107     delete[] weights_buffer;
00108     weights_buffer=NULL;
00109 }
00110 
00111 
00112 void CWeightedDegreeStringKernel::remove_lhs()
00113 { 
00114     SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00115     delete_optimization();
00116     initialized = false;
00117 
00118     if (tries!=NULL)
00119         tries->destroy();
00120 
00121     CKernel::remove_lhs();
00122 }
00123 
00124 void CWeightedDegreeStringKernel::create_empty_tries()
00125 {
00126     seq_length=((CStringFeatures<CHAR>*) lhs)->get_max_vector_length();
00127 
00128     if (tries!=NULL)
00129     {
00130         tries->destroy() ;
00131         tries->create(seq_length, max_mismatch==0) ;
00132     }
00133 }
00134   
00135 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r)
00136 {
00137     INT lhs_changed=(lhs!=l);
00138     INT rhs_changed=(rhs!=r);
00139 
00140     bool result=CStringKernel<CHAR>::init(l,r);
00141     initialized=false;
00142 
00143     SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00144     SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00145 
00146     CStringFeatures<CHAR>* sf_l=(CStringFeatures<CHAR>*) l;
00147     CStringFeatures<CHAR>* sf_r=(CStringFeatures<CHAR>*) r;
00148 
00149     INT len=sf_l->get_max_vector_length();
00150     if (lhs_changed && !sf_l->have_same_length(len))
00151         SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00152 
00153     if (rhs_changed && !sf_r->have_same_length(len))
00154         SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00155 
00156     delete alphabet;
00157     alphabet=new CAlphabet(sf_l->get_alphabet());
00158     CAlphabet* ralphabet=(sf_r->get_alphabet());
00159     if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00160         properties &= ((ULONG) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00161 
00162     ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00163     SG_UNREF(ralphabet);
00164 
00165     if (tries!=NULL) {
00166         tries->delete_trees(max_mismatch==0);
00167         delete tries;
00168     }
00169     tries=new CTrie<DNATrie>(degree, max_mismatch==0);
00170     create_empty_tries();
00171 
00172     init_block_weights();
00173 
00174     if (use_normalization)
00175         normalization_const=block_weights[seq_length-1];
00176     else
00177         normalization_const=1.0;
00178     
00179     initialized=true;
00180     return result;
00181 }
00182 
00183 void CWeightedDegreeStringKernel::cleanup()
00184 {
00185     SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n");
00186     delete_optimization();
00187 
00188     delete[] block_weights;
00189     block_weights=NULL;
00190 
00191     if (tries!=NULL)
00192     {
00193         tries->destroy();
00194         delete tries;
00195         tries=NULL;
00196     }
00197 
00198     seq_length=0;
00199     initialized=false;
00200     tree_initialized = false;
00201 
00202     delete alphabet;
00203     alphabet=NULL;
00204 
00205     CKernel::cleanup();
00206 }
00207 
00208 bool CWeightedDegreeStringKernel::load_init(FILE* src)
00209 {
00210     return false;
00211 }
00212 
00213 bool CWeightedDegreeStringKernel::save_init(FILE* dest)
00214 {
00215     return false;
00216 }
00217   
00218 
00219 bool CWeightedDegreeStringKernel::init_optimization(INT count, INT* IDX, DREAL* alphas, INT tree_num)
00220 {
00221     if (tree_num<0)
00222         SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00223 
00224     delete_optimization();
00225 
00226     if (tree_num<0)
00227         SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ;
00228 
00229     int i=0;
00230     for (i=0; i<count; i++)
00231     {
00232         if (tree_num<0)
00233         {
00234             if ( (i % (count/10+1)) == 0)
00235                 SG_PROGRESS(i, 0, count);
00236             
00237             if (max_mismatch==0)
00238                 add_example_to_tree(IDX[i], alphas[i]) ;
00239             else
00240                 add_example_to_tree_mismatch(IDX[i], alphas[i]) ;
00241 
00242             //SG_DEBUG( "number of used trie nodes: %i\n", tries.get_num_used_nodes()) ;
00243         }
00244         else
00245         {
00246             if (max_mismatch==0)
00247                 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ;
00248             else
00249                 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ;
00250         }
00251     }
00252     
00253     if (tree_num<0)
00254         SG_DONE();
00255 
00256     //tries.compact_nodes(NO_CHILD, 0, weights) ;
00257 
00258     set_is_initialized(true) ;
00259     return true ;
00260 }
00261 
00262 bool CWeightedDegreeStringKernel::delete_optimization()
00263 { 
00264     if (get_is_initialized())
00265     {
00266         if (tries!=NULL)
00267             tries->delete_trees(max_mismatch==0); 
00268         set_is_initialized(false);
00269         return true;
00270     }
00271     
00272     return false;
00273 }
00274 
00275 
00276 DREAL CWeightedDegreeStringKernel::compute_with_mismatch(CHAR* avec, INT alen, CHAR* bvec, INT blen)
00277 {
00278     DREAL sum = 0.0 ;
00279     
00280     for (INT i=0; i<alen; i++)
00281     {
00282         DREAL sumi = 0.0 ;
00283         INT mismatches=0 ;
00284         
00285         for (INT j=0; (i+j<alen) && (j<degree); j++)
00286         {
00287             if (avec[i+j]!=bvec[i+j])
00288             {
00289                 mismatches++ ;
00290                 if (mismatches>max_mismatch)
00291                     break ;
00292             } ;
00293             sumi += weights[j+degree*mismatches];
00294         }
00295         if (position_weights!=NULL)
00296             sum+=position_weights[i]*sumi ;
00297         else
00298             sum+=sumi ;
00299     }
00300     return sum ;
00301 }
00302 
00303 DREAL CWeightedDegreeStringKernel::compute_using_block(CHAR* avec, INT alen, CHAR* bvec, INT blen)
00304 {
00305     ASSERT(alen==blen);
00306 
00307     DREAL sum=0;
00308     INT match_len=-1;
00309 
00310     for (INT i=0; i<alen; i++)
00311     {
00312         if (avec[i]==bvec[i])
00313             match_len++;
00314         else
00315         {
00316             if (match_len>=0)
00317                 sum+=block_weights[match_len];
00318             match_len=-1;
00319         }
00320     }
00321 
00322     if (match_len>=0)
00323         sum+=block_weights[match_len];
00324 
00325     return sum;
00326 }
00327 
00328 DREAL CWeightedDegreeStringKernel::compute_without_mismatch(CHAR* avec, INT alen, CHAR* bvec, INT blen)
00329 {
00330     DREAL sum = 0.0 ;
00331     
00332     for (INT i=0; i<alen; i++)
00333     {
00334         DREAL sumi = 0.0 ;
00335         
00336         for (INT j=0; (i+j<alen) && (j<degree); j++)
00337         {
00338             if (avec[i+j]!=bvec[i+j])
00339                 break ;
00340             sumi += weights[j];
00341         }
00342         if (position_weights!=NULL)
00343             sum+=position_weights[i]*sumi ;
00344         else
00345             sum+=sumi ;
00346     }
00347     return sum ;
00348 }
00349 
00350 DREAL CWeightedDegreeStringKernel::compute_without_mismatch_matrix(CHAR* avec, INT alen, CHAR* bvec, INT blen)
00351 {
00352     DREAL sum = 0.0 ;
00353 
00354     for (INT i=0; i<alen; i++)
00355     {
00356         DREAL sumi=0.0 ;
00357         for (INT j=0; (i+j<alen) && (j<degree); j++)
00358         {
00359             if (avec[i+j]!=bvec[i+j])
00360                 break;
00361             sumi += weights[i*degree+j];
00362         }
00363         if (position_weights!=NULL)
00364             sum += position_weights[i]*sumi ;
00365         else
00366             sum += sumi ;
00367     }
00368 
00369     return sum ;
00370 }
00371 
00372 
00373 DREAL CWeightedDegreeStringKernel::compute(INT idx_a, INT idx_b)
00374 {
00375     INT alen, blen;
00376     CHAR* avec=((CStringFeatures<CHAR>*) lhs)->get_feature_vector(idx_a, alen);
00377     CHAR* bvec=((CStringFeatures<CHAR>*) rhs)->get_feature_vector(idx_b, blen);
00378     DREAL result=0;
00379 
00380     if (max_mismatch==0 && length==0 && block_computation)
00381         result=compute_using_block(avec, alen, bvec, blen);
00382     else
00383     {
00384         if (max_mismatch>0)
00385             result=compute_with_mismatch(avec, alen, bvec, blen);
00386         else if (length==0)
00387             result=compute_without_mismatch(avec, alen, bvec, blen);
00388         else
00389             result=compute_without_mismatch_matrix(avec, alen, bvec, blen);
00390     }
00391 
00392     return result/normalization_const;
00393 }
00394 
00395 
00396 void CWeightedDegreeStringKernel::add_example_to_tree(INT idx, DREAL alpha)
00397 {
00398     ASSERT(alphabet);
00399     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00400 
00401     INT len=0;
00402     CHAR* char_vec=((CStringFeatures<CHAR>*) lhs)->get_feature_vector(idx, len);
00403     ASSERT(max_mismatch==0);
00404     INT *vec=new INT[len];
00405 
00406     for (INT i=0; i<len; i++)
00407         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00408     
00409     if (length == 0 || max_mismatch > 0)
00410     {
00411         for (INT i=0; i<len; i++)
00412         {
00413             DREAL alpha_pw=alpha;
00414             /*if (position_weights!=NULL)
00415               alpha_pw *= position_weights[i] ;*/
00416             if (alpha_pw==0.0)
00417                 continue;
00418             ASSERT(tries);
00419             tries->add_to_trie(i, 0, vec, alpha_pw, weights, (length!=0));
00420         }
00421     }
00422     else
00423     {
00424         for (INT i=0; i<len; i++)
00425         {
00426             DREAL alpha_pw=alpha;
00427             /*if (position_weights!=NULL) 
00428               alpha_pw = alpha*position_weights[i] ;*/
00429             if (alpha_pw==0.0)
00430                 continue ;
00431             ASSERT(tries);
00432             tries->add_to_trie(i, 0, vec, alpha_pw, weights, (length!=0));
00433         }
00434     }
00435     delete[] vec ;
00436     tree_initialized=true ;
00437 }
00438 
00439 void CWeightedDegreeStringKernel::add_example_to_single_tree(INT idx, DREAL alpha, INT tree_num) 
00440 {
00441     ASSERT(alphabet);
00442     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00443 
00444     INT len ;
00445     CHAR* char_vec=((CStringFeatures<CHAR>*) lhs)->get_feature_vector(idx, len);
00446     ASSERT(max_mismatch==0);
00447     INT *vec = new INT[len] ;
00448 
00449     for (INT i=tree_num; i<tree_num+degree && i<len; i++)
00450         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00451     
00452     ASSERT(tries);
00453     if (alpha!=0.0)
00454         tries->add_to_trie(tree_num, 0, vec, alpha, weights, (length!=0));
00455 
00456     delete[] vec ;
00457     tree_initialized=true ;
00458 }
00459 
00460 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(INT idx, DREAL alpha)
00461 {
00462     ASSERT(tries);
00463     ASSERT(alphabet);
00464     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00465 
00466     INT len ;
00467     CHAR* char_vec=((CStringFeatures<CHAR>*) lhs)->get_feature_vector(idx, len);
00468     
00469     INT *vec = new INT[len] ;
00470     
00471     for (INT i=0; i<len; i++)
00472         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00473     
00474     for (INT i=0; i<len; i++)
00475     {
00476         if (alpha!=0.0)
00477             tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, alpha, &vec[i], len-i, 0, 0, max_mismatch, weights);
00478     }
00479     
00480     delete[] vec ;
00481     tree_initialized=true ;
00482 }
00483 
00484 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch(INT idx, DREAL alpha, INT tree_num)
00485 {
00486     ASSERT(tries);
00487     ASSERT(alphabet);
00488     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00489 
00490     INT len=0;
00491     CHAR* char_vec=((CStringFeatures<CHAR>*) lhs)->get_feature_vector(idx, len);
00492     INT *vec=new INT[len];
00493 
00494     for (INT i=tree_num; i<len && i<tree_num+degree; i++)
00495         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00496 
00497     if (alpha!=0.0)
00498     {
00499         tries->add_example_to_tree_mismatch_recursion(
00500             NO_CHILD, tree_num, alpha, &vec[tree_num], len-tree_num,
00501             0, 0, max_mismatch, weights);
00502     }
00503 
00504     delete[] vec;
00505     tree_initialized=true;
00506 }
00507 
00508 
00509 DREAL CWeightedDegreeStringKernel::compute_by_tree(INT idx)
00510 {
00511     ASSERT(alphabet);
00512     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00513 
00514     INT len=0;
00515     CHAR* char_vec=((CStringFeatures<CHAR>*) rhs)->get_feature_vector(idx, len);
00516     ASSERT(char_vec && len>0);
00517     INT *vec=new INT[len];
00518 
00519     for (INT i=0; i<len; i++)
00520         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00521 
00522     DREAL sum=0;
00523     ASSERT(tries);
00524     for (INT i=0; i<len; i++)
00525         sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0));
00526 
00527     delete[] vec;
00528     return sum/normalization_const;
00529 }
00530 
00531 void CWeightedDegreeStringKernel::compute_by_tree(INT idx, DREAL* LevelContrib)
00532 {
00533     ASSERT(alphabet);
00534     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00535 
00536     INT len ;
00537     CHAR* char_vec=((CStringFeatures<CHAR>*) rhs)->get_feature_vector(idx, len);
00538 
00539     INT *vec = new INT[len] ;
00540 
00541     for (INT i=0; i<len; i++)
00542         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00543 
00544     ASSERT(tries);
00545     for (INT i=0; i<len; i++)
00546       tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib, 1.0/normalization_const, mkl_stepsize, weights, (length!=0));
00547 
00548     delete[] vec ;
00549 }
00550 
00551 DREAL *CWeightedDegreeStringKernel::compute_abs_weights(int &len)
00552 {
00553     ASSERT(tries);
00554     return tries->compute_abs_weights(len);
00555 }
00556 
00557 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type)
00558 {
00559     ASSERT(degree>0);
00560     ASSERT(p_type==E_WD); 
00561 
00562     delete[] weights;
00563     weights=new DREAL[degree];
00564     if (weights)
00565     {
00566         INT i;
00567         DREAL sum=0;
00568         for (i=0; i<degree; i++)
00569         {
00570             weights[i]=degree-i;
00571             sum+=weights[i];
00572         }
00573         for (i=0; i<degree; i++)
00574             weights[i]/=sum;
00575 
00576         for (i=0; i<degree; i++)
00577         {
00578             for (INT j=1; j<=max_mismatch; j++)
00579             {
00580                 if (j<i+1)
00581                 {
00582                     INT nk=CMath::nchoosek(i+1, j);
00583                     weights[i+j*degree]=weights[i]/(nk*pow(3,j));
00584                 }
00585                 else
00586                     weights[i+j*degree]= 0;
00587             }
00588         }
00589 
00590         if (which_degree>=0)
00591         {
00592             ASSERT(which_degree<degree);
00593             for (i=0; i<degree; i++)
00594             {
00595                 if (i!=which_degree)
00596                     weights[i]=0;
00597                 else
00598                     weights[i]=1;
00599             }
00600         }
00601         return true;
00602     }
00603     else
00604         return false;
00605 }
00606 
00607 bool CWeightedDegreeStringKernel::set_weights(DREAL* ws, INT d, INT len)
00608 {
00609     SG_DEBUG("degree = %i  d=%i\n", degree, d);
00610     degree=d;
00611     ASSERT(tries);
00612     tries->set_degree(degree);
00613     length=len;
00614     
00615     if (length==0) length=1;
00616     INT num_weights=degree*(length+max_mismatch);
00617     delete[] weights;
00618     weights=new DREAL[num_weights];
00619     if (weights)
00620     {
00621         for (INT i=0; i<num_weights; i++) {
00622             if (ws[i]) // len(ws) might be != num_weights?
00623                 weights[i]=ws[i];
00624         }
00625         return true;
00626     }
00627     else
00628         return false;
00629 }
00630 
00631 bool CWeightedDegreeStringKernel::set_position_weights(DREAL* pws, INT len)
00632 {
00633     if (len==0)
00634     {
00635         delete[] position_weights;
00636         position_weights=NULL;
00637         ASSERT(tries);
00638         tries->set_position_weights(position_weights);
00639     }
00640 
00641     if (seq_length!=len)
00642         SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00643 
00644     delete[] position_weights;
00645     position_weights=new DREAL[len];
00646     ASSERT(tries);
00647     tries->set_position_weights(position_weights);
00648     
00649     if (position_weights)
00650     {
00651         for (int i=0; i<len; i++)
00652             position_weights[i]=pws[i];
00653         return true;
00654     }
00655     else
00656         return false;
00657 }
00658 
00659 bool CWeightedDegreeStringKernel::init_block_weights_from_wd()
00660 {
00661     delete[] block_weights;
00662     block_weights=new DREAL[CMath::max(seq_length,degree)];
00663 
00664     if (block_weights)
00665     {
00666         double deg=degree;
00667         INT k;
00668         for (k=0; k<degree ; k++)
00669             block_weights[k]=(-pow(k,3) + (3*deg-3)*pow(k,2) + (9*deg-2)*k + 6*deg) / (3*deg*(deg+1));
00670         for (k=degree; k<seq_length ; k++)
00671             block_weights[k]=(-deg+3*k+4)/3;
00672     }
00673 
00674     return (block_weights!=NULL);
00675 }
00676 
00677 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external()
00678 {
00679     ASSERT(weights);
00680     delete[] block_weights;
00681     block_weights=new DREAL[CMath::max(seq_length,degree)];
00682 
00683     if (block_weights)
00684     {
00685         INT i=0;
00686         block_weights[0]=weights[0];
00687         for (i=1; i<CMath::max(seq_length,degree); i++)
00688             block_weights[i]=0;
00689 
00690         for (i=1; i<CMath::max(seq_length,degree); i++)
00691         {
00692             block_weights[i]=block_weights[i-1];
00693 
00694             DREAL contrib=0;
00695             for (INT j=0; j<CMath::min(degree,i+1); j++)
00696                 contrib+=weights[j];
00697 
00698             block_weights[i]+=contrib;
00699         }
00700     }
00701 
00702     return (block_weights!=NULL);
00703 }
00704 
00705 bool CWeightedDegreeStringKernel::init_block_weights_const()
00706 {
00707     delete[] block_weights;
00708     block_weights=new DREAL[seq_length];
00709 
00710     if (block_weights)
00711     {
00712         for (int i=1; i<seq_length+1 ; i++)
00713             block_weights[i-1]=1.0/seq_length;
00714     }
00715 
00716     return (block_weights!=NULL);
00717 }
00718 
00719 bool CWeightedDegreeStringKernel::init_block_weights_linear()
00720 {
00721     delete[] block_weights;
00722     block_weights=new DREAL[seq_length];
00723 
00724     if (block_weights)
00725     {
00726         for (int i=1; i<seq_length+1 ; i++)
00727             block_weights[i-1]=degree*i;
00728     }
00729 
00730     return (block_weights!=NULL);
00731 }
00732 
00733 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly()
00734 {
00735     delete[] block_weights;
00736     block_weights=new DREAL[seq_length];
00737 
00738     if (block_weights)
00739     {
00740         for (int i=1; i<degree+1 ; i++)
00741             block_weights[i-1]=((double) i)*i;
00742 
00743         for (int i=degree+1; i<seq_length+1 ; i++)
00744             block_weights[i-1]=i;
00745     }
00746 
00747     return (block_weights!=NULL);
00748 }
00749 
00750 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly()
00751 {
00752     delete[] block_weights;
00753     block_weights=new DREAL[seq_length];
00754 
00755     if (block_weights)
00756     {
00757         for (int i=1; i<degree+1 ; i++)
00758             block_weights[i-1]=((double) i)*i*i;
00759 
00760         for (int i=degree+1; i<seq_length+1 ; i++)
00761             block_weights[i-1]=i;
00762     }
00763 
00764     return (block_weights!=NULL);
00765 }
00766 
00767 bool CWeightedDegreeStringKernel::init_block_weights_exp()
00768 {
00769     delete[] block_weights;
00770     block_weights=new DREAL[seq_length];
00771 
00772     if (block_weights)
00773     {
00774         for (int i=1; i<degree+1 ; i++)
00775             block_weights[i-1]=exp(((double) i/10.0));
00776 
00777         for (int i=degree+1; i<seq_length+1 ; i++)
00778             block_weights[i-1]=i;
00779     }
00780 
00781     return (block_weights!=NULL);
00782 }
00783 
00784 bool CWeightedDegreeStringKernel::init_block_weights_log()
00785 {
00786     delete[] block_weights;
00787     block_weights=new DREAL[seq_length];
00788 
00789     if (block_weights)
00790     {
00791         for (int i=1; i<degree+1 ; i++)
00792             block_weights[i-1]=pow(log(i),2);
00793 
00794         for (int i=degree+1; i<seq_length+1 ; i++)
00795             block_weights[i-1]=i-degree+1+pow(log(degree+1),2);
00796     }
00797 
00798     return (block_weights!=NULL);
00799 }
00800 
00801 bool CWeightedDegreeStringKernel::init_block_weights_external()
00802 {
00803     if (block_weights_external && (seq_length == num_block_weights_external) )
00804     {
00805         delete[] block_weights;
00806         block_weights=new DREAL[seq_length];
00807 
00808         if (block_weights)
00809         {
00810             for (int i=0; i<seq_length; i++)
00811                 block_weights[i]=block_weights_external[i];
00812         }
00813     }
00814     else {
00815       SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external);
00816    }
00817     return (block_weights!=NULL);
00818 }
00819 
00820 bool CWeightedDegreeStringKernel::init_block_weights()
00821 {
00822     switch (type)
00823     {
00824         case E_WD:
00825             return init_block_weights_from_wd();
00826         case E_EXTERNAL:
00827             return init_block_weights_from_wd_external();
00828         case E_BLOCK_CONST:
00829             return init_block_weights_const();
00830         case E_BLOCK_LINEAR:
00831             return init_block_weights_linear();
00832         case E_BLOCK_SQPOLY:
00833             return init_block_weights_sqpoly();
00834         case E_BLOCK_CUBICPOLY:
00835             return init_block_weights_cubicpoly();
00836         case E_BLOCK_EXP:
00837             return init_block_weights_exp();
00838         case E_BLOCK_LOG:
00839             return init_block_weights_log();
00840         case E_BLOCK_EXTERNAL:
00841             return init_block_weights_external();
00842         default:
00843             return false;
00844     };
00845 }
00846 
00847 
00848 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p)
00849 {
00850     S_THREAD_PARAM* params = (S_THREAD_PARAM*) p;
00851     INT j=params->j;
00852     CWeightedDegreeStringKernel* wd=params->kernel;
00853     CTrie<DNATrie>* tries=params->tries;
00854     DREAL* weights=params->weights;
00855     INT length=params->length;
00856     INT* vec=params->vec;
00857     DREAL* result=params->result;
00858     DREAL factor=params->factor;
00859     INT* vec_idx=params->vec_idx;
00860 
00861     CStringFeatures<CHAR>* rhs_feat=((CStringFeatures<CHAR>*) wd->get_rhs());
00862     CAlphabet* alpha=wd->alphabet;
00863 
00864     for (INT i=params->start; i<params->end; i++)
00865     {
00866         INT len=0;
00867         CHAR* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len);
00868         for (INT k=j; k<CMath::min(len,j+wd->get_degree()); k++)
00869             vec[k]=alpha->remap_to_bin(char_vec[k]);
00870 
00871         ASSERT(tries);
00872 
00873         result[i]+=factor*
00874             tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0))/
00875             wd->get_normalization_const();
00876     }
00877 
00878     SG_UNREF(rhs_feat);
00879 
00880     return NULL;
00881 }
00882 
00883 void CWeightedDegreeStringKernel::compute_batch(INT num_vec, INT* vec_idx, DREAL* result, INT num_suppvec, INT* IDX, DREAL* alphas, DREAL factor)
00884 {
00885     ASSERT(tries);
00886     ASSERT(alphabet);
00887     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00888     ASSERT(rhs);
00889     ASSERT(num_vec<=rhs->get_num_vectors());
00890     ASSERT(num_vec>0);
00891     ASSERT(vec_idx);
00892     ASSERT(result);
00893     create_empty_tries();
00894 
00895     INT num_feat=((CStringFeatures<CHAR>*) rhs)->get_max_vector_length();
00896     ASSERT(num_feat>0);
00897     INT num_threads=parallel.get_num_threads();
00898     ASSERT(num_threads>0);
00899     INT* vec=new INT[num_threads*num_feat];
00900 
00901     if (num_threads < 2)
00902     {
00903 #ifdef CYGWIN
00904         for (INT j=0; j<num_feat; j++)
00905 #else
00906         CSignal::clear() ;
00907         for (INT j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00908 #endif
00909         {
00910             init_optimization(num_suppvec, IDX, alphas, j);
00911             S_THREAD_PARAM params;
00912             params.vec=vec;
00913             params.result=result;
00914             params.weights=weights;
00915             params.kernel=this;
00916             params.tries=tries;
00917             params.factor=factor;
00918             params.j=j;
00919             params.start=0;
00920             params.end=num_vec;
00921             params.length=length;
00922             params.vec_idx=vec_idx;
00923             compute_batch_helper((void*) &params);
00924 
00925             SG_PROGRESS(j,0,num_feat);
00926         }
00927     }
00928 #ifndef WIN32
00929     else
00930     {
00931         CSignal::clear() ;
00932         for (INT j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00933         {
00934             init_optimization(num_suppvec, IDX, alphas, j);
00935             pthread_t threads[num_threads-1];
00936             S_THREAD_PARAM params[num_threads];
00937             INT step= num_vec/num_threads;
00938             INT t;
00939 
00940             for (t=0; t<num_threads-1; t++)
00941             {
00942                 params[t].vec=&vec[num_feat*t];
00943                 params[t].result=result;
00944                 params[t].weights=weights;
00945                 params[t].kernel=this;
00946                 params[t].tries=tries;
00947                 params[t].factor=factor;
00948                 params[t].j=j;
00949                 params[t].start = t*step;
00950                 params[t].end = (t+1)*step;
00951                 params[t].length=length;
00952                 params[t].vec_idx=vec_idx;
00953                 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)&params[t]);
00954             }
00955             params[t].vec=&vec[num_feat*t];
00956             params[t].result=result;
00957             params[t].weights=weights;
00958             params[t].kernel=this;
00959             params[t].tries=tries;
00960             params[t].factor=factor;
00961             params[t].j=j;
00962             params[t].start=t*step;
00963             params[t].end=num_vec;
00964             params[t].length=length;
00965             params[t].vec_idx=vec_idx;
00966             compute_batch_helper((void*) &params[t]);
00967 
00968             for (t=0; t<num_threads-1; t++)
00969                 pthread_join(threads[t], NULL);
00970             SG_PROGRESS(j,0,num_feat);
00971         }
00972     }
00973 #endif
00974 
00975     delete[] vec;
00976 
00977     //really also free memory as this can be huge on testing especially when
00978     //using the combined kernel
00979     create_empty_tries();
00980 }
00981 
00982 bool CWeightedDegreeStringKernel::set_max_mismatch(INT max)
00983 {
00984     if (type==E_EXTERNAL && max!=0) {
00985         return false;
00986     }
00987 
00988     max_mismatch=max;
00989 
00990     if (lhs!=NULL && rhs!=NULL)
00991         return init(lhs, rhs);
00992     else
00993         return true;
00994 }
00995 

SHOGUN Machine Learning Toolbox - Documentation