WeightedDegreePositionStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017 
00018 #include "kernel/WeightedDegreePositionStringKernel.h"
00019 #include "features/Features.h"
00020 #include "features/StringFeatures.h"
00021 
00022 #include "classifier/svm/SVM.h"
00023 
00024 #ifndef WIN32
00025 #include <pthread.h>
00026 #endif
00027 
00028 #define TRIES(X) ((use_poim_tries) ? (poim_tries.X) : (tries.X))
00029 
00030 template <class Trie> struct S_THREAD_PARAM 
00031 {
00032     INT* vec;
00033     DREAL* result;
00034     DREAL* weights;
00035     CWeightedDegreePositionStringKernel* kernel;
00036     CTrie<Trie>* tries;
00037     DREAL factor;
00038     INT j;
00039     INT start;
00040     INT end;
00041     INT length;
00042     INT max_shift;
00043     INT* shift;
00044     INT* vec_idx;
00045 };
00046 
00047 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00048     INT size, INT d, INT mm, bool un, INT mkls)
00049 : CStringKernel<CHAR>(size), weights(NULL), position_weights(NULL),
00050     position_weights_lhs(NULL), position_weights_rhs(NULL),
00051     weights_buffer(NULL), mkl_stepsize(mkls), degree(d), length(0),
00052     max_mismatch(mm), seq_length(0), shift(NULL), shift_len(0),
00053     initialized(false), use_normalization(un),
00054     normalization_const(1.0), num_block_weights_external(0),
00055     block_weights_external(NULL), block_weights(NULL), type(E_EXTERNAL),
00056     tries(d), poim_tries(d), tree_initialized(false), use_poim_tries(false),
00057     m_poim_distrib(NULL), m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0),
00058     m_poim_result_len(0), alphabet(NULL)
00059 {
00060     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00061     set_wd_weights();
00062     ASSERT(weights);
00063 }
00064 
00065 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00066     INT size, DREAL* w, INT d, INT mm, INT* s, INT sl, bool un, INT mkls)
00067 : CStringKernel<CHAR>(size), weights(NULL), position_weights(NULL),
00068     position_weights_lhs(NULL), position_weights_rhs(NULL),
00069     weights_buffer(NULL), mkl_stepsize(mkls), degree(d), length(0),
00070     max_mismatch(mm), seq_length(0), shift(NULL), shift_len(0),
00071     initialized(false), use_normalization(un),
00072     normalization_const(1.0), num_block_weights_external(0),
00073     block_weights_external(NULL), block_weights(NULL), type(E_EXTERNAL),
00074     tries(d), poim_tries(d), tree_initialized(false), use_poim_tries(false),
00075     m_poim_distrib(NULL), m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0),
00076     m_poim_result_len(0), alphabet(NULL)
00077 {
00078     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00079 
00080     weights=new DREAL[d*(1+max_mismatch)];
00081     for (INT i=0; i<d*(1+max_mismatch); i++)
00082         weights[i]=w[i];
00083 
00084     set_shifts(s, sl);
00085 }
00086 
00087 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00088     CStringFeatures<CHAR>* l, CStringFeatures<CHAR>* r, INT d)
00089 : CStringKernel<CHAR>(10), weights(NULL), position_weights(NULL),
00090     position_weights_lhs(NULL), position_weights_rhs(NULL),
00091     weights_buffer(NULL), mkl_stepsize(1), degree(d), length(0),
00092     max_mismatch(0), seq_length(0), shift(NULL), shift_len(0),
00093     initialized(false), use_normalization(true),
00094     normalization_const(1.0), num_block_weights_external(0),
00095     block_weights_external(NULL), block_weights(NULL), type(E_EXTERNAL),
00096     tries(d), poim_tries(d), tree_initialized(false), use_poim_tries(false),
00097     m_poim_distrib(NULL), m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0),
00098     m_poim_result_len(0), alphabet(NULL)
00099 {
00100     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00101     set_wd_weights();
00102     ASSERT(weights);
00103 
00104     init(l, r);
00105 }
00106 
00107 
00108 CWeightedDegreePositionStringKernel::~CWeightedDegreePositionStringKernel()
00109 {
00110     cleanup();
00111     cleanup_POIM2();
00112 
00113     delete[] shift;
00114     shift=NULL;
00115 
00116     delete[] weights;
00117     weights=NULL ;
00118 
00119     delete[] block_weights;
00120     block_weights=NULL;
00121 
00122     delete[] position_weights;
00123     position_weights=NULL;
00124 
00125     delete[] position_weights_lhs;
00126     position_weights_lhs=NULL;
00127 
00128     delete[] position_weights_rhs;
00129     position_weights_rhs=NULL;
00130 
00131     delete[] weights_buffer;
00132     weights_buffer=NULL;
00133 }
00134 
00135 void CWeightedDegreePositionStringKernel::remove_lhs()
00136 {
00137     SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00138     delete_optimization();
00139     initialized = false;
00140 
00141     tries.destroy();
00142     poim_tries.destroy();
00143 
00144     CKernel::remove_lhs();
00145 }
00146 
00147 void CWeightedDegreePositionStringKernel::create_empty_tries()
00148 {
00149     ASSERT(lhs);
00150     seq_length = ((CStringFeatures<CHAR>*) lhs)->get_max_vector_length();
00151 
00152     if (opt_type==SLOWBUTMEMEFFICIENT)
00153     {
00154         tries.create(seq_length, true); 
00155         poim_tries.create(seq_length, true); 
00156     }
00157     else if (opt_type==FASTBUTMEMHUNGRY)
00158     {
00159         tries.create(seq_length, false);  // still buggy
00160         poim_tries.create(seq_length, false);  // still buggy
00161     }
00162     else
00163         SG_ERROR( "unknown optimization type\n");
00164 }
00165 
00166 bool CWeightedDegreePositionStringKernel::init(CFeatures* l, CFeatures* r)
00167 {
00168     INT lhs_changed = (lhs!=l) ;
00169     INT rhs_changed = (rhs!=r) ;
00170 
00171     bool result=CStringKernel<CHAR>::init(l,r);
00172 
00173     SG_DEBUG( "lhs_changed: %i\n", lhs_changed) ;
00174     SG_DEBUG( "rhs_changed: %i\n", rhs_changed) ;
00175 
00176     CStringFeatures<CHAR>* sf_l=(CStringFeatures<CHAR>*) l;
00177     CStringFeatures<CHAR>* sf_r=(CStringFeatures<CHAR>*) r;
00178 
00179     /* set shift */
00180     if (shift_len==0) {
00181         shift_len=sf_l->get_vector_length(0);
00182         INT *shifts=new INT[shift_len];
00183         for (INT i=0; i<shift_len; i++) {
00184             shifts[i]=1;
00185         }
00186         set_shifts(shifts, shift_len);
00187         delete[] shifts;
00188     }
00189 
00190 
00191     INT len=sf_l->get_max_vector_length();
00192     if (lhs_changed && !sf_l->have_same_length(len))
00193         SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00194 
00195     if (rhs_changed && !sf_r->have_same_length(len))
00196         SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00197 
00198     delete alphabet;
00199     alphabet= new CAlphabet(sf_l->get_alphabet());
00200     CAlphabet* ralphabet=sf_r->get_alphabet();
00201     if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00202         properties &= ((ULONG) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00203 
00204     ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00205     SG_UNREF(ralphabet);
00206 
00207     //whenever init is called also init tries and block weights
00208     if (!initialized)
00209     {
00210         create_empty_tries();
00211         init_block_weights();
00212     }
00213 
00214     //whenever lhs changes also recompute normalisation const
00215     if (lhs_changed)
00216     {
00217         normalization_const=1.0;
00218         if (use_normalization)
00219             normalization_const=compute(0,0);
00220     } 
00221 
00222     SG_DEBUG( "use normalization:%d (const:%f)\n", (use_normalization) ? 1 : 0,
00223             normalization_const);
00224 
00225     initialized = true;
00226     return result;
00227 }
00228 
00229 void CWeightedDegreePositionStringKernel::cleanup()
00230 {
00231     SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00232     delete_optimization();
00233 
00234     delete[] block_weights;
00235     block_weights=NULL;
00236 
00237     tries.destroy();
00238     poim_tries.destroy();
00239 
00240     seq_length = 0;
00241     initialized = false;
00242     tree_initialized = false;
00243 
00244     delete alphabet;
00245     alphabet=NULL;
00246 
00247     CKernel::cleanup();
00248 }
00249 
00250 bool CWeightedDegreePositionStringKernel::load_init(FILE* src)
00251 {
00252     return false;
00253 }
00254 
00255 bool CWeightedDegreePositionStringKernel::save_init(FILE* dest)
00256 {
00257     return false;
00258 }
00259 
00260 bool CWeightedDegreePositionStringKernel::init_optimization(INT p_count, INT * IDX, DREAL * alphas, INT tree_num, INT upto_tree)
00261 {
00262     ASSERT(position_weights_lhs==NULL);
00263     ASSERT(position_weights_rhs==NULL);
00264 
00265     if (upto_tree<0)
00266         upto_tree=tree_num;
00267 
00268     if (max_mismatch!=0)
00269     {
00270         SG_ERROR( "CWeightedDegreePositionStringKernel optimization not implemented for mismatch!=0\n");
00271         return false ;
00272     }
00273 
00274     if (tree_num<0)
00275         SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00276 
00277     delete_optimization();
00278 
00279     if (tree_num<0)
00280         SG_DEBUG( "initializing CWeightedDegreePositionStringKernel optimization\n") ;
00281 
00282     int i=0;
00283     for (i=0; i<p_count; i++)
00284     {
00285         if (tree_num<0)
00286         {
00287             if ( (i % (p_count/10+1)) == 0)
00288                 SG_PROGRESS(i,0,p_count);
00289             add_example_to_tree(IDX[i], alphas[i]);
00290         }
00291         else
00292         {
00293             for (INT t=tree_num; t<=upto_tree; t++)
00294                 add_example_to_single_tree(IDX[i], alphas[i], t);
00295         }
00296     }
00297 
00298     if (tree_num<0)
00299         SG_DONE();
00300 
00301     set_is_initialized(true) ;
00302     return true ;
00303 }
00304 
00305 bool CWeightedDegreePositionStringKernel::delete_optimization() 
00306 { 
00307     if ((opt_type==FASTBUTMEMHUNGRY) && (tries.get_use_compact_terminal_nodes())) 
00308     {
00309         tries.set_use_compact_terminal_nodes(false) ;
00310         SG_DEBUG( "disabling compact trie nodes with FASTBUTMEMHUNGRY\n") ;
00311     }
00312 
00313     if (get_is_initialized())
00314     {
00315         if (opt_type==SLOWBUTMEMEFFICIENT)
00316             tries.delete_trees(true); 
00317         else if (opt_type==FASTBUTMEMHUNGRY)
00318             tries.delete_trees(false);  // still buggy
00319         else {
00320             SG_ERROR( "unknown optimization type\n");
00321         }
00322         set_is_initialized(false);
00323 
00324         return true;
00325     }
00326 
00327     return false;
00328 }
00329 
00330 DREAL CWeightedDegreePositionStringKernel::compute_with_mismatch(CHAR* avec, INT alen, CHAR* bvec, INT blen) 
00331 {
00332     DREAL max_shift_vec[max_shift];
00333     DREAL sum0=0 ;
00334     for (INT i=0; i<max_shift; i++)
00335         max_shift_vec[i]=0 ;
00336     
00337     // no shift
00338     for (INT i=0; i<alen; i++)
00339     {
00340         if ((position_weights!=NULL) && (position_weights[i]==0.0))
00341             continue ;
00342         
00343         INT mismatches=0;
00344         DREAL sumi = 0.0 ;
00345         for (INT j=0; (j<degree) && (i+j<alen); j++)
00346         {
00347             if (avec[i+j]!=bvec[i+j])
00348             {
00349                 mismatches++ ;
00350                 if (mismatches>max_mismatch)
00351                     break ;
00352             } ;
00353             sumi += weights[j+degree*mismatches];
00354         }
00355         if (position_weights!=NULL)
00356             sum0 += position_weights[i]*sumi ;
00357         else
00358             sum0 += sumi ;
00359     } ;
00360     
00361     for (INT i=0; i<alen; i++)
00362     {
00363         for (INT k=1; (k<=shift[i]) && (i+k<alen); k++)
00364         {
00365             if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00366                 continue ;
00367             
00368             DREAL sumi1 = 0.0 ;
00369             // shift in sequence a
00370             INT mismatches=0;
00371             for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00372             {
00373                 if (avec[i+j+k]!=bvec[i+j])
00374                 {
00375                     mismatches++ ;
00376                     if (mismatches>max_mismatch)
00377                         break ;
00378                 } ;
00379                 sumi1 += weights[j+degree*mismatches];
00380             }
00381             DREAL sumi2 = 0.0 ;
00382             // shift in sequence b
00383             mismatches=0;
00384             for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00385             {
00386                 if (avec[i+j]!=bvec[i+j+k])
00387                 {
00388                     mismatches++ ;
00389                     if (mismatches>max_mismatch)
00390                         break ;
00391                 } ;
00392                 sumi2 += weights[j+degree*mismatches];
00393             }
00394             if (position_weights!=NULL)
00395                 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00396             else
00397                 max_shift_vec[k-1] += sumi1 + sumi2 ;
00398         } ;
00399     }
00400     
00401     DREAL result = sum0 ;
00402     for (INT i=0; i<max_shift; i++)
00403         result += max_shift_vec[i]/(2*(i+1)) ;
00404     
00405     return result ;
00406 }
00407 
00408 DREAL CWeightedDegreePositionStringKernel::compute_without_mismatch(CHAR* avec, INT alen, CHAR* bvec, INT blen) 
00409 {
00410     DREAL max_shift_vec[max_shift];
00411     DREAL sum0=0 ;
00412     for (INT i=0; i<max_shift; i++)
00413         max_shift_vec[i]=0 ;
00414 
00415     // no shift
00416     for (INT i=0; i<alen; i++)
00417     {
00418         if ((position_weights!=NULL) && (position_weights[i]==0.0))
00419             continue ;
00420 
00421         DREAL sumi = 0.0 ;
00422         for (INT j=0; (j<degree) && (i+j<alen); j++)
00423         {
00424             if (avec[i+j]!=bvec[i+j])
00425                 break ;
00426             sumi += weights[j];
00427         }
00428         if (position_weights!=NULL)
00429             sum0 += position_weights[i]*sumi ;
00430         else
00431             sum0 += sumi ;
00432     } ;
00433 
00434     for (INT i=0; i<alen; i++)
00435     {
00436         for (INT k=1; (k<=shift[i]) && (i+k<alen); k++)
00437         {
00438             if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00439                 continue ;
00440 
00441             DREAL sumi1 = 0.0 ;
00442             // shift in sequence a
00443             for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00444             {
00445                 if (avec[i+j+k]!=bvec[i+j])
00446                     break ;
00447                 sumi1 += weights[j];
00448             }
00449             DREAL sumi2 = 0.0 ;
00450             // shift in sequence b
00451             for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00452             {
00453                 if (avec[i+j]!=bvec[i+j+k])
00454                     break ;
00455                 sumi2 += weights[j];
00456             }
00457             if (position_weights!=NULL)
00458                 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00459             else
00460                 max_shift_vec[k-1] += sumi1 + sumi2 ;
00461         } ;
00462     }
00463 
00464     DREAL result = sum0 ;
00465     for (INT i=0; i<max_shift; i++)
00466         result += max_shift_vec[i]/(2*(i+1)) ;
00467 
00468     return result ;
00469 }
00470 
00471 DREAL CWeightedDegreePositionStringKernel::compute_without_mismatch_matrix(CHAR* avec, INT alen, CHAR* bvec, INT blen) 
00472 {
00473     DREAL max_shift_vec[max_shift];
00474     DREAL sum0=0 ;
00475     for (INT i=0; i<max_shift; i++)
00476         max_shift_vec[i]=0 ;
00477 
00478     // no shift
00479     for (INT i=0; i<alen; i++)
00480     {
00481         if ((position_weights!=NULL) && (position_weights[i]==0.0))
00482             continue ;
00483         DREAL sumi = 0.0 ;
00484         for (INT j=0; (j<degree) && (i+j<alen); j++)
00485         {
00486             if (avec[i+j]!=bvec[i+j])
00487                 break ;
00488             sumi += weights[i*degree+j];
00489         }
00490         if (position_weights!=NULL)
00491             sum0 += position_weights[i]*sumi ;
00492         else
00493             sum0 += sumi ;
00494     } ;
00495 
00496     for (INT i=0; i<alen; i++)
00497     {
00498         for (INT k=1; (k<=shift[i]) && (i+k<alen); k++)
00499         {
00500             if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00501                 continue ;
00502 
00503             DREAL sumi1 = 0.0 ;
00504             // shift in sequence a
00505             for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00506             {
00507                 if (avec[i+j+k]!=bvec[i+j])
00508                     break ;
00509                 sumi1 += weights[i*degree+j];
00510             }
00511             DREAL sumi2 = 0.0 ;
00512             // shift in sequence b
00513             for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00514             {
00515                 if (avec[i+j]!=bvec[i+j+k])
00516                     break ;
00517                 sumi2 += weights[i*degree+j];
00518             }
00519             if (position_weights!=NULL)
00520                 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00521             else
00522                 max_shift_vec[k-1] += sumi1 + sumi2 ;
00523         } ;
00524     }
00525 
00526     DREAL result = sum0 ;
00527     for (INT i=0; i<max_shift; i++)
00528         result += max_shift_vec[i]/(2*(i+1)) ;
00529 
00530     return result ;
00531 }
00532 
00533 DREAL CWeightedDegreePositionStringKernel::compute_without_mismatch_position_weights(CHAR* avec, DREAL* pos_weights_lhs, INT alen, CHAR* bvec, DREAL* pos_weights_rhs, INT blen) 
00534 {
00535     DREAL max_shift_vec[max_shift];
00536     DREAL sum0=0 ;
00537     for (INT i=0; i<max_shift; i++)
00538         max_shift_vec[i]=0 ;
00539 
00540     // no shift
00541     for (INT i=0; i<alen; i++)
00542     {
00543         if ((position_weights!=NULL) && (position_weights[i]==0.0))
00544             continue ;
00545 
00546         DREAL sumi = 0.0 ;
00547         DREAL posweight_lhs = 0.0 ;
00548         DREAL posweight_rhs = 0.0 ;
00549         for (INT j=0; (j<degree) && (i+j<alen); j++)
00550         {
00551             posweight_lhs += pos_weights_lhs[i+j] ;
00552             posweight_rhs += pos_weights_rhs[i+j] ;
00553             
00554             if (avec[i+j]!=bvec[i+j])
00555                 break ;
00556             sumi += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00557         }
00558         if (position_weights!=NULL)
00559             sum0 += position_weights[i]*sumi ;
00560         else
00561             sum0 += sumi ;
00562     } ;
00563 
00564     for (INT i=0; i<alen; i++)
00565     {
00566         for (INT k=1; (k<=shift[i]) && (i+k<alen); k++)
00567         {
00568             if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00569                 continue ;
00570 
00571             // shift in sequence a  
00572             DREAL sumi1 = 0.0 ;
00573             DREAL posweight_lhs = 0.0 ;
00574             DREAL posweight_rhs = 0.0 ;
00575             for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00576             {
00577                 posweight_lhs += pos_weights_lhs[i+j+k] ;
00578                 posweight_rhs += pos_weights_rhs[i+j] ;
00579                 if (avec[i+j+k]!=bvec[i+j])
00580                     break ;
00581                 sumi1 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00582             }
00583             // shift in sequence b
00584             DREAL sumi2 = 0.0 ;
00585             posweight_lhs = 0.0 ;
00586             posweight_rhs = 0.0 ;
00587             for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00588             {
00589                 posweight_lhs += pos_weights_lhs[i+j] ;
00590                 posweight_rhs += pos_weights_rhs[i+j+k] ;
00591                 if (avec[i+j]!=bvec[i+j+k])
00592                     break ;
00593                 sumi2 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00594             }
00595             if (position_weights!=NULL)
00596                 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00597             else
00598                 max_shift_vec[k-1] += sumi1 + sumi2 ;
00599         } ;
00600     }
00601 
00602     DREAL result = sum0 ;
00603     for (INT i=0; i<max_shift; i++)
00604         result += max_shift_vec[i]/(2*(i+1)) ;
00605 
00606     return result ;
00607 }
00608 
00609 
00610 DREAL CWeightedDegreePositionStringKernel::compute(INT idx_a, INT idx_b)
00611 {
00612     INT alen, blen;
00613 
00614     CHAR* avec=((CStringFeatures<CHAR>*) lhs)->get_feature_vector(idx_a, alen);
00615     CHAR* bvec=((CStringFeatures<CHAR>*) rhs)->get_feature_vector(idx_b, blen);
00616     // can only deal with strings of same length
00617     ASSERT(alen==blen);
00618     ASSERT(shift_len==alen);
00619 
00620     DREAL result = 0 ;
00621     if (position_weights_lhs!=NULL || position_weights_rhs!=NULL)
00622     {
00623         ASSERT(max_mismatch==0);
00624         result = compute_without_mismatch_position_weights(avec, &position_weights_lhs[idx_a*alen], alen, bvec, &position_weights_rhs[idx_b*blen], blen) ;
00625     }
00626     else if (max_mismatch > 0)
00627         result = compute_with_mismatch(avec, alen, bvec, blen) ;
00628     else if (length==0)
00629         result = compute_without_mismatch(avec, alen, bvec, blen) ;
00630     else
00631         result = compute_without_mismatch_matrix(avec, alen, bvec, blen) ;
00632 
00633     result/=normalization_const;
00634 
00635     return result ;
00636 }
00637 
00638 
00639 void CWeightedDegreePositionStringKernel::add_example_to_tree(INT idx, DREAL alpha)
00640 {
00641     ASSERT(position_weights_lhs==NULL);
00642     ASSERT(position_weights_rhs==NULL);
00643     ASSERT(alphabet);
00644     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00645 
00646     INT len=0;
00647     CHAR* char_vec=((CStringFeatures<CHAR>*) lhs)->get_feature_vector(idx, len);
00648     ASSERT(max_mismatch==0);
00649     INT *vec = new INT[len] ;
00650 
00651     for (INT i=0; i<len; i++)
00652         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00653 
00654     if (opt_type==FASTBUTMEMHUNGRY)
00655     {
00656         //TRIES(set_use_compact_terminal_nodes(false)) ;
00657         ASSERT(!TRIES(get_use_compact_terminal_nodes()));
00658     }
00659 
00660     for (INT i=0; i<len; i++)
00661     {
00662         INT max_s=-1;
00663 
00664         if (opt_type==SLOWBUTMEMEFFICIENT)
00665             max_s=0;
00666         else if (opt_type==FASTBUTMEMHUNGRY)
00667             max_s=shift[i];
00668         else {
00669             SG_ERROR( "unknown optimization type\n");
00670         }
00671 
00672         for (INT s=max_s; s>=0; s--)
00673         {
00674             DREAL alpha_pw = (s==0) ? (alpha) : (alpha/(2.0*s)) ;
00675             TRIES(add_to_trie(i, s, vec, alpha_pw, weights, (length!=0))) ;
00676             //fprintf(stderr, "tree=%i, s=%i, example=%i, alpha_pw=%1.2f\n", i, s, idx, alpha_pw) ;
00677 
00678             if ((s==0) || (i+s>=len))
00679                 continue;
00680 
00681             TRIES(add_to_trie(i+s, -s, vec, alpha_pw, weights, (length!=0))) ;
00682             //fprintf(stderr, "tree=%i, s=%i, example=%i, alpha_pw=%1.2f\n", i+s, -s, idx, alpha_pw) ;
00683         }
00684     }
00685 
00686     delete[] vec ;
00687     tree_initialized=true ;
00688 }
00689 
00690 void CWeightedDegreePositionStringKernel::add_example_to_single_tree(INT idx, DREAL alpha, INT tree_num) 
00691 {
00692     ASSERT(position_weights_lhs==NULL);
00693     ASSERT(position_weights_rhs==NULL);
00694     ASSERT(alphabet);
00695     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00696 
00697     INT len=0;
00698     CHAR* char_vec=((CStringFeatures<CHAR>*) lhs)->get_feature_vector(idx, len);
00699     ASSERT(max_mismatch==0);
00700     INT *vec=new INT[len];
00701     INT max_s=-1;
00702 
00703     if (opt_type==SLOWBUTMEMEFFICIENT)
00704         max_s=0;
00705     else if (opt_type==FASTBUTMEMHUNGRY)
00706     {
00707         ASSERT(!tries.get_use_compact_terminal_nodes());
00708         max_s=shift[tree_num];
00709     }
00710     else {
00711         SG_ERROR( "unknown optimization type\n");
00712     }
00713     for (INT i=CMath::max(0,tree_num-max_shift); i<CMath::min(len,tree_num+degree+max_shift); i++)
00714         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00715 
00716     for (INT s=max_s; s>=0; s--)
00717     {
00718         DREAL alpha_pw = (s==0) ? (alpha) : (alpha/(2.0*s)) ;
00719         tries.add_to_trie(tree_num, s, vec, alpha_pw, weights, (length!=0)) ;
00720         //fprintf(stderr, "tree=%i, s=%i, example=%i, alpha_pw=%1.2f\n", tree_num, s, idx, alpha_pw) ;
00721     } 
00722 
00723     if (opt_type==FASTBUTMEMHUNGRY)
00724     {
00725         for (INT i=CMath::max(0,tree_num-max_shift); i<CMath::min(len,tree_num+max_shift+1); i++)
00726         {
00727             INT s=tree_num-i;
00728             if ((i+s<len) && (s>=1) && (s<=shift[i]))
00729             {
00730                 DREAL alpha_pw = (s==0) ? (alpha) : (alpha/(2.0*s)) ;
00731                 tries.add_to_trie(tree_num, -s, vec, alpha_pw, weights, (length!=0)) ; 
00732                 //fprintf(stderr, "tree=%i, s=%i, example=%i, alpha_pw=%1.2f\n", tree_num, -s, idx, alpha_pw) ;
00733             }
00734         }
00735     }
00736     delete[] vec ;
00737     tree_initialized=true ;
00738 }
00739 
00740 DREAL CWeightedDegreePositionStringKernel::compute_by_tree(INT idx)
00741 {
00742     ASSERT(position_weights_lhs==NULL);
00743     ASSERT(position_weights_rhs==NULL);
00744     ASSERT(alphabet);
00745     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00746 
00747     DREAL sum=0;
00748     INT len=0;
00749     CHAR* char_vec=((CStringFeatures<CHAR>*) rhs)->get_feature_vector(idx, len);
00750     ASSERT(max_mismatch==0);
00751     INT *vec=new INT[len];
00752 
00753     for (INT i=0; i<len; i++)
00754         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00755 
00756     for (INT i=0; i<len; i++)
00757         sum += tries.compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0)) ;
00758 
00759     if (opt_type==SLOWBUTMEMEFFICIENT)
00760     {
00761         for (INT i=0; i<len; i++)
00762         {
00763             for (INT s=1; (s<=shift[i]) && (i+s<len); s++)
00764             {
00765                 sum+=tries.compute_by_tree_helper(vec, len, i, i+s, i, weights, (length!=0))/(2*s) ;
00766                 sum+=tries.compute_by_tree_helper(vec, len, i+s, i, i+s, weights, (length!=0))/(2*s) ;
00767             }
00768         }
00769     }
00770 
00771     delete[] vec ;
00772 
00773     return sum/normalization_const;
00774 }
00775 
00776 void CWeightedDegreePositionStringKernel::compute_by_tree(INT idx, DREAL* LevelContrib)
00777 {
00778     ASSERT(position_weights_lhs==NULL);
00779     ASSERT(position_weights_rhs==NULL);
00780     ASSERT(alphabet);
00781     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00782 
00783     INT len=0;
00784     CHAR* char_vec=((CStringFeatures<CHAR>*) rhs)->get_feature_vector(idx, len);
00785     ASSERT(max_mismatch==0);
00786     INT *vec=new INT[len];
00787 
00788     for (INT i=0; i<len; i++)
00789         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00790 
00791     for (INT i=0; i<len; i++)
00792         tries.compute_by_tree_helper(vec, len, i, i, i, LevelContrib, 1.0/normalization_const, mkl_stepsize, weights, (length!=0)) ;
00793 
00794     if (opt_type==SLOWBUTMEMEFFICIENT)
00795     {
00796         for (INT i=0; i<len; i++)
00797             for (INT k=1; (k<=shift[i]) && (i+k<len); k++)
00798             {
00799                 tries.compute_by_tree_helper(vec, len, i, i+k, i, LevelContrib, 1.0/(2*k*normalization_const), mkl_stepsize, weights, (length!=0)) ;
00800                 tries.compute_by_tree_helper(vec, len, i+k, i, i+k, LevelContrib, 1.0/(2*k*normalization_const), mkl_stepsize, weights, (length!=0)) ;
00801             }
00802     }
00803 
00804     delete[] vec ;
00805 }
00806 
00807 DREAL *CWeightedDegreePositionStringKernel::compute_abs_weights(int &len) 
00808 {
00809     return tries.compute_abs_weights(len) ;
00810 }
00811 
00812 bool CWeightedDegreePositionStringKernel::set_shifts(INT* shift_, INT shift_len_)
00813 {
00814     delete[] shift;
00815 
00816     shift_len = shift_len_ ;
00817     shift = new INT[shift_len] ;
00818 
00819     if (shift)
00820     {
00821         max_shift = 0 ;
00822 
00823         for (INT i=0; i<shift_len; i++)
00824         {
00825             shift[i] = shift_[i] ;
00826             if (shift[i]>max_shift)
00827                 max_shift = shift[i] ;
00828         }
00829 
00830         ASSERT(max_shift>=0 && max_shift<=shift_len);
00831     }
00832     
00833     return false;
00834 }
00835 
00836 bool CWeightedDegreePositionStringKernel::set_wd_weights()
00837 {
00838     ASSERT(degree>0);
00839 
00840     delete[] weights;
00841     weights=new DREAL[degree];
00842     if (weights)
00843     {
00844         INT i;
00845         DREAL sum=0;
00846         for (i=0; i<degree; i++)
00847         {
00848             weights[i]=degree-i;
00849             sum+=weights[i];
00850         }
00851         for (i=0; i<degree; i++)
00852             weights[i]/=sum;
00853 
00854         for (i=0; i<degree; i++)
00855         {
00856             for (INT j=1; j<=max_mismatch; j++)
00857             {
00858                 if (j<i+1)
00859                 {
00860                     INT nk=CMath::nchoosek(i+1, j);
00861                     weights[i+j*degree]=weights[i]/(nk*pow(3,j));
00862                 }
00863                 else
00864                     weights[i+j*degree]= 0;
00865             }
00866         }
00867 
00868         return true;
00869     }
00870     else
00871         return false;
00872 }
00873 
00874 bool CWeightedDegreePositionStringKernel::set_weights(DREAL* ws, INT d, INT len)
00875 {
00876     SG_DEBUG( "degree = %i  d=%i\n", degree, d) ;
00877     degree = d ;
00878     length=len;
00879 
00880     if (len <= 0)
00881         len=1;
00882 
00883     delete[] weights;
00884     weights=new DREAL[d*len];
00885 
00886     if (weights)
00887     {
00888         for (int i=0; i<degree*len; i++)
00889             weights[i]=ws[i];
00890         return true;
00891     }
00892     else
00893         return false;
00894 }
00895 
00896 bool CWeightedDegreePositionStringKernel::set_position_weights(DREAL* pws, INT len)
00897 {
00898     fprintf(stderr, "len=%i\n", len) ;
00899 
00900     if (len==0)
00901     {
00902         delete[] position_weights ;
00903         position_weights = NULL ;
00904         tries.set_position_weights(position_weights) ;
00905         return true ;
00906     }
00907     if (seq_length==0)
00908         seq_length = len ;
00909 
00910     if (seq_length!=len) 
00911     {
00912         SG_ERROR( "seq_length = %i, position_weights_length=%i\n", seq_length, len) ;
00913         return false ;
00914     }
00915     delete[] position_weights;
00916     position_weights=new DREAL[len];
00917     tries.set_position_weights(position_weights) ;
00918 
00919     if (position_weights)
00920     {
00921         for (int i=0; i<len; i++)
00922             position_weights[i]=pws[i];
00923         return true;
00924     }
00925     else
00926         return false;
00927 }
00928 
00929 bool CWeightedDegreePositionStringKernel::set_position_weights_lhs(DREAL* pws, INT len, INT num)
00930 {
00931     fprintf(stderr, "lhs %i %i %i\n", len, num, seq_length) ;
00932     
00933     if (position_weights_rhs==position_weights_lhs)
00934         position_weights_rhs=NULL ;
00935     else
00936         delete_position_weights_rhs() ;
00937 
00938     if (len==0)
00939     {
00940         return delete_position_weights_lhs() ;
00941     }
00942     
00943     if (seq_length!=len) 
00944     {
00945         SG_ERROR( "seq_length = %i, position_weights_length=%i\n", seq_length, len) ;
00946         return false ;
00947     }
00948     if (!lhs)
00949     {
00950         SG_ERROR("lhs=NULL\n") ;
00951         return false ;
00952     }
00953     if (lhs->get_num_vectors()!=num)
00954     {
00955         SG_ERROR("lhs->get_num_vectors()=%i, num=%i\n", lhs->get_num_vectors(), num) ;
00956         return false ;
00957     }
00958     
00959     delete[] position_weights_lhs;
00960     position_weights_lhs=new DREAL[len*num];
00961     if (position_weights_lhs)
00962     {
00963         for (int i=0; i<len*num; i++)
00964             position_weights_lhs[i]=pws[i];
00965         return true;
00966     }
00967     else
00968         return false;
00969 }
00970 
00971 bool CWeightedDegreePositionStringKernel::set_position_weights_rhs(DREAL* pws, INT len, INT num)
00972 {
00973     fprintf(stderr, "rhs %i %i %i\n", len, num, seq_length) ;
00974     if (len==0)
00975     {
00976         if (position_weights_rhs==position_weights_lhs)
00977         {
00978             position_weights_rhs=NULL ;
00979             return true ;
00980         }
00981         return delete_position_weights_rhs() ;
00982     }
00983 
00984     if (seq_length!=len) 
00985     {
00986         SG_ERROR( "seq_length = %i, position_weights_length=%i\n", seq_length, len) ;
00987         return false ;
00988     }
00989     if (!rhs)
00990     {
00991         if (!lhs)
00992         {
00993             SG_ERROR("rhs==0 and lhs=NULL\n") ;
00994             return false ;
00995         }
00996         if (lhs->get_num_vectors()!=num)
00997         {
00998             SG_ERROR("lhs->get_num_vectors()=%i, num=%i\n", lhs->get_num_vectors(), num) ;
00999             return false ;
01000         }
01001     } else
01002         if (rhs->get_num_vectors()!=num)
01003         {
01004             SG_ERROR("rhs->get_num_vectors()=%i, num=%i\n", rhs->get_num_vectors(), num) ;
01005             return false ;
01006         }
01007 
01008 
01009     delete[] position_weights_rhs;
01010     position_weights_rhs=new DREAL[len*num];
01011     if (position_weights_rhs)
01012     {
01013         for (int i=0; i<len*num; i++)
01014             position_weights_rhs[i]=pws[i];
01015         return true;
01016     }
01017     else
01018         return false;
01019 }
01020 
01021 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd()
01022 {
01023     delete[] block_weights;
01024     block_weights=new DREAL[CMath::max(seq_length,degree)];
01025 
01026     if (block_weights)
01027     {
01028         double deg=degree;
01029         INT k;
01030         for (k=0; k<degree ; k++)
01031             block_weights[k]=(-pow(k,3) + (3*deg-3)*pow(k,2) + (9*deg-2)*k + 6*deg) / (3*deg*(deg+1));
01032         for (k=degree; k<seq_length ; k++)
01033             block_weights[k]=(-deg+3*k+4)/3;
01034     }
01035 
01036     return (block_weights!=NULL);
01037 }
01038 
01039 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd_external()
01040 {
01041     ASSERT(weights);
01042     delete[] block_weights;
01043     block_weights=new DREAL[CMath::max(seq_length,degree)];
01044 
01045     if (block_weights)
01046     {
01047         INT i=0;
01048         block_weights[0]=weights[0];
01049         for (i=1; i<CMath::max(seq_length,degree); i++)
01050             block_weights[i]=0;
01051 
01052         for (i=1; i<CMath::max(seq_length,degree); i++)
01053         {
01054             block_weights[i]=block_weights[i-1];
01055 
01056             DREAL contrib=0;
01057             for (INT j=0; j<CMath::min(degree,i+1); j++)
01058                 contrib+=weights[j];
01059 
01060             block_weights[i]+=contrib;
01061         }
01062     }
01063 
01064     return (block_weights!=NULL);
01065 }
01066 
01067 bool CWeightedDegreePositionStringKernel::init_block_weights_const()
01068 {
01069     delete[] block_weights;
01070     block_weights=new DREAL[seq_length];
01071 
01072     if (block_weights)
01073     {
01074         for (int i=1; i<seq_length+1 ; i++)
01075             block_weights[i-1]=1.0/seq_length;
01076     }
01077 
01078     return (block_weights!=NULL);
01079 }
01080 
01081 bool CWeightedDegreePositionStringKernel::init_block_weights_linear()
01082 {
01083     delete[] block_weights;
01084     block_weights=new DREAL[seq_length];
01085 
01086     if (block_weights)
01087     {
01088         for (int i=1; i<seq_length+1 ; i++)
01089             block_weights[i-1]=degree*i;
01090     }
01091 
01092     return (block_weights!=NULL);
01093 }
01094 
01095 bool CWeightedDegreePositionStringKernel::init_block_weights_sqpoly()
01096 {
01097     delete[] block_weights;
01098     block_weights=new DREAL[seq_length];
01099 
01100     if (block_weights)
01101     {
01102         for (int i=1; i<degree+1 ; i++)
01103             block_weights[i-1]=((double) i)*i;
01104 
01105         for (int i=degree+1; i<seq_length+1 ; i++)
01106             block_weights[i-1]=i;
01107     }
01108 
01109     return (block_weights!=NULL);
01110 }
01111 
01112 bool CWeightedDegreePositionStringKernel::init_block_weights_cubicpoly()
01113 {
01114     delete[] block_weights;
01115     block_weights=new DREAL[seq_length];
01116 
01117     if (block_weights)
01118     {
01119         for (int i=1; i<degree+1 ; i++)
01120             block_weights[i-1]=((double) i)*i*i;
01121 
01122         for (int i=degree+1; i<seq_length+1 ; i++)
01123             block_weights[i-1]=i;
01124     }
01125 
01126     return (block_weights!=NULL);
01127 }
01128 
01129 bool CWeightedDegreePositionStringKernel::init_block_weights_exp()
01130 {
01131     delete[] block_weights;
01132     block_weights=new DREAL[seq_length];
01133 
01134     if (block_weights)
01135     {
01136         for (int i=1; i<degree+1 ; i++)
01137             block_weights[i-1]=exp(((double) i/10.0));
01138 
01139         for (int i=degree+1; i<seq_length+1 ; i++)
01140             block_weights[i-1]=i;
01141     }
01142 
01143     return (block_weights!=NULL);
01144 }
01145 
01146 bool CWeightedDegreePositionStringKernel::init_block_weights_log()
01147 {
01148     delete[] block_weights;
01149     block_weights=new DREAL[seq_length];
01150 
01151     if (block_weights)
01152     {
01153         for (int i=1; i<degree+1 ; i++)
01154             block_weights[i-1]=pow(log(i),2);
01155 
01156         for (int i=degree+1; i<seq_length+1 ; i++)
01157             block_weights[i-1]=i-degree+1+pow(log(degree+1),2);
01158     }
01159 
01160     return (block_weights!=NULL);
01161 }
01162 
01163 bool CWeightedDegreePositionStringKernel::init_block_weights_external()
01164 {
01165     if (block_weights_external && (seq_length == num_block_weights_external) )
01166     {
01167         delete[] block_weights;
01168         block_weights=new DREAL[seq_length];
01169 
01170         if (block_weights)
01171         {
01172             for (int i=0; i<seq_length; i++)
01173                 block_weights[i]=block_weights_external[i];
01174         }
01175     }
01176     else {
01177       SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external);
01178    }
01179     return (block_weights!=NULL);
01180 }
01181 
01182 bool CWeightedDegreePositionStringKernel::init_block_weights()
01183 {
01184     switch (type)
01185     {
01186         case E_WD:
01187             return init_block_weights_from_wd();
01188         case E_EXTERNAL:
01189             return init_block_weights_from_wd_external();
01190         case E_BLOCK_CONST:
01191             return init_block_weights_const();
01192         case E_BLOCK_LINEAR:
01193             return init_block_weights_linear();
01194         case E_BLOCK_SQPOLY:
01195             return init_block_weights_sqpoly();
01196         case E_BLOCK_CUBICPOLY:
01197             return init_block_weights_cubicpoly();
01198         case E_BLOCK_EXP:
01199             return init_block_weights_exp();
01200         case E_BLOCK_LOG:
01201             return init_block_weights_log();
01202         case E_BLOCK_EXTERNAL:
01203             return init_block_weights_external();
01204         default:
01205             return false;
01206     };
01207 }
01208 
01209 
01210 
01211 void* CWeightedDegreePositionStringKernel::compute_batch_helper(void* p)
01212 {
01213     S_THREAD_PARAM<DNATrie>* params = (S_THREAD_PARAM<DNATrie>*) p;
01214     INT j=params->j;
01215     CWeightedDegreePositionStringKernel* wd=params->kernel;
01216     CTrie<DNATrie>* tries=params->tries;
01217     DREAL* weights=params->weights;
01218     INT length=params->length;
01219     INT max_shift=params->max_shift;
01220     INT* vec=params->vec;
01221     DREAL* result=params->result;
01222     DREAL factor=params->factor;
01223     INT* shift=params->shift;
01224     INT* vec_idx=params->vec_idx;
01225 
01226     for (INT i=params->start; i<params->end; i++)
01227     {
01228         INT len=0;
01229         CStringFeatures<CHAR>* rhs_feat=((CStringFeatures<CHAR>*) wd->get_rhs());
01230         CAlphabet* alpha=wd->alphabet;
01231 
01232         CHAR* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len);
01233         for (INT k=CMath::max(0,j-max_shift); k<CMath::min(len,j+wd->get_degree()+max_shift); k++)
01234             vec[k]=alpha->remap_to_bin(char_vec[k]);
01235 
01236         SG_UNREF(rhs_feat);
01237 
01238         result[i] += factor*tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0))/wd->normalization_const ;
01239 
01240         if (wd->get_optimization_type()==SLOWBUTMEMEFFICIENT)
01241         {
01242             for (INT q=CMath::max(0,j-max_shift); q<CMath::min(len,j+max_shift+1); q++)
01243             {
01244                 INT s=j-q ;
01245                 if ((s>=1) && (s<=shift[q]) && (q+s<len))
01246                     result[i] += tries->compute_by_tree_helper(vec, len, q, q+s, q, weights, (length!=0))/(2.0*s*wd->normalization_const) ;
01247             }
01248             for (INT s=1; (s<=shift[j]) && (j+s<len); s++)
01249                 result[i] += tries->compute_by_tree_helper(vec, len, j+s, j, j+s, weights, (length!=0))/(2.0*s*wd->normalization_const) ;
01250         }
01251     }
01252 
01253     return NULL;
01254 }
01255 
01256 void CWeightedDegreePositionStringKernel::compute_batch(INT num_vec, INT* vec_idx, DREAL* result, INT num_suppvec, INT* IDX, DREAL* alphas, DREAL factor)
01257 {
01258     ASSERT(alphabet);
01259     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01260     ASSERT(position_weights_lhs==NULL);
01261     ASSERT(position_weights_rhs==NULL);
01262     ASSERT(rhs);
01263     ASSERT(num_vec<=rhs->get_num_vectors());
01264     ASSERT(num_vec>0);
01265     ASSERT(vec_idx);
01266     ASSERT(result);
01267     create_empty_tries();
01268 
01269     INT num_feat=((CStringFeatures<CHAR>*) rhs)->get_max_vector_length();
01270     ASSERT(num_feat>0);
01271     INT num_threads=parallel.get_num_threads();
01272     ASSERT(num_threads>0);
01273     INT* vec=new INT[num_threads*num_feat];
01274 
01275     if (num_threads < 2)
01276     {
01277 #ifdef WIN32
01278        for (INT j=0; j<num_feat; j++)
01279 #else
01280        CSignal::clear() ;
01281        for (INT j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01282 #endif
01283             {
01284                 init_optimization(num_suppvec, IDX, alphas, j);
01285                 S_THREAD_PARAM<DNATrie> params;
01286                 params.vec=vec;
01287                 params.result=result;
01288                 params.weights=weights;
01289                 params.kernel=this;
01290                 params.tries=&tries;
01291                 params.factor=factor;
01292                 params.j=j;
01293                 params.start=0;
01294                 params.end=num_vec;
01295                 params.length=length;
01296                 params.max_shift=max_shift;
01297                 params.shift=shift;
01298                 params.vec_idx=vec_idx;
01299                 compute_batch_helper((void*) &params);
01300 
01301                 SG_PROGRESS(j,0,num_feat);
01302             }
01303     }
01304 #ifndef WIN32
01305     else
01306     {
01307 
01308         CSignal::clear() ;
01309         for (INT j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01310         {
01311             init_optimization(num_suppvec, IDX, alphas, j);
01312             pthread_t threads[num_threads-1];
01313             S_THREAD_PARAM<DNATrie> params[num_threads];
01314             INT step= num_vec/num_threads;
01315             INT t;
01316 
01317             for (t=0; t<num_threads-1; t++)
01318             {
01319                 params[t].vec=&vec[num_feat*t];
01320                 params[t].result=result;
01321                 params[t].weights=weights;
01322                 params[t].kernel=this;
01323                 params[t].tries=&tries;
01324                 params[t].factor=factor;
01325                 params[t].j=j;
01326                 params[t].start = t*step;
01327                 params[t].end = (t+1)*step;
01328                 params[t].length=length;
01329                 params[t].max_shift=max_shift;
01330                 params[t].shift=shift;
01331                 params[t].vec_idx=vec_idx;
01332                 pthread_create(&threads[t], NULL, CWeightedDegreePositionStringKernel::compute_batch_helper, (void*)&params[t]);
01333             }
01334 
01335             params[t].vec=&vec[num_feat*t];
01336             params[t].result=result;
01337             params[t].weights=weights;
01338             params[t].kernel=this;
01339             params[t].tries=&tries;
01340             params[t].factor=factor;
01341             params[t].j=j;
01342             params[t].start=t*step;
01343             params[t].end=num_vec;
01344             params[t].length=length;
01345             params[t].max_shift=max_shift;
01346             params[t].shift=shift;
01347             params[t].vec_idx=vec_idx;
01348             compute_batch_helper((void*) &params[t]);
01349 
01350             for (t=0; t<num_threads-1; t++)
01351                 pthread_join(threads[t], NULL);
01352             SG_PROGRESS(j,0,num_feat);
01353         }
01354     }
01355 #endif
01356 
01357     delete[] vec;
01358 
01359     //really also free memory as this can be huge on testing especially when
01360     //using the combined kernel
01361     create_empty_tries();
01362 }
01363 
01364 DREAL* CWeightedDegreePositionStringKernel::compute_scoring(INT max_degree, INT& num_feat, INT& num_sym, DREAL* result, INT num_suppvec, INT* IDX, DREAL* alphas)
01365 {
01366     ASSERT(position_weights_lhs==NULL);
01367     ASSERT(position_weights_rhs==NULL);
01368 
01369     num_feat=((CStringFeatures<CHAR>*) rhs)->get_max_vector_length();
01370     ASSERT(num_feat>0);
01371     ASSERT(alphabet);
01372     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01373 
01374     num_sym=4; //for now works only w/ DNA
01375 
01376     ASSERT(max_degree>0);
01377 
01378     // === variables
01379     INT* nofsKmers=new INT[max_degree];
01380     DREAL** C=new DREAL*[max_degree];
01381     DREAL** L=new DREAL*[max_degree];
01382     DREAL** R=new DREAL*[max_degree];
01383 
01384     INT i;
01385     INT k;
01386 
01387     // --- return table
01388     INT bigtabSize=0;
01389     for (k=0; k<max_degree; ++k )
01390     {
01391         nofsKmers[k]=(INT) pow(num_sym, k+1);
01392         const INT tabSize=nofsKmers[k]*num_feat;
01393         bigtabSize+=tabSize;
01394     }
01395     result=new DREAL[bigtabSize];
01396 
01397     // --- auxilliary tables
01398     INT tabOffs=0;
01399     for( k = 0; k < max_degree; ++k )
01400     {
01401         const INT tabSize = nofsKmers[k] * num_feat;
01402         C[k] = &result[tabOffs];
01403         L[k] = new DREAL[ tabSize ];
01404         R[k] = new DREAL[ tabSize ];
01405         tabOffs+=tabSize;
01406         for(i = 0; i < tabSize; i++ )
01407         {
01408             C[k][i] = 0.0;
01409             L[k][i] = 0.0;
01410             R[k][i] = 0.0;
01411         }
01412     }
01413 
01414     // --- tree parsing info
01415     DREAL* margFactors=new DREAL[degree];
01416 
01417     INT* x = new INT[ degree+1 ];
01418     INT* substrs = new INT[ degree+1 ];
01419     // - fill arrays
01420     margFactors[0] = 1.0;
01421     substrs[0] = 0;
01422     for( k=1; k < degree; ++k ) {
01423         margFactors[k] = 0.25 * margFactors[k-1];
01424         substrs[k] = -1;
01425     }
01426     substrs[degree] = -1;
01427     // - fill struct
01428     struct TreeParseInfo info;
01429     info.num_sym = num_sym;
01430     info.num_feat = num_feat;
01431     info.p = -1;
01432     info.k = -1;
01433     info.nofsKmers = nofsKmers;
01434     info.margFactors = margFactors;
01435     info.x = x;
01436     info.substrs = substrs;
01437     info.y0 = 0;
01438     info.C_k = NULL;
01439     info.L_k = NULL;
01440     info.R_k = NULL;
01441 
01442     // === main loop
01443     i = 0; // total progress
01444     for( k = 0; k < max_degree; ++k )
01445     {
01446         const INT nofKmers = nofsKmers[ k ];
01447         info.C_k = C[k];
01448         info.L_k = L[k];
01449         info.R_k = R[k];
01450 
01451         // --- run over all trees
01452         for(INT p = 0; p < num_feat; ++p )
01453         {
01454             init_optimization( num_suppvec, IDX, alphas, p );
01455             INT tree = p ;
01456             for(INT j = 0; j < degree+1; j++ ) {
01457                 x[j] = -1;
01458             }
01459             tries.traverse( tree, p, info, 0, x, k );
01460             SG_PROGRESS(i++,0,num_feat*max_degree);
01461         }
01462 
01463         // --- add partial overlap scores
01464         if( k > 0 ) {
01465             const INT j = k - 1;
01466             const INT nofJmers = (INT) pow( num_sym, j+1 );
01467             for(INT p = 0; p < num_feat; ++p ) {
01468                 const INT offsetJ = nofJmers * p;
01469                 const INT offsetJ1 = nofJmers * (p+1);
01470                 const INT offsetK = nofKmers * p;
01471                 INT y;
01472                 INT sym;
01473                 for( y = 0; y < nofJmers; ++y ) {
01474                     for( sym = 0; sym < num_sym; ++sym ) {
01475                         const INT y_sym = num_sym*y + sym;
01476                         const INT sym_y = nofJmers*sym + y;
01477                         ASSERT(0<=y_sym && y_sym<nofKmers);
01478                         ASSERT(0<=sym_y && sym_y<nofKmers);
01479                         C[k][ y_sym + offsetK ] += L[j][ y + offsetJ ];
01480                         if( p < num_feat-1 ) {
01481                             C[k][ sym_y + offsetK ] += R[j][ y + offsetJ1 ];
01482                         }
01483                     }
01484                 }
01485             }
01486         }
01487         //   if( k > 1 )
01488         //     j = k-1
01489         //     for all positions p
01490         //       for all j-mers y
01491         //          for n in {A,C,G,T}
01492         //            C_k[ p, [y,n] ] += L_j[ p, y ]
01493         //            C_k[ p, [n,y] ] += R_j[ p+1, y ]
01494         //          end;
01495         //       end;
01496         //     end;
01497         //   end;
01498     }
01499 
01500     // === return a vector
01501     num_feat=1;
01502     num_sym = bigtabSize;
01503     // --- clean up
01504     delete[] nofsKmers;
01505     delete[] margFactors;
01506     delete[] substrs;
01507     delete[] x;
01508     delete[] C;
01509     for( k = 0; k < max_degree; ++k ) {
01510         delete[] L[k];
01511         delete[] R[k];
01512     }
01513     delete[] L;
01514     delete[] R;
01515     return result;
01516 }
01517 
01518 CHAR* CWeightedDegreePositionStringKernel::compute_consensus(INT &num_feat, INT num_suppvec, INT* IDX, DREAL* alphas)
01519 {
01520     ASSERT(position_weights_lhs==NULL);
01521     ASSERT(position_weights_rhs==NULL);
01522     //only works for order <= 32
01523     ASSERT(degree<=32);
01524     ASSERT(!tries.get_use_compact_terminal_nodes());
01525     num_feat=((CStringFeatures<CHAR>*) rhs)->get_max_vector_length();
01526     ASSERT(num_feat>0);
01527     ASSERT(alphabet);
01528     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01529 
01530     //consensus
01531     CHAR* result=new CHAR[num_feat];
01532 
01533     //backtracking and scoring table
01534     INT num_tables=CMath::max(1,num_feat-degree+1);
01535     CDynamicArray<ConsensusEntry>** table=new CDynamicArray<ConsensusEntry>*[num_tables];
01536 
01537     for (INT i=0; i<num_tables; i++)
01538         table[i]=new CDynamicArray<ConsensusEntry>(num_suppvec/10);
01539 
01540     //compute consensus via dynamic programming
01541     for (INT i=0; i<num_tables; i++)
01542     {
01543         bool cumulative=false;
01544 
01545         if (i<num_tables-1)
01546             init_optimization(num_suppvec, IDX, alphas, i);
01547         else
01548         {
01549             init_optimization(num_suppvec, IDX, alphas, i, num_feat-1);
01550             cumulative=true;
01551         }
01552 
01553         if (i==0)
01554             tries.fill_backtracking_table(i, NULL, table[i], cumulative, weights);
01555         else
01556             tries.fill_backtracking_table(i, table[i-1], table[i], cumulative, weights);
01557 
01558         SG_PROGRESS(i,0,num_feat);
01559     }
01560 
01561 
01562     //INT n=table[0]->get_num_elements();
01563 
01564     //for (INT i=0; i<n; i++)
01565     //{
01566     //  ConsensusEntry e= table[0]->get_element(i);
01567     //  SG_PRINT("first: str:0%0llx sc:%f bt:%d\n",e.string,e.score,e.bt);
01568     //}
01569 
01570     //n=table[num_tables-1]->get_num_elements();
01571     //for (INT i=0; i<n; i++)
01572     //{
01573     //  ConsensusEntry e= table[num_tables-1]->get_element(i);
01574     //  SG_PRINT("last: str:0%0llx sc:%f bt:%d\n",e.string,e.score,e.bt);
01575     //}
01576     //n=table[num_tables-2]->get_num_elements();
01577     //for (INT i=0; i<n; i++)
01578     //{
01579     //  ConsensusEntry e= table[num_tables-2]->get_element(i);
01580     //  SG_PRINT("second last: str:0%0llx sc:%f bt:%d\n",e.string,e.score,e.bt);
01581     //}
01582 
01583     const CHAR* acgt="ACGT";
01584 
01585     //backtracking start
01586     INT max_idx=-1;
01587     SHORTREAL max_score=0;
01588     INT num_elements=table[num_tables-1]->get_num_elements();
01589 
01590     for (INT i=0; i<num_elements; i++)
01591     {
01592         DREAL sc=table[num_tables-1]->get_element(i).score;
01593         if (sc>max_score || max_idx==-1)
01594         {
01595             max_idx=i;
01596             max_score=sc;
01597         }
01598     }
01599     ULONG endstr=table[num_tables-1]->get_element(max_idx).string;
01600 
01601     SG_INFO("max_idx:%d num_el:%d num_feat:%d num_tables:%d max_score:%f\n", max_idx, num_elements, num_feat, num_tables, max_score);
01602 
01603     for (INT i=0; i<degree; i++)
01604         result[num_feat-1-i]=acgt[(endstr >> (2*i)) & 3];
01605 
01606     if (num_tables>1)
01607     {
01608         for (INT i=num_tables-1; i>=0; i--)
01609         {
01610             //SG_PRINT("max_idx: %d, i:%d\n", max_idx, i);
01611             result[i]=acgt[table[i]->get_element(max_idx).string >> (2*(degree-1)) & 3];
01612             max_idx=table[i]->get_element(max_idx).bt;
01613         }
01614     }
01615 
01616     //for (INT t=0; t<num_tables; t++)
01617     //{
01618     //  n=table[t]->get_num_elements();
01619     //  for (INT i=0; i<n; i++)
01620     //  {
01621     //      ConsensusEntry e= table[t]->get_element(i);
01622     //      SG_PRINT("table[%d,%d]: str:0%0llx sc:%+f bt:%d\n",t,i, e.string,e.score,e.bt);
01623     //  }
01624     //}
01625 
01626     for (INT i=0; i<num_tables; i++)
01627         delete table[i];
01628 
01629     delete[] table;
01630     return result;
01631 }
01632 
01633 
01634 DREAL* CWeightedDegreePositionStringKernel::extract_w( INT max_degree, INT& num_feat, INT& num_sym, DREAL* w_result, INT num_suppvec, INT* IDX, DREAL* alphas )
01635 {
01636   delete_optimization();
01637   use_poim_tries=true;
01638   poim_tries.delete_trees(false);
01639 
01640   // === check
01641   ASSERT(position_weights_lhs==NULL);
01642   ASSERT(position_weights_rhs==NULL);
01643   num_feat=((CStringFeatures<CHAR>*) rhs)->get_max_vector_length();
01644   ASSERT(num_feat>0);
01645   ASSERT(alphabet->get_alphabet()==DNA);
01646   ASSERT(max_degree>0);
01647 
01648   // === general variables
01649   static const INT NUM_SYMS = poim_tries.NUM_SYMS;
01650   const INT seqLen = num_feat;
01651   DREAL** subs;
01652   INT i;
01653   INT k;
01654   //INT y;
01655 
01656   // === init tables "subs" for substring scores / POIMs
01657   // --- compute table sizes
01658   INT* offsets;
01659   INT offset;
01660   offsets = new INT[ max_degree ];
01661   offset = 0;
01662   for( k = 0; k < max_degree; ++k ) {
01663     offsets[k] = offset;
01664     const INT nofsKmers = (INT) pow( NUM_SYMS, k+1 );
01665     const INT tabSize = nofsKmers * seqLen;
01666     offset += tabSize;
01667   }
01668   // --- allocate memory
01669   const INT bigTabSize = offset;
01670   w_result=new DREAL[bigTabSize];
01671   for (i=0; i<bigTabSize; ++i)
01672     w_result[i]=0;
01673 
01674   // --- set pointers for tables
01675   subs = new DREAL*[ max_degree ];
01676   ASSERT( subs != NULL );
01677   for( k = 0; k < max_degree; ++k ) {
01678     subs[k] = &w_result[ offsets[k] ];
01679   }
01680   delete[] offsets;
01681 
01682   // === init trees; extract "w"
01683   init_optimization( num_suppvec, IDX, alphas, -1);
01684   poim_tries.POIMs_extract_W( subs, max_degree );
01685 
01686   // === clean; return "subs" as vector
01687   delete[] subs;
01688   num_feat = 1;
01689   num_sym = bigTabSize;
01690   use_poim_tries=false;
01691   poim_tries.delete_trees(false);
01692   return w_result;
01693 }
01694 
01695 DREAL* CWeightedDegreePositionStringKernel::compute_POIM( INT max_degree, INT& num_feat, INT& num_sym, DREAL* poim_result, INT num_suppvec, INT* IDX, DREAL* alphas, DREAL* distrib )
01696 {
01697   delete_optimization();
01698   use_poim_tries=true;
01699   poim_tries.delete_trees(false);
01700 
01701   // === check
01702   ASSERT(position_weights_lhs==NULL);
01703   ASSERT(position_weights_rhs==NULL);
01704   num_feat=((CStringFeatures<CHAR>*) rhs)->get_max_vector_length();
01705   ASSERT(num_feat>0);
01706   ASSERT(alphabet->get_alphabet()==DNA);
01707   ASSERT(max_degree!=0);
01708   ASSERT(distrib);
01709 
01710   // === general variables
01711   static const INT NUM_SYMS = poim_tries.NUM_SYMS;
01712   const INT seqLen = num_feat;
01713   DREAL** subs;
01714   INT i;
01715   INT k;
01716 
01717   // === DEBUGGING mode
01718   //
01719   // Activated if "max_degree" < 0.
01720   // Allows to output selected partial score.
01721   //
01722   // |max_degree| mod 4
01723   //   0: substring
01724   //   1: superstring
01725   //   2: left overlap
01726   //   3: right overlap
01727   //
01728   const INT debug = ( max_degree < 0 ) ? ( abs(max_degree) % 4 + 1 ) : 0;
01729   if( debug ) {
01730     max_degree = abs(max_degree) / 4;
01731     switch( debug ) {
01732     case 1: {
01733       printf( "POIM DEBUGGING: substring only (max order=%d)\n", max_degree );
01734       break;
01735     }
01736     case 2: {
01737       printf( "POIM DEBUGGING: superstring only (max order=%d)\n", max_degree );
01738       break;
01739     }
01740     case 3: {
01741       printf( "POIM DEBUGGING: left overlap only (max order=%d)\n", max_degree );
01742       break;
01743     }
01744     case 4: {
01745       printf( "POIM DEBUGGING: right overlap only (max order=%d)\n", max_degree );
01746       break;
01747     }
01748     default: {
01749       printf( "POIM DEBUGGING: something is wrong (max order=%d)\n", max_degree );
01750       ASSERT(0);
01751       break;
01752     }
01753     }
01754   }
01755 
01756   // --- compute table sizes
01757   INT* offsets;
01758   INT offset;
01759   offsets = new INT[ max_degree ];
01760   offset = 0;
01761   for( k = 0; k < max_degree; ++k ) {
01762     offsets[k] = offset;
01763     const INT nofsKmers = (INT) pow( NUM_SYMS, k+1 );
01764     const INT tabSize = nofsKmers * seqLen;
01765     offset += tabSize;
01766   }
01767   // --- allocate memory
01768   const INT bigTabSize=offset;
01769   poim_result=new DREAL[bigTabSize];
01770   for (i=0; i<bigTabSize; ++i )
01771     poim_result[i]=0;
01772 
01773   // --- set pointers for tables
01774   subs=new DREAL*[max_degree];
01775   for (k=0; k<max_degree; ++k)
01776     subs[k]=&poim_result[offsets[k]];
01777 
01778   delete[] offsets;
01779 
01780   // === init trees; precalc S, L and R
01781   init_optimization( num_suppvec, IDX, alphas, -1);
01782   poim_tries.POIMs_precalc_SLR( distrib );
01783 
01784   // === compute substring scores
01785   if( debug==0 || debug==1 ) {
01786     poim_tries.POIMs_extract_W( subs, max_degree );
01787     for( k = 1; k < max_degree; ++k ) {
01788       const INT nofKmers2 = ( k > 1 ) ? (INT) pow(NUM_SYMS,k-1) : 0;
01789       const INT nofKmers1 = (INT) pow( NUM_SYMS, k );
01790       const INT nofKmers0 = nofKmers1 * NUM_SYMS;
01791       for( i = 0; i < seqLen; ++i ) {
01792     DREAL* const subs_k2i1 = ( k>1 && i<seqLen-1 ) ? &subs[k-2][(i+1)*nofKmers2] : NULL;
01793     DREAL* const subs_k1i1 = ( i < seqLen-1 ) ? &subs[k-1][(i+1)*nofKmers1] : NULL;
01794     DREAL* const subs_k1i0 = & subs[ k-1 ][ i*nofKmers1 ];
01795     DREAL* const subs_k0i  = & subs[ k-0 ][ i*nofKmers0 ];
01796     INT y0;
01797     for( y0 = 0; y0 < nofKmers0; ++y0 ) {
01798       const INT y1l = y0 / NUM_SYMS;
01799       const INT y1r = y0 % nofKmers1;
01800       const INT y2 = y1r / NUM_SYMS;
01801       subs_k0i[ y0 ] += subs_k1i0[ y1l ];
01802       if( i < seqLen-1 ) {
01803         subs_k0i[ y0 ] += subs_k1i1[ y1r ];
01804         if( k > 1 ) {
01805           subs_k0i[ y0 ] -= subs_k2i1[ y2 ];
01806         }
01807       }
01808     }
01809       }
01810     }
01811   }
01812 
01813   // === compute POIMs
01814   poim_tries.POIMs_add_SLR( subs, max_degree, debug );
01815 
01816   // === clean; return "subs" as vector
01817   delete[] subs;
01818   num_feat = 1;
01819   num_sym = bigTabSize;
01820 
01821   use_poim_tries=false;
01822   poim_tries.delete_trees(false);
01823   
01824   return poim_result;
01825 }
01826 
01827 
01828 void CWeightedDegreePositionStringKernel::prepare_POIM2(DREAL* distrib, INT num_sym, INT num_feat)
01829 {
01830     free(m_poim_distrib);
01831     m_poim_distrib=(DREAL*)malloc(num_sym*num_feat*sizeof(DREAL));
01832     ASSERT(m_poim_distrib);
01833 
01834     memcpy(m_poim_distrib, distrib, num_sym*num_feat*sizeof(DREAL));
01835     m_poim_num_sym=num_sym;
01836     m_poim_num_feat=num_feat;
01837 }
01838 
01839 void CWeightedDegreePositionStringKernel::compute_POIM2(INT max_degree, CSVM* svm)
01840 {
01841     ASSERT(svm);
01842     INT num_suppvec=svm->get_num_support_vectors();
01843     INT* sv_idx=new INT[num_suppvec];
01844     DREAL* sv_weight=new DREAL[num_suppvec];
01845 
01846     for (INT i=0; i<num_suppvec; i++)
01847     {
01848         sv_idx[i]=svm->get_support_vector(i);
01849         sv_weight[i]=svm->get_alpha(i);
01850     }
01851     
01852     if ((max_degree < 1) || (max_degree > 12))
01853     {
01854         //SG_WARNING( "max_degree out of range 1..12 (%d).\n", max_degree);
01855         SG_WARNING( "max_degree out of range 1..12 (%d). setting to 1.\n", max_degree);
01856         max_degree=1;
01857     }
01858     
01859     int num_feat = m_poim_num_feat ;
01860     int num_sym = m_poim_num_sym ;
01861     free(m_poim) ;
01862 
01863     m_poim = compute_POIM(max_degree, num_feat, num_sym, NULL,  num_suppvec, sv_idx, 
01864                           sv_weight, m_poim_distrib);
01865 
01866     ASSERT(num_feat==1);
01867     m_poim_result_len=num_sym ;
01868     
01869     delete[] sv_weight ;
01870     delete[] sv_idx ;
01871 }
01872 
01873 void CWeightedDegreePositionStringKernel::get_POIM2(DREAL** poim, INT* result_len)
01874 {
01875     *poim=(DREAL*) malloc(m_poim_result_len*sizeof(DREAL));
01876     ASSERT(*poim);
01877     memcpy(*poim, m_poim, m_poim_result_len*sizeof(DREAL)) ;
01878     *result_len=m_poim_result_len ;
01879 }
01880 
01881 void CWeightedDegreePositionStringKernel::cleanup_POIM2()
01882 {
01883     free(m_poim) ;
01884     m_poim=NULL ;
01885     free(m_poim_distrib) ;
01886     m_poim_distrib=NULL ;
01887     m_poim_num_sym=0 ;
01888     m_poim_num_sym=0 ;
01889     m_poim_result_len=0 ;
01890 }
01891 

SHOGUN Machine Learning Toolbox - Documentation