CommWordStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include "lib/common.h"
00012 #include "kernel/CommWordStringKernel.h"
00013 #include "features/StringFeatures.h"
00014 #include "lib/io.h"
00015 
00016 CCommWordStringKernel::CCommWordStringKernel(
00017     INT size, bool s, ENormalizationType n)
00018 : CStringKernel<WORD>(size), sqrtdiag_lhs(NULL), sqrtdiag_rhs(NULL),
00019     initialized(false), dictionary_size(0), dictionary_weights(NULL),
00020     use_sign(s), normalization(n), use_dict_diagonal_optimization(false), dict_diagonal_optimization(NULL)
00021 {
00022     properties |= KP_LINADD;
00023     init_dictionary(1<<(sizeof(WORD)*8));
00024 }
00025 
00026 CCommWordStringKernel::CCommWordStringKernel(
00027     CStringFeatures<WORD>* l, CStringFeatures<WORD>* r,
00028     bool s, ENormalizationType n, INT size)
00029 : CStringKernel<WORD>(size), sqrtdiag_lhs(NULL), sqrtdiag_rhs(NULL),
00030     initialized(false), dictionary_size(0), dictionary_weights(NULL),
00031     use_sign(s), normalization(n), use_dict_diagonal_optimization(false), dict_diagonal_optimization(NULL)
00032 {
00033     properties |= KP_LINADD;
00034 
00035     init_dictionary(1<<(sizeof(WORD)*8));
00036     init(l,r);
00037 }
00038 
00039 
00040 bool CCommWordStringKernel::init_dictionary(INT size)
00041 {
00042     dictionary_size= size;
00043     delete[] dictionary_weights;
00044     dictionary_weights=new DREAL[size];
00045     SG_DEBUG( "using dictionary of %d words\n", size);
00046     clear_normal();
00047 
00048     return dictionary_weights!=NULL;
00049 }
00050 
00051 CCommWordStringKernel::~CCommWordStringKernel() 
00052 {
00053     cleanup();
00054 
00055     delete[] dictionary_weights;
00056     delete[] dict_diagonal_optimization ;
00057 }
00058   
00059 void CCommWordStringKernel::remove_lhs() 
00060 { 
00061     delete_optimization();
00062 
00063 #ifdef SVMLIGHT
00064     if (lhs)
00065         cache_reset();
00066 #endif
00067 
00068     if (sqrtdiag_lhs != sqrtdiag_rhs)
00069         delete[] sqrtdiag_rhs;
00070     delete[] sqrtdiag_lhs;
00071 
00072     lhs = NULL ; 
00073     rhs = NULL ; 
00074     initialized = false;
00075     sqrtdiag_lhs = NULL;
00076     sqrtdiag_rhs = NULL;
00077 }
00078 
00079 void CCommWordStringKernel::remove_rhs()
00080 {
00081 #ifdef SVMLIGHT
00082     if (rhs)
00083         cache_reset();
00084 #endif
00085 
00086     if (sqrtdiag_lhs != sqrtdiag_rhs)
00087         delete[] sqrtdiag_rhs;
00088     sqrtdiag_rhs = sqrtdiag_lhs;
00089     rhs = lhs;
00090 }
00091 
00092 bool CCommWordStringKernel::init(CFeatures* l, CFeatures* r)
00093 {
00094     bool result=CStringKernel<WORD>::init(l,r);
00095 
00096 
00097     initialized = false;
00098     INT i;
00099 
00100     if (sqrtdiag_lhs!=sqrtdiag_rhs)
00101         delete[] sqrtdiag_rhs;
00102     sqrtdiag_rhs=NULL;
00103     delete[] sqrtdiag_lhs;
00104     sqrtdiag_lhs=new DREAL[lhs->get_num_vectors()];
00105 
00106     for (i=0; i<lhs->get_num_vectors(); i++)
00107         sqrtdiag_lhs[i]=1;
00108 
00109     if (l==r)
00110         sqrtdiag_rhs=sqrtdiag_lhs;
00111     else
00112     {
00113         sqrtdiag_rhs=new DREAL[rhs->get_num_vectors()];
00114         for (i=0; i<rhs->get_num_vectors(); i++)
00115             sqrtdiag_rhs[i]=1;
00116     }
00117 
00118     this->lhs=(CStringFeatures<WORD>*) l;
00119     this->rhs=(CStringFeatures<WORD>*) l;
00120 
00121     if (use_dict_diagonal_optimization)
00122     {
00123         delete[] dict_diagonal_optimization ;
00124         dict_diagonal_optimization=new INT[INT(((CStringFeatures<WORD>*)l)->get_num_symbols())];
00125         ASSERT(((CStringFeatures<WORD>*)l)->get_num_symbols() == ((CStringFeatures<WORD>*)r)->get_num_symbols()) ;
00126     }
00127 
00128     //compute normalize to 1 values
00129     for (i=0; i<lhs->get_num_vectors(); i++)
00130     {
00131         if (use_dict_diagonal_optimization)
00132             sqrtdiag_lhs[i]=sqrt(compute_diag(i));
00133         else
00134             sqrtdiag_lhs[i]=sqrt(compute_helper(i,i, true));
00135         
00136         //trap divide by zero exception
00137         if (sqrtdiag_lhs[i]==0)
00138             sqrtdiag_lhs[i]=1e-16;
00139     }
00140 
00141     // if lhs is different from rhs (train/test data)
00142     // compute also the normalization for rhs
00143     if (sqrtdiag_lhs!=sqrtdiag_rhs)
00144     {
00145         this->lhs=(CStringFeatures<WORD>*) r;
00146         this->rhs=(CStringFeatures<WORD>*) r;
00147         
00148         //compute normalize to 1 values
00149         for (i=0; i<rhs->get_num_vectors(); i++)
00150         {
00151             if (use_dict_diagonal_optimization)
00152                 sqrtdiag_rhs[i]=sqrt(compute_diag(i));
00153             else
00154                 sqrtdiag_rhs[i]=sqrt(compute_helper(i,i, true));
00155                 
00156             //trap divide by zero exception
00157             if (sqrtdiag_rhs[i]==0)
00158                 sqrtdiag_rhs[i]=1e-16;
00159         }
00160     }
00161 
00162     this->lhs=(CStringFeatures<WORD>*) l;
00163     this->rhs=(CStringFeatures<WORD>*) r;
00164 
00165     initialized = true;
00166     return result;
00167 }
00168 
00169 void CCommWordStringKernel::cleanup()
00170 {
00171     delete_optimization();
00172     clear_normal();
00173 
00174     initialized=false;
00175 
00176     if (sqrtdiag_lhs != sqrtdiag_rhs)
00177         delete[] sqrtdiag_rhs;
00178 
00179     sqrtdiag_rhs=NULL;
00180 
00181     delete[] sqrtdiag_lhs;
00182     sqrtdiag_lhs=NULL;
00183 
00184     CKernel::cleanup();
00185 }
00186 
00187 bool CCommWordStringKernel::load_init(FILE* src)
00188 {
00189     return false;
00190 }
00191 
00192 bool CCommWordStringKernel::save_init(FILE* dest)
00193 {
00194     return false;
00195 }
00196 
00197 DREAL CCommWordStringKernel::compute_diag(INT idx_a)
00198 {
00199     INT alen;
00200     CStringFeatures<WORD>* l = (CStringFeatures<WORD>*) lhs;
00201     CStringFeatures<WORD>* r = (CStringFeatures<WORD>*) rhs;
00202 
00203     WORD* av=l->get_feature_vector(idx_a, alen);
00204 
00205     DREAL result=0.0 ;
00206     ASSERT(l==r);
00207     ASSERT(sizeof(WORD)<=sizeof(DREAL));
00208     ASSERT((1<<(sizeof(WORD)*8)) > alen);
00209 
00210     INT num_symbols=(INT) l->get_num_symbols();
00211     ASSERT(num_symbols<=dictionary_size);
00212 
00213     INT* dic = dict_diagonal_optimization;
00214     memset(dic, 0, num_symbols*sizeof(INT));
00215 
00216     for (INT i=0; i<alen; i++)
00217         dic[av[i]]++;
00218 
00219     if (use_sign)
00220     {
00221         for (INT i=0; i<(INT) l->get_num_symbols(); i++)
00222         {
00223             if (dic[i]!=0)
00224                 result++;
00225         }
00226     }
00227     else
00228     {
00229         for (INT i=0; i<num_symbols; i++)
00230         {
00231             if (dic[i]!=0)
00232                 result+=dic[i]*dic[i];
00233         }
00234     }
00235 
00236     return result;
00237 }
00238 
00239 DREAL CCommWordStringKernel::compute_helper(INT idx_a, INT idx_b, bool do_sort)
00240 {
00241     INT alen, blen;
00242     CStringFeatures<WORD>* l = (CStringFeatures<WORD>*) lhs;
00243     CStringFeatures<WORD>* r = (CStringFeatures<WORD>*) rhs;
00244 
00245     WORD* av=l->get_feature_vector(idx_a, alen);
00246     WORD* bv=r->get_feature_vector(idx_b, blen);
00247 
00248     WORD* avec=av;
00249     WORD* bvec=bv;
00250 
00251     if (do_sort)
00252     {
00253         if (alen>0)
00254         {
00255             avec=new WORD[alen];
00256             memcpy(avec, av, sizeof(WORD)*alen);
00257             CMath::radix_sort(avec, alen);
00258         }
00259         else
00260             avec=NULL;
00261 
00262         if (blen>0)
00263         {
00264             bvec=new WORD[blen];
00265             memcpy(bvec, bv, sizeof(WORD)*blen);
00266             CMath::radix_sort(bvec, blen);
00267         }
00268         else
00269             bvec=NULL;
00270     }
00271     else
00272     {
00273         if ( (l->get_num_preproc() != l->get_num_preprocessed()) ||
00274                 (r->get_num_preproc() != r->get_num_preprocessed()))
00275         {
00276             SG_ERROR("not all preprocessors have been applied to training (%d/%d)"
00277                     " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preproc(),
00278                     r->get_num_preprocessed(), r->get_num_preproc());
00279         }
00280     }
00281 
00282     DREAL result=0;
00283 
00284     INT left_idx=0;
00285     INT right_idx=0;
00286 
00287     if (use_sign)
00288     {
00289         while (left_idx < alen && right_idx < blen)
00290         {
00291             if (avec[left_idx]==bvec[right_idx])
00292             {
00293                 WORD sym=avec[left_idx];
00294 
00295                 while (left_idx< alen && avec[left_idx]==sym)
00296                     left_idx++;
00297 
00298                 while (right_idx< blen && bvec[right_idx]==sym)
00299                     right_idx++;
00300 
00301                 result++;
00302             }
00303             else if (avec[left_idx]<bvec[right_idx])
00304                 left_idx++;
00305             else
00306                 right_idx++;
00307         }
00308     }
00309     else
00310     {
00311         while (left_idx < alen && right_idx < blen)
00312         {
00313             if (avec[left_idx]==bvec[right_idx])
00314             {
00315                 INT old_left_idx=left_idx;
00316                 INT old_right_idx=right_idx;
00317 
00318                 WORD sym=avec[left_idx];
00319 
00320                 while (left_idx< alen && avec[left_idx]==sym)
00321                     left_idx++;
00322 
00323                 while (right_idx< blen && bvec[right_idx]==sym)
00324                     right_idx++;
00325 
00326                 result+=((DREAL) (left_idx-old_left_idx)) * ((DREAL) (right_idx-old_right_idx));
00327             }
00328             else if (avec[left_idx]<bvec[right_idx])
00329                 left_idx++;
00330             else
00331                 right_idx++;
00332         }
00333     }
00334 
00335     if (do_sort)
00336     {
00337         delete[] avec;
00338         delete[] bvec;
00339     }
00340 
00341     if (initialized)
00342     {
00343         switch (normalization)
00344         {
00345             case NO_NORMALIZATION:
00346                 return result;
00347             case SQRT_NORMALIZATION:
00348                 return result/sqrt(sqrtdiag_lhs[idx_a]*sqrtdiag_rhs[idx_b]);
00349             case FULL_NORMALIZATION:
00350                 return result/(sqrtdiag_lhs[idx_a]*sqrtdiag_rhs[idx_b]);
00351             case SQRTLEN_NORMALIZATION:
00352                 return result/sqrt(sqrt(alen*blen));
00353             case LEN_NORMALIZATION:
00354                 return result/sqrt(alen*blen);
00355             case SQLEN_NORMALIZATION:
00356                 return result/(alen*blen);
00357             default:
00358                 SG_ERROR( "Unknown Normalization in use!\n");
00359                 return -CMath::INFTY;
00360         }
00361     }
00362     else
00363         return result;
00364 }
00365 
00366 void CCommWordStringKernel::add_to_normal(INT vec_idx, DREAL weight)
00367 {
00368     INT len=-1;
00369     WORD* vec=((CStringFeatures<WORD>*) lhs)->get_feature_vector(vec_idx, len);
00370 
00371     if (len>0)
00372     {
00373         int j, last_j=0;
00374         if (use_sign)
00375         {
00376             for (j=1; j<len; j++)
00377             {
00378                 if (vec[j]==vec[j-1])
00379                     continue;
00380 
00381                 dictionary_weights[(int) vec[j-1]] += normalize_weight(sqrtdiag_lhs, weight, vec_idx, len, normalization);
00382             }
00383 
00384             dictionary_weights[(int) vec[len-1]] += normalize_weight(sqrtdiag_lhs, weight, vec_idx, len, normalization);
00385         }
00386         else
00387         {
00388             for (j=1; j<len; j++)
00389             {
00390                 if (vec[j]==vec[j-1])
00391                     continue;
00392 
00393                 dictionary_weights[(int) vec[j-1]] += normalize_weight(sqrtdiag_lhs, weight*(j-last_j), vec_idx, len, normalization);
00394                 last_j = j;
00395             }
00396 
00397             dictionary_weights[(int) vec[len-1]] += normalize_weight(sqrtdiag_lhs, weight*(len-last_j), vec_idx, len, normalization);
00398         }
00399         set_is_initialized(true);
00400     }
00401 }
00402 
00403 void CCommWordStringKernel::clear_normal()
00404 {
00405     memset(dictionary_weights, 0, dictionary_size*sizeof(DREAL));
00406     set_is_initialized(false);
00407 }
00408 
00409 bool CCommWordStringKernel::init_optimization(INT count, INT *IDX, DREAL * weights) 
00410 {
00411     delete_optimization();
00412 
00413     if (count<=0)
00414     {
00415         set_is_initialized(true);
00416         SG_DEBUG( "empty set of SVs\n");
00417         return true;
00418     }
00419 
00420     SG_DEBUG( "initializing CCommWordStringKernel optimization\n");
00421 
00422     for (int i=0; i<count; i++)
00423     {
00424         if ( (i % (count/10+1)) == 0)
00425             SG_PROGRESS(i, 0, count);
00426 
00427         add_to_normal(IDX[i], weights[i]);
00428     }
00429 
00430     //SG_PRINT( "Done.         \n");
00431     
00432     set_is_initialized(true);
00433     return true;
00434 }
00435 
00436 bool CCommWordStringKernel::delete_optimization() 
00437 {
00438     SG_DEBUG( "deleting CCommWordStringKernel optimization\n");
00439 
00440     clear_normal();
00441     return true;
00442 }
00443 
00444 DREAL CCommWordStringKernel::compute_optimized(INT i) 
00445 { 
00446     if (!get_is_initialized())
00447     {
00448       SG_ERROR( "CCommWordStringKernel optimization not initialized\n");
00449         return 0 ; 
00450     }
00451 
00452     DREAL result = 0;
00453     INT len = -1;
00454     WORD* vec=((CStringFeatures<WORD>*) rhs)->get_feature_vector(i, len);
00455 
00456     int j, last_j=0;
00457     if (vec && len>0)
00458     {
00459         if (use_sign)
00460         {
00461             for (j=1; j<len; j++)
00462             {
00463                 if (vec[j]==vec[j-1])
00464                     continue;
00465 
00466                 result += dictionary_weights[(int) vec[j-1]];
00467             }
00468 
00469             result += dictionary_weights[(int) vec[len-1]];
00470         }
00471         else
00472         {
00473             for (j=1; j<len; j++)
00474             {
00475                 if (vec[j]==vec[j-1])
00476                     continue;
00477 
00478                 result += dictionary_weights[(int) vec[j-1]]*(j-last_j);
00479                 last_j = j;
00480             }
00481 
00482             result += dictionary_weights[(int) vec[len-1]]*(len-last_j);
00483         }
00484 
00485         result=normalize_weight(sqrtdiag_rhs, result, i, len, normalization);
00486     }
00487     return result;
00488 }
00489 
00490 DREAL* CCommWordStringKernel::compute_scoring(INT max_degree, INT& num_feat,
00491         INT& num_sym, DREAL* target, INT num_suppvec, INT* IDX, DREAL* alphas,
00492         bool do_init)
00493 {
00494     ASSERT(lhs);
00495     CStringFeatures<WORD>* str=((CStringFeatures<WORD>*) lhs);
00496     num_feat=1;//str->get_max_vector_length();
00497     CAlphabet* alpha=str->get_alphabet();
00498     ASSERT(alpha);
00499     INT num_bits=alpha->get_num_bits();
00500     INT order=str->get_order();
00501     ASSERT(max_degree<=order);
00502     //INT num_words=(INT) str->get_num_symbols();
00503     INT num_words=(INT) str->get_original_num_symbols();
00504     INT offset=0;
00505 
00506     num_sym=0;
00507     
00508     for (INT i=0; i<order; i++)
00509         num_sym+=CMath::pow((INT) num_words,i+1);
00510 
00511     SG_DEBUG("num_words:%d, order:%d, len:%d sz:%d (len*sz:%d)\n", num_words, order,
00512             num_feat, num_sym, num_feat*num_sym);
00513 
00514     if (!target)
00515         target=new DREAL[num_feat*num_sym];
00516     memset(target, 0, num_feat*num_sym*sizeof(DREAL));
00517 
00518     if (do_init)
00519         init_optimization(num_suppvec, IDX, alphas);
00520 
00521     UINT kmer_mask=0;
00522     UINT words=CMath::pow((INT) num_words,(INT) order);
00523 
00524     for (INT o=0; o<max_degree; o++)
00525     {
00526         DREAL* contrib=&target[offset];
00527         offset+=CMath::pow((INT) num_words,(INT) o+1);
00528 
00529         kmer_mask=(kmer_mask<<(num_bits)) | str->get_masked_symbols(0xffff, 1);
00530 
00531         for (INT p=-o; p<order; p++)
00532         {
00533             INT o_sym=0, m_sym=0, il=0,ir=0, jl=0;
00534             UINT imer_mask=kmer_mask;
00535             UINT jmer_mask=kmer_mask;
00536 
00537             if (p<0)
00538             {
00539                 il=-p;
00540                 m_sym=order-o-p-1;
00541                 o_sym=-p;
00542             }
00543             else if (p<order-o)
00544             {
00545                 ir=p;
00546                 m_sym=order-o-1;
00547             }
00548             else
00549             {
00550                 ir=p;
00551                 m_sym=p;
00552                 o_sym=p-order+o+1;
00553                 jl=order-ir;
00554                 imer_mask=(kmer_mask>>(num_bits*o_sym));
00555                 jmer_mask=(kmer_mask>>(num_bits*jl));
00556             }
00557 
00558             DREAL marginalizer=1.0/CMath::pow((INT) num_words,(INT) m_sym);
00559             
00560             for (UINT i=0; i<words; i++)
00561             {
00562                 WORD x= ((i << (num_bits*il)) >> (num_bits*ir)) & imer_mask;
00563 
00564                 if (p>=0 && p<order-o)
00565                 {
00566 //#define DEBUG_COMMSCORING
00567 #ifdef DEBUG_COMMSCORING
00568                     SG_PRINT("o=%d/%d p=%d/%d i=0x%x x=0x%x imask=%x jmask=%x kmask=%x il=%d ir=%d marg=%g o_sym:%d m_sym:%d weight(",
00569                             o,order, p,order, i, x, imer_mask, jmer_mask, kmer_mask, il, ir, marginalizer, o_sym, m_sym);
00570 
00571                     SG_PRINT("%c%c%c%c/%c%c%c%c)+=%g/%g\n", 
00572                             alpha->remap_to_char((x>>(3*num_bits))&0x03), alpha->remap_to_char((x>>(2*num_bits))&0x03),
00573                             alpha->remap_to_char((x>>num_bits)&0x03), alpha->remap_to_char(x&0x03),
00574                             alpha->remap_to_char((i>>(3*num_bits))&0x03), alpha->remap_to_char((i>>(2*num_bits))&0x03),
00575                             alpha->remap_to_char((i>>(1*num_bits))&0x03), alpha->remap_to_char(i&0x03),
00576                             dictionary_weights[i]*marginalizer, dictionary_weights[i]);
00577 #endif
00578                     contrib[x]+=dictionary_weights[i]*marginalizer;
00579                 }
00580                 else
00581                 {
00582                     for (UINT j=0; j< (UINT) CMath::pow((INT) num_words, (INT) o_sym); j++)
00583                     {
00584                         UINT c=x | ((j & jmer_mask) << (num_bits*jl));
00585 #ifdef DEBUG_COMMSCORING
00586 
00587                         SG_PRINT("o=%d/%d p=%d/%d i=0x%x j=0x%x x=0x%x c=0x%x imask=%x jmask=%x kmask=%x il=%d ir=%d jl=%d marg=%g o_sym:%d m_sym:%d weight(",
00588                                 o,order, p,order, i, j, x, c, imer_mask, jmer_mask, kmer_mask, il, ir, jl, marginalizer, o_sym, m_sym);
00589                         SG_PRINT("%c%c%c%c/%c%c%c%c)+=%g/%g\n", 
00590                                 alpha->remap_to_char((c>>(3*num_bits))&0x03), alpha->remap_to_char((c>>(2*num_bits))&0x03),
00591                                 alpha->remap_to_char((c>>num_bits)&0x03), alpha->remap_to_char(c&0x03),
00592                                 alpha->remap_to_char((i>>(3*num_bits))&0x03), alpha->remap_to_char((i>>(2*num_bits))&0x03),
00593                                 alpha->remap_to_char((i>>(1*num_bits))&0x03), alpha->remap_to_char(i&0x03),
00594                                 dictionary_weights[i]*marginalizer, dictionary_weights[i]);
00595 #endif
00596                         contrib[c]+=dictionary_weights[i]*marginalizer;
00597                     }
00598                 }
00599             }
00600         }
00601     }
00602 
00603     for (INT i=1; i<num_feat; i++)
00604         memcpy(&target[num_sym*i], target, num_sym*sizeof(DREAL));
00605 
00606     SG_UNREF(alpha);
00607 
00608     return target;
00609 }
00610 
00611 
00612 CHAR* CCommWordStringKernel::compute_consensus(INT &result_len, INT num_suppvec, INT* IDX, DREAL* alphas)
00613 {
00614 
00615     ASSERT(lhs);
00616     ASSERT(IDX);
00617     ASSERT(alphas);
00618 
00619     CStringFeatures<WORD>* str=((CStringFeatures<WORD>*) lhs);
00620     INT num_words=(INT) str->get_num_symbols();
00621     INT num_feat=str->get_max_vector_length();
00622     LONG total_len=((LONG) num_feat) * num_words;
00623     CAlphabet* alpha=((CStringFeatures<WORD>*) lhs)->get_alphabet();
00624     ASSERT(alpha);
00625     INT num_bits=alpha->get_num_bits();
00626     INT order=str->get_order();
00627     INT max_idx=-1;
00628     DREAL max_score=0; 
00629     result_len=num_feat+order-1;
00630 
00631     //init
00632     init_optimization(num_suppvec, IDX, alphas);
00633 
00634     CHAR* result=new CHAR[result_len];
00635     INT* bt=new INT[total_len];
00636     DREAL* score=new DREAL[total_len];
00637 
00638     for (LONG i=0; i<total_len; i++)
00639     {
00640         bt[i]=-1;
00641         score[i]=0;
00642     }
00643 
00644     for (INT t=0; t<num_words; t++)
00645         score[t]=dictionary_weights[t];
00646 
00647     //dynamic program
00648     for (INT i=1; i<num_feat; i++)
00649     {
00650         for (INT t1=0; t1<num_words; t1++)
00651         {
00652             max_idx=-1;
00653             max_score=0; 
00654 
00655             /* ignore weights the svm does not care about 
00656              * (has not seen in training). note that this assumes that zero 
00657              * weights are very unlikely to appear elsewise */
00658 
00659             //if (dictionary_weights[t1]==0.0)
00660                 //continue;
00661 
00662             /* iterate over words t ending on t1 and find the highest scoring
00663              * pair */
00664             WORD suffix=(WORD) t1 >> num_bits;
00665 
00666             for (INT sym=0; sym<str->get_original_num_symbols(); sym++)
00667             {
00668                 WORD t=suffix | sym << (num_bits*(order-1));
00669 
00670                 //if (dictionary_weights[t]==0.0)
00671                 //  continue;
00672 
00673                 DREAL sc=score[num_words*(i-1) + t] + dictionary_weights[t1];
00674                 if (sc > max_score || max_idx==-1)
00675                 {
00676                     max_idx=t;
00677                     max_score=sc;
00678                 }
00679             }
00680             ASSERT(max_idx!=-1);
00681 
00682             score[num_words*i + t1]=max_score;
00683             bt[num_words*i + t1]=max_idx;
00684         }
00685     }
00686 
00687     //backtracking
00688     max_idx=0;
00689     max_score=score[num_words*(num_feat-1) + 0];
00690     for (INT t=1; t<num_words; t++)
00691     {
00692         DREAL sc=score[num_words*(num_feat-1) + t];
00693         if (sc>max_score)
00694         {
00695             max_idx=t;
00696             max_score=sc;
00697         }
00698     }
00699 
00700     SG_PRINT("max_idx:%i, max_score:%f\n", max_idx, max_score);
00701     
00702     for (INT i=result_len-1; i>=num_feat; i--)
00703         result[i]=alpha->remap_to_char( (BYTE) str->get_masked_symbols( (WORD) max_idx >> (num_bits*(result_len-1-i)), 1) );
00704 
00705     for (INT i=num_feat-1; i>=0; i--)
00706     {
00707         result[i]=alpha->remap_to_char( (BYTE) str->get_masked_symbols( (WORD) max_idx >> (num_bits*(order-1)), 1) );
00708         max_idx=bt[num_words*i + max_idx];
00709     }
00710 
00711     delete[] bt;
00712     delete[] score;
00713     SG_UNREF(alpha);
00714     return result;
00715 }

SHOGUN Machine Learning Toolbox - Documentation