00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017
00018 #include "kernel/WeightedDegreeStringKernel.h"
00019 #include "kernel/FirstElementKernelNormalizer.h"
00020 #include "features/Features.h"
00021 #include "features/StringFeatures.h"
00022
00023 #ifndef WIN32
00024 #include <pthread.h>
00025 #endif
00026
00027 struct S_THREAD_PARAM
00028 {
00029 int32_t* vec;
00030 float64_t* result;
00031 float64_t* weights;
00032 CWeightedDegreeStringKernel* kernel;
00033 CTrie<DNATrie>* tries;
00034 float64_t factor;
00035 int32_t j;
00036 int32_t start;
00037 int32_t end;
00038 int32_t length;
00039 int32_t* vec_idx;
00040 };
00041
00042 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00043 int32_t degree_, EWDKernType type_)
00044 : CStringKernel<char>(10),weights(NULL),position_weights(NULL),
00045 weights_buffer(NULL), mkl_stepsize(1),degree(degree_), length(0),
00046 max_mismatch(0), seq_length(0), block_computation(true),
00047 num_block_weights_external(0), block_weights_external(NULL),
00048 block_weights(NULL), type(type_), which_degree(-1), tries(NULL),
00049 tree_initialized(false), alphabet(NULL)
00050 {
00051 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00052 lhs=NULL;
00053 rhs=NULL;
00054
00055 if (type!=E_EXTERNAL)
00056 set_wd_weights_by_type(type);
00057
00058 set_normalizer(new CFirstElementKernelNormalizer());
00059 }
00060
00061 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00062 float64_t *weights_, int32_t degree_)
00063 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00064 weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00065 max_mismatch(0), seq_length(0), block_computation(true),
00066 num_block_weights_external(0), block_weights_external(NULL),
00067 block_weights(NULL), type(E_EXTERNAL), which_degree(-1), tries(NULL),
00068 tree_initialized(false), alphabet(NULL)
00069 {
00070 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00071 lhs=NULL;
00072 rhs=NULL;
00073
00074 weights=new float64_t[degree*(1+max_mismatch)];
00075 for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00076 weights[i]=weights_[i];
00077 set_normalizer(new CFirstElementKernelNormalizer());
00078 }
00079
00080 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(
00081 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree_)
00082 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00083 weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00084 max_mismatch(0), seq_length(0), block_computation(true),
00085 num_block_weights_external(0), block_weights_external(NULL),
00086 block_weights(NULL), type(E_WD), which_degree(-1), tries(NULL),
00087 tree_initialized(false), alphabet(NULL)
00088 {
00089 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00090
00091 set_wd_weights_by_type(type);
00092 set_normalizer(new CFirstElementKernelNormalizer());
00093 init(l, r);
00094 }
00095
00096 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel()
00097 {
00098 cleanup();
00099
00100 delete[] weights;
00101 weights=NULL;
00102
00103 delete[] block_weights;
00104 block_weights=NULL;
00105
00106 delete[] position_weights;
00107 position_weights=NULL;
00108
00109 delete[] weights_buffer;
00110 weights_buffer=NULL;
00111 }
00112
00113
00114 void CWeightedDegreeStringKernel::remove_lhs()
00115 {
00116 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00117 delete_optimization();
00118
00119 if (tries!=NULL)
00120 tries->destroy();
00121
00122 CKernel::remove_lhs();
00123 }
00124
00125 void CWeightedDegreeStringKernel::create_empty_tries()
00126 {
00127 seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length();
00128
00129 if (tries!=NULL)
00130 {
00131 tries->destroy() ;
00132 tries->create(seq_length, max_mismatch==0) ;
00133 }
00134 }
00135
00136 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r)
00137 {
00138 int32_t lhs_changed=(lhs!=l);
00139 int32_t rhs_changed=(rhs!=r);
00140
00141 CStringKernel<char>::init(l,r);
00142
00143 SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00144 SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00145
00146 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00147 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00148
00149 int32_t len=sf_l->get_max_vector_length();
00150 if (lhs_changed && !sf_l->have_same_length(len))
00151 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00152
00153 if (rhs_changed && !sf_r->have_same_length(len))
00154 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00155
00156 delete alphabet;
00157 alphabet=new CAlphabet(sf_l->get_alphabet());
00158 CAlphabet* ralphabet=(sf_r->get_alphabet());
00159 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00160 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00161
00162 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00163 SG_UNREF(ralphabet);
00164
00165 if (tries!=NULL) {
00166 tries->delete_trees(max_mismatch==0);
00167 delete tries;
00168 }
00169 tries=new CTrie<DNATrie>(degree, max_mismatch==0);
00170 create_empty_tries();
00171
00172 init_block_weights();
00173
00174 return init_normalizer();
00175 }
00176
00177 void CWeightedDegreeStringKernel::cleanup()
00178 {
00179 SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n");
00180 delete_optimization();
00181
00182 delete[] block_weights;
00183 block_weights=NULL;
00184
00185 if (tries!=NULL)
00186 {
00187 tries->destroy();
00188 delete tries;
00189 tries=NULL;
00190 }
00191
00192 seq_length=0;
00193 tree_initialized = false;
00194
00195 delete alphabet;
00196 alphabet=NULL;
00197
00198 CKernel::cleanup();
00199 }
00200
00201 bool CWeightedDegreeStringKernel::load_init(FILE* src)
00202 {
00203 return false;
00204 }
00205
00206 bool CWeightedDegreeStringKernel::save_init(FILE* dest)
00207 {
00208 return false;
00209 }
00210
00211
00212 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num)
00213 {
00214 if (tree_num<0)
00215 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00216
00217 delete_optimization();
00218
00219 if (tree_num<0)
00220 SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ;
00221
00222 for (int32_t i=0; i<count; i++)
00223 {
00224 if (tree_num<0)
00225 {
00226 if ( (i % (count/10+1)) == 0)
00227 SG_PROGRESS(i, 0, count);
00228
00229 if (max_mismatch==0)
00230 add_example_to_tree(IDX[i], alphas[i]) ;
00231 else
00232 add_example_to_tree_mismatch(IDX[i], alphas[i]) ;
00233
00234
00235 }
00236 else
00237 {
00238 if (max_mismatch==0)
00239 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ;
00240 else
00241 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ;
00242 }
00243 }
00244
00245 if (tree_num<0)
00246 SG_DONE();
00247
00248
00249
00250 set_is_initialized(true) ;
00251 return true ;
00252 }
00253
00254 bool CWeightedDegreeStringKernel::delete_optimization()
00255 {
00256 if (get_is_initialized())
00257 {
00258 if (tries!=NULL)
00259 tries->delete_trees(max_mismatch==0);
00260 set_is_initialized(false);
00261 return true;
00262 }
00263
00264 return false;
00265 }
00266
00267
00268 float64_t CWeightedDegreeStringKernel::compute_with_mismatch(
00269 char* avec, int32_t alen, char* bvec, int32_t blen)
00270 {
00271 float64_t sum = 0.0;
00272
00273 for (int32_t i=0; i<alen; i++)
00274 {
00275 float64_t sumi = 0.0;
00276 int32_t mismatches=0;
00277
00278 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00279 {
00280 if (avec[i+j]!=bvec[i+j])
00281 {
00282 mismatches++ ;
00283 if (mismatches>max_mismatch)
00284 break ;
00285 } ;
00286 sumi += weights[j+degree*mismatches];
00287 }
00288 if (position_weights!=NULL)
00289 sum+=position_weights[i]*sumi ;
00290 else
00291 sum+=sumi ;
00292 }
00293 return sum ;
00294 }
00295
00296 float64_t CWeightedDegreeStringKernel::compute_using_block(
00297 char* avec, int32_t alen, char* bvec, int32_t blen)
00298 {
00299 ASSERT(alen==blen);
00300
00301 float64_t sum=0;
00302 int32_t match_len=-1;
00303
00304 for (int32_t i=0; i<alen; i++)
00305 {
00306 if (avec[i]==bvec[i])
00307 match_len++;
00308 else
00309 {
00310 if (match_len>=0)
00311 sum+=block_weights[match_len];
00312 match_len=-1;
00313 }
00314 }
00315
00316 if (match_len>=0)
00317 sum+=block_weights[match_len];
00318
00319 return sum;
00320 }
00321
00322 float64_t CWeightedDegreeStringKernel::compute_without_mismatch(
00323 char* avec, int32_t alen, char* bvec, int32_t blen)
00324 {
00325 float64_t sum = 0.0;
00326
00327 for (int32_t i=0; i<alen; i++)
00328 {
00329 float64_t sumi = 0.0;
00330
00331 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00332 {
00333 if (avec[i+j]!=bvec[i+j])
00334 break ;
00335 sumi += weights[j];
00336 }
00337 if (position_weights!=NULL)
00338 sum+=position_weights[i]*sumi ;
00339 else
00340 sum+=sumi ;
00341 }
00342 return sum ;
00343 }
00344
00345 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix(
00346 char* avec, int32_t alen, char* bvec, int32_t blen)
00347 {
00348 float64_t sum = 0.0;
00349
00350 for (int32_t i=0; i<alen; i++)
00351 {
00352 float64_t sumi=0.0;
00353 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00354 {
00355 if (avec[i+j]!=bvec[i+j])
00356 break;
00357 sumi += weights[i*degree+j];
00358 }
00359 if (position_weights!=NULL)
00360 sum += position_weights[i]*sumi ;
00361 else
00362 sum += sumi ;
00363 }
00364
00365 return sum ;
00366 }
00367
00368
00369 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b)
00370 {
00371 int32_t alen, blen;
00372 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen);
00373 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen);
00374 float64_t result=0;
00375
00376 if (max_mismatch==0 && length==0 && block_computation)
00377 result=compute_using_block(avec, alen, bvec, blen);
00378 else
00379 {
00380 if (max_mismatch>0)
00381 result=compute_with_mismatch(avec, alen, bvec, blen);
00382 else if (length==0)
00383 result=compute_without_mismatch(avec, alen, bvec, blen);
00384 else
00385 result=compute_without_mismatch_matrix(avec, alen, bvec, blen);
00386 }
00387
00388 return result;
00389 }
00390
00391
00392 void CWeightedDegreeStringKernel::add_example_to_tree(
00393 int32_t idx, float64_t alpha)
00394 {
00395 ASSERT(alphabet);
00396 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00397
00398 int32_t len=0;
00399 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00400 ASSERT(max_mismatch==0);
00401 int32_t *vec=new int32_t[len];
00402
00403 for (int32_t i=0; i<len; i++)
00404 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00405
00406 if (length == 0 || max_mismatch > 0)
00407 {
00408 for (int32_t i=0; i<len; i++)
00409 {
00410 float64_t alpha_pw=alpha;
00411
00412
00413 if (alpha_pw==0.0)
00414 continue;
00415 ASSERT(tries);
00416 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00417 }
00418 }
00419 else
00420 {
00421 for (int32_t i=0; i<len; i++)
00422 {
00423 float64_t alpha_pw=alpha;
00424
00425
00426 if (alpha_pw==0.0)
00427 continue ;
00428 ASSERT(tries);
00429 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00430 }
00431 }
00432 delete[] vec ;
00433 tree_initialized=true ;
00434 }
00435
00436 void CWeightedDegreeStringKernel::add_example_to_single_tree(
00437 int32_t idx, float64_t alpha, int32_t tree_num)
00438 {
00439 ASSERT(alphabet);
00440 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00441
00442 int32_t len ;
00443 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00444 ASSERT(max_mismatch==0);
00445 int32_t *vec = new int32_t[len] ;
00446
00447 for (int32_t i=tree_num; i<tree_num+degree && i<len; i++)
00448 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00449
00450 ASSERT(tries);
00451 if (alpha!=0.0)
00452 tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0));
00453
00454 delete[] vec ;
00455 tree_initialized=true ;
00456 }
00457
00458 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha)
00459 {
00460 ASSERT(tries);
00461 ASSERT(alphabet);
00462 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00463
00464 int32_t len ;
00465 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00466
00467 int32_t *vec = new int32_t[len] ;
00468
00469 for (int32_t i=0; i<len; i++)
00470 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00471
00472 for (int32_t i=0; i<len; i++)
00473 {
00474 if (alpha!=0.0)
00475 tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights);
00476 }
00477
00478 delete[] vec ;
00479 tree_initialized=true ;
00480 }
00481
00482 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch(
00483 int32_t idx, float64_t alpha, int32_t tree_num)
00484 {
00485 ASSERT(tries);
00486 ASSERT(alphabet);
00487 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00488
00489 int32_t len=0;
00490 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00491 int32_t *vec=new int32_t[len];
00492
00493 for (int32_t i=tree_num; i<len && i<tree_num+degree; i++)
00494 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00495
00496 if (alpha!=0.0)
00497 {
00498 tries->add_example_to_tree_mismatch_recursion(
00499 NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num,
00500 0, 0, max_mismatch, weights);
00501 }
00502
00503 delete[] vec;
00504 tree_initialized=true;
00505 }
00506
00507
00508 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx)
00509 {
00510 ASSERT(alphabet);
00511 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00512
00513 int32_t len=0;
00514 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len);
00515 ASSERT(char_vec && len>0);
00516 int32_t *vec=new int32_t[len];
00517
00518 for (int32_t i=0; i<len; i++)
00519 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00520
00521 float64_t sum=0;
00522 ASSERT(tries);
00523 for (int32_t i=0; i<len; i++)
00524 sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0));
00525
00526 delete[] vec;
00527 return normalizer->normalize_rhs(sum, idx);
00528 }
00529
00530 void CWeightedDegreeStringKernel::compute_by_tree(
00531 int32_t idx, float64_t* LevelContrib)
00532 {
00533 ASSERT(alphabet);
00534 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00535
00536 int32_t len ;
00537 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len);
00538
00539 int32_t *vec = new int32_t[len] ;
00540
00541 for (int32_t i=0; i<len; i++)
00542 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00543
00544 ASSERT(tries);
00545 for (int32_t i=0; i<len; i++)
00546 {
00547 tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00548 normalizer->normalize_rhs(1.0, idx),
00549 mkl_stepsize, weights, (length!=0));
00550 }
00551
00552 delete[] vec ;
00553 }
00554
00555 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len)
00556 {
00557 ASSERT(tries);
00558 return tries->compute_abs_weights(len);
00559 }
00560
00561 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type)
00562 {
00563 ASSERT(degree>0);
00564 ASSERT(p_type==E_WD);
00565
00566 delete[] weights;
00567 weights=new float64_t[degree];
00568 if (weights)
00569 {
00570 int32_t i;
00571 float64_t sum=0;
00572 for (i=0; i<degree; i++)
00573 {
00574 weights[i]=degree-i;
00575 sum+=weights[i];
00576 }
00577 for (i=0; i<degree; i++)
00578 weights[i]/=sum;
00579
00580 for (i=0; i<degree; i++)
00581 {
00582 for (int32_t j=1; j<=max_mismatch; j++)
00583 {
00584 if (j<i+1)
00585 {
00586 int32_t nk=CMath::nchoosek(i+1, j);
00587 weights[i+j*degree]=weights[i]/(nk*pow(3,j));
00588 }
00589 else
00590 weights[i+j*degree]= 0;
00591 }
00592 }
00593
00594 if (which_degree>=0)
00595 {
00596 ASSERT(which_degree<degree);
00597 for (i=0; i<degree; i++)
00598 {
00599 if (i!=which_degree)
00600 weights[i]=0;
00601 else
00602 weights[i]=1;
00603 }
00604 }
00605 return true;
00606 }
00607 else
00608 return false;
00609 }
00610
00611 bool CWeightedDegreeStringKernel::set_weights(
00612 float64_t* ws, int32_t d, int32_t len)
00613 {
00614 SG_DEBUG("degree = %i d=%i\n", degree, d);
00615 degree=d;
00616 ASSERT(tries);
00617 tries->set_degree(degree);
00618 length=len;
00619
00620 if (length==0) length=1;
00621 int32_t num_weights=degree*(length+max_mismatch);
00622 delete[] weights;
00623 weights=new float64_t[num_weights];
00624 if (weights)
00625 {
00626 for (int32_t i=0; i<num_weights; i++) {
00627 if (ws[i])
00628 weights[i]=ws[i];
00629 }
00630 return true;
00631 }
00632 else
00633 return false;
00634 }
00635
00636 bool CWeightedDegreeStringKernel::set_position_weights(
00637 float64_t* pws, int32_t len)
00638 {
00639 if (len==0)
00640 {
00641 delete[] position_weights;
00642 position_weights=NULL;
00643 ASSERT(tries);
00644 tries->set_position_weights(position_weights);
00645 }
00646
00647 if (seq_length!=len)
00648 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00649
00650 delete[] position_weights;
00651 position_weights=new float64_t[len];
00652 ASSERT(tries);
00653 tries->set_position_weights(position_weights);
00654
00655 if (position_weights)
00656 {
00657 for (int32_t i=0; i<len; i++)
00658 position_weights[i]=pws[i];
00659 return true;
00660 }
00661 else
00662 return false;
00663 }
00664
00665 bool CWeightedDegreeStringKernel::init_block_weights_from_wd()
00666 {
00667 delete[] block_weights;
00668 block_weights=new float64_t[CMath::max(seq_length,degree)];
00669
00670 if (block_weights)
00671 {
00672 int32_t k;
00673 float64_t d=degree;
00674
00675 for (k=0; k<degree; k++)
00676 block_weights[k]=
00677 (-pow(k, 3)+(3*d-3)*pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
00678 for (k=degree; k<seq_length; k++)
00679 block_weights[k]=(-d+3*k+4)/3;
00680 }
00681
00682 return (block_weights!=NULL);
00683 }
00684
00685 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external()
00686 {
00687 ASSERT(weights);
00688 delete[] block_weights;
00689 block_weights=new float64_t[CMath::max(seq_length,degree)];
00690
00691 if (block_weights)
00692 {
00693 int32_t i=0;
00694 block_weights[0]=weights[0];
00695 for (i=1; i<CMath::max(seq_length,degree); i++)
00696 block_weights[i]=0;
00697
00698 for (i=1; i<CMath::max(seq_length,degree); i++)
00699 {
00700 block_weights[i]=block_weights[i-1];
00701
00702 float64_t contrib=0;
00703 for (int32_t j=0; j<CMath::min(degree,i+1); j++)
00704 contrib+=weights[j];
00705
00706 block_weights[i]+=contrib;
00707 }
00708 }
00709
00710 return (block_weights!=NULL);
00711 }
00712
00713 bool CWeightedDegreeStringKernel::init_block_weights_const()
00714 {
00715 delete[] block_weights;
00716 block_weights=new float64_t[seq_length];
00717
00718 if (block_weights)
00719 {
00720 for (int32_t i=1; i<seq_length+1 ; i++)
00721 block_weights[i-1]=1.0/seq_length;
00722 }
00723
00724 return (block_weights!=NULL);
00725 }
00726
00727 bool CWeightedDegreeStringKernel::init_block_weights_linear()
00728 {
00729 delete[] block_weights;
00730 block_weights=new float64_t[seq_length];
00731
00732 if (block_weights)
00733 {
00734 for (int32_t i=1; i<seq_length+1 ; i++)
00735 block_weights[i-1]=degree*i;
00736 }
00737
00738 return (block_weights!=NULL);
00739 }
00740
00741 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly()
00742 {
00743 delete[] block_weights;
00744 block_weights=new float64_t[seq_length];
00745
00746 if (block_weights)
00747 {
00748 for (int32_t i=1; i<degree+1 ; i++)
00749 block_weights[i-1]=((float64_t) i)*i;
00750
00751 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00752 block_weights[i-1]=i;
00753 }
00754
00755 return (block_weights!=NULL);
00756 }
00757
00758 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly()
00759 {
00760 delete[] block_weights;
00761 block_weights=new float64_t[seq_length];
00762
00763 if (block_weights)
00764 {
00765 for (int32_t i=1; i<degree+1 ; i++)
00766 block_weights[i-1]=((float64_t) i)*i*i;
00767
00768 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00769 block_weights[i-1]=i;
00770 }
00771
00772 return (block_weights!=NULL);
00773 }
00774
00775 bool CWeightedDegreeStringKernel::init_block_weights_exp()
00776 {
00777 delete[] block_weights;
00778 block_weights=new float64_t[seq_length];
00779
00780 if (block_weights)
00781 {
00782 for (int32_t i=1; i<degree+1 ; i++)
00783 block_weights[i-1]=exp(((float64_t) i/10.0));
00784
00785 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00786 block_weights[i-1]=i;
00787 }
00788
00789 return (block_weights!=NULL);
00790 }
00791
00792 bool CWeightedDegreeStringKernel::init_block_weights_log()
00793 {
00794 delete[] block_weights;
00795 block_weights=new float64_t[seq_length];
00796
00797 if (block_weights)
00798 {
00799 for (int32_t i=1; i<degree+1 ; i++)
00800 block_weights[i-1]=pow(log(i),2);
00801
00802 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00803 block_weights[i-1]=i-degree+1+pow(log(degree+1),2);
00804 }
00805
00806 return (block_weights!=NULL);
00807 }
00808
00809 bool CWeightedDegreeStringKernel::init_block_weights_external()
00810 {
00811 if (block_weights_external && (seq_length == num_block_weights_external) )
00812 {
00813 delete[] block_weights;
00814 block_weights=new float64_t[seq_length];
00815
00816 if (block_weights)
00817 {
00818 for (int32_t i=0; i<seq_length; i++)
00819 block_weights[i]=block_weights_external[i];
00820 }
00821 }
00822 else {
00823 SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external);
00824 }
00825 return (block_weights!=NULL);
00826 }
00827
00828 bool CWeightedDegreeStringKernel::init_block_weights()
00829 {
00830 switch (type)
00831 {
00832 case E_WD:
00833 return init_block_weights_from_wd();
00834 case E_EXTERNAL:
00835 return init_block_weights_from_wd_external();
00836 case E_BLOCK_CONST:
00837 return init_block_weights_const();
00838 case E_BLOCK_LINEAR:
00839 return init_block_weights_linear();
00840 case E_BLOCK_SQPOLY:
00841 return init_block_weights_sqpoly();
00842 case E_BLOCK_CUBICPOLY:
00843 return init_block_weights_cubicpoly();
00844 case E_BLOCK_EXP:
00845 return init_block_weights_exp();
00846 case E_BLOCK_LOG:
00847 return init_block_weights_log();
00848 case E_BLOCK_EXTERNAL:
00849 return init_block_weights_external();
00850 default:
00851 return false;
00852 };
00853 }
00854
00855
00856 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p)
00857 {
00858 S_THREAD_PARAM* params = (S_THREAD_PARAM*) p;
00859 int32_t j=params->j;
00860 CWeightedDegreeStringKernel* wd=params->kernel;
00861 CTrie<DNATrie>* tries=params->tries;
00862 float64_t* weights=params->weights;
00863 int32_t length=params->length;
00864 int32_t* vec=params->vec;
00865 float64_t* result=params->result;
00866 float64_t factor=params->factor;
00867 int32_t* vec_idx=params->vec_idx;
00868
00869 CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
00870 CAlphabet* alpha=wd->alphabet;
00871
00872 for (int32_t i=params->start; i<params->end; i++)
00873 {
00874 int32_t len=0;
00875 char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len);
00876 for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++)
00877 vec[k]=alpha->remap_to_bin(char_vec[k]);
00878
00879 ASSERT(tries);
00880
00881 result[i]+=factor*
00882 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
00883 }
00884
00885 SG_UNREF(rhs_feat);
00886
00887 return NULL;
00888 }
00889
00890 void CWeightedDegreeStringKernel::compute_batch(
00891 int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
00892 int32_t* IDX, float64_t* alphas, float64_t factor)
00893 {
00894 ASSERT(tries);
00895 ASSERT(alphabet);
00896 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00897 ASSERT(rhs);
00898 ASSERT(num_vec<=rhs->get_num_vectors());
00899 ASSERT(num_vec>0);
00900 ASSERT(vec_idx);
00901 ASSERT(result);
00902 create_empty_tries();
00903
00904 int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
00905 ASSERT(num_feat>0);
00906 int32_t num_threads=parallel.get_num_threads();
00907 ASSERT(num_threads>0);
00908 int32_t* vec=new int32_t[num_threads*num_feat];
00909
00910 if (num_threads < 2)
00911 {
00912 #ifdef CYGWIN
00913 for (int32_t j=0; j<num_feat; j++)
00914 #else
00915 CSignal::clear_cancel();
00916 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00917 #endif
00918 {
00919 init_optimization(num_suppvec, IDX, alphas, j);
00920 S_THREAD_PARAM params;
00921 params.vec=vec;
00922 params.result=result;
00923 params.weights=weights;
00924 params.kernel=this;
00925 params.tries=tries;
00926 params.factor=factor;
00927 params.j=j;
00928 params.start=0;
00929 params.end=num_vec;
00930 params.length=length;
00931 params.vec_idx=vec_idx;
00932 compute_batch_helper((void*) ¶ms);
00933
00934 SG_PROGRESS(j,0,num_feat);
00935 }
00936 }
00937 #ifndef WIN32
00938 else
00939 {
00940 CSignal::clear_cancel();
00941 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00942 {
00943 init_optimization(num_suppvec, IDX, alphas, j);
00944 pthread_t threads[num_threads-1];
00945 S_THREAD_PARAM params[num_threads];
00946 int32_t step= num_vec/num_threads;
00947 int32_t t;
00948
00949 for (t=0; t<num_threads-1; t++)
00950 {
00951 params[t].vec=&vec[num_feat*t];
00952 params[t].result=result;
00953 params[t].weights=weights;
00954 params[t].kernel=this;
00955 params[t].tries=tries;
00956 params[t].factor=factor;
00957 params[t].j=j;
00958 params[t].start = t*step;
00959 params[t].end = (t+1)*step;
00960 params[t].length=length;
00961 params[t].vec_idx=vec_idx;
00962 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)¶ms[t]);
00963 }
00964 params[t].vec=&vec[num_feat*t];
00965 params[t].result=result;
00966 params[t].weights=weights;
00967 params[t].kernel=this;
00968 params[t].tries=tries;
00969 params[t].factor=factor;
00970 params[t].j=j;
00971 params[t].start=t*step;
00972 params[t].end=num_vec;
00973 params[t].length=length;
00974 params[t].vec_idx=vec_idx;
00975 compute_batch_helper((void*) ¶ms[t]);
00976
00977 for (t=0; t<num_threads-1; t++)
00978 pthread_join(threads[t], NULL);
00979 SG_PROGRESS(j,0,num_feat);
00980 }
00981 }
00982 #endif
00983
00984 delete[] vec;
00985
00986
00987
00988 create_empty_tries();
00989 }
00990
00991 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max)
00992 {
00993 if (type==E_EXTERNAL && max!=0) {
00994 return false;
00995 }
00996
00997 max_mismatch=max;
00998
00999 if (lhs!=NULL && rhs!=NULL)
01000 return init(lhs, rhs);
01001 else
01002 return true;
01003 }