00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017
00018 #include "kernel/WeightedDegreePositionStringKernel.h"
00019 #include "features/Features.h"
00020 #include "features/StringFeatures.h"
00021
00022 #include "classifier/svm/SVM.h"
00023
00024 #ifndef WIN32
00025 #include <pthread.h>
00026 #endif
00027
00028 #define TRIES(X) ((use_poim_tries) ? (poim_tries.X) : (tries.X))
00029
00030 template <class Trie> struct S_THREAD_PARAM
00031 {
00032 INT* vec;
00033 DREAL* result;
00034 DREAL* weights;
00035 CWeightedDegreePositionStringKernel* kernel;
00036 CTrie<Trie>* tries;
00037 DREAL factor;
00038 INT j;
00039 INT start;
00040 INT end;
00041 INT length;
00042 INT max_shift;
00043 INT* shift;
00044 INT* vec_idx;
00045 };
00046
00047 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00048 INT size, INT d, INT mm, bool un, INT mkls)
00049 : CStringKernel<CHAR>(size), weights(NULL), position_weights(NULL),
00050 position_weights_lhs(NULL), position_weights_rhs(NULL),
00051 weights_buffer(NULL), mkl_stepsize(mkls), degree(d), length(0),
00052 max_mismatch(mm), seq_length(0), shift(NULL), shift_len(0),
00053 initialized(false), use_normalization(un),
00054 normalization_const(1.0), num_block_weights_external(0),
00055 block_weights_external(NULL), block_weights(NULL), type(E_EXTERNAL),
00056 tries(d), poim_tries(d), tree_initialized(false), use_poim_tries(false),
00057 m_poim_distrib(NULL), m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0),
00058 m_poim_result_len(0), alphabet(NULL)
00059 {
00060 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00061 set_wd_weights();
00062 ASSERT(weights);
00063 }
00064
00065 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00066 INT size, DREAL* w, INT d, INT mm, INT* s, INT sl, bool un, INT mkls)
00067 : CStringKernel<CHAR>(size), weights(NULL), position_weights(NULL),
00068 position_weights_lhs(NULL), position_weights_rhs(NULL),
00069 weights_buffer(NULL), mkl_stepsize(mkls), degree(d), length(0),
00070 max_mismatch(mm), seq_length(0), shift(NULL), shift_len(0),
00071 initialized(false), use_normalization(un),
00072 normalization_const(1.0), num_block_weights_external(0),
00073 block_weights_external(NULL), block_weights(NULL), type(E_EXTERNAL),
00074 tries(d), poim_tries(d), tree_initialized(false), use_poim_tries(false),
00075 m_poim_distrib(NULL), m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0),
00076 m_poim_result_len(0), alphabet(NULL)
00077 {
00078 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00079
00080 weights=new DREAL[d*(1+max_mismatch)];
00081 for (INT i=0; i<d*(1+max_mismatch); i++)
00082 weights[i]=w[i];
00083
00084 set_shifts(s, sl);
00085 }
00086
00087 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00088 CStringFeatures<CHAR>* l, CStringFeatures<CHAR>* r, INT d)
00089 : CStringKernel<CHAR>(10), weights(NULL), position_weights(NULL),
00090 position_weights_lhs(NULL), position_weights_rhs(NULL),
00091 weights_buffer(NULL), mkl_stepsize(1), degree(d), length(0),
00092 max_mismatch(0), seq_length(0), shift(NULL), shift_len(0),
00093 initialized(false), use_normalization(true),
00094 normalization_const(1.0), num_block_weights_external(0),
00095 block_weights_external(NULL), block_weights(NULL), type(E_EXTERNAL),
00096 tries(d), poim_tries(d), tree_initialized(false), use_poim_tries(false),
00097 m_poim_distrib(NULL), m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0),
00098 m_poim_result_len(0), alphabet(NULL)
00099 {
00100 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00101 set_wd_weights();
00102 ASSERT(weights);
00103
00104 init(l, r);
00105 }
00106
00107
00108 CWeightedDegreePositionStringKernel::~CWeightedDegreePositionStringKernel()
00109 {
00110 cleanup();
00111 cleanup_POIM2();
00112
00113 delete[] shift;
00114 shift=NULL;
00115
00116 delete[] weights;
00117 weights=NULL ;
00118
00119 delete[] block_weights;
00120 block_weights=NULL;
00121
00122 delete[] position_weights;
00123 position_weights=NULL;
00124
00125 delete[] position_weights_lhs;
00126 position_weights_lhs=NULL;
00127
00128 delete[] position_weights_rhs;
00129 position_weights_rhs=NULL;
00130
00131 delete[] weights_buffer;
00132 weights_buffer=NULL;
00133 }
00134
00135 void CWeightedDegreePositionStringKernel::remove_lhs()
00136 {
00137 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00138 delete_optimization();
00139 initialized = false;
00140
00141 tries.destroy();
00142 poim_tries.destroy();
00143
00144 CKernel::remove_lhs();
00145 }
00146
00147 void CWeightedDegreePositionStringKernel::create_empty_tries()
00148 {
00149 ASSERT(lhs);
00150 seq_length = ((CStringFeatures<CHAR>*) lhs)->get_max_vector_length();
00151
00152 if (opt_type==SLOWBUTMEMEFFICIENT)
00153 {
00154 tries.create(seq_length, true);
00155 poim_tries.create(seq_length, true);
00156 }
00157 else if (opt_type==FASTBUTMEMHUNGRY)
00158 {
00159 tries.create(seq_length, false);
00160 poim_tries.create(seq_length, false);
00161 }
00162 else
00163 SG_ERROR( "unknown optimization type\n");
00164 }
00165
00166 bool CWeightedDegreePositionStringKernel::init(CFeatures* l, CFeatures* r)
00167 {
00168 INT lhs_changed = (lhs!=l) ;
00169 INT rhs_changed = (rhs!=r) ;
00170
00171 bool result=CStringKernel<CHAR>::init(l,r);
00172
00173 SG_DEBUG( "lhs_changed: %i\n", lhs_changed) ;
00174 SG_DEBUG( "rhs_changed: %i\n", rhs_changed) ;
00175
00176 CStringFeatures<CHAR>* sf_l=(CStringFeatures<CHAR>*) l;
00177 CStringFeatures<CHAR>* sf_r=(CStringFeatures<CHAR>*) r;
00178
00179
00180 if (shift_len==0) {
00181 shift_len=sf_l->get_vector_length(0);
00182 INT *shifts=new INT[shift_len];
00183 for (INT i=0; i<shift_len; i++) {
00184 shifts[i]=1;
00185 }
00186 set_shifts(shifts, shift_len);
00187 delete[] shifts;
00188 }
00189
00190
00191 INT len=sf_l->get_max_vector_length();
00192 if (lhs_changed && !sf_l->have_same_length(len))
00193 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00194
00195 if (rhs_changed && !sf_r->have_same_length(len))
00196 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00197
00198 delete alphabet;
00199 alphabet= new CAlphabet(sf_l->get_alphabet());
00200 CAlphabet* ralphabet=sf_r->get_alphabet();
00201 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00202 properties &= ((ULONG) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00203
00204 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00205 SG_UNREF(ralphabet);
00206
00207
00208 if (!initialized)
00209 {
00210 create_empty_tries();
00211 init_block_weights();
00212 }
00213
00214
00215 if (lhs_changed)
00216 {
00217 normalization_const=1.0;
00218 if (use_normalization)
00219 normalization_const=compute(0,0);
00220 }
00221
00222 SG_DEBUG( "use normalization:%d (const:%f)\n", (use_normalization) ? 1 : 0,
00223 normalization_const);
00224
00225 initialized = true;
00226 return result;
00227 }
00228
00229 void CWeightedDegreePositionStringKernel::cleanup()
00230 {
00231 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00232 delete_optimization();
00233
00234 delete[] block_weights;
00235 block_weights=NULL;
00236
00237 tries.destroy();
00238 poim_tries.destroy();
00239
00240 seq_length = 0;
00241 initialized = false;
00242 tree_initialized = false;
00243
00244 delete alphabet;
00245 alphabet=NULL;
00246
00247 CKernel::cleanup();
00248 }
00249
00250 bool CWeightedDegreePositionStringKernel::load_init(FILE* src)
00251 {
00252 return false;
00253 }
00254
00255 bool CWeightedDegreePositionStringKernel::save_init(FILE* dest)
00256 {
00257 return false;
00258 }
00259
00260 bool CWeightedDegreePositionStringKernel::init_optimization(INT p_count, INT * IDX, DREAL * alphas, INT tree_num, INT upto_tree)
00261 {
00262 ASSERT(position_weights_lhs==NULL);
00263 ASSERT(position_weights_rhs==NULL);
00264
00265 if (upto_tree<0)
00266 upto_tree=tree_num;
00267
00268 if (max_mismatch!=0)
00269 {
00270 SG_ERROR( "CWeightedDegreePositionStringKernel optimization not implemented for mismatch!=0\n");
00271 return false ;
00272 }
00273
00274 if (tree_num<0)
00275 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00276
00277 delete_optimization();
00278
00279 if (tree_num<0)
00280 SG_DEBUG( "initializing CWeightedDegreePositionStringKernel optimization\n") ;
00281
00282 int i=0;
00283 for (i=0; i<p_count; i++)
00284 {
00285 if (tree_num<0)
00286 {
00287 if ( (i % (p_count/10+1)) == 0)
00288 SG_PROGRESS(i,0,p_count);
00289 add_example_to_tree(IDX[i], alphas[i]);
00290 }
00291 else
00292 {
00293 for (INT t=tree_num; t<=upto_tree; t++)
00294 add_example_to_single_tree(IDX[i], alphas[i], t);
00295 }
00296 }
00297
00298 if (tree_num<0)
00299 SG_DONE();
00300
00301 set_is_initialized(true) ;
00302 return true ;
00303 }
00304
00305 bool CWeightedDegreePositionStringKernel::delete_optimization()
00306 {
00307 if ((opt_type==FASTBUTMEMHUNGRY) && (tries.get_use_compact_terminal_nodes()))
00308 {
00309 tries.set_use_compact_terminal_nodes(false) ;
00310 SG_DEBUG( "disabling compact trie nodes with FASTBUTMEMHUNGRY\n") ;
00311 }
00312
00313 if (get_is_initialized())
00314 {
00315 if (opt_type==SLOWBUTMEMEFFICIENT)
00316 tries.delete_trees(true);
00317 else if (opt_type==FASTBUTMEMHUNGRY)
00318 tries.delete_trees(false);
00319 else {
00320 SG_ERROR( "unknown optimization type\n");
00321 }
00322 set_is_initialized(false);
00323
00324 return true;
00325 }
00326
00327 return false;
00328 }
00329
00330 DREAL CWeightedDegreePositionStringKernel::compute_with_mismatch(CHAR* avec, INT alen, CHAR* bvec, INT blen)
00331 {
00332 DREAL max_shift_vec[max_shift];
00333 DREAL sum0=0 ;
00334 for (INT i=0; i<max_shift; i++)
00335 max_shift_vec[i]=0 ;
00336
00337
00338 for (INT i=0; i<alen; i++)
00339 {
00340 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00341 continue ;
00342
00343 INT mismatches=0;
00344 DREAL sumi = 0.0 ;
00345 for (INT j=0; (j<degree) && (i+j<alen); j++)
00346 {
00347 if (avec[i+j]!=bvec[i+j])
00348 {
00349 mismatches++ ;
00350 if (mismatches>max_mismatch)
00351 break ;
00352 } ;
00353 sumi += weights[j+degree*mismatches];
00354 }
00355 if (position_weights!=NULL)
00356 sum0 += position_weights[i]*sumi ;
00357 else
00358 sum0 += sumi ;
00359 } ;
00360
00361 for (INT i=0; i<alen; i++)
00362 {
00363 for (INT k=1; (k<=shift[i]) && (i+k<alen); k++)
00364 {
00365 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00366 continue ;
00367
00368 DREAL sumi1 = 0.0 ;
00369
00370 INT mismatches=0;
00371 for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00372 {
00373 if (avec[i+j+k]!=bvec[i+j])
00374 {
00375 mismatches++ ;
00376 if (mismatches>max_mismatch)
00377 break ;
00378 } ;
00379 sumi1 += weights[j+degree*mismatches];
00380 }
00381 DREAL sumi2 = 0.0 ;
00382
00383 mismatches=0;
00384 for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00385 {
00386 if (avec[i+j]!=bvec[i+j+k])
00387 {
00388 mismatches++ ;
00389 if (mismatches>max_mismatch)
00390 break ;
00391 } ;
00392 sumi2 += weights[j+degree*mismatches];
00393 }
00394 if (position_weights!=NULL)
00395 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00396 else
00397 max_shift_vec[k-1] += sumi1 + sumi2 ;
00398 } ;
00399 }
00400
00401 DREAL result = sum0 ;
00402 for (INT i=0; i<max_shift; i++)
00403 result += max_shift_vec[i]/(2*(i+1)) ;
00404
00405 return result ;
00406 }
00407
00408 DREAL CWeightedDegreePositionStringKernel::compute_without_mismatch(CHAR* avec, INT alen, CHAR* bvec, INT blen)
00409 {
00410 DREAL max_shift_vec[max_shift];
00411 DREAL sum0=0 ;
00412 for (INT i=0; i<max_shift; i++)
00413 max_shift_vec[i]=0 ;
00414
00415
00416 for (INT i=0; i<alen; i++)
00417 {
00418 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00419 continue ;
00420
00421 DREAL sumi = 0.0 ;
00422 for (INT j=0; (j<degree) && (i+j<alen); j++)
00423 {
00424 if (avec[i+j]!=bvec[i+j])
00425 break ;
00426 sumi += weights[j];
00427 }
00428 if (position_weights!=NULL)
00429 sum0 += position_weights[i]*sumi ;
00430 else
00431 sum0 += sumi ;
00432 } ;
00433
00434 for (INT i=0; i<alen; i++)
00435 {
00436 for (INT k=1; (k<=shift[i]) && (i+k<alen); k++)
00437 {
00438 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00439 continue ;
00440
00441 DREAL sumi1 = 0.0 ;
00442
00443 for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00444 {
00445 if (avec[i+j+k]!=bvec[i+j])
00446 break ;
00447 sumi1 += weights[j];
00448 }
00449 DREAL sumi2 = 0.0 ;
00450
00451 for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00452 {
00453 if (avec[i+j]!=bvec[i+j+k])
00454 break ;
00455 sumi2 += weights[j];
00456 }
00457 if (position_weights!=NULL)
00458 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00459 else
00460 max_shift_vec[k-1] += sumi1 + sumi2 ;
00461 } ;
00462 }
00463
00464 DREAL result = sum0 ;
00465 for (INT i=0; i<max_shift; i++)
00466 result += max_shift_vec[i]/(2*(i+1)) ;
00467
00468 return result ;
00469 }
00470
00471 DREAL CWeightedDegreePositionStringKernel::compute_without_mismatch_matrix(CHAR* avec, INT alen, CHAR* bvec, INT blen)
00472 {
00473 DREAL max_shift_vec[max_shift];
00474 DREAL sum0=0 ;
00475 for (INT i=0; i<max_shift; i++)
00476 max_shift_vec[i]=0 ;
00477
00478
00479 for (INT i=0; i<alen; i++)
00480 {
00481 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00482 continue ;
00483 DREAL sumi = 0.0 ;
00484 for (INT j=0; (j<degree) && (i+j<alen); j++)
00485 {
00486 if (avec[i+j]!=bvec[i+j])
00487 break ;
00488 sumi += weights[i*degree+j];
00489 }
00490 if (position_weights!=NULL)
00491 sum0 += position_weights[i]*sumi ;
00492 else
00493 sum0 += sumi ;
00494 } ;
00495
00496 for (INT i=0; i<alen; i++)
00497 {
00498 for (INT k=1; (k<=shift[i]) && (i+k<alen); k++)
00499 {
00500 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00501 continue ;
00502
00503 DREAL sumi1 = 0.0 ;
00504
00505 for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00506 {
00507 if (avec[i+j+k]!=bvec[i+j])
00508 break ;
00509 sumi1 += weights[i*degree+j];
00510 }
00511 DREAL sumi2 = 0.0 ;
00512
00513 for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00514 {
00515 if (avec[i+j]!=bvec[i+j+k])
00516 break ;
00517 sumi2 += weights[i*degree+j];
00518 }
00519 if (position_weights!=NULL)
00520 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00521 else
00522 max_shift_vec[k-1] += sumi1 + sumi2 ;
00523 } ;
00524 }
00525
00526 DREAL result = sum0 ;
00527 for (INT i=0; i<max_shift; i++)
00528 result += max_shift_vec[i]/(2*(i+1)) ;
00529
00530 return result ;
00531 }
00532
00533 DREAL CWeightedDegreePositionStringKernel::compute_without_mismatch_position_weights(CHAR* avec, DREAL* pos_weights_lhs, INT alen, CHAR* bvec, DREAL* pos_weights_rhs, INT blen)
00534 {
00535 DREAL max_shift_vec[max_shift];
00536 DREAL sum0=0 ;
00537 for (INT i=0; i<max_shift; i++)
00538 max_shift_vec[i]=0 ;
00539
00540
00541 for (INT i=0; i<alen; i++)
00542 {
00543 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00544 continue ;
00545
00546 DREAL sumi = 0.0 ;
00547 DREAL posweight_lhs = 0.0 ;
00548 DREAL posweight_rhs = 0.0 ;
00549 for (INT j=0; (j<degree) && (i+j<alen); j++)
00550 {
00551 posweight_lhs += pos_weights_lhs[i+j] ;
00552 posweight_rhs += pos_weights_rhs[i+j] ;
00553
00554 if (avec[i+j]!=bvec[i+j])
00555 break ;
00556 sumi += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00557 }
00558 if (position_weights!=NULL)
00559 sum0 += position_weights[i]*sumi ;
00560 else
00561 sum0 += sumi ;
00562 } ;
00563
00564 for (INT i=0; i<alen; i++)
00565 {
00566 for (INT k=1; (k<=shift[i]) && (i+k<alen); k++)
00567 {
00568 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00569 continue ;
00570
00571
00572 DREAL sumi1 = 0.0 ;
00573 DREAL posweight_lhs = 0.0 ;
00574 DREAL posweight_rhs = 0.0 ;
00575 for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00576 {
00577 posweight_lhs += pos_weights_lhs[i+j+k] ;
00578 posweight_rhs += pos_weights_rhs[i+j] ;
00579 if (avec[i+j+k]!=bvec[i+j])
00580 break ;
00581 sumi1 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00582 }
00583
00584 DREAL sumi2 = 0.0 ;
00585 posweight_lhs = 0.0 ;
00586 posweight_rhs = 0.0 ;
00587 for (INT j=0; (j<degree) && (i+j+k<alen); j++)
00588 {
00589 posweight_lhs += pos_weights_lhs[i+j] ;
00590 posweight_rhs += pos_weights_rhs[i+j+k] ;
00591 if (avec[i+j]!=bvec[i+j+k])
00592 break ;
00593 sumi2 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00594 }
00595 if (position_weights!=NULL)
00596 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00597 else
00598 max_shift_vec[k-1] += sumi1 + sumi2 ;
00599 } ;
00600 }
00601
00602 DREAL result = sum0 ;
00603 for (INT i=0; i<max_shift; i++)
00604 result += max_shift_vec[i]/(2*(i+1)) ;
00605
00606 return result ;
00607 }
00608
00609
00610 DREAL CWeightedDegreePositionStringKernel::compute(INT idx_a, INT idx_b)
00611 {
00612 INT alen, blen;
00613
00614 CHAR* avec=((CStringFeatures<CHAR>*) lhs)->get_feature_vector(idx_a, alen);
00615 CHAR* bvec=((CStringFeatures<CHAR>*) rhs)->get_feature_vector(idx_b, blen);
00616
00617 ASSERT(alen==blen);
00618 ASSERT(shift_len==alen);
00619
00620 DREAL result = 0 ;
00621 if (position_weights_lhs!=NULL || position_weights_rhs!=NULL)
00622 {
00623 ASSERT(max_mismatch==0);
00624 result = compute_without_mismatch_position_weights(avec, &position_weights_lhs[idx_a*alen], alen, bvec, &position_weights_rhs[idx_b*blen], blen) ;
00625 }
00626 else if (max_mismatch > 0)
00627 result = compute_with_mismatch(avec, alen, bvec, blen) ;
00628 else if (length==0)
00629 result = compute_without_mismatch(avec, alen, bvec, blen) ;
00630 else
00631 result = compute_without_mismatch_matrix(avec, alen, bvec, blen) ;
00632
00633 result/=normalization_const;
00634
00635 return result ;
00636 }
00637
00638
00639 void CWeightedDegreePositionStringKernel::add_example_to_tree(INT idx, DREAL alpha)
00640 {
00641 ASSERT(position_weights_lhs==NULL);
00642 ASSERT(position_weights_rhs==NULL);
00643 ASSERT(alphabet);
00644 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00645
00646 INT len=0;
00647 CHAR* char_vec=((CStringFeatures<CHAR>*) lhs)->get_feature_vector(idx, len);
00648 ASSERT(max_mismatch==0);
00649 INT *vec = new INT[len] ;
00650
00651 for (INT i=0; i<len; i++)
00652 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00653
00654 if (opt_type==FASTBUTMEMHUNGRY)
00655 {
00656
00657 ASSERT(!TRIES(get_use_compact_terminal_nodes()));
00658 }
00659
00660 for (INT i=0; i<len; i++)
00661 {
00662 INT max_s=-1;
00663
00664 if (opt_type==SLOWBUTMEMEFFICIENT)
00665 max_s=0;
00666 else if (opt_type==FASTBUTMEMHUNGRY)
00667 max_s=shift[i];
00668 else {
00669 SG_ERROR( "unknown optimization type\n");
00670 }
00671
00672 for (INT s=max_s; s>=0; s--)
00673 {
00674 DREAL alpha_pw = (s==0) ? (alpha) : (alpha/(2.0*s)) ;
00675 TRIES(add_to_trie(i, s, vec, alpha_pw, weights, (length!=0))) ;
00676
00677
00678 if ((s==0) || (i+s>=len))
00679 continue;
00680
00681 TRIES(add_to_trie(i+s, -s, vec, alpha_pw, weights, (length!=0))) ;
00682
00683 }
00684 }
00685
00686 delete[] vec ;
00687 tree_initialized=true ;
00688 }
00689
00690 void CWeightedDegreePositionStringKernel::add_example_to_single_tree(INT idx, DREAL alpha, INT tree_num)
00691 {
00692 ASSERT(position_weights_lhs==NULL);
00693 ASSERT(position_weights_rhs==NULL);
00694 ASSERT(alphabet);
00695 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00696
00697 INT len=0;
00698 CHAR* char_vec=((CStringFeatures<CHAR>*) lhs)->get_feature_vector(idx, len);
00699 ASSERT(max_mismatch==0);
00700 INT *vec=new INT[len];
00701 INT max_s=-1;
00702
00703 if (opt_type==SLOWBUTMEMEFFICIENT)
00704 max_s=0;
00705 else if (opt_type==FASTBUTMEMHUNGRY)
00706 {
00707 ASSERT(!tries.get_use_compact_terminal_nodes());
00708 max_s=shift[tree_num];
00709 }
00710 else {
00711 SG_ERROR( "unknown optimization type\n");
00712 }
00713 for (INT i=CMath::max(0,tree_num-max_shift); i<CMath::min(len,tree_num+degree+max_shift); i++)
00714 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00715
00716 for (INT s=max_s; s>=0; s--)
00717 {
00718 DREAL alpha_pw = (s==0) ? (alpha) : (alpha/(2.0*s)) ;
00719 tries.add_to_trie(tree_num, s, vec, alpha_pw, weights, (length!=0)) ;
00720
00721 }
00722
00723 if (opt_type==FASTBUTMEMHUNGRY)
00724 {
00725 for (INT i=CMath::max(0,tree_num-max_shift); i<CMath::min(len,tree_num+max_shift+1); i++)
00726 {
00727 INT s=tree_num-i;
00728 if ((i+s<len) && (s>=1) && (s<=shift[i]))
00729 {
00730 DREAL alpha_pw = (s==0) ? (alpha) : (alpha/(2.0*s)) ;
00731 tries.add_to_trie(tree_num, -s, vec, alpha_pw, weights, (length!=0)) ;
00732
00733 }
00734 }
00735 }
00736 delete[] vec ;
00737 tree_initialized=true ;
00738 }
00739
00740 DREAL CWeightedDegreePositionStringKernel::compute_by_tree(INT idx)
00741 {
00742 ASSERT(position_weights_lhs==NULL);
00743 ASSERT(position_weights_rhs==NULL);
00744 ASSERT(alphabet);
00745 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00746
00747 DREAL sum=0;
00748 INT len=0;
00749 CHAR* char_vec=((CStringFeatures<CHAR>*) rhs)->get_feature_vector(idx, len);
00750 ASSERT(max_mismatch==0);
00751 INT *vec=new INT[len];
00752
00753 for (INT i=0; i<len; i++)
00754 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00755
00756 for (INT i=0; i<len; i++)
00757 sum += tries.compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0)) ;
00758
00759 if (opt_type==SLOWBUTMEMEFFICIENT)
00760 {
00761 for (INT i=0; i<len; i++)
00762 {
00763 for (INT s=1; (s<=shift[i]) && (i+s<len); s++)
00764 {
00765 sum+=tries.compute_by_tree_helper(vec, len, i, i+s, i, weights, (length!=0))/(2*s) ;
00766 sum+=tries.compute_by_tree_helper(vec, len, i+s, i, i+s, weights, (length!=0))/(2*s) ;
00767 }
00768 }
00769 }
00770
00771 delete[] vec ;
00772
00773 return sum/normalization_const;
00774 }
00775
00776 void CWeightedDegreePositionStringKernel::compute_by_tree(INT idx, DREAL* LevelContrib)
00777 {
00778 ASSERT(position_weights_lhs==NULL);
00779 ASSERT(position_weights_rhs==NULL);
00780 ASSERT(alphabet);
00781 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00782
00783 INT len=0;
00784 CHAR* char_vec=((CStringFeatures<CHAR>*) rhs)->get_feature_vector(idx, len);
00785 ASSERT(max_mismatch==0);
00786 INT *vec=new INT[len];
00787
00788 for (INT i=0; i<len; i++)
00789 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00790
00791 for (INT i=0; i<len; i++)
00792 tries.compute_by_tree_helper(vec, len, i, i, i, LevelContrib, 1.0/normalization_const, mkl_stepsize, weights, (length!=0)) ;
00793
00794 if (opt_type==SLOWBUTMEMEFFICIENT)
00795 {
00796 for (INT i=0; i<len; i++)
00797 for (INT k=1; (k<=shift[i]) && (i+k<len); k++)
00798 {
00799 tries.compute_by_tree_helper(vec, len, i, i+k, i, LevelContrib, 1.0/(2*k*normalization_const), mkl_stepsize, weights, (length!=0)) ;
00800 tries.compute_by_tree_helper(vec, len, i+k, i, i+k, LevelContrib, 1.0/(2*k*normalization_const), mkl_stepsize, weights, (length!=0)) ;
00801 }
00802 }
00803
00804 delete[] vec ;
00805 }
00806
00807 DREAL *CWeightedDegreePositionStringKernel::compute_abs_weights(int &len)
00808 {
00809 return tries.compute_abs_weights(len) ;
00810 }
00811
00812 bool CWeightedDegreePositionStringKernel::set_shifts(INT* shift_, INT shift_len_)
00813 {
00814 delete[] shift;
00815
00816 shift_len = shift_len_ ;
00817 shift = new INT[shift_len] ;
00818
00819 if (shift)
00820 {
00821 max_shift = 0 ;
00822
00823 for (INT i=0; i<shift_len; i++)
00824 {
00825 shift[i] = shift_[i] ;
00826 if (shift[i]>max_shift)
00827 max_shift = shift[i] ;
00828 }
00829
00830 ASSERT(max_shift>=0 && max_shift<=shift_len);
00831 }
00832
00833 return false;
00834 }
00835
00836 bool CWeightedDegreePositionStringKernel::set_wd_weights()
00837 {
00838 ASSERT(degree>0);
00839
00840 delete[] weights;
00841 weights=new DREAL[degree];
00842 if (weights)
00843 {
00844 INT i;
00845 DREAL sum=0;
00846 for (i=0; i<degree; i++)
00847 {
00848 weights[i]=degree-i;
00849 sum+=weights[i];
00850 }
00851 for (i=0; i<degree; i++)
00852 weights[i]/=sum;
00853
00854 for (i=0; i<degree; i++)
00855 {
00856 for (INT j=1; j<=max_mismatch; j++)
00857 {
00858 if (j<i+1)
00859 {
00860 INT nk=CMath::nchoosek(i+1, j);
00861 weights[i+j*degree]=weights[i]/(nk*pow(3,j));
00862 }
00863 else
00864 weights[i+j*degree]= 0;
00865 }
00866 }
00867
00868 return true;
00869 }
00870 else
00871 return false;
00872 }
00873
00874 bool CWeightedDegreePositionStringKernel::set_weights(DREAL* ws, INT d, INT len)
00875 {
00876 SG_DEBUG( "degree = %i d=%i\n", degree, d) ;
00877 degree = d ;
00878 length=len;
00879
00880 if (len <= 0)
00881 len=1;
00882
00883 delete[] weights;
00884 weights=new DREAL[d*len];
00885
00886 if (weights)
00887 {
00888 for (int i=0; i<degree*len; i++)
00889 weights[i]=ws[i];
00890 return true;
00891 }
00892 else
00893 return false;
00894 }
00895
00896 bool CWeightedDegreePositionStringKernel::set_position_weights(DREAL* pws, INT len)
00897 {
00898 fprintf(stderr, "len=%i\n", len) ;
00899
00900 if (len==0)
00901 {
00902 delete[] position_weights ;
00903 position_weights = NULL ;
00904 tries.set_position_weights(position_weights) ;
00905 return true ;
00906 }
00907 if (seq_length==0)
00908 seq_length = len ;
00909
00910 if (seq_length!=len)
00911 {
00912 SG_ERROR( "seq_length = %i, position_weights_length=%i\n", seq_length, len) ;
00913 return false ;
00914 }
00915 delete[] position_weights;
00916 position_weights=new DREAL[len];
00917 tries.set_position_weights(position_weights) ;
00918
00919 if (position_weights)
00920 {
00921 for (int i=0; i<len; i++)
00922 position_weights[i]=pws[i];
00923 return true;
00924 }
00925 else
00926 return false;
00927 }
00928
00929 bool CWeightedDegreePositionStringKernel::set_position_weights_lhs(DREAL* pws, INT len, INT num)
00930 {
00931 fprintf(stderr, "lhs %i %i %i\n", len, num, seq_length) ;
00932
00933 if (position_weights_rhs==position_weights_lhs)
00934 position_weights_rhs=NULL ;
00935 else
00936 delete_position_weights_rhs() ;
00937
00938 if (len==0)
00939 {
00940 return delete_position_weights_lhs() ;
00941 }
00942
00943 if (seq_length!=len)
00944 {
00945 SG_ERROR( "seq_length = %i, position_weights_length=%i\n", seq_length, len) ;
00946 return false ;
00947 }
00948 if (!lhs)
00949 {
00950 SG_ERROR("lhs=NULL\n") ;
00951 return false ;
00952 }
00953 if (lhs->get_num_vectors()!=num)
00954 {
00955 SG_ERROR("lhs->get_num_vectors()=%i, num=%i\n", lhs->get_num_vectors(), num) ;
00956 return false ;
00957 }
00958
00959 delete[] position_weights_lhs;
00960 position_weights_lhs=new DREAL[len*num];
00961 if (position_weights_lhs)
00962 {
00963 for (int i=0; i<len*num; i++)
00964 position_weights_lhs[i]=pws[i];
00965 return true;
00966 }
00967 else
00968 return false;
00969 }
00970
00971 bool CWeightedDegreePositionStringKernel::set_position_weights_rhs(DREAL* pws, INT len, INT num)
00972 {
00973 fprintf(stderr, "rhs %i %i %i\n", len, num, seq_length) ;
00974 if (len==0)
00975 {
00976 if (position_weights_rhs==position_weights_lhs)
00977 {
00978 position_weights_rhs=NULL ;
00979 return true ;
00980 }
00981 return delete_position_weights_rhs() ;
00982 }
00983
00984 if (seq_length!=len)
00985 {
00986 SG_ERROR( "seq_length = %i, position_weights_length=%i\n", seq_length, len) ;
00987 return false ;
00988 }
00989 if (!rhs)
00990 {
00991 if (!lhs)
00992 {
00993 SG_ERROR("rhs==0 and lhs=NULL\n") ;
00994 return false ;
00995 }
00996 if (lhs->get_num_vectors()!=num)
00997 {
00998 SG_ERROR("lhs->get_num_vectors()=%i, num=%i\n", lhs->get_num_vectors(), num) ;
00999 return false ;
01000 }
01001 } else
01002 if (rhs->get_num_vectors()!=num)
01003 {
01004 SG_ERROR("rhs->get_num_vectors()=%i, num=%i\n", rhs->get_num_vectors(), num) ;
01005 return false ;
01006 }
01007
01008
01009 delete[] position_weights_rhs;
01010 position_weights_rhs=new DREAL[len*num];
01011 if (position_weights_rhs)
01012 {
01013 for (int i=0; i<len*num; i++)
01014 position_weights_rhs[i]=pws[i];
01015 return true;
01016 }
01017 else
01018 return false;
01019 }
01020
01021 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd()
01022 {
01023 delete[] block_weights;
01024 block_weights=new DREAL[CMath::max(seq_length,degree)];
01025
01026 if (block_weights)
01027 {
01028 double deg=degree;
01029 INT k;
01030 for (k=0; k<degree ; k++)
01031 block_weights[k]=(-pow(k,3) + (3*deg-3)*pow(k,2) + (9*deg-2)*k + 6*deg) / (3*deg*(deg+1));
01032 for (k=degree; k<seq_length ; k++)
01033 block_weights[k]=(-deg+3*k+4)/3;
01034 }
01035
01036 return (block_weights!=NULL);
01037 }
01038
01039 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd_external()
01040 {
01041 ASSERT(weights);
01042 delete[] block_weights;
01043 block_weights=new DREAL[CMath::max(seq_length,degree)];
01044
01045 if (block_weights)
01046 {
01047 INT i=0;
01048 block_weights[0]=weights[0];
01049 for (i=1; i<CMath::max(seq_length,degree); i++)
01050 block_weights[i]=0;
01051
01052 for (i=1; i<CMath::max(seq_length,degree); i++)
01053 {
01054 block_weights[i]=block_weights[i-1];
01055
01056 DREAL contrib=0;
01057 for (INT j=0; j<CMath::min(degree,i+1); j++)
01058 contrib+=weights[j];
01059
01060 block_weights[i]+=contrib;
01061 }
01062 }
01063
01064 return (block_weights!=NULL);
01065 }
01066
01067 bool CWeightedDegreePositionStringKernel::init_block_weights_const()
01068 {
01069 delete[] block_weights;
01070 block_weights=new DREAL[seq_length];
01071
01072 if (block_weights)
01073 {
01074 for (int i=1; i<seq_length+1 ; i++)
01075 block_weights[i-1]=1.0/seq_length;
01076 }
01077
01078 return (block_weights!=NULL);
01079 }
01080
01081 bool CWeightedDegreePositionStringKernel::init_block_weights_linear()
01082 {
01083 delete[] block_weights;
01084 block_weights=new DREAL[seq_length];
01085
01086 if (block_weights)
01087 {
01088 for (int i=1; i<seq_length+1 ; i++)
01089 block_weights[i-1]=degree*i;
01090 }
01091
01092 return (block_weights!=NULL);
01093 }
01094
01095 bool CWeightedDegreePositionStringKernel::init_block_weights_sqpoly()
01096 {
01097 delete[] block_weights;
01098 block_weights=new DREAL[seq_length];
01099
01100 if (block_weights)
01101 {
01102 for (int i=1; i<degree+1 ; i++)
01103 block_weights[i-1]=((double) i)*i;
01104
01105 for (int i=degree+1; i<seq_length+1 ; i++)
01106 block_weights[i-1]=i;
01107 }
01108
01109 return (block_weights!=NULL);
01110 }
01111
01112 bool CWeightedDegreePositionStringKernel::init_block_weights_cubicpoly()
01113 {
01114 delete[] block_weights;
01115 block_weights=new DREAL[seq_length];
01116
01117 if (block_weights)
01118 {
01119 for (int i=1; i<degree+1 ; i++)
01120 block_weights[i-1]=((double) i)*i*i;
01121
01122 for (int i=degree+1; i<seq_length+1 ; i++)
01123 block_weights[i-1]=i;
01124 }
01125
01126 return (block_weights!=NULL);
01127 }
01128
01129 bool CWeightedDegreePositionStringKernel::init_block_weights_exp()
01130 {
01131 delete[] block_weights;
01132 block_weights=new DREAL[seq_length];
01133
01134 if (block_weights)
01135 {
01136 for (int i=1; i<degree+1 ; i++)
01137 block_weights[i-1]=exp(((double) i/10.0));
01138
01139 for (int i=degree+1; i<seq_length+1 ; i++)
01140 block_weights[i-1]=i;
01141 }
01142
01143 return (block_weights!=NULL);
01144 }
01145
01146 bool CWeightedDegreePositionStringKernel::init_block_weights_log()
01147 {
01148 delete[] block_weights;
01149 block_weights=new DREAL[seq_length];
01150
01151 if (block_weights)
01152 {
01153 for (int i=1; i<degree+1 ; i++)
01154 block_weights[i-1]=pow(log(i),2);
01155
01156 for (int i=degree+1; i<seq_length+1 ; i++)
01157 block_weights[i-1]=i-degree+1+pow(log(degree+1),2);
01158 }
01159
01160 return (block_weights!=NULL);
01161 }
01162
01163 bool CWeightedDegreePositionStringKernel::init_block_weights_external()
01164 {
01165 if (block_weights_external && (seq_length == num_block_weights_external) )
01166 {
01167 delete[] block_weights;
01168 block_weights=new DREAL[seq_length];
01169
01170 if (block_weights)
01171 {
01172 for (int i=0; i<seq_length; i++)
01173 block_weights[i]=block_weights_external[i];
01174 }
01175 }
01176 else {
01177 SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external);
01178 }
01179 return (block_weights!=NULL);
01180 }
01181
01182 bool CWeightedDegreePositionStringKernel::init_block_weights()
01183 {
01184 switch (type)
01185 {
01186 case E_WD:
01187 return init_block_weights_from_wd();
01188 case E_EXTERNAL:
01189 return init_block_weights_from_wd_external();
01190 case E_BLOCK_CONST:
01191 return init_block_weights_const();
01192 case E_BLOCK_LINEAR:
01193 return init_block_weights_linear();
01194 case E_BLOCK_SQPOLY:
01195 return init_block_weights_sqpoly();
01196 case E_BLOCK_CUBICPOLY:
01197 return init_block_weights_cubicpoly();
01198 case E_BLOCK_EXP:
01199 return init_block_weights_exp();
01200 case E_BLOCK_LOG:
01201 return init_block_weights_log();
01202 case E_BLOCK_EXTERNAL:
01203 return init_block_weights_external();
01204 default:
01205 return false;
01206 };
01207 }
01208
01209
01210
01211 void* CWeightedDegreePositionStringKernel::compute_batch_helper(void* p)
01212 {
01213 S_THREAD_PARAM<DNATrie>* params = (S_THREAD_PARAM<DNATrie>*) p;
01214 INT j=params->j;
01215 CWeightedDegreePositionStringKernel* wd=params->kernel;
01216 CTrie<DNATrie>* tries=params->tries;
01217 DREAL* weights=params->weights;
01218 INT length=params->length;
01219 INT max_shift=params->max_shift;
01220 INT* vec=params->vec;
01221 DREAL* result=params->result;
01222 DREAL factor=params->factor;
01223 INT* shift=params->shift;
01224 INT* vec_idx=params->vec_idx;
01225
01226 for (INT i=params->start; i<params->end; i++)
01227 {
01228 INT len=0;
01229 CStringFeatures<CHAR>* rhs_feat=((CStringFeatures<CHAR>*) wd->get_rhs());
01230 CAlphabet* alpha=wd->alphabet;
01231
01232 CHAR* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len);
01233 for (INT k=CMath::max(0,j-max_shift); k<CMath::min(len,j+wd->get_degree()+max_shift); k++)
01234 vec[k]=alpha->remap_to_bin(char_vec[k]);
01235
01236 SG_UNREF(rhs_feat);
01237
01238 result[i] += factor*tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0))/wd->normalization_const ;
01239
01240 if (wd->get_optimization_type()==SLOWBUTMEMEFFICIENT)
01241 {
01242 for (INT q=CMath::max(0,j-max_shift); q<CMath::min(len,j+max_shift+1); q++)
01243 {
01244 INT s=j-q ;
01245 if ((s>=1) && (s<=shift[q]) && (q+s<len))
01246 result[i] += tries->compute_by_tree_helper(vec, len, q, q+s, q, weights, (length!=0))/(2.0*s*wd->normalization_const) ;
01247 }
01248 for (INT s=1; (s<=shift[j]) && (j+s<len); s++)
01249 result[i] += tries->compute_by_tree_helper(vec, len, j+s, j, j+s, weights, (length!=0))/(2.0*s*wd->normalization_const) ;
01250 }
01251 }
01252
01253 return NULL;
01254 }
01255
01256 void CWeightedDegreePositionStringKernel::compute_batch(INT num_vec, INT* vec_idx, DREAL* result, INT num_suppvec, INT* IDX, DREAL* alphas, DREAL factor)
01257 {
01258 ASSERT(alphabet);
01259 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01260 ASSERT(position_weights_lhs==NULL);
01261 ASSERT(position_weights_rhs==NULL);
01262 ASSERT(rhs);
01263 ASSERT(num_vec<=rhs->get_num_vectors());
01264 ASSERT(num_vec>0);
01265 ASSERT(vec_idx);
01266 ASSERT(result);
01267 create_empty_tries();
01268
01269 INT num_feat=((CStringFeatures<CHAR>*) rhs)->get_max_vector_length();
01270 ASSERT(num_feat>0);
01271 INT num_threads=parallel.get_num_threads();
01272 ASSERT(num_threads>0);
01273 INT* vec=new INT[num_threads*num_feat];
01274
01275 if (num_threads < 2)
01276 {
01277 #ifdef WIN32
01278 for (INT j=0; j<num_feat; j++)
01279 #else
01280 CSignal::clear() ;
01281 for (INT j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01282 #endif
01283 {
01284 init_optimization(num_suppvec, IDX, alphas, j);
01285 S_THREAD_PARAM<DNATrie> params;
01286 params.vec=vec;
01287 params.result=result;
01288 params.weights=weights;
01289 params.kernel=this;
01290 params.tries=&tries;
01291 params.factor=factor;
01292 params.j=j;
01293 params.start=0;
01294 params.end=num_vec;
01295 params.length=length;
01296 params.max_shift=max_shift;
01297 params.shift=shift;
01298 params.vec_idx=vec_idx;
01299 compute_batch_helper((void*) ¶ms);
01300
01301 SG_PROGRESS(j,0,num_feat);
01302 }
01303 }
01304 #ifndef WIN32
01305 else
01306 {
01307
01308 CSignal::clear() ;
01309 for (INT j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01310 {
01311 init_optimization(num_suppvec, IDX, alphas, j);
01312 pthread_t threads[num_threads-1];
01313 S_THREAD_PARAM<DNATrie> params[num_threads];
01314 INT step= num_vec/num_threads;
01315 INT t;
01316
01317 for (t=0; t<num_threads-1; t++)
01318 {
01319 params[t].vec=&vec[num_feat*t];
01320 params[t].result=result;
01321 params[t].weights=weights;
01322 params[t].kernel=this;
01323 params[t].tries=&tries;
01324 params[t].factor=factor;
01325 params[t].j=j;
01326 params[t].start = t*step;
01327 params[t].end = (t+1)*step;
01328 params[t].length=length;
01329 params[t].max_shift=max_shift;
01330 params[t].shift=shift;
01331 params[t].vec_idx=vec_idx;
01332 pthread_create(&threads[t], NULL, CWeightedDegreePositionStringKernel::compute_batch_helper, (void*)¶ms[t]);
01333 }
01334
01335 params[t].vec=&vec[num_feat*t];
01336 params[t].result=result;
01337 params[t].weights=weights;
01338 params[t].kernel=this;
01339 params[t].tries=&tries;
01340 params[t].factor=factor;
01341 params[t].j=j;
01342 params[t].start=t*step;
01343 params[t].end=num_vec;
01344 params[t].length=length;
01345 params[t].max_shift=max_shift;
01346 params[t].shift=shift;
01347 params[t].vec_idx=vec_idx;
01348 compute_batch_helper((void*) ¶ms[t]);
01349
01350 for (t=0; t<num_threads-1; t++)
01351 pthread_join(threads[t], NULL);
01352 SG_PROGRESS(j,0,num_feat);
01353 }
01354 }
01355 #endif
01356
01357 delete[] vec;
01358
01359
01360
01361 create_empty_tries();
01362 }
01363
01364 DREAL* CWeightedDegreePositionStringKernel::compute_scoring(INT max_degree, INT& num_feat, INT& num_sym, DREAL* result, INT num_suppvec, INT* IDX, DREAL* alphas)
01365 {
01366 ASSERT(position_weights_lhs==NULL);
01367 ASSERT(position_weights_rhs==NULL);
01368
01369 num_feat=((CStringFeatures<CHAR>*) rhs)->get_max_vector_length();
01370 ASSERT(num_feat>0);
01371 ASSERT(alphabet);
01372 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01373
01374 num_sym=4;
01375
01376 ASSERT(max_degree>0);
01377
01378
01379 INT* nofsKmers=new INT[max_degree];
01380 DREAL** C=new DREAL*[max_degree];
01381 DREAL** L=new DREAL*[max_degree];
01382 DREAL** R=new DREAL*[max_degree];
01383
01384 INT i;
01385 INT k;
01386
01387
01388 INT bigtabSize=0;
01389 for (k=0; k<max_degree; ++k )
01390 {
01391 nofsKmers[k]=(INT) pow(num_sym, k+1);
01392 const INT tabSize=nofsKmers[k]*num_feat;
01393 bigtabSize+=tabSize;
01394 }
01395 result=new DREAL[bigtabSize];
01396
01397
01398 INT tabOffs=0;
01399 for( k = 0; k < max_degree; ++k )
01400 {
01401 const INT tabSize = nofsKmers[k] * num_feat;
01402 C[k] = &result[tabOffs];
01403 L[k] = new DREAL[ tabSize ];
01404 R[k] = new DREAL[ tabSize ];
01405 tabOffs+=tabSize;
01406 for(i = 0; i < tabSize; i++ )
01407 {
01408 C[k][i] = 0.0;
01409 L[k][i] = 0.0;
01410 R[k][i] = 0.0;
01411 }
01412 }
01413
01414
01415 DREAL* margFactors=new DREAL[degree];
01416
01417 INT* x = new INT[ degree+1 ];
01418 INT* substrs = new INT[ degree+1 ];
01419
01420 margFactors[0] = 1.0;
01421 substrs[0] = 0;
01422 for( k=1; k < degree; ++k ) {
01423 margFactors[k] = 0.25 * margFactors[k-1];
01424 substrs[k] = -1;
01425 }
01426 substrs[degree] = -1;
01427
01428 struct TreeParseInfo info;
01429 info.num_sym = num_sym;
01430 info.num_feat = num_feat;
01431 info.p = -1;
01432 info.k = -1;
01433 info.nofsKmers = nofsKmers;
01434 info.margFactors = margFactors;
01435 info.x = x;
01436 info.substrs = substrs;
01437 info.y0 = 0;
01438 info.C_k = NULL;
01439 info.L_k = NULL;
01440 info.R_k = NULL;
01441
01442
01443 i = 0;
01444 for( k = 0; k < max_degree; ++k )
01445 {
01446 const INT nofKmers = nofsKmers[ k ];
01447 info.C_k = C[k];
01448 info.L_k = L[k];
01449 info.R_k = R[k];
01450
01451
01452 for(INT p = 0; p < num_feat; ++p )
01453 {
01454 init_optimization( num_suppvec, IDX, alphas, p );
01455 INT tree = p ;
01456 for(INT j = 0; j < degree+1; j++ ) {
01457 x[j] = -1;
01458 }
01459 tries.traverse( tree, p, info, 0, x, k );
01460 SG_PROGRESS(i++,0,num_feat*max_degree);
01461 }
01462
01463
01464 if( k > 0 ) {
01465 const INT j = k - 1;
01466 const INT nofJmers = (INT) pow( num_sym, j+1 );
01467 for(INT p = 0; p < num_feat; ++p ) {
01468 const INT offsetJ = nofJmers * p;
01469 const INT offsetJ1 = nofJmers * (p+1);
01470 const INT offsetK = nofKmers * p;
01471 INT y;
01472 INT sym;
01473 for( y = 0; y < nofJmers; ++y ) {
01474 for( sym = 0; sym < num_sym; ++sym ) {
01475 const INT y_sym = num_sym*y + sym;
01476 const INT sym_y = nofJmers*sym + y;
01477 ASSERT(0<=y_sym && y_sym<nofKmers);
01478 ASSERT(0<=sym_y && sym_y<nofKmers);
01479 C[k][ y_sym + offsetK ] += L[j][ y + offsetJ ];
01480 if( p < num_feat-1 ) {
01481 C[k][ sym_y + offsetK ] += R[j][ y + offsetJ1 ];
01482 }
01483 }
01484 }
01485 }
01486 }
01487
01488
01489
01490
01491
01492
01493
01494
01495
01496
01497
01498 }
01499
01500
01501 num_feat=1;
01502 num_sym = bigtabSize;
01503
01504 delete[] nofsKmers;
01505 delete[] margFactors;
01506 delete[] substrs;
01507 delete[] x;
01508 delete[] C;
01509 for( k = 0; k < max_degree; ++k ) {
01510 delete[] L[k];
01511 delete[] R[k];
01512 }
01513 delete[] L;
01514 delete[] R;
01515 return result;
01516 }
01517
01518 CHAR* CWeightedDegreePositionStringKernel::compute_consensus(INT &num_feat, INT num_suppvec, INT* IDX, DREAL* alphas)
01519 {
01520 ASSERT(position_weights_lhs==NULL);
01521 ASSERT(position_weights_rhs==NULL);
01522
01523 ASSERT(degree<=32);
01524 ASSERT(!tries.get_use_compact_terminal_nodes());
01525 num_feat=((CStringFeatures<CHAR>*) rhs)->get_max_vector_length();
01526 ASSERT(num_feat>0);
01527 ASSERT(alphabet);
01528 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01529
01530
01531 CHAR* result=new CHAR[num_feat];
01532
01533
01534 INT num_tables=CMath::max(1,num_feat-degree+1);
01535 CDynamicArray<ConsensusEntry>** table=new CDynamicArray<ConsensusEntry>*[num_tables];
01536
01537 for (INT i=0; i<num_tables; i++)
01538 table[i]=new CDynamicArray<ConsensusEntry>(num_suppvec/10);
01539
01540
01541 for (INT i=0; i<num_tables; i++)
01542 {
01543 bool cumulative=false;
01544
01545 if (i<num_tables-1)
01546 init_optimization(num_suppvec, IDX, alphas, i);
01547 else
01548 {
01549 init_optimization(num_suppvec, IDX, alphas, i, num_feat-1);
01550 cumulative=true;
01551 }
01552
01553 if (i==0)
01554 tries.fill_backtracking_table(i, NULL, table[i], cumulative, weights);
01555 else
01556 tries.fill_backtracking_table(i, table[i-1], table[i], cumulative, weights);
01557
01558 SG_PROGRESS(i,0,num_feat);
01559 }
01560
01561
01562
01563
01564
01565
01566
01567
01568
01569
01570
01571
01572
01573
01574
01575
01576
01577
01578
01579
01580
01581
01582
01583 const CHAR* acgt="ACGT";
01584
01585
01586 INT max_idx=-1;
01587 SHORTREAL max_score=0;
01588 INT num_elements=table[num_tables-1]->get_num_elements();
01589
01590 for (INT i=0; i<num_elements; i++)
01591 {
01592 DREAL sc=table[num_tables-1]->get_element(i).score;
01593 if (sc>max_score || max_idx==-1)
01594 {
01595 max_idx=i;
01596 max_score=sc;
01597 }
01598 }
01599 ULONG endstr=table[num_tables-1]->get_element(max_idx).string;
01600
01601 SG_INFO("max_idx:%d num_el:%d num_feat:%d num_tables:%d max_score:%f\n", max_idx, num_elements, num_feat, num_tables, max_score);
01602
01603 for (INT i=0; i<degree; i++)
01604 result[num_feat-1-i]=acgt[(endstr >> (2*i)) & 3];
01605
01606 if (num_tables>1)
01607 {
01608 for (INT i=num_tables-1; i>=0; i--)
01609 {
01610
01611 result[i]=acgt[table[i]->get_element(max_idx).string >> (2*(degree-1)) & 3];
01612 max_idx=table[i]->get_element(max_idx).bt;
01613 }
01614 }
01615
01616
01617
01618
01619
01620
01621
01622
01623
01624
01625
01626 for (INT i=0; i<num_tables; i++)
01627 delete table[i];
01628
01629 delete[] table;
01630 return result;
01631 }
01632
01633
01634 DREAL* CWeightedDegreePositionStringKernel::extract_w( INT max_degree, INT& num_feat, INT& num_sym, DREAL* w_result, INT num_suppvec, INT* IDX, DREAL* alphas )
01635 {
01636 delete_optimization();
01637 use_poim_tries=true;
01638 poim_tries.delete_trees(false);
01639
01640
01641 ASSERT(position_weights_lhs==NULL);
01642 ASSERT(position_weights_rhs==NULL);
01643 num_feat=((CStringFeatures<CHAR>*) rhs)->get_max_vector_length();
01644 ASSERT(num_feat>0);
01645 ASSERT(alphabet->get_alphabet()==DNA);
01646 ASSERT(max_degree>0);
01647
01648
01649 static const INT NUM_SYMS = poim_tries.NUM_SYMS;
01650 const INT seqLen = num_feat;
01651 DREAL** subs;
01652 INT i;
01653 INT k;
01654
01655
01656
01657
01658 INT* offsets;
01659 INT offset;
01660 offsets = new INT[ max_degree ];
01661 offset = 0;
01662 for( k = 0; k < max_degree; ++k ) {
01663 offsets[k] = offset;
01664 const INT nofsKmers = (INT) pow( NUM_SYMS, k+1 );
01665 const INT tabSize = nofsKmers * seqLen;
01666 offset += tabSize;
01667 }
01668
01669 const INT bigTabSize = offset;
01670 w_result=new DREAL[bigTabSize];
01671 for (i=0; i<bigTabSize; ++i)
01672 w_result[i]=0;
01673
01674
01675 subs = new DREAL*[ max_degree ];
01676 ASSERT( subs != NULL );
01677 for( k = 0; k < max_degree; ++k ) {
01678 subs[k] = &w_result[ offsets[k] ];
01679 }
01680 delete[] offsets;
01681
01682
01683 init_optimization( num_suppvec, IDX, alphas, -1);
01684 poim_tries.POIMs_extract_W( subs, max_degree );
01685
01686
01687 delete[] subs;
01688 num_feat = 1;
01689 num_sym = bigTabSize;
01690 use_poim_tries=false;
01691 poim_tries.delete_trees(false);
01692 return w_result;
01693 }
01694
01695 DREAL* CWeightedDegreePositionStringKernel::compute_POIM( INT max_degree, INT& num_feat, INT& num_sym, DREAL* poim_result, INT num_suppvec, INT* IDX, DREAL* alphas, DREAL* distrib )
01696 {
01697 delete_optimization();
01698 use_poim_tries=true;
01699 poim_tries.delete_trees(false);
01700
01701
01702 ASSERT(position_weights_lhs==NULL);
01703 ASSERT(position_weights_rhs==NULL);
01704 num_feat=((CStringFeatures<CHAR>*) rhs)->get_max_vector_length();
01705 ASSERT(num_feat>0);
01706 ASSERT(alphabet->get_alphabet()==DNA);
01707 ASSERT(max_degree!=0);
01708 ASSERT(distrib);
01709
01710
01711 static const INT NUM_SYMS = poim_tries.NUM_SYMS;
01712 const INT seqLen = num_feat;
01713 DREAL** subs;
01714 INT i;
01715 INT k;
01716
01717
01718
01719
01720
01721
01722
01723
01724
01725
01726
01727
01728 const INT debug = ( max_degree < 0 ) ? ( abs(max_degree) % 4 + 1 ) : 0;
01729 if( debug ) {
01730 max_degree = abs(max_degree) / 4;
01731 switch( debug ) {
01732 case 1: {
01733 printf( "POIM DEBUGGING: substring only (max order=%d)\n", max_degree );
01734 break;
01735 }
01736 case 2: {
01737 printf( "POIM DEBUGGING: superstring only (max order=%d)\n", max_degree );
01738 break;
01739 }
01740 case 3: {
01741 printf( "POIM DEBUGGING: left overlap only (max order=%d)\n", max_degree );
01742 break;
01743 }
01744 case 4: {
01745 printf( "POIM DEBUGGING: right overlap only (max order=%d)\n", max_degree );
01746 break;
01747 }
01748 default: {
01749 printf( "POIM DEBUGGING: something is wrong (max order=%d)\n", max_degree );
01750 ASSERT(0);
01751 break;
01752 }
01753 }
01754 }
01755
01756
01757 INT* offsets;
01758 INT offset;
01759 offsets = new INT[ max_degree ];
01760 offset = 0;
01761 for( k = 0; k < max_degree; ++k ) {
01762 offsets[k] = offset;
01763 const INT nofsKmers = (INT) pow( NUM_SYMS, k+1 );
01764 const INT tabSize = nofsKmers * seqLen;
01765 offset += tabSize;
01766 }
01767
01768 const INT bigTabSize=offset;
01769 poim_result=new DREAL[bigTabSize];
01770 for (i=0; i<bigTabSize; ++i )
01771 poim_result[i]=0;
01772
01773
01774 subs=new DREAL*[max_degree];
01775 for (k=0; k<max_degree; ++k)
01776 subs[k]=&poim_result[offsets[k]];
01777
01778 delete[] offsets;
01779
01780
01781 init_optimization( num_suppvec, IDX, alphas, -1);
01782 poim_tries.POIMs_precalc_SLR( distrib );
01783
01784
01785 if( debug==0 || debug==1 ) {
01786 poim_tries.POIMs_extract_W( subs, max_degree );
01787 for( k = 1; k < max_degree; ++k ) {
01788 const INT nofKmers2 = ( k > 1 ) ? (INT) pow(NUM_SYMS,k-1) : 0;
01789 const INT nofKmers1 = (INT) pow( NUM_SYMS, k );
01790 const INT nofKmers0 = nofKmers1 * NUM_SYMS;
01791 for( i = 0; i < seqLen; ++i ) {
01792 DREAL* const subs_k2i1 = ( k>1 && i<seqLen-1 ) ? &subs[k-2][(i+1)*nofKmers2] : NULL;
01793 DREAL* const subs_k1i1 = ( i < seqLen-1 ) ? &subs[k-1][(i+1)*nofKmers1] : NULL;
01794 DREAL* const subs_k1i0 = & subs[ k-1 ][ i*nofKmers1 ];
01795 DREAL* const subs_k0i = & subs[ k-0 ][ i*nofKmers0 ];
01796 INT y0;
01797 for( y0 = 0; y0 < nofKmers0; ++y0 ) {
01798 const INT y1l = y0 / NUM_SYMS;
01799 const INT y1r = y0 % nofKmers1;
01800 const INT y2 = y1r / NUM_SYMS;
01801 subs_k0i[ y0 ] += subs_k1i0[ y1l ];
01802 if( i < seqLen-1 ) {
01803 subs_k0i[ y0 ] += subs_k1i1[ y1r ];
01804 if( k > 1 ) {
01805 subs_k0i[ y0 ] -= subs_k2i1[ y2 ];
01806 }
01807 }
01808 }
01809 }
01810 }
01811 }
01812
01813
01814 poim_tries.POIMs_add_SLR( subs, max_degree, debug );
01815
01816
01817 delete[] subs;
01818 num_feat = 1;
01819 num_sym = bigTabSize;
01820
01821 use_poim_tries=false;
01822 poim_tries.delete_trees(false);
01823
01824 return poim_result;
01825 }
01826
01827
01828 void CWeightedDegreePositionStringKernel::prepare_POIM2(DREAL* distrib, INT num_sym, INT num_feat)
01829 {
01830 free(m_poim_distrib);
01831 m_poim_distrib=(DREAL*)malloc(num_sym*num_feat*sizeof(DREAL));
01832 ASSERT(m_poim_distrib);
01833
01834 memcpy(m_poim_distrib, distrib, num_sym*num_feat*sizeof(DREAL));
01835 m_poim_num_sym=num_sym;
01836 m_poim_num_feat=num_feat;
01837 }
01838
01839 void CWeightedDegreePositionStringKernel::compute_POIM2(INT max_degree, CSVM* svm)
01840 {
01841 ASSERT(svm);
01842 INT num_suppvec=svm->get_num_support_vectors();
01843 INT* sv_idx=new INT[num_suppvec];
01844 DREAL* sv_weight=new DREAL[num_suppvec];
01845
01846 for (INT i=0; i<num_suppvec; i++)
01847 {
01848 sv_idx[i]=svm->get_support_vector(i);
01849 sv_weight[i]=svm->get_alpha(i);
01850 }
01851
01852 if ((max_degree < 1) || (max_degree > 12))
01853 {
01854
01855 SG_WARNING( "max_degree out of range 1..12 (%d). setting to 1.\n", max_degree);
01856 max_degree=1;
01857 }
01858
01859 int num_feat = m_poim_num_feat ;
01860 int num_sym = m_poim_num_sym ;
01861 free(m_poim) ;
01862
01863 m_poim = compute_POIM(max_degree, num_feat, num_sym, NULL, num_suppvec, sv_idx,
01864 sv_weight, m_poim_distrib);
01865
01866 ASSERT(num_feat==1);
01867 m_poim_result_len=num_sym ;
01868
01869 delete[] sv_weight ;
01870 delete[] sv_idx ;
01871 }
01872
01873 void CWeightedDegreePositionStringKernel::get_POIM2(DREAL** poim, INT* result_len)
01874 {
01875 *poim=(DREAL*) malloc(m_poim_result_len*sizeof(DREAL));
01876 ASSERT(*poim);
01877 memcpy(*poim, m_poim, m_poim_result_len*sizeof(DREAL)) ;
01878 *result_len=m_poim_result_len ;
01879 }
01880
01881 void CWeightedDegreePositionStringKernel::cleanup_POIM2()
01882 {
01883 free(m_poim) ;
01884 m_poim=NULL ;
01885 free(m_poim_distrib) ;
01886 m_poim_distrib=NULL ;
01887 m_poim_num_sym=0 ;
01888 m_poim_num_sym=0 ;
01889 m_poim_result_len=0 ;
01890 }
01891