00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017
00018 #include "kernel/WeightedDegreePositionStringKernel.h"
00019 #include "kernel/SqrtDiagKernelNormalizer.h"
00020 #include "features/Features.h"
00021 #include "features/StringFeatures.h"
00022
00023 #include "classifier/svm/SVM.h"
00024
00025 #ifndef WIN32
00026 #include <pthread.h>
00027 #endif
00028
00029 #define TRIES(X) ((use_poim_tries) ? (poim_tries.X) : (tries.X))
00030
00031 template <class Trie> struct S_THREAD_PARAM
00032 {
00033 int32_t* vec;
00034 float64_t* result;
00035 float64_t* weights;
00036 CWeightedDegreePositionStringKernel* kernel;
00037 CTrie<Trie>* tries;
00038 float64_t factor;
00039 int32_t j;
00040 int32_t start;
00041 int32_t end;
00042 int32_t length;
00043 int32_t max_shift;
00044 int32_t* shift;
00045 int32_t* vec_idx;
00046 };
00047
00048 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00049 int32_t size, int32_t d, int32_t mm, int32_t mkls)
00050 : CStringKernel<char>(size), weights(NULL), position_weights(NULL),
00051 position_weights_lhs(NULL), position_weights_rhs(NULL),
00052 weights_buffer(NULL), mkl_stepsize(mkls), degree(d), length(0),
00053 max_mismatch(mm), seq_length(0), shift(NULL), shift_len(0),
00054 num_block_weights_external(0), block_weights_external(NULL),
00055 block_weights(NULL), type(E_EXTERNAL), tries(d), poim_tries(d),
00056 tree_initialized(false), use_poim_tries(false), m_poim_distrib(NULL),
00057 m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0), m_poim_result_len(0),
00058 alphabet(NULL)
00059 {
00060 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00061 set_wd_weights();
00062 ASSERT(weights);
00063
00064 set_normalizer(new CSqrtDiagKernelNormalizer());
00065 }
00066
00067 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00068 int32_t size, float64_t* w, int32_t d, int32_t mm, int32_t* s, int32_t sl,
00069 int32_t mkls)
00070 : CStringKernel<char>(size), weights(NULL), position_weights(NULL),
00071 position_weights_lhs(NULL), position_weights_rhs(NULL),
00072 weights_buffer(NULL), mkl_stepsize(mkls), degree(d), length(0),
00073 max_mismatch(mm), seq_length(0), shift(NULL), shift_len(0),
00074 num_block_weights_external(0), block_weights_external(NULL),
00075 block_weights(NULL), type(E_EXTERNAL), tries(d), poim_tries(d),
00076 tree_initialized(false), use_poim_tries(false), m_poim_distrib(NULL),
00077 m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0), m_poim_result_len(0),
00078 alphabet(NULL)
00079 {
00080 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00081
00082 weights=new float64_t[d*(1+max_mismatch)];
00083 for (int32_t i=0; i<d*(1+max_mismatch); i++)
00084 weights[i]=w[i];
00085
00086 set_shifts(s, sl);
00087
00088 set_normalizer(new CSqrtDiagKernelNormalizer());
00089 }
00090
00091 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00092 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t d)
00093 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00094 position_weights_lhs(NULL), position_weights_rhs(NULL),
00095 weights_buffer(NULL), mkl_stepsize(1), degree(d), length(0),
00096 max_mismatch(0), seq_length(0), shift(NULL), shift_len(0),
00097 num_block_weights_external(0), block_weights_external(NULL),
00098 block_weights(NULL), type(E_EXTERNAL), tries(d), poim_tries(d),
00099 tree_initialized(false), use_poim_tries(false), m_poim_distrib(NULL),
00100 m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0), m_poim_result_len(0),
00101 alphabet(NULL)
00102 {
00103 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00104 set_wd_weights();
00105 ASSERT(weights);
00106 set_normalizer(new CSqrtDiagKernelNormalizer());
00107
00108 init(l, r);
00109 }
00110
00111
00112 CWeightedDegreePositionStringKernel::~CWeightedDegreePositionStringKernel()
00113 {
00114 cleanup();
00115 cleanup_POIM2();
00116
00117 delete[] shift;
00118 shift=NULL;
00119
00120 delete[] weights;
00121 weights=NULL ;
00122
00123 delete[] block_weights;
00124 block_weights=NULL;
00125
00126 delete[] position_weights;
00127 position_weights=NULL;
00128
00129 delete[] position_weights_lhs;
00130 position_weights_lhs=NULL;
00131
00132 delete[] position_weights_rhs;
00133 position_weights_rhs=NULL;
00134
00135 delete[] weights_buffer;
00136 weights_buffer=NULL;
00137 }
00138
00139 void CWeightedDegreePositionStringKernel::remove_lhs()
00140 {
00141 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00142 delete_optimization();
00143
00144 tries.destroy();
00145 poim_tries.destroy();
00146
00147 CKernel::remove_lhs();
00148 }
00149
00150 void CWeightedDegreePositionStringKernel::create_empty_tries()
00151 {
00152 ASSERT(lhs);
00153 seq_length = ((CStringFeatures<char>*) lhs)->get_max_vector_length();
00154
00155 if (opt_type==SLOWBUTMEMEFFICIENT)
00156 {
00157 tries.create(seq_length, true);
00158 poim_tries.create(seq_length, true);
00159 }
00160 else if (opt_type==FASTBUTMEMHUNGRY)
00161 {
00162 tries.create(seq_length, false);
00163 poim_tries.create(seq_length, false);
00164 }
00165 else
00166 SG_ERROR( "unknown optimization type\n");
00167 }
00168
00169 bool CWeightedDegreePositionStringKernel::init(CFeatures* l, CFeatures* r)
00170 {
00171 int32_t lhs_changed = (lhs!=l) ;
00172 int32_t rhs_changed = (rhs!=r) ;
00173
00174 CStringKernel<char>::init(l,r);
00175
00176 SG_DEBUG( "lhs_changed: %i\n", lhs_changed) ;
00177 SG_DEBUG( "rhs_changed: %i\n", rhs_changed) ;
00178
00179 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00180 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00181
00182
00183 if (shift_len==0) {
00184 shift_len=sf_l->get_vector_length(0);
00185 int32_t *shifts=new int32_t[shift_len];
00186 for (int32_t i=0; i<shift_len; i++) {
00187 shifts[i]=1;
00188 }
00189 set_shifts(shifts, shift_len);
00190 delete[] shifts;
00191 }
00192
00193
00194 int32_t len=sf_l->get_max_vector_length();
00195 if (lhs_changed && !sf_l->have_same_length(len))
00196 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00197
00198 if (rhs_changed && !sf_r->have_same_length(len))
00199 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00200
00201 delete alphabet;
00202 alphabet= new CAlphabet(sf_l->get_alphabet());
00203 CAlphabet* ralphabet=sf_r->get_alphabet();
00204 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00205 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00206
00207 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00208 SG_UNREF(ralphabet);
00209
00210
00211 create_empty_tries();
00212 init_block_weights();
00213
00214 return init_normalizer();
00215 }
00216
00217 void CWeightedDegreePositionStringKernel::cleanup()
00218 {
00219 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00220 delete_optimization();
00221
00222 delete[] block_weights;
00223 block_weights=NULL;
00224
00225 tries.destroy();
00226 poim_tries.destroy();
00227
00228 seq_length = 0;
00229 tree_initialized = false;
00230
00231 delete alphabet;
00232 alphabet=NULL;
00233
00234 CKernel::cleanup();
00235 }
00236
00237 bool CWeightedDegreePositionStringKernel::load_init(FILE* src)
00238 {
00239 return false;
00240 }
00241
00242 bool CWeightedDegreePositionStringKernel::save_init(FILE* dest)
00243 {
00244 return false;
00245 }
00246
00247 bool CWeightedDegreePositionStringKernel::init_optimization(
00248 int32_t p_count, int32_t * IDX, float64_t * alphas, int32_t tree_num,
00249 int32_t upto_tree)
00250 {
00251 ASSERT(position_weights_lhs==NULL);
00252 ASSERT(position_weights_rhs==NULL);
00253
00254 if (upto_tree<0)
00255 upto_tree=tree_num;
00256
00257 if (max_mismatch!=0)
00258 {
00259 SG_ERROR( "CWeightedDegreePositionStringKernel optimization not implemented for mismatch!=0\n");
00260 return false ;
00261 }
00262
00263 if (tree_num<0)
00264 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00265
00266 delete_optimization();
00267
00268 if (tree_num<0)
00269 SG_DEBUG( "initializing CWeightedDegreePositionStringKernel optimization\n") ;
00270
00271 for (int32_t i=0; i<p_count; i++)
00272 {
00273 if (tree_num<0)
00274 {
00275 if ( (i % (p_count/10+1)) == 0)
00276 SG_PROGRESS(i,0,p_count);
00277 add_example_to_tree(IDX[i], alphas[i]);
00278 }
00279 else
00280 {
00281 for (int32_t t=tree_num; t<=upto_tree; t++)
00282 add_example_to_single_tree(IDX[i], alphas[i], t);
00283 }
00284 }
00285
00286 if (tree_num<0)
00287 SG_DONE();
00288
00289 set_is_initialized(true) ;
00290 return true ;
00291 }
00292
00293 bool CWeightedDegreePositionStringKernel::delete_optimization()
00294 {
00295 if ((opt_type==FASTBUTMEMHUNGRY) && (tries.get_use_compact_terminal_nodes()))
00296 {
00297 tries.set_use_compact_terminal_nodes(false) ;
00298 SG_DEBUG( "disabling compact trie nodes with FASTBUTMEMHUNGRY\n") ;
00299 }
00300
00301 if (get_is_initialized())
00302 {
00303 if (opt_type==SLOWBUTMEMEFFICIENT)
00304 tries.delete_trees(true);
00305 else if (opt_type==FASTBUTMEMHUNGRY)
00306 tries.delete_trees(false);
00307 else {
00308 SG_ERROR( "unknown optimization type\n");
00309 }
00310 set_is_initialized(false);
00311
00312 return true;
00313 }
00314
00315 return false;
00316 }
00317
00318 float64_t CWeightedDegreePositionStringKernel::compute_with_mismatch(
00319 char* avec, int32_t alen, char* bvec, int32_t blen)
00320 {
00321 float64_t max_shift_vec[max_shift];
00322 float64_t sum0=0 ;
00323 for (int32_t i=0; i<max_shift; i++)
00324 max_shift_vec[i]=0 ;
00325
00326
00327 for (int32_t i=0; i<alen; i++)
00328 {
00329 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00330 continue ;
00331
00332 int32_t mismatches=0;
00333 float64_t sumi = 0.0 ;
00334 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00335 {
00336 if (avec[i+j]!=bvec[i+j])
00337 {
00338 mismatches++ ;
00339 if (mismatches>max_mismatch)
00340 break ;
00341 } ;
00342 sumi += weights[j+degree*mismatches];
00343 }
00344 if (position_weights!=NULL)
00345 sum0 += position_weights[i]*sumi ;
00346 else
00347 sum0 += sumi ;
00348 } ;
00349
00350 for (int32_t i=0; i<alen; i++)
00351 {
00352 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00353 {
00354 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00355 continue ;
00356
00357 float64_t sumi1 = 0.0 ;
00358
00359 int32_t mismatches=0;
00360 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00361 {
00362 if (avec[i+j+k]!=bvec[i+j])
00363 {
00364 mismatches++ ;
00365 if (mismatches>max_mismatch)
00366 break ;
00367 } ;
00368 sumi1 += weights[j+degree*mismatches];
00369 }
00370 float64_t sumi2 = 0.0 ;
00371
00372 mismatches=0;
00373 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00374 {
00375 if (avec[i+j]!=bvec[i+j+k])
00376 {
00377 mismatches++ ;
00378 if (mismatches>max_mismatch)
00379 break ;
00380 } ;
00381 sumi2 += weights[j+degree*mismatches];
00382 }
00383 if (position_weights!=NULL)
00384 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00385 else
00386 max_shift_vec[k-1] += sumi1 + sumi2 ;
00387 } ;
00388 }
00389
00390 float64_t result = sum0 ;
00391 for (int32_t i=0; i<max_shift; i++)
00392 result += max_shift_vec[i]/(2*(i+1)) ;
00393
00394 return result ;
00395 }
00396
00397 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch(
00398 char* avec, int32_t alen, char* bvec, int32_t blen)
00399 {
00400 float64_t max_shift_vec[max_shift];
00401 float64_t sum0=0 ;
00402 for (int32_t i=0; i<max_shift; i++)
00403 max_shift_vec[i]=0 ;
00404
00405
00406 for (int32_t i=0; i<alen; i++)
00407 {
00408 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00409 continue ;
00410
00411 float64_t sumi = 0.0 ;
00412 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00413 {
00414 if (avec[i+j]!=bvec[i+j])
00415 break ;
00416 sumi += weights[j];
00417 }
00418 if (position_weights!=NULL)
00419 sum0 += position_weights[i]*sumi ;
00420 else
00421 sum0 += sumi ;
00422 } ;
00423
00424 for (int32_t i=0; i<alen; i++)
00425 {
00426 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00427 {
00428 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00429 continue ;
00430
00431 float64_t sumi1 = 0.0 ;
00432
00433 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00434 {
00435 if (avec[i+j+k]!=bvec[i+j])
00436 break ;
00437 sumi1 += weights[j];
00438 }
00439 float64_t sumi2 = 0.0 ;
00440
00441 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00442 {
00443 if (avec[i+j]!=bvec[i+j+k])
00444 break ;
00445 sumi2 += weights[j];
00446 }
00447 if (position_weights!=NULL)
00448 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00449 else
00450 max_shift_vec[k-1] += sumi1 + sumi2 ;
00451 } ;
00452 }
00453
00454 float64_t result = sum0 ;
00455 for (int32_t i=0; i<max_shift; i++)
00456 result += max_shift_vec[i]/(2*(i+1)) ;
00457
00458 return result ;
00459 }
00460
00461 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch_matrix(
00462 char* avec, int32_t alen, char* bvec, int32_t blen)
00463 {
00464 float64_t max_shift_vec[max_shift];
00465 float64_t sum0=0 ;
00466 for (int32_t i=0; i<max_shift; i++)
00467 max_shift_vec[i]=0 ;
00468
00469
00470 for (int32_t i=0; i<alen; i++)
00471 {
00472 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00473 continue ;
00474 float64_t sumi = 0.0 ;
00475 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00476 {
00477 if (avec[i+j]!=bvec[i+j])
00478 break ;
00479 sumi += weights[i*degree+j];
00480 }
00481 if (position_weights!=NULL)
00482 sum0 += position_weights[i]*sumi ;
00483 else
00484 sum0 += sumi ;
00485 } ;
00486
00487 for (int32_t i=0; i<alen; i++)
00488 {
00489 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00490 {
00491 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00492 continue ;
00493
00494 float64_t sumi1 = 0.0 ;
00495
00496 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00497 {
00498 if (avec[i+j+k]!=bvec[i+j])
00499 break ;
00500 sumi1 += weights[i*degree+j];
00501 }
00502 float64_t sumi2 = 0.0 ;
00503
00504 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00505 {
00506 if (avec[i+j]!=bvec[i+j+k])
00507 break ;
00508 sumi2 += weights[i*degree+j];
00509 }
00510 if (position_weights!=NULL)
00511 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00512 else
00513 max_shift_vec[k-1] += sumi1 + sumi2 ;
00514 } ;
00515 }
00516
00517 float64_t result = sum0 ;
00518 for (int32_t i=0; i<max_shift; i++)
00519 result += max_shift_vec[i]/(2*(i+1)) ;
00520
00521 return result ;
00522 }
00523
00524 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch_position_weights(
00525 char* avec, float64_t* pos_weights_lhs, int32_t alen, char* bvec,
00526 float64_t* pos_weights_rhs, int32_t blen)
00527 {
00528 float64_t max_shift_vec[max_shift];
00529 float64_t sum0=0 ;
00530 for (int32_t i=0; i<max_shift; i++)
00531 max_shift_vec[i]=0 ;
00532
00533
00534 for (int32_t i=0; i<alen; i++)
00535 {
00536 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00537 continue ;
00538
00539 float64_t sumi = 0.0 ;
00540 float64_t posweight_lhs = 0.0 ;
00541 float64_t posweight_rhs = 0.0 ;
00542 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00543 {
00544 posweight_lhs += pos_weights_lhs[i+j] ;
00545 posweight_rhs += pos_weights_rhs[i+j] ;
00546
00547 if (avec[i+j]!=bvec[i+j])
00548 break ;
00549 sumi += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00550 }
00551 if (position_weights!=NULL)
00552 sum0 += position_weights[i]*sumi ;
00553 else
00554 sum0 += sumi ;
00555 } ;
00556
00557 for (int32_t i=0; i<alen; i++)
00558 {
00559 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00560 {
00561 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00562 continue ;
00563
00564
00565 float64_t sumi1 = 0.0 ;
00566 float64_t posweight_lhs = 0.0 ;
00567 float64_t posweight_rhs = 0.0 ;
00568 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00569 {
00570 posweight_lhs += pos_weights_lhs[i+j+k] ;
00571 posweight_rhs += pos_weights_rhs[i+j] ;
00572 if (avec[i+j+k]!=bvec[i+j])
00573 break ;
00574 sumi1 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00575 }
00576
00577 float64_t sumi2 = 0.0 ;
00578 posweight_lhs = 0.0 ;
00579 posweight_rhs = 0.0 ;
00580 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00581 {
00582 posweight_lhs += pos_weights_lhs[i+j] ;
00583 posweight_rhs += pos_weights_rhs[i+j+k] ;
00584 if (avec[i+j]!=bvec[i+j+k])
00585 break ;
00586 sumi2 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00587 }
00588 if (position_weights!=NULL)
00589 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00590 else
00591 max_shift_vec[k-1] += sumi1 + sumi2 ;
00592 } ;
00593 }
00594
00595 float64_t result = sum0 ;
00596 for (int32_t i=0; i<max_shift; i++)
00597 result += max_shift_vec[i]/(2*(i+1)) ;
00598
00599 return result ;
00600 }
00601
00602
00603 float64_t CWeightedDegreePositionStringKernel::compute(
00604 int32_t idx_a, int32_t idx_b)
00605 {
00606 int32_t alen, blen;
00607
00608 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen);
00609 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen);
00610
00611 ASSERT(alen==blen);
00612 ASSERT(shift_len==alen);
00613
00614 float64_t result = 0 ;
00615 if (position_weights_lhs!=NULL || position_weights_rhs!=NULL)
00616 {
00617 ASSERT(max_mismatch==0);
00618 result = compute_without_mismatch_position_weights(avec, &position_weights_lhs[idx_a*alen], alen, bvec, &position_weights_rhs[idx_b*blen], blen) ;
00619 }
00620 else if (max_mismatch > 0)
00621 result = compute_with_mismatch(avec, alen, bvec, blen) ;
00622 else if (length==0)
00623 result = compute_without_mismatch(avec, alen, bvec, blen) ;
00624 else
00625 result = compute_without_mismatch_matrix(avec, alen, bvec, blen) ;
00626
00627 return result ;
00628 }
00629
00630
00631 void CWeightedDegreePositionStringKernel::add_example_to_tree(
00632 int32_t idx, float64_t alpha)
00633 {
00634 ASSERT(position_weights_lhs==NULL);
00635 ASSERT(position_weights_rhs==NULL);
00636 ASSERT(alphabet);
00637 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00638
00639 int32_t len=0;
00640 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00641 ASSERT(max_mismatch==0);
00642 int32_t *vec = new int32_t[len] ;
00643
00644 for (int32_t i=0; i<len; i++)
00645 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00646
00647 if (opt_type==FASTBUTMEMHUNGRY)
00648 {
00649
00650 ASSERT(!TRIES(get_use_compact_terminal_nodes()));
00651 }
00652
00653 for (int32_t i=0; i<len; i++)
00654 {
00655 int32_t max_s=-1;
00656
00657 if (opt_type==SLOWBUTMEMEFFICIENT)
00658 max_s=0;
00659 else if (opt_type==FASTBUTMEMHUNGRY)
00660 max_s=shift[i];
00661 else {
00662 SG_ERROR( "unknown optimization type\n");
00663 }
00664
00665 for (int32_t s=max_s; s>=0; s--)
00666 {
00667 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00668 TRIES(add_to_trie(i, s, vec, alpha_pw, weights, (length!=0))) ;
00669
00670
00671 if ((s==0) || (i+s>=len))
00672 continue;
00673
00674 TRIES(add_to_trie(i+s, -s, vec, alpha_pw, weights, (length!=0))) ;
00675
00676 }
00677 }
00678
00679 delete[] vec ;
00680 tree_initialized=true ;
00681 }
00682
00683 void CWeightedDegreePositionStringKernel::add_example_to_single_tree(
00684 int32_t idx, float64_t alpha, int32_t tree_num)
00685 {
00686 ASSERT(position_weights_lhs==NULL);
00687 ASSERT(position_weights_rhs==NULL);
00688 ASSERT(alphabet);
00689 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00690
00691 int32_t len=0;
00692 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len);
00693 ASSERT(max_mismatch==0);
00694 int32_t *vec=new int32_t[len];
00695 int32_t max_s=-1;
00696
00697 if (opt_type==SLOWBUTMEMEFFICIENT)
00698 max_s=0;
00699 else if (opt_type==FASTBUTMEMHUNGRY)
00700 {
00701 ASSERT(!tries.get_use_compact_terminal_nodes());
00702 max_s=shift[tree_num];
00703 }
00704 else {
00705 SG_ERROR( "unknown optimization type\n");
00706 }
00707 for (int32_t i=CMath::max(0,tree_num-max_shift); i<CMath::min(len,tree_num+degree+max_shift); i++)
00708 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00709
00710 for (int32_t s=max_s; s>=0; s--)
00711 {
00712 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00713 tries.add_to_trie(tree_num, s, vec, alpha_pw, weights, (length!=0)) ;
00714
00715 }
00716
00717 if (opt_type==FASTBUTMEMHUNGRY)
00718 {
00719 for (int32_t i=CMath::max(0,tree_num-max_shift); i<CMath::min(len,tree_num+max_shift+1); i++)
00720 {
00721 int32_t s=tree_num-i;
00722 if ((i+s<len) && (s>=1) && (s<=shift[i]))
00723 {
00724 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00725 tries.add_to_trie(tree_num, -s, vec, alpha_pw, weights, (length!=0)) ;
00726
00727 }
00728 }
00729 }
00730 delete[] vec ;
00731 tree_initialized=true ;
00732 }
00733
00734 float64_t CWeightedDegreePositionStringKernel::compute_by_tree(int32_t idx)
00735 {
00736 ASSERT(position_weights_lhs==NULL);
00737 ASSERT(position_weights_rhs==NULL);
00738 ASSERT(alphabet);
00739 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00740
00741 float64_t sum=0;
00742 int32_t len=0;
00743 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len);
00744 ASSERT(max_mismatch==0);
00745 int32_t *vec=new int32_t[len];
00746
00747 for (int32_t i=0; i<len; i++)
00748 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00749
00750 for (int32_t i=0; i<len; i++)
00751 sum += tries.compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0)) ;
00752
00753 if (opt_type==SLOWBUTMEMEFFICIENT)
00754 {
00755 for (int32_t i=0; i<len; i++)
00756 {
00757 for (int32_t s=1; (s<=shift[i]) && (i+s<len); s++)
00758 {
00759 sum+=tries.compute_by_tree_helper(vec, len, i, i+s, i, weights, (length!=0))/(2*s) ;
00760 sum+=tries.compute_by_tree_helper(vec, len, i+s, i, i+s, weights, (length!=0))/(2*s) ;
00761 }
00762 }
00763 }
00764
00765 delete[] vec ;
00766
00767 return normalizer->normalize_rhs(sum, idx);
00768 }
00769
00770 void CWeightedDegreePositionStringKernel::compute_by_tree(
00771 int32_t idx, float64_t* LevelContrib)
00772 {
00773 ASSERT(position_weights_lhs==NULL);
00774 ASSERT(position_weights_rhs==NULL);
00775 ASSERT(alphabet);
00776 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00777
00778 int32_t len=0;
00779 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len);
00780 ASSERT(max_mismatch==0);
00781 int32_t *vec=new int32_t[len];
00782
00783 for (int32_t i=0; i<len; i++)
00784 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00785
00786 for (int32_t i=0; i<len; i++)
00787 {
00788 tries.compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00789 normalizer->normalize_rhs(1.0, idx), mkl_stepsize, weights,
00790 (length!=0));
00791 }
00792
00793 if (opt_type==SLOWBUTMEMEFFICIENT)
00794 {
00795 for (int32_t i=0; i<len; i++)
00796 for (int32_t k=1; (k<=shift[i]) && (i+k<len); k++)
00797 {
00798 tries.compute_by_tree_helper(vec, len, i, i+k, i, LevelContrib,
00799 normalizer->normalize_rhs(1.0/(2*k), idx), mkl_stepsize,
00800 weights, (length!=0)) ;
00801 tries.compute_by_tree_helper(vec, len, i+k, i, i+k,
00802 LevelContrib, normalizer->normalize_rhs(1.0/(2*k), idx),
00803 mkl_stepsize, weights, (length!=0)) ;
00804 }
00805 }
00806
00807 delete[] vec ;
00808 }
00809
00810 float64_t* CWeightedDegreePositionStringKernel::compute_abs_weights(
00811 int32_t &len)
00812 {
00813 return tries.compute_abs_weights(len);
00814 }
00815
00816 bool CWeightedDegreePositionStringKernel::set_shifts(
00817 int32_t* shift_, int32_t shift_len_)
00818 {
00819 delete[] shift;
00820
00821 shift_len = shift_len_ ;
00822 shift = new int32_t[shift_len] ;
00823
00824 if (shift)
00825 {
00826 max_shift = 0 ;
00827
00828 for (int32_t i=0; i<shift_len; i++)
00829 {
00830 shift[i] = shift_[i] ;
00831 if (shift[i]>max_shift)
00832 max_shift = shift[i] ;
00833 }
00834
00835 ASSERT(max_shift>=0 && max_shift<=shift_len);
00836 }
00837
00838 return false;
00839 }
00840
00841 bool CWeightedDegreePositionStringKernel::set_wd_weights()
00842 {
00843 ASSERT(degree>0);
00844
00845 delete[] weights;
00846 weights=new float64_t[degree];
00847 if (weights)
00848 {
00849 int32_t i;
00850 float64_t sum=0;
00851 for (i=0; i<degree; i++)
00852 {
00853 weights[i]=degree-i;
00854 sum+=weights[i];
00855 }
00856 for (i=0; i<degree; i++)
00857 weights[i]/=sum;
00858
00859 for (i=0; i<degree; i++)
00860 {
00861 for (int32_t j=1; j<=max_mismatch; j++)
00862 {
00863 if (j<i+1)
00864 {
00865 int32_t nk=CMath::nchoosek(i+1, j);
00866 weights[i+j*degree]=weights[i]/(nk*pow(3,j));
00867 }
00868 else
00869 weights[i+j*degree]= 0;
00870 }
00871 }
00872
00873 return true;
00874 }
00875 else
00876 return false;
00877 }
00878
00879 bool CWeightedDegreePositionStringKernel::set_weights(
00880 float64_t* ws, int32_t d, int32_t len)
00881 {
00882 SG_DEBUG("degree = %i d=%i\n", degree, d);
00883 degree=d;
00884 length=len;
00885
00886 if (len <= 0)
00887 len=1;
00888
00889 delete[] weights;
00890 weights=new float64_t[d*len];
00891
00892 if (weights)
00893 {
00894 for (int32_t i=0; i<degree*len; i++)
00895 weights[i]=ws[i];
00896 return true;
00897 }
00898 else
00899 return false;
00900 }
00901
00902 bool CWeightedDegreePositionStringKernel::set_position_weights(
00903 float64_t* pws, int32_t len)
00904 {
00905
00906
00907 if (len==0)
00908 {
00909 delete[] position_weights;
00910 position_weights = NULL;
00911 tries.set_position_weights(position_weights);
00912 return true;
00913 }
00914 if (seq_length==0)
00915 seq_length=len;
00916
00917 if (seq_length!=len)
00918 {
00919 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00920 return false;
00921 }
00922 delete[] position_weights;
00923 position_weights=new float64_t[len];
00924 tries.set_position_weights(position_weights);
00925
00926 if (position_weights)
00927 {
00928 for (int32_t i=0; i<len; i++)
00929 position_weights[i]=pws[i];
00930 return true;
00931 }
00932 else
00933 return false;
00934 }
00935
00936 bool CWeightedDegreePositionStringKernel::set_position_weights_lhs(float64_t* pws, int32_t len, int32_t num)
00937 {
00938
00939
00940 if (position_weights_rhs==position_weights_lhs)
00941 position_weights_rhs=NULL;
00942 else
00943 delete_position_weights_rhs();
00944
00945 if (len==0)
00946 {
00947 return delete_position_weights_lhs();
00948 }
00949
00950 if (seq_length!=len)
00951 {
00952 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00953 return false;
00954 }
00955 if (!lhs)
00956 {
00957 SG_ERROR("lhs=NULL\n");
00958 return false ;
00959 }
00960 if (lhs->get_num_vectors()!=num)
00961 {
00962 SG_ERROR("lhs->get_num_vectors()=%i, num=%i\n", lhs->get_num_vectors(), num);
00963 return false;
00964 }
00965
00966 delete[] position_weights_lhs;
00967 position_weights_lhs=new float64_t[len*num];
00968 if (position_weights_lhs)
00969 {
00970 for (int32_t i=0; i<len*num; i++)
00971 position_weights_lhs[i]=pws[i];
00972 return true;
00973 }
00974 else
00975 return false;
00976 }
00977
00978 bool CWeightedDegreePositionStringKernel::set_position_weights_rhs(
00979 float64_t* pws, int32_t len, int32_t num)
00980 {
00981
00982 if (len==0)
00983 {
00984 if (position_weights_rhs==position_weights_lhs)
00985 {
00986 position_weights_rhs=NULL;
00987 return true;
00988 }
00989 return delete_position_weights_rhs();
00990 }
00991
00992 if (seq_length!=len)
00993 {
00994 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00995 return false;
00996 }
00997 if (!rhs)
00998 {
00999 if (!lhs)
01000 {
01001 SG_ERROR("rhs==0 and lhs=NULL\n");
01002 return false;
01003 }
01004 if (lhs->get_num_vectors()!=num)
01005 {
01006 SG_ERROR("lhs->get_num_vectors()=%i, num=%i\n", lhs->get_num_vectors(), num);
01007 return false;
01008 }
01009 } else
01010 if (rhs->get_num_vectors()!=num)
01011 {
01012 SG_ERROR("rhs->get_num_vectors()=%i, num=%i\n", rhs->get_num_vectors(), num);
01013 return false;
01014 }
01015
01016 delete[] position_weights_rhs;
01017 position_weights_rhs=new float64_t[len*num];
01018 if (position_weights_rhs)
01019 {
01020 for (int32_t i=0; i<len*num; i++)
01021 position_weights_rhs[i]=pws[i];
01022 return true;
01023 }
01024 else
01025 return false;
01026 }
01027
01028 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd()
01029 {
01030 delete[] block_weights;
01031 block_weights=new float64_t[CMath::max(seq_length,degree)];
01032
01033 if (block_weights)
01034 {
01035 int32_t k;
01036 float64_t d=degree;
01037
01038 for (k=0; k<degree; k++)
01039 block_weights[k]=
01040 (-pow(k, 3)+(3*d-3)*pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
01041 for (k=degree; k<seq_length; k++)
01042 block_weights[k]=(-d+3*k+4)/3;
01043 }
01044
01045 return (block_weights!=NULL);
01046 }
01047
01048 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd_external()
01049 {
01050 ASSERT(weights);
01051 delete[] block_weights;
01052 block_weights=new float64_t[CMath::max(seq_length,degree)];
01053
01054 if (block_weights)
01055 {
01056 int32_t i=0;
01057 block_weights[0]=weights[0];
01058 for (i=1; i<CMath::max(seq_length,degree); i++)
01059 block_weights[i]=0;
01060
01061 for (i=1; i<CMath::max(seq_length,degree); i++)
01062 {
01063 block_weights[i]=block_weights[i-1];
01064
01065 float64_t contrib=0;
01066 for (int32_t j=0; j<CMath::min(degree,i+1); j++)
01067 contrib+=weights[j];
01068
01069 block_weights[i]+=contrib;
01070 }
01071 }
01072
01073 return (block_weights!=NULL);
01074 }
01075
01076 bool CWeightedDegreePositionStringKernel::init_block_weights_const()
01077 {
01078 delete[] block_weights;
01079 block_weights=new float64_t[seq_length];
01080
01081 if (block_weights)
01082 {
01083 for (int32_t i=1; i<seq_length+1 ; i++)
01084 block_weights[i-1]=1.0/seq_length;
01085 }
01086
01087 return (block_weights!=NULL);
01088 }
01089
01090 bool CWeightedDegreePositionStringKernel::init_block_weights_linear()
01091 {
01092 delete[] block_weights;
01093 block_weights=new float64_t[seq_length];
01094
01095 if (block_weights)
01096 {
01097 for (int32_t i=1; i<seq_length+1 ; i++)
01098 block_weights[i-1]=degree*i;
01099 }
01100
01101 return (block_weights!=NULL);
01102 }
01103
01104 bool CWeightedDegreePositionStringKernel::init_block_weights_sqpoly()
01105 {
01106 delete[] block_weights;
01107 block_weights=new float64_t[seq_length];
01108
01109 if (block_weights)
01110 {
01111 for (int32_t i=1; i<degree+1 ; i++)
01112 block_weights[i-1]=((float64_t) i)*i;
01113
01114 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01115 block_weights[i-1]=i;
01116 }
01117
01118 return (block_weights!=NULL);
01119 }
01120
01121 bool CWeightedDegreePositionStringKernel::init_block_weights_cubicpoly()
01122 {
01123 delete[] block_weights;
01124 block_weights=new float64_t[seq_length];
01125
01126 if (block_weights)
01127 {
01128 for (int32_t i=1; i<degree+1 ; i++)
01129 block_weights[i-1]=((float64_t) i)*i*i;
01130
01131 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01132 block_weights[i-1]=i;
01133 }
01134
01135 return (block_weights!=NULL);
01136 }
01137
01138 bool CWeightedDegreePositionStringKernel::init_block_weights_exp()
01139 {
01140 delete[] block_weights;
01141 block_weights=new float64_t[seq_length];
01142
01143 if (block_weights)
01144 {
01145 for (int32_t i=1; i<degree+1 ; i++)
01146 block_weights[i-1]=exp(((float64_t) i/10.0));
01147
01148 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01149 block_weights[i-1]=i;
01150 }
01151
01152 return (block_weights!=NULL);
01153 }
01154
01155 bool CWeightedDegreePositionStringKernel::init_block_weights_log()
01156 {
01157 delete[] block_weights;
01158 block_weights=new float64_t[seq_length];
01159
01160 if (block_weights)
01161 {
01162 for (int32_t i=1; i<degree+1 ; i++)
01163 block_weights[i-1]=pow(log(i),2);
01164
01165 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01166 block_weights[i-1]=i-degree+1+pow(log(degree+1),2);
01167 }
01168
01169 return (block_weights!=NULL);
01170 }
01171
01172 bool CWeightedDegreePositionStringKernel::init_block_weights_external()
01173 {
01174 if (block_weights_external && (seq_length == num_block_weights_external) )
01175 {
01176 delete[] block_weights;
01177 block_weights=new float64_t[seq_length];
01178
01179 if (block_weights)
01180 {
01181 for (int32_t i=0; i<seq_length; i++)
01182 block_weights[i]=block_weights_external[i];
01183 }
01184 }
01185 else {
01186 SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external);
01187 }
01188 return (block_weights!=NULL);
01189 }
01190
01191 bool CWeightedDegreePositionStringKernel::init_block_weights()
01192 {
01193 switch (type)
01194 {
01195 case E_WD:
01196 return init_block_weights_from_wd();
01197 case E_EXTERNAL:
01198 return init_block_weights_from_wd_external();
01199 case E_BLOCK_CONST:
01200 return init_block_weights_const();
01201 case E_BLOCK_LINEAR:
01202 return init_block_weights_linear();
01203 case E_BLOCK_SQPOLY:
01204 return init_block_weights_sqpoly();
01205 case E_BLOCK_CUBICPOLY:
01206 return init_block_weights_cubicpoly();
01207 case E_BLOCK_EXP:
01208 return init_block_weights_exp();
01209 case E_BLOCK_LOG:
01210 return init_block_weights_log();
01211 case E_BLOCK_EXTERNAL:
01212 return init_block_weights_external();
01213 default:
01214 return false;
01215 };
01216 }
01217
01218
01219
01220 void* CWeightedDegreePositionStringKernel::compute_batch_helper(void* p)
01221 {
01222 S_THREAD_PARAM<DNATrie>* params = (S_THREAD_PARAM<DNATrie>*) p;
01223 int32_t j=params->j;
01224 CWeightedDegreePositionStringKernel* wd=params->kernel;
01225 CTrie<DNATrie>* tries=params->tries;
01226 float64_t* weights=params->weights;
01227 int32_t length=params->length;
01228 int32_t max_shift=params->max_shift;
01229 int32_t* vec=params->vec;
01230 float64_t* result=params->result;
01231 float64_t factor=params->factor;
01232 int32_t* shift=params->shift;
01233 int32_t* vec_idx=params->vec_idx;
01234
01235 for (int32_t i=params->start; i<params->end; i++)
01236 {
01237 int32_t len=0;
01238 CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
01239 CAlphabet* alpha=wd->alphabet;
01240
01241 char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len);
01242 for (int32_t k=CMath::max(0,j-max_shift); k<CMath::min(len,j+wd->get_degree()+max_shift); k++)
01243 vec[k]=alpha->remap_to_bin(char_vec[k]);
01244
01245 SG_UNREF(rhs_feat);
01246
01247 result[i] += factor*wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
01248
01249 if (wd->get_optimization_type()==SLOWBUTMEMEFFICIENT)
01250 {
01251 for (int32_t q=CMath::max(0,j-max_shift); q<CMath::min(len,j+max_shift+1); q++)
01252 {
01253 int32_t s=j-q ;
01254 if ((s>=1) && (s<=shift[q]) && (q+s<len))
01255 {
01256 result[i] +=
01257 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec,
01258 len, q, q+s, q, weights, (length!=0)),
01259 vec_idx[i])/(2.0*s);
01260 }
01261 }
01262
01263 for (int32_t s=1; (s<=shift[j]) && (j+s<len); s++)
01264 {
01265 result[i] +=
01266 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec,
01267 len, j+s, j, j+s, weights, (length!=0)),
01268 vec_idx[i])/(2.0*s);
01269 }
01270 }
01271 }
01272
01273 return NULL;
01274 }
01275
01276 void CWeightedDegreePositionStringKernel::compute_batch(
01277 int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
01278 int32_t* IDX, float64_t* alphas, float64_t factor)
01279 {
01280 ASSERT(alphabet);
01281 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01282 ASSERT(position_weights_lhs==NULL);
01283 ASSERT(position_weights_rhs==NULL);
01284 ASSERT(rhs);
01285 ASSERT(num_vec<=rhs->get_num_vectors());
01286 ASSERT(num_vec>0);
01287 ASSERT(vec_idx);
01288 ASSERT(result);
01289 create_empty_tries();
01290
01291 int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01292 ASSERT(num_feat>0);
01293 int32_t num_threads=parallel.get_num_threads();
01294 ASSERT(num_threads>0);
01295 int32_t* vec=new int32_t[num_threads*num_feat];
01296
01297 if (num_threads < 2)
01298 {
01299 #ifdef WIN32
01300 for (int32_t j=0; j<num_feat; j++)
01301 #else
01302 CSignal::clear_cancel();
01303 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01304 #endif
01305 {
01306 init_optimization(num_suppvec, IDX, alphas, j);
01307 S_THREAD_PARAM<DNATrie> params;
01308 params.vec=vec;
01309 params.result=result;
01310 params.weights=weights;
01311 params.kernel=this;
01312 params.tries=&tries;
01313 params.factor=factor;
01314 params.j=j;
01315 params.start=0;
01316 params.end=num_vec;
01317 params.length=length;
01318 params.max_shift=max_shift;
01319 params.shift=shift;
01320 params.vec_idx=vec_idx;
01321 compute_batch_helper((void*) ¶ms);
01322
01323 SG_PROGRESS(j,0,num_feat);
01324 }
01325 }
01326 #ifndef WIN32
01327 else
01328 {
01329
01330 CSignal::clear_cancel();
01331 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01332 {
01333 init_optimization(num_suppvec, IDX, alphas, j);
01334 pthread_t threads[num_threads-1];
01335 S_THREAD_PARAM<DNATrie> params[num_threads];
01336 int32_t step= num_vec/num_threads;
01337 int32_t t;
01338
01339 for (t=0; t<num_threads-1; t++)
01340 {
01341 params[t].vec=&vec[num_feat*t];
01342 params[t].result=result;
01343 params[t].weights=weights;
01344 params[t].kernel=this;
01345 params[t].tries=&tries;
01346 params[t].factor=factor;
01347 params[t].j=j;
01348 params[t].start = t*step;
01349 params[t].end = (t+1)*step;
01350 params[t].length=length;
01351 params[t].max_shift=max_shift;
01352 params[t].shift=shift;
01353 params[t].vec_idx=vec_idx;
01354 pthread_create(&threads[t], NULL, CWeightedDegreePositionStringKernel::compute_batch_helper, (void*)¶ms[t]);
01355 }
01356
01357 params[t].vec=&vec[num_feat*t];
01358 params[t].result=result;
01359 params[t].weights=weights;
01360 params[t].kernel=this;
01361 params[t].tries=&tries;
01362 params[t].factor=factor;
01363 params[t].j=j;
01364 params[t].start=t*step;
01365 params[t].end=num_vec;
01366 params[t].length=length;
01367 params[t].max_shift=max_shift;
01368 params[t].shift=shift;
01369 params[t].vec_idx=vec_idx;
01370 compute_batch_helper((void*) ¶ms[t]);
01371
01372 for (t=0; t<num_threads-1; t++)
01373 pthread_join(threads[t], NULL);
01374 SG_PROGRESS(j,0,num_feat);
01375 }
01376 }
01377 #endif
01378
01379 delete[] vec;
01380
01381
01382
01383 create_empty_tries();
01384 }
01385
01386 float64_t* CWeightedDegreePositionStringKernel::compute_scoring(
01387 int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* result,
01388 int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01389 {
01390 ASSERT(position_weights_lhs==NULL);
01391 ASSERT(position_weights_rhs==NULL);
01392
01393 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01394 ASSERT(num_feat>0);
01395 ASSERT(alphabet);
01396 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01397
01398 num_sym=4;
01399
01400 ASSERT(max_degree>0);
01401
01402
01403 int32_t* nofsKmers=new int32_t[max_degree];
01404 float64_t** C=new float64_t*[max_degree];
01405 float64_t** L=new float64_t*[max_degree];
01406 float64_t** R=new float64_t*[max_degree];
01407
01408 int32_t i;
01409 int32_t k;
01410
01411
01412 int32_t bigtabSize=0;
01413 for (k=0; k<max_degree; ++k )
01414 {
01415 nofsKmers[k]=(int32_t) pow(num_sym, k+1);
01416 const int32_t tabSize=nofsKmers[k]*num_feat;
01417 bigtabSize+=tabSize;
01418 }
01419 result=new float64_t[bigtabSize];
01420
01421
01422 int32_t tabOffs=0;
01423 for( k = 0; k < max_degree; ++k )
01424 {
01425 const int32_t tabSize = nofsKmers[k] * num_feat;
01426 C[k] = &result[tabOffs];
01427 L[k] = new float64_t[ tabSize ];
01428 R[k] = new float64_t[ tabSize ];
01429 tabOffs+=tabSize;
01430 for(i = 0; i < tabSize; i++ )
01431 {
01432 C[k][i] = 0.0;
01433 L[k][i] = 0.0;
01434 R[k][i] = 0.0;
01435 }
01436 }
01437
01438
01439 float64_t* margFactors=new float64_t[degree];
01440
01441 int32_t* x = new int32_t[ degree+1 ];
01442 int32_t* substrs = new int32_t[ degree+1 ];
01443
01444 margFactors[0] = 1.0;
01445 substrs[0] = 0;
01446 for( k=1; k < degree; ++k ) {
01447 margFactors[k] = 0.25 * margFactors[k-1];
01448 substrs[k] = -1;
01449 }
01450 substrs[degree] = -1;
01451
01452 struct TreeParseInfo info;
01453 info.num_sym = num_sym;
01454 info.num_feat = num_feat;
01455 info.p = -1;
01456 info.k = -1;
01457 info.nofsKmers = nofsKmers;
01458 info.margFactors = margFactors;
01459 info.x = x;
01460 info.substrs = substrs;
01461 info.y0 = 0;
01462 info.C_k = NULL;
01463 info.L_k = NULL;
01464 info.R_k = NULL;
01465
01466
01467 i = 0;
01468 for( k = 0; k < max_degree; ++k )
01469 {
01470 const int32_t nofKmers = nofsKmers[ k ];
01471 info.C_k = C[k];
01472 info.L_k = L[k];
01473 info.R_k = R[k];
01474
01475
01476 for(int32_t p = 0; p < num_feat; ++p )
01477 {
01478 init_optimization( num_suppvec, IDX, alphas, p );
01479 int32_t tree = p ;
01480 for(int32_t j = 0; j < degree+1; j++ ) {
01481 x[j] = -1;
01482 }
01483 tries.traverse( tree, p, info, 0, x, k );
01484 SG_PROGRESS(i++,0,num_feat*max_degree);
01485 }
01486
01487
01488 if( k > 0 ) {
01489 const int32_t j = k - 1;
01490 const int32_t nofJmers = (int32_t) pow( num_sym, j+1 );
01491 for(int32_t p = 0; p < num_feat; ++p ) {
01492 const int32_t offsetJ = nofJmers * p;
01493 const int32_t offsetJ1 = nofJmers * (p+1);
01494 const int32_t offsetK = nofKmers * p;
01495 int32_t y;
01496 int32_t sym;
01497 for( y = 0; y < nofJmers; ++y ) {
01498 for( sym = 0; sym < num_sym; ++sym ) {
01499 const int32_t y_sym = num_sym*y + sym;
01500 const int32_t sym_y = nofJmers*sym + y;
01501 ASSERT(0<=y_sym && y_sym<nofKmers);
01502 ASSERT(0<=sym_y && sym_y<nofKmers);
01503 C[k][ y_sym + offsetK ] += L[j][ y + offsetJ ];
01504 if( p < num_feat-1 ) {
01505 C[k][ sym_y + offsetK ] += R[j][ y + offsetJ1 ];
01506 }
01507 }
01508 }
01509 }
01510 }
01511
01512
01513
01514
01515
01516
01517
01518
01519
01520
01521
01522 }
01523
01524
01525 num_feat=1;
01526 num_sym = bigtabSize;
01527
01528 delete[] nofsKmers;
01529 delete[] margFactors;
01530 delete[] substrs;
01531 delete[] x;
01532 delete[] C;
01533 for( k = 0; k < max_degree; ++k ) {
01534 delete[] L[k];
01535 delete[] R[k];
01536 }
01537 delete[] L;
01538 delete[] R;
01539 return result;
01540 }
01541
01542 char* CWeightedDegreePositionStringKernel::compute_consensus(
01543 int32_t &num_feat, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01544 {
01545 ASSERT(position_weights_lhs==NULL);
01546 ASSERT(position_weights_rhs==NULL);
01547
01548 ASSERT(degree<=32);
01549 ASSERT(!tries.get_use_compact_terminal_nodes());
01550 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01551 ASSERT(num_feat>0);
01552 ASSERT(alphabet);
01553 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01554
01555
01556 char* result=new char[num_feat];
01557
01558
01559 int32_t num_tables=CMath::max(1,num_feat-degree+1);
01560 CDynamicArray<ConsensusEntry>** table=new CDynamicArray<ConsensusEntry>*[num_tables];
01561
01562 for (int32_t i=0; i<num_tables; i++)
01563 table[i]=new CDynamicArray<ConsensusEntry>(num_suppvec/10);
01564
01565
01566 for (int32_t i=0; i<num_tables; i++)
01567 {
01568 bool cumulative=false;
01569
01570 if (i<num_tables-1)
01571 init_optimization(num_suppvec, IDX, alphas, i);
01572 else
01573 {
01574 init_optimization(num_suppvec, IDX, alphas, i, num_feat-1);
01575 cumulative=true;
01576 }
01577
01578 if (i==0)
01579 tries.fill_backtracking_table(i, NULL, table[i], cumulative, weights);
01580 else
01581 tries.fill_backtracking_table(i, table[i-1], table[i], cumulative, weights);
01582
01583 SG_PROGRESS(i,0,num_feat);
01584 }
01585
01586
01587
01588
01589
01590
01591
01592
01593
01594
01595
01596
01597
01598
01599
01600
01601
01602
01603
01604
01605
01606
01607
01608 const char* acgt="ACGT";
01609
01610
01611 int32_t max_idx=-1;
01612 float32_t max_score=0;
01613 int32_t num_elements=table[num_tables-1]->get_num_elements();
01614
01615 for (int32_t i=0; i<num_elements; i++)
01616 {
01617 float64_t sc=table[num_tables-1]->get_element(i).score;
01618 if (sc>max_score || max_idx==-1)
01619 {
01620 max_idx=i;
01621 max_score=sc;
01622 }
01623 }
01624 uint64_t endstr=table[num_tables-1]->get_element(max_idx).string;
01625
01626 SG_INFO("max_idx:%d num_el:%d num_feat:%d num_tables:%d max_score:%f\n", max_idx, num_elements, num_feat, num_tables, max_score);
01627
01628 for (int32_t i=0; i<degree; i++)
01629 result[num_feat-1-i]=acgt[(endstr >> (2*i)) & 3];
01630
01631 if (num_tables>1)
01632 {
01633 for (int32_t i=num_tables-1; i>=0; i--)
01634 {
01635
01636 result[i]=acgt[table[i]->get_element(max_idx).string >> (2*(degree-1)) & 3];
01637 max_idx=table[i]->get_element(max_idx).bt;
01638 }
01639 }
01640
01641
01642
01643
01644
01645
01646
01647
01648
01649
01650
01651 for (int32_t i=0; i<num_tables; i++)
01652 delete table[i];
01653
01654 delete[] table;
01655 return result;
01656 }
01657
01658
01659 float64_t* CWeightedDegreePositionStringKernel::extract_w(
01660 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
01661 float64_t* w_result, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01662 {
01663 delete_optimization();
01664 use_poim_tries=true;
01665 poim_tries.delete_trees(false);
01666
01667
01668 ASSERT(position_weights_lhs==NULL);
01669 ASSERT(position_weights_rhs==NULL);
01670 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01671 ASSERT(num_feat>0);
01672 ASSERT(alphabet->get_alphabet()==DNA);
01673 ASSERT(max_degree>0);
01674
01675
01676 static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
01677 const int32_t seqLen = num_feat;
01678 float64_t** subs;
01679 int32_t i;
01680 int32_t k;
01681
01682
01683
01684
01685 int32_t* offsets;
01686 int32_t offset;
01687 offsets = new int32_t[ max_degree ];
01688 offset = 0;
01689 for( k = 0; k < max_degree; ++k ) {
01690 offsets[k] = offset;
01691 const int32_t nofsKmers = (int32_t) pow( NUM_SYMS, k+1 );
01692 const int32_t tabSize = nofsKmers * seqLen;
01693 offset += tabSize;
01694 }
01695
01696 const int32_t bigTabSize = offset;
01697 w_result=new float64_t[bigTabSize];
01698 for (i=0; i<bigTabSize; ++i)
01699 w_result[i]=0;
01700
01701
01702 subs = new float64_t*[ max_degree ];
01703 ASSERT( subs != NULL );
01704 for( k = 0; k < max_degree; ++k ) {
01705 subs[k] = &w_result[ offsets[k] ];
01706 }
01707 delete[] offsets;
01708
01709
01710 init_optimization( num_suppvec, IDX, alphas, -1);
01711 poim_tries.POIMs_extract_W( subs, max_degree );
01712
01713
01714 delete[] subs;
01715 num_feat = 1;
01716 num_sym = bigTabSize;
01717 use_poim_tries=false;
01718 poim_tries.delete_trees(false);
01719 return w_result;
01720 }
01721
01722 float64_t* CWeightedDegreePositionStringKernel::compute_POIM(
01723 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
01724 float64_t* poim_result, int32_t num_suppvec, int32_t* IDX,
01725 float64_t* alphas, float64_t* distrib )
01726 {
01727 delete_optimization();
01728 use_poim_tries=true;
01729 poim_tries.delete_trees(false);
01730
01731
01732 ASSERT(position_weights_lhs==NULL);
01733 ASSERT(position_weights_rhs==NULL);
01734 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01735 ASSERT(num_feat>0);
01736 ASSERT(alphabet->get_alphabet()==DNA);
01737 ASSERT(max_degree!=0);
01738 ASSERT(distrib);
01739
01740
01741 static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
01742 const int32_t seqLen = num_feat;
01743 float64_t** subs;
01744 int32_t i;
01745 int32_t k;
01746
01747
01748
01749
01750
01751
01752
01753
01754
01755
01756
01757
01758 const int32_t debug = ( max_degree < 0 ) ? ( abs(max_degree) % 4 + 1 ) : 0;
01759 if( debug ) {
01760 max_degree = abs(max_degree) / 4;
01761 switch( debug ) {
01762 case 1: {
01763 printf( "POIM DEBUGGING: substring only (max order=%d)\n", max_degree );
01764 break;
01765 }
01766 case 2: {
01767 printf( "POIM DEBUGGING: superstring only (max order=%d)\n", max_degree );
01768 break;
01769 }
01770 case 3: {
01771 printf( "POIM DEBUGGING: left overlap only (max order=%d)\n", max_degree );
01772 break;
01773 }
01774 case 4: {
01775 printf( "POIM DEBUGGING: right overlap only (max order=%d)\n", max_degree );
01776 break;
01777 }
01778 default: {
01779 printf( "POIM DEBUGGING: something is wrong (max order=%d)\n", max_degree );
01780 ASSERT(0);
01781 break;
01782 }
01783 }
01784 }
01785
01786
01787 int32_t* offsets;
01788 int32_t offset;
01789 offsets = new int32_t[ max_degree ];
01790 offset = 0;
01791 for( k = 0; k < max_degree; ++k ) {
01792 offsets[k] = offset;
01793 const int32_t nofsKmers = (int32_t) pow( NUM_SYMS, k+1 );
01794 const int32_t tabSize = nofsKmers * seqLen;
01795 offset += tabSize;
01796 }
01797
01798 const int32_t bigTabSize=offset;
01799 poim_result=new float64_t[bigTabSize];
01800 for (i=0; i<bigTabSize; ++i )
01801 poim_result[i]=0;
01802
01803
01804 subs=new float64_t*[max_degree];
01805 for (k=0; k<max_degree; ++k)
01806 subs[k]=&poim_result[offsets[k]];
01807
01808 delete[] offsets;
01809
01810
01811 init_optimization( num_suppvec, IDX, alphas, -1);
01812 poim_tries.POIMs_precalc_SLR( distrib );
01813
01814
01815 if( debug==0 || debug==1 ) {
01816 poim_tries.POIMs_extract_W( subs, max_degree );
01817 for( k = 1; k < max_degree; ++k ) {
01818 const int32_t nofKmers2 = ( k > 1 ) ? (int32_t) pow(NUM_SYMS,k-1) : 0;
01819 const int32_t nofKmers1 = (int32_t) pow( NUM_SYMS, k );
01820 const int32_t nofKmers0 = nofKmers1 * NUM_SYMS;
01821 for( i = 0; i < seqLen; ++i ) {
01822 float64_t* const subs_k2i1 = ( k>1 && i<seqLen-1 ) ? &subs[k-2][(i+1)*nofKmers2] : NULL;
01823 float64_t* const subs_k1i1 = ( i < seqLen-1 ) ? &subs[k-1][(i+1)*nofKmers1] : NULL;
01824 float64_t* const subs_k1i0 = & subs[ k-1 ][ i*nofKmers1 ];
01825 float64_t* const subs_k0i = & subs[ k-0 ][ i*nofKmers0 ];
01826 int32_t y0;
01827 for( y0 = 0; y0 < nofKmers0; ++y0 ) {
01828 const int32_t y1l = y0 / NUM_SYMS;
01829 const int32_t y1r = y0 % nofKmers1;
01830 const int32_t y2 = y1r / NUM_SYMS;
01831 subs_k0i[ y0 ] += subs_k1i0[ y1l ];
01832 if( i < seqLen-1 ) {
01833 subs_k0i[ y0 ] += subs_k1i1[ y1r ];
01834 if( k > 1 ) {
01835 subs_k0i[ y0 ] -= subs_k2i1[ y2 ];
01836 }
01837 }
01838 }
01839 }
01840 }
01841 }
01842
01843
01844 poim_tries.POIMs_add_SLR( subs, max_degree, debug );
01845
01846
01847 delete[] subs;
01848 num_feat = 1;
01849 num_sym = bigTabSize;
01850
01851 use_poim_tries=false;
01852 poim_tries.delete_trees(false);
01853
01854 return poim_result;
01855 }
01856
01857
01858 void CWeightedDegreePositionStringKernel::prepare_POIM2(
01859 float64_t* distrib, int32_t num_sym, int32_t num_feat)
01860 {
01861 free(m_poim_distrib);
01862 m_poim_distrib=(float64_t*)malloc(num_sym*num_feat*sizeof(float64_t));
01863 ASSERT(m_poim_distrib);
01864
01865 memcpy(m_poim_distrib, distrib, num_sym*num_feat*sizeof(float64_t));
01866 m_poim_num_sym=num_sym;
01867 m_poim_num_feat=num_feat;
01868 }
01869
01870 void CWeightedDegreePositionStringKernel::compute_POIM2(
01871 int32_t max_degree, CSVM* svm)
01872 {
01873 ASSERT(svm);
01874 int32_t num_suppvec=svm->get_num_support_vectors();
01875 int32_t* sv_idx=new int32_t[num_suppvec];
01876 float64_t* sv_weight=new float64_t[num_suppvec];
01877
01878 for (int32_t i=0; i<num_suppvec; i++)
01879 {
01880 sv_idx[i]=svm->get_support_vector(i);
01881 sv_weight[i]=svm->get_alpha(i);
01882 }
01883
01884 if ((max_degree < 1) || (max_degree > 12))
01885 {
01886
01887 SG_WARNING( "max_degree out of range 1..12 (%d). setting to 1.\n", max_degree);
01888 max_degree=1;
01889 }
01890
01891 int32_t num_feat = m_poim_num_feat;
01892 int32_t num_sym = m_poim_num_sym;
01893 free(m_poim);
01894
01895 m_poim = compute_POIM(max_degree, num_feat, num_sym, NULL, num_suppvec, sv_idx,
01896 sv_weight, m_poim_distrib);
01897
01898 ASSERT(num_feat==1);
01899 m_poim_result_len=num_sym;
01900
01901 delete[] sv_weight;
01902 delete[] sv_idx;
01903 }
01904
01905 void CWeightedDegreePositionStringKernel::get_POIM2(
01906 float64_t** poim, int32_t* result_len)
01907 {
01908 *poim=(float64_t*) malloc(m_poim_result_len*sizeof(float64_t));
01909 ASSERT(*poim);
01910 memcpy(*poim, m_poim, m_poim_result_len*sizeof(float64_t)) ;
01911 *result_len=m_poim_result_len ;
01912 }
01913
01914 void CWeightedDegreePositionStringKernel::cleanup_POIM2()
01915 {
01916 free(m_poim) ;
01917 m_poim=NULL ;
01918 free(m_poim_distrib) ;
01919 m_poim_distrib=NULL ;
01920 m_poim_num_sym=0 ;
01921 m_poim_num_sym=0 ;
01922 m_poim_result_len=0 ;
01923 }
01924