00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "structure/DynProg.h"
00013 #include "lib/Mathematics.h"
00014 #include "lib/io.h"
00015 #include "lib/config.h"
00016 #include "features/StringFeatures.h"
00017 #include "features/CharFeatures.h"
00018 #include "features/Alphabet.h"
00019 #include "structure/Plif.h"
00020 #include "lib/Array.h"
00021 #include "lib/Array2.h"
00022 #include "lib/Array3.h"
00023
00024 #include <stdlib.h>
00025 #include <stdio.h>
00026 #include <time.h>
00027 #include <ctype.h>
00028
00029 template void CDynProg::best_path_trans<1,true,false>(const DREAL *seq, INT seq_len, const INT *pos,
00030 const INT *orf_info, CPlifBase **PLif_matrix,
00031 CPlifBase **Plif_state_signals, INT max_num_signals,
00032 INT genestr_num,
00033 DREAL *prob_nbest, INT *my_state_seq, INT *my_pos_seq,
00034 bool use_orf) ;
00035 template void CDynProg::best_path_trans<2,true,false>(const DREAL *seq, INT seq_len, const INT *pos,
00036 const INT *orf_info, CPlifBase **PLif_matrix,
00037 CPlifBase **Plif_state_signals, INT max_num_signals,
00038 INT genestr_num,
00039 DREAL *prob_nbest, INT *my_state_seq, INT *my_pos_seq,
00040 bool use_orf) ;
00041
00042 template void CDynProg::best_path_trans<1,false,false>(const DREAL *seq, INT seq_len, const INT *pos,
00043 const INT *orf_info, CPlifBase **PLif_matrix,
00044 CPlifBase **Plif_state_signals, INT max_num_signals,
00045 INT genestr_num,
00046 DREAL *prob_nbest, INT *my_state_seq, INT *my_pos_seq,
00047 bool use_orf) ;
00048 template void CDynProg::best_path_trans<2,false,false>(const DREAL *seq, INT seq_len, const INT *pos,
00049 const INT *orf_info, CPlifBase **PLif_matrix,
00050 CPlifBase **Plif_state_signals, INT max_num_signals,
00051 INT genestr_num,
00052 DREAL *prob_nbest, INT *my_state_seq, INT *my_pos_seq,
00053 bool use_orf) ;
00054
00055
00056 #ifdef SUNOS
00057 extern "C" int finite(double);
00058 #endif
00059
00060
00061
00062
00063
00064
00065 static INT word_degree_default[4]={3,4,5,6} ;
00066 static INT cum_num_words_default[5]={0,64,320,1344,5440} ;
00067 static INT num_words_default[4]= {64,256,1024,4096} ;
00068 static INT mod_words_default[32] = {1,1,1,1,1,1,1,1,
00069 1,1,1,1,1,1,1,1,
00070 0,0,0,0,0,0,0,0,
00071 0,0,0,0,0,0,0,0} ;
00072 static bool sign_words_default[16] = {true,true,true,true,true,true,true,true,
00073 false,false,false,false,false,false,false,false} ;
00074 static INT string_words_default[16] = {0,0,0,0,0,0,0,0,
00075 1,1,1,1,1,1,1,1} ;
00076
00077 CDynProg::CDynProg(INT p_num_svms )
00078 : CSGObject(), transition_matrix_a_id(1,1), transition_matrix_a(1,1),
00079 transition_matrix_a_deriv(1,1), initial_state_distribution_p(1),
00080 initial_state_distribution_p_deriv(1), end_state_distribution_q(1),
00081 end_state_distribution_q_deriv(1), dict_weights(1,1),
00082 dict_weights_array(dict_weights.get_array()),
00083
00084
00085 num_degrees(4),
00086 num_svms(8),
00087 num_strings(1),
00088 word_degree(word_degree_default, num_degrees, true, true),
00089 cum_num_words(cum_num_words_default, num_degrees+1, true, true),
00090 cum_num_words_array(cum_num_words.get_array()),
00091 num_words(num_words_default, num_degrees, true, true),
00092 num_words_array(num_words.get_array()),
00093 mod_words(mod_words_default, num_svms, 2, true, true),
00094 mod_words_array(mod_words.get_array()),
00095 sign_words(sign_words_default, num_svms, true, true),
00096 sign_words_array(sign_words.get_array()),
00097 string_words(string_words_default, num_svms, true, true),
00098 string_words_array(string_words.get_array()),
00099
00100
00101
00102 svm_pos_start(num_degrees),
00103 num_unique_words(num_degrees),
00104 svm_arrays_clean(true),
00105
00106
00107 num_svms_single(1),
00108 word_degree_single(1),
00109 num_words_single(4),
00110 word_used_single(num_words_single),
00111 svm_value_unnormalized_single(num_svms_single),
00112 num_unique_words_single(0),
00113
00114 max_a_id(0), m_seq(1,1,1), m_pos(1), m_orf_info(1,2),
00115 m_segment_sum_weights(1,1), m_plif_list(1),
00116 m_PEN(1,1), m_PEN_state_signals(1,1),
00117 m_genestr(1,1), m_dict_weights(1,1), m_segment_loss(1,1,2),
00118 m_segment_ids(1),
00119 m_segment_mask(1),
00120 m_scores(1), m_states(1,1), m_positions(1,1), m_genestr_stop(1),
00121 m_precomputed_svm_values(1,1),
00122 m_precomputed_tiling_values(1,1)
00123 {
00124 trans_list_forward = NULL ;
00125 trans_list_forward_cnt = NULL ;
00126 trans_list_forward_val = NULL ;
00127 trans_list_forward_id = NULL ;
00128 trans_list_len = 0 ;
00129
00130 mem_initialized = true ;
00131
00132 this->N=1;
00133 m_step=0;
00134
00135 m_raw_intensities = NULL;
00136 m_probe_pos = NULL;
00137 m_use_tiling=false;
00138 m_genestr_len = 0;
00139 #ifdef ARRAY_STATISTICS
00140 word_degree.set_name("word_degree");
00141 #endif
00142 }
00143
00144 CDynProg::~CDynProg()
00145 {
00146 if (trans_list_forward_cnt)
00147 delete[] trans_list_forward_cnt;
00148 if (trans_list_forward)
00149 {
00150 for (INT i=0; i<trans_list_len; i++)
00151 {
00152 if (trans_list_forward[i])
00153 delete[] trans_list_forward[i];
00154 }
00155 delete[] trans_list_forward;
00156 }
00157 if (trans_list_forward_val)
00158 {
00159 for (INT i=0; i<trans_list_len; i++)
00160 {
00161 if (trans_list_forward_val[i])
00162 delete[] trans_list_forward_val[i];
00163 }
00164 delete[] trans_list_forward_val;
00165 }
00166 if (trans_list_forward_id)
00167 {
00168 for (INT i=0; i<trans_list_len; i++)
00169 {
00170 if (trans_list_forward_id[i])
00171 delete[] trans_list_forward_id[i];
00172 }
00173 delete[] trans_list_forward_id;
00174 }
00175
00176
00177
00178
00179
00180 }
00181
00183 void CDynProg::set_genestr_len(INT genestr_len)
00184 {
00185 m_genestr_len = genestr_len;
00186 }
00187 INT CDynProg::get_num_svms()
00188 {
00189 return num_svms;
00190 }
00191 void CDynProg::precompute_stop_codons(const CHAR* sequence, INT length)
00192 {
00193 m_genestr_stop.resize_array(length) ;
00194 m_genestr_stop.zero() ;
00195 m_genestr_stop.set_name("genestr_stop") ;
00196 {
00197 for (INT i=0; i<length-2; i++)
00198 if ((sequence[i]=='t' || sequence[i]=='T') &&
00199 (((sequence[i+1]=='a' || sequence[i+1]=='A') &&
00200 (sequence[i+2]=='a' || sequence[i+2]=='g' || sequence[i+2]=='A' || sequence[i+2]=='G')) ||
00201 ((sequence[i+1]=='g'||sequence[i+1]=='G') && (sequence[i+2]=='a' || sequence[i+2]=='A') )))
00202 m_genestr_stop.element(i)=true ;
00203 else
00204 m_genestr_stop.element(i)=false ;
00205 m_genestr_stop.element(length-1)=false ;
00206 m_genestr_stop.element(length-1)=false ;
00207 }
00208 }
00209 void CDynProg::set_num_states(INT p_N)
00210 {
00211 N=p_N ;
00212
00213 transition_matrix_a_id.resize_array(N,N) ;
00214 transition_matrix_a.resize_array(N,N) ;
00215 transition_matrix_a_deriv.resize_array(N,N) ;
00216 initial_state_distribution_p.resize_array(N) ;
00217 initial_state_distribution_p_deriv.resize_array(N) ;
00218 end_state_distribution_q.resize_array(N);
00219 end_state_distribution_q_deriv.resize_array(N) ;
00220
00221 m_orf_info.resize_array(N,2) ;
00222 m_PEN.resize_array(N,N) ;
00223 m_PEN_state_signals.resize_array(N,1) ;
00224 }
00225 INT CDynProg::get_num_states()
00226 {
00227 return N;
00228 }
00229 void CDynProg::init_tiling_data(DREAL* probe_pos, DREAL* intensities, const INT num_probes, const INT seq_len)
00230 {
00231 m_probe_pos = new INT[num_probes];
00232 for (INT i=0;i<num_probes;i++)
00233 m_probe_pos[i] = (INT) probe_pos[i];
00234 m_raw_intensities = intensities;
00235 m_num_probes = num_probes;
00236 m_precomputed_tiling_values.resize_array(num_svms,seq_len);
00237 m_use_tiling=true;
00238
00239
00240 }
00241 void CDynProg::init_content_svm_value_array(const INT seq_len)
00242 {
00243 m_precomputed_svm_values.resize_array(num_svms,seq_len);
00244 }
00245 void CDynProg::precompute_tiling_plifs(CPlif** PEN, const INT num_penalties, const INT seq_len, const INT* pos)
00246 {
00247
00248
00249 INT tiling_plif_ids[num_svms];
00250 INT num = 0;
00251 for (INT i=0; i<num_penalties; i++)
00252 {
00253 CPlif * plif = PEN[i];
00254 if (plif->get_use_svm()>num_svms)
00255 {
00256 tiling_plif_ids[num] = i;
00257 num++;
00258 }
00259 }
00260 ASSERT(num==num_svms)
00261
00262 DREAL tiling_plif[num_svms];
00263 DREAL svm_value[2*num_svms];
00264 for (INT i=0; i<num_svms; i++)
00265 tiling_plif[i]=0.0;
00266
00267 INT* p_tiling_pos = m_probe_pos;
00268 DREAL* p_tiling_data = m_raw_intensities;
00269 for (INT pos_idx=0;pos_idx<seq_len;pos_idx++)
00270 {
00271
00272
00273 while (*p_tiling_pos<pos[pos_idx])
00274 {
00275
00276 for (INT i=0; i<num_svms; i++)
00277 {
00278 svm_value[num_svms+i]=*p_tiling_data;
00279 CPlif * plif = PEN[tiling_plif_ids[i]];
00280 plif->set_do_calc(true);
00281 tiling_plif[i]+=plif->lookup_penalty(0,svm_value);
00282
00283 plif->set_do_calc(false);
00284
00285 }
00286 p_tiling_data++;
00287 p_tiling_pos++;
00288 }
00289 for (INT i=0; i<num_svms; i++)
00290 m_precomputed_tiling_values.set_element(tiling_plif[i],i,pos_idx);
00291 }
00292
00293
00294
00295
00296 }
00297 void CDynProg::create_word_string(const CHAR* genestr, INT genestr_num, INT genestr_len, WORD*** wordstr)
00298 {
00299 for (INT k=0; k<genestr_num; k++)
00300 {
00301 wordstr[k]=new WORD*[num_degrees] ;
00302 for (INT j=0; j<num_degrees; j++)
00303 {
00304 wordstr[k][j]=NULL ;
00305 {
00306 wordstr[k][j]=new WORD[genestr_len] ;
00307 for (INT i=0; i<genestr_len; i++)
00308 switch (genestr[i])
00309 {
00310 case 'A':
00311 case 'a': wordstr[k][j][i]=0 ; break ;
00312 case 'C':
00313 case 'c': wordstr[k][j][i]=1 ; break ;
00314 case 'G':
00315 case 'g': wordstr[k][j][i]=2 ; break ;
00316 case 'T':
00317 case 't': wordstr[k][j][i]=3 ; break ;
00318 default: ASSERT(0) ;
00319 }
00320 translate_from_single_order(wordstr[k][j], genestr_len, word_degree[j]-1, word_degree[j]) ;
00321 }
00322 }
00323 }
00324 precompute_stop_codons(genestr, genestr_len);
00325 }
00326 void CDynProg::precompute_content_values(WORD*** wordstr, const INT *pos,const INT seq_len, const INT genestr_len,DREAL *dictionary_weights,INT dict_len)
00327 {
00328
00329
00330
00331 dict_weights.set_array(dictionary_weights, dict_len, num_svms, false, false) ;
00332 dict_weights_array=dict_weights.get_array() ;
00333
00334 for (INT p=0 ; p<seq_len-1 ; p++)
00335 {
00336 INT from_pos = pos[p];
00337 INT to_pos = pos[p+1];
00338 DREAL my_svm_values_unnormalized[num_svms] ;
00339
00340
00341 ASSERT(from_pos<=genestr_len)
00342 ASSERT(to_pos<=genestr_len)
00343
00344 for (INT s=0; s<num_svms; s++)
00345 {
00346 my_svm_values_unnormalized[s]=0.0;
00347 }
00348 for (INT i=from_pos; i<to_pos;i++)
00349 {
00350 for (INT j=0; j<num_degrees; j++)
00351 {
00352 WORD word = wordstr[0][j][i] ;
00353 for (INT s=0; s<num_svms; s++)
00354 {
00355 if (s==4 && i%3!=0)
00356 continue;
00357 else if (s==5 && i%3!=1)
00358 continue;
00359 else if (s==6 && i%3!=2)
00360 continue;
00361 my_svm_values_unnormalized[s] += dict_weights_array[(word+cum_num_words_array[j])+s*cum_num_words_array[num_degrees]] ;
00362 }
00363 }
00364 }
00365 for (INT s=0; s<num_svms; s++)
00366 {
00367 DREAL prev = m_precomputed_svm_values.get_element(s,p);
00368 m_precomputed_svm_values.set_element(prev + my_svm_values_unnormalized[s],s,p+1);
00369 ASSERT(prev>-1e20);
00370 }
00371 }
00372 for (INT j=0; j<num_degrees; j++)
00373 delete[] wordstr[0][j] ;
00374 delete[] wordstr[0] ;
00375 }
00376 void CDynProg::set_p_vector(DREAL *p, INT p_N)
00377 {
00378 ASSERT(p_N==N);
00379
00380
00381
00382 initial_state_distribution_p.set_array(p, p_N, true, true) ;
00383 }
00384
00385 void CDynProg::set_q_vector(DREAL *q, INT q_N)
00386 {
00387 ASSERT(q_N==N);
00388 end_state_distribution_q.set_array(q, q_N, true, true) ;
00389 }
00390
00391 void CDynProg::set_a(DREAL *a, INT p_M, INT p_N)
00392 {
00393 ASSERT(p_N==N);
00394 ASSERT(p_M==p_N);
00395 transition_matrix_a.set_array(a, p_N, p_N, true, true) ;
00396 transition_matrix_a_deriv.resize_array(p_N, p_N) ;
00397 }
00398
00399 void CDynProg::set_a_id(INT *a, INT p_M, INT p_N)
00400 {
00401 ASSERT(p_N==N);
00402 ASSERT(p_M==p_N);
00403 transition_matrix_a_id.set_array(a, p_N, p_N, true, true) ;
00404 max_a_id = 0 ;
00405 for (INT i=0; i<p_N; i++)
00406 for (INT j=0; j<p_N; j++)
00407 max_a_id = CMath::max(max_a_id, transition_matrix_a_id.element(i,j)) ;
00408 }
00409
00410 void CDynProg::set_a_trans_matrix(DREAL *a_trans, INT num_trans, INT p_N)
00411 {
00412 if (!((p_N==3) || (p_N==4)))
00413 SG_ERROR("!((p_N==3) || (p_N==4)), p_N: %i\n",p_N);
00414
00415 delete[] trans_list_forward ;
00416 delete[] trans_list_forward_cnt ;
00417 delete[] trans_list_forward_val ;
00418 delete[] trans_list_forward_id ;
00419
00420 trans_list_forward = NULL ;
00421 trans_list_forward_cnt = NULL ;
00422 trans_list_forward_val = NULL ;
00423 trans_list_len = 0 ;
00424
00425 transition_matrix_a.zero() ;
00426 transition_matrix_a_id.zero() ;
00427
00428 mem_initialized = true ;
00429
00430 trans_list_forward_cnt=NULL ;
00431 trans_list_len = N ;
00432 trans_list_forward = new T_STATES*[N] ;
00433 trans_list_forward_cnt = new T_STATES[N] ;
00434 trans_list_forward_val = new DREAL*[N] ;
00435 trans_list_forward_id = new INT*[N] ;
00436
00437 INT start_idx=0;
00438 for (INT j=0; j<N; j++)
00439 {
00440 INT old_start_idx=start_idx;
00441
00442 while (start_idx<num_trans && a_trans[start_idx+num_trans]==j)
00443 {
00444 start_idx++;
00445
00446 if (start_idx>1 && start_idx<num_trans)
00447 ASSERT(a_trans[start_idx+num_trans-1] <= a_trans[start_idx+num_trans]);
00448 }
00449
00450 if (start_idx>1 && start_idx<num_trans)
00451 ASSERT(a_trans[start_idx+num_trans-1] <= a_trans[start_idx+num_trans]);
00452
00453 INT len=start_idx-old_start_idx;
00454 ASSERT(len>=0);
00455
00456 trans_list_forward_cnt[j] = 0 ;
00457
00458 if (len>0)
00459 {
00460 trans_list_forward[j] = new T_STATES[len] ;
00461 trans_list_forward_val[j] = new DREAL[len] ;
00462 trans_list_forward_id[j] = new INT[len] ;
00463 }
00464 else
00465 {
00466 trans_list_forward[j] = NULL;
00467 trans_list_forward_val[j] = NULL;
00468 trans_list_forward_id[j] = NULL;
00469 }
00470 }
00471
00472 for (INT i=0; i<num_trans; i++)
00473 {
00474 INT from_state = (INT)a_trans[i] ;
00475 INT to_state = (INT)a_trans[i+num_trans] ;
00476 DREAL val = a_trans[i+num_trans*2] ;
00477 INT id = 0 ;
00478 if (p_N==4)
00479 id = (INT)a_trans[i+num_trans*3] ;
00480
00481
00482 ASSERT(to_state>=0 && to_state<N);
00483 ASSERT(from_state>=0 && from_state<N);
00484
00485 trans_list_forward[to_state][trans_list_forward_cnt[to_state]]=from_state ;
00486 trans_list_forward_val[to_state][trans_list_forward_cnt[to_state]]=val ;
00487 trans_list_forward_id[to_state][trans_list_forward_cnt[to_state]]=id ;
00488 trans_list_forward_cnt[to_state]++ ;
00489 transition_matrix_a.element(from_state, to_state) = val ;
00490 transition_matrix_a_id.element(from_state, to_state) = id ;
00491
00492 } ;
00493
00494 max_a_id = 0 ;
00495 for (INT i=0; i<N; i++)
00496 for (INT j=0; j<N; j++)
00497 {
00498
00499
00500 max_a_id = CMath::max(max_a_id, transition_matrix_a_id.element(i,j)) ;
00501 }
00502
00503 }
00504
00505 void CDynProg::init_svm_arrays(INT p_num_degrees, INT p_num_svms)
00506 {
00507 svm_arrays_clean=false ;
00508
00509 word_degree.resize_array(num_degrees) ;
00510
00511 cum_num_words.resize_array(num_degrees+1) ;
00512 cum_num_words_array=cum_num_words.get_array() ;
00513
00514 num_words.resize_array(num_degrees) ;
00515 num_words_array=num_words.get_array() ;
00516
00517
00518 svm_pos_start.resize_array(num_degrees) ;
00519 num_unique_words.resize_array(num_degrees) ;
00520 }
00521
00522
00523 void CDynProg::init_word_degree_array(INT * p_word_degree_array, INT num_elem)
00524 {
00525 svm_arrays_clean=false ;
00526
00527 word_degree.resize_array(num_degrees) ;
00528 ASSERT(num_degrees==num_elem);
00529
00530
00531 for (INT i=0; i<num_degrees; i++)
00532 word_degree[i]=p_word_degree_array[i] ;
00533
00534 }
00535
00536 void CDynProg::init_cum_num_words_array(INT * p_cum_num_words_array, INT num_elem)
00537 {
00538 svm_arrays_clean=false ;
00539
00540 cum_num_words.resize_array(num_degrees+1) ;
00541 cum_num_words_array=cum_num_words.get_array() ;
00542 ASSERT(num_degrees+1==num_elem);
00543
00544 for (INT i=0; i<num_degrees+1; i++)
00545 cum_num_words[i]=p_cum_num_words_array[i] ;
00546 }
00547
00548 void CDynProg::init_num_words_array(INT * p_num_words_array, INT num_elem)
00549 {
00550 svm_arrays_clean=false ;
00551
00552 num_words.resize_array(num_degrees) ;
00553 num_words_array=num_words.get_array() ;
00554 ASSERT(num_degrees==num_elem);
00555
00556 for (INT i=0; i<num_degrees; i++)
00557 num_words[i]=p_num_words_array[i] ;
00558
00559
00560
00561 }
00562
00563 void CDynProg::init_mod_words_array(INT * mod_words_input, INT num_elem, INT num_columns)
00564 {
00565 INT* p_mod_words_array=new INT[num_columns*num_elem] ;
00566 for (INT i=0; i<num_columns; i++)
00567 for (INT j=0; j<num_elem; j++)
00568 p_mod_words_array[i*j] = mod_words_input[i*num_elem+j];
00569 svm_arrays_clean=false ;
00570
00571 ASSERT(num_svms==num_elem);
00572 ASSERT(num_columns==2);
00573
00574 mod_words.set_array(p_mod_words_array, num_elem, 2, true, true) ;
00575 mod_words_array = mod_words.get_array() ;
00576
00577
00578
00579
00580
00581 }
00582
00583 void CDynProg::init_sign_words_array(bool* p_sign_words_array, INT num_elem)
00584 {
00585 svm_arrays_clean=false ;
00586
00587 ASSERT(num_svms==num_elem);
00588
00589 sign_words.set_array(p_sign_words_array, num_elem, true, true) ;
00590 sign_words_array = sign_words.get_array() ;
00591 }
00592
00593 void CDynProg::init_string_words_array(INT* p_string_words_array, INT num_elem)
00594 {
00595 svm_arrays_clean=false ;
00596
00597 ASSERT(num_svms==num_elem);
00598
00599 string_words.set_array(p_string_words_array, num_elem, true, true) ;
00600 string_words_array = string_words.get_array() ;
00601 }
00602
00603 bool CDynProg::check_svm_arrays()
00604 {
00605
00606 if ((word_degree.get_dim1()==num_degrees) &&
00607 (cum_num_words.get_dim1()==num_degrees+1) &&
00608 (num_words.get_dim1()==num_degrees) &&
00609
00610
00611
00612
00613
00614 (svm_pos_start.get_dim1()==num_degrees) &&
00615 (num_unique_words.get_dim1()==num_degrees) &&
00616 (mod_words.get_dim1()==num_svms) &&
00617 (mod_words.get_dim2()==2) &&
00618 (sign_words.get_dim1()==num_svms) &&
00619 (string_words.get_dim1()==num_svms))
00620 {
00621 svm_arrays_clean=true ;
00622 return true ;
00623 }
00624 else
00625 {
00626 if ((num_unique_words.get_dim1()==num_degrees) &&
00627 (mod_words.get_dim1()==num_svms) &&
00628 (mod_words.get_dim2()==2) &&
00629 (sign_words.get_dim1()==num_svms) &&
00630 (string_words.get_dim1()==num_svms))
00631 fprintf(stderr, "OK\n") ;
00632 else
00633 fprintf(stderr, "not OK\n") ;
00634
00635 if (!(word_degree.get_dim1()==num_degrees))
00636 SG_WARNING("SVM array: word_degree.get_dim1()!=num_degrees") ;
00637 if (!(cum_num_words.get_dim1()==num_degrees+1))
00638 SG_WARNING("SVM array: cum_num_words.get_dim1()!=num_degrees+1") ;
00639 if (!(num_words.get_dim1()==num_degrees))
00640 SG_WARNING("SVM array: num_words.get_dim1()==num_degrees") ;
00641 if (!(svm_pos_start.get_dim1()==num_degrees))
00642 SG_WARNING("SVM array: svm_pos_start.get_dim1()!=num_degrees") ;
00643 if (!(num_unique_words.get_dim1()==num_degrees))
00644 SG_WARNING("SVM array: num_unique_words.get_dim1()!=num_degrees") ;
00645 if (!(mod_words.get_dim1()==num_svms))
00646 SG_WARNING("SVM array: mod_words.get_dim1()!=num_svms") ;
00647 if (!(mod_words.get_dim2()==2))
00648 SG_WARNING("SVM array: mod_words.get_dim2()!=2") ;
00649 if (!(sign_words.get_dim1()==num_svms))
00650 SG_WARNING("SVM array: sign_words.get_dim1()!=num_svms") ;
00651 if (!(string_words.get_dim1()==num_svms))
00652 SG_WARNING("SVM array: string_words.get_dim1()!=num_svms") ;
00653
00654 svm_arrays_clean=false ;
00655 return false ;
00656 }
00657 }
00658
00659 void CDynProg::best_path_set_seq(DREAL *seq, INT p_N, INT seq_len)
00660 {
00661 if (!svm_arrays_clean)
00662 {
00663 SG_ERROR( "SVM arrays not clean") ;
00664 return ;
00665 } ;
00666
00667 ASSERT(p_N==N);
00668 ASSERT(initial_state_distribution_p.get_dim1()==N);
00669 ASSERT(end_state_distribution_q.get_dim1()==N);
00670
00671 m_seq.set_array(seq, N, seq_len, 1, true, true) ;
00672 this->N=N ;
00673
00674 m_call=3 ;
00675 m_step=2 ;
00676 }
00677
00678 void CDynProg::best_path_set_seq3d(DREAL *seq, INT p_N, INT seq_len, INT max_num_signals)
00679 {
00680 if (!svm_arrays_clean)
00681 {
00682 SG_ERROR( "SVM arrays not clean") ;
00683 return ;
00684 } ;
00685
00686 ASSERT(p_N==N);
00687 ASSERT(initial_state_distribution_p.get_dim1()==N);
00688 ASSERT(end_state_distribution_q.get_dim1()==N);
00689
00690 m_seq.set_array(seq, N, seq_len, max_num_signals, true, true) ;
00691 this->N=N ;
00692
00693 m_call=3 ;
00694 m_step=2 ;
00695 }
00696
00697 void CDynProg::best_path_set_pos(INT *pos, INT seq_len)
00698 {
00699 if (m_step!=2)
00700 SG_ERROR( "please call best_path_set_seq first\n") ;
00701
00702 if (seq_len!=m_seq.get_dim2())
00703 SG_ERROR( "pos size does not match previous info %i!=%i\n", seq_len, m_seq.get_dim2()) ;
00704
00705 m_pos.set_array(pos, seq_len, true, true) ;
00706
00707 m_step=3 ;
00708 }
00709
00710 void CDynProg::best_path_set_orf_info(INT *orf_info, INT m, INT n)
00711 {
00712 if (m_step!=3)
00713 SG_ERROR( "please call best_path_set_pos first\n") ;
00714
00715 if (m!=N)
00716 SG_ERROR( "orf_info size does not match previous info %i!=%i\n", m, N) ;
00717 if (n!=2)
00718 SG_ERROR( "orf_info size incorrect %i!=2\n", n) ;
00719 m_orf_info.set_array(orf_info, m, n, true, true) ;
00720
00721 m_call=1 ;
00722 m_step=4 ;
00723 }
00724
00725 void CDynProg::best_path_set_segment_sum_weights(DREAL *segment_sum_weights, INT num_states, INT seq_len)
00726 {
00727 if (m_step!=3)
00728 SG_ERROR( "please call best_path_set_pos first\n") ;
00729
00730 if (num_states!=N)
00731 SG_ERROR( "segment_sum_weights size does not match previous info %i!=%i\n", num_states, N) ;
00732 if (seq_len!=m_pos.get_dim1())
00733 SG_ERROR( "segment_sum_weights size incorrect %i!=%i\n", seq_len, m_pos.get_dim1()) ;
00734
00735 m_segment_sum_weights.set_array(segment_sum_weights, num_states, seq_len, true, true) ;
00736
00737 m_call=2 ;
00738 m_step=4 ;
00739 }
00740
00741 void CDynProg::best_path_set_plif_list(CDynamicArray<CPlifBase*>* plifs)
00742 {
00743 ASSERT(plifs);
00744 CPlifBase** plif_list=plifs->get_array();
00745 INT num_plif=plifs->get_num_elements();
00746
00747 if (m_step!=4)
00748 SG_ERROR( "please call best_path_set_orf_info or best_path_segment_sum_weights first\n") ;
00749
00750 m_plif_list.set_array(plif_list, num_plif, true, true) ;
00751
00752 m_step=5 ;
00753 }
00754
00755 void CDynProg::best_path_set_plif_id_matrix(INT *plif_id_matrix, INT m, INT n)
00756 {
00757 if (m_step!=5)
00758 SG_ERROR( "please call best_path_set_plif_list first\n") ;
00759
00760 if ((m!=N) || (n!=N))
00761 SG_ERROR( "plif_id_matrix size does not match previous info %i!=%i or %i!=%i\n", m, N, n, N) ;
00762
00763 CArray2<INT> id_matrix(plif_id_matrix, N, N, false, false) ;
00764 #ifdef DYNPROG_DEBUG
00765 id_matrix.set_name("id_matrix");
00766 id_matrix.display_array();
00767 #endif //DYNPROG_DEBUG
00768 m_PEN.resize_array(N, N) ;
00769 for (INT i=0; i<N; i++)
00770 for (INT j=0; j<N; j++)
00771 if (id_matrix.element(i,j)>=0)
00772 m_PEN.element(i,j)=m_plif_list[id_matrix.element(i,j)] ;
00773 else
00774 m_PEN.element(i,j)=NULL ;
00775
00776 m_step=6 ;
00777 }
00778
00779 void CDynProg::best_path_set_plif_state_signal_matrix(INT *plif_id_matrix, INT m, INT max_num_signals)
00780 {
00781 if (m_step!=6)
00782 SG_ERROR( "please call best_path_set_plif_id_matrix first\n") ;
00783
00784 if (m!=N)
00785 SG_ERROR( "plif_state_signal_matrix size does not match previous info %i!=%i\n", m, N) ;
00786
00787 if (m_seq.get_dim3() != max_num_signals)
00788 SG_ERROR( "size(plif_state_signal_matrix,2) does not match with size(m_seq,3): %i!=%i\nSorry, Soeren... interface changed\n", m_seq.get_dim3(), max_num_signals) ;
00789
00790 CArray2<INT> id_matrix(plif_id_matrix, N, max_num_signals, false, false) ;
00791 m_PEN_state_signals.resize_array(N, max_num_signals) ;
00792 for (INT i=0; i<N; i++)
00793 {
00794 for (INT j=0; j<max_num_signals; j++)
00795 {
00796 if (id_matrix.element(i,j)>=0)
00797 m_PEN_state_signals.element(i,j)=m_plif_list[id_matrix.element(i,j)] ;
00798 else
00799 m_PEN_state_signals.element(i,j)=NULL ;
00800 }
00801 }
00802
00803 m_step=6 ;
00804 }
00805
00806 void CDynProg::best_path_set_genestr(CHAR* genestr, INT genestr_len, INT genestr_num)
00807 {
00808 if (m_step!=6)
00809 SG_ERROR( "please call best_path_set_plif_id_matrix first\n") ;
00810
00811 ASSERT(genestr);
00812 ASSERT(genestr_len>0);
00813 ASSERT(genestr_num>0);
00814
00815 m_genestr.set_array(genestr, genestr_len, genestr_num, true, true) ;
00816
00817 m_step=7 ;
00818 }
00819
00820 void CDynProg::best_path_set_my_state_seq(INT* my_state_seq, INT seq_len)
00821 {
00822 ASSERT(my_state_seq && seq_len>0);
00823 m_my_state_seq.resize_array(seq_len);
00824 for (INT i=0; i<seq_len; i++)
00825 m_my_state_seq[i]=my_state_seq[i];
00826 }
00827
00828 void CDynProg::best_path_set_my_pos_seq(INT* my_pos_seq, INT seq_len)
00829 {
00830 ASSERT(my_pos_seq && seq_len>0);
00831 m_my_pos_seq.resize_array(seq_len);
00832 for (INT i=0; i<seq_len; i++)
00833 m_my_pos_seq[i]=my_pos_seq[i];
00834 }
00835
00836 void CDynProg::best_path_set_dict_weights(DREAL* dictionary_weights, INT dict_len, INT n)
00837 {
00838 if (m_step!=7)
00839 SG_ERROR( "please call best_path_set_genestr first\n") ;
00840
00841 if (num_svms!=n)
00842 SG_ERROR( "dict_weights array does not match num_svms=%i!=%i\n", num_svms, n) ;
00843
00844 m_dict_weights.set_array(dictionary_weights, dict_len, num_svms, true, true) ;
00845
00846
00847 m_segment_loss.resize_array(max_a_id+1, max_a_id+1, 2) ;
00848 m_segment_loss.zero() ;
00849 m_segment_ids.resize_array(m_seq.get_dim2()) ;
00850 m_segment_mask.resize_array(m_seq.get_dim2()) ;
00851 m_segment_ids.zero() ;
00852 m_segment_mask.zero() ;
00853
00854 m_step=8 ;
00855 }
00856
00857 void CDynProg::best_path_set_segment_loss(DREAL* segment_loss, INT m, INT n)
00858 {
00859
00860 if (2*m!=n)
00861 SG_ERROR( "segment_loss should be 2 x quadratic matrix: %i!=%i\n", 2*m, n) ;
00862
00863 if (m!=max_a_id+1)
00864 SG_ERROR( "segment_loss size should match max_a_id: %i!=%i\n", m, max_a_id+1) ;
00865
00866 m_segment_loss.set_array(segment_loss, m, n/2, 2, true, true) ;
00867
00868
00869
00870 }
00871
00872 void CDynProg::best_path_set_segment_ids_mask(INT* segment_ids, DREAL* segment_mask, INT m)
00873 {
00874 INT max_id = 0;
00875 for (INT i=1;i<m;i++)
00876 max_id = CMath::max(max_id,segment_ids[i]);
00877
00878 m_segment_ids.set_array(segment_ids, m, false, true) ;
00879 m_segment_ids.set_name("m_segment_ids");
00880 m_segment_mask.set_array(segment_mask, m, false, true) ;
00881 m_segment_mask.set_name("m_segment_mask");
00882 }
00883
00884
00885 void CDynProg::best_path_call(INT nbest, bool use_orf)
00886 {
00887 if (m_step!=8)
00888 SG_ERROR( "please call best_path_set_dict_weights first\n") ;
00889 if (m_call!=1)
00890 SG_ERROR( "please call best_path_set_orf_info first\n") ;
00891 ASSERT(N==m_seq.get_dim1()) ;
00892 ASSERT(m_seq.get_dim2()==m_pos.get_dim1()) ;
00893
00894 m_scores.resize_array(nbest) ;
00895 m_states.resize_array(nbest, m_seq.get_dim2()) ;
00896 m_positions.resize_array(nbest, m_seq.get_dim2()) ;
00897
00898 m_call=1 ;
00899
00900 ASSERT(nbest==1||nbest==2) ;
00901 ASSERT(m_genestr.get_dim2()==1) ;
00902 SG_ERROR("is this used???\n");
00903
00904
00905
00906
00907
00908
00909
00910
00911
00912
00913
00914
00915
00916
00917
00918
00919
00920 m_step=9 ;
00921 }
00922
00923 void CDynProg::best_path_deriv_call()
00924 {
00925
00926
00927
00928
00929 ASSERT(N==m_seq.get_dim1()) ;
00930 ASSERT(m_seq.get_dim2()==m_pos.get_dim1()) ;
00931
00932 m_call=5 ;
00933
00934 m_my_scores.resize_array(m_my_state_seq.get_array_size()) ;
00935 m_my_losses.resize_array(m_my_state_seq.get_array_size()) ;
00936
00937 best_path_trans_deriv(m_my_state_seq.get_array(), m_my_pos_seq.get_array(),
00938 m_my_scores.get_array(), m_my_losses.get_array(), m_my_state_seq.get_array_size(),
00939 m_seq.get_array(), m_seq.get_dim2(), m_pos.get_array(),
00940 m_PEN.get_array(), m_PEN_state_signals.get_array(), m_PEN_state_signals.get_dim2(), m_genestr.get_dim2()) ;
00941
00942 m_step=12 ;
00943 }
00944
00945
00946 void CDynProg::best_path_2struct_call(INT nbest)
00947 {
00948 if (m_step!=8)
00949 SG_ERROR( "please call best_path_set_orf_dict_weights first\n") ;
00950 if (m_call!=2)
00951 SG_ERROR( "please call best_path_set_segment_sum_weights first\n") ;
00952 ASSERT(N==m_seq.get_dim1()) ;
00953 ASSERT(m_seq.get_dim2()==m_pos.get_dim1()) ;
00954
00955 m_scores.resize_array(nbest) ;
00956 m_states.resize_array(nbest, m_seq.get_dim2()) ;
00957 m_positions.resize_array(nbest, m_seq.get_dim2()) ;
00958
00959 m_call=2 ;
00960
00961 best_path_2struct(m_seq.get_array(), m_seq.get_dim2(), m_pos.get_array(),
00962 m_PEN.get_array(),
00963 m_genestr.get_array(), m_genestr.get_dim1(),
00964 nbest,
00965 m_scores.get_array(), m_states.get_array(), m_positions.get_array(),
00966 m_dict_weights.get_array(), m_dict_weights.get_dim1()*m_dict_weights.get_dim2(),
00967 m_segment_sum_weights.get_array()) ;
00968
00969 m_step=9 ;
00970 }
00971
00972 void CDynProg::best_path_simple_call(INT nbest)
00973 {
00974 if (m_step!=2)
00975 SG_ERROR( "please call best_path_set_seq first\n") ;
00976 if (m_call!=3)
00977 SG_ERROR( "please call best_path_set_seq first\n") ;
00978 ASSERT(N==m_seq.get_dim1()) ;
00979
00980 m_scores.resize_array(nbest) ;
00981 m_states.resize_array(nbest, m_seq.get_dim2()) ;
00982
00983 m_call=3 ;
00984
00985 best_path_trans_simple(m_seq.get_array(), m_seq.get_dim2(),
00986 nbest,
00987 m_scores.get_array(), m_states.get_array()) ;
00988
00989 m_step=9 ;
00990 }
00991
00992 void CDynProg::best_path_deriv_call(INT nbest)
00993 {
00994 if (!svm_arrays_clean)
00995 {
00996 SG_ERROR( "SVM arrays not clean") ;
00997 return ;
00998 } ;
00999
01000
01001 }
01002
01003
01004 void CDynProg::best_path_get_scores(DREAL **scores, INT *m)
01005 {
01006 if (m_step!=9 && m_step!=12)
01007 SG_ERROR( "please call best_path*_call first\n") ;
01008
01009 if (m_step==9)
01010 {
01011 *scores=m_scores.get_array() ;
01012 *m=m_scores.get_dim1() ;
01013 } else
01014 {
01015 *scores=m_my_scores.get_array() ;
01016 *m=m_my_scores.get_dim1() ;
01017 }
01018
01019 m_step=10 ;
01020 }
01021
01022 void CDynProg::best_path_get_states(INT **states, INT *m, INT *n)
01023 {
01024 if (m_step!=10)
01025 SG_ERROR( "please call best_path_get_score first\n") ;
01026
01027 *states=m_states.get_array() ;
01028 *m=m_states.get_dim1() ;
01029 *n=m_states.get_dim2() ;
01030
01031 m_step=11 ;
01032 }
01033
01034 void CDynProg::best_path_get_positions(INT **positions, INT *m, INT *n)
01035 {
01036 if (m_step!=11)
01037 SG_ERROR( "please call best_path_get_positions first\n") ;
01038 if (m_call==3)
01039 SG_ERROR( "no position information for best_path_simple\n") ;
01040
01041 *positions=m_positions.get_array() ;
01042 *m=m_positions.get_dim1() ;
01043 *n=m_positions.get_dim2() ;
01044 }
01045
01046 void CDynProg::best_path_get_losses(DREAL** losses, INT* seq_len)
01047 {
01048 ASSERT(losses && seq_len);
01049 *losses=m_my_losses.get_array();
01050 *seq_len=m_my_losses.get_dim1();
01051 }
01052
01053
01055
01056 DREAL CDynProg::best_path_no_b(INT max_iter, INT &best_iter, INT *my_path)
01057 {
01058 CArray2<T_STATES> psi(max_iter, N) ;
01059 CArray<DREAL>* delta = new CArray<DREAL>(N) ;
01060 CArray<DREAL>* delta_new = new CArray<DREAL>(N) ;
01061
01062 {
01063 for (INT i=0; i<N; i++)
01064 {
01065 delta->element(i) = get_p(i) ;
01066 psi.element(0, i)= 0 ;
01067 }
01068 }
01069
01070 DREAL best_iter_prob = CMath::ALMOST_NEG_INFTY ;
01071 best_iter = 0 ;
01072
01073
01074 for (INT t=1; t<max_iter; t++)
01075 {
01076 CArray<DREAL>* dummy;
01077 INT NN=N ;
01078 for (INT j=0; j<NN; j++)
01079 {
01080 DREAL maxj = delta->element(0) + transition_matrix_a.element(0,j);
01081 INT argmax=0;
01082
01083 for (INT i=1; i<NN; i++)
01084 {
01085 DREAL temp = delta->element(i) + transition_matrix_a.element(i,j);
01086
01087 if (temp>maxj)
01088 {
01089 maxj=temp;
01090 argmax=i;
01091 }
01092 }
01093 delta_new->element(j)=maxj ;
01094 psi.element(t, j)=argmax ;
01095 }
01096
01097 dummy=delta;
01098 delta=delta_new;
01099 delta_new=dummy;
01100
01101 {
01102 DREAL maxj=delta->element(0)+get_q(0);
01103 INT argmax=0;
01104
01105 for (INT i=1; i<N; i++)
01106 {
01107 DREAL temp=delta->element(i)+get_q(i);
01108
01109 if (temp>maxj)
01110 {
01111 maxj=temp;
01112 argmax=i;
01113 }
01114 }
01115
01116
01117 if (maxj>best_iter_prob)
01118 {
01119 my_path[t]=argmax;
01120 best_iter=t ;
01121 best_iter_prob = maxj ;
01122 } ;
01123 } ;
01124 }
01125
01126
01127 {
01128 for (INT t = best_iter; t>0; t--)
01129 {
01130 my_path[t-1]=psi.element(t, my_path[t]);
01131 }
01132 }
01133
01134 delete delta ;
01135 delete delta_new ;
01136
01137 return best_iter_prob ;
01138 }
01139
01140 void CDynProg::best_path_no_b_trans(INT max_iter, INT &max_best_iter, short int nbest, DREAL *prob_nbest, INT *my_paths)
01141 {
01142
01143 CArray3<T_STATES> psi(max_iter, N, nbest) ;
01144 CArray3<short int> ktable(max_iter, N, nbest) ;
01145 CArray2<short int> ktable_ends(max_iter, nbest) ;
01146
01147 CArray<DREAL> tempvv(nbest*N) ;
01148 CArray<INT> tempii(nbest*N) ;
01149
01150 CArray2<T_STATES> path_ends(max_iter, nbest) ;
01151 CArray2<DREAL> *delta=new CArray2<DREAL>(N, nbest) ;
01152 CArray2<DREAL> *delta_new=new CArray2<DREAL>(N, nbest) ;
01153 CArray2<DREAL> delta_end(max_iter, nbest) ;
01154
01155 CArray2<INT> paths(max_iter, nbest) ;
01156 paths.set_array(my_paths, max_iter, nbest, false, false) ;
01157
01158 {
01159 for (T_STATES i=0; i<N; i++)
01160 {
01161 delta->element(i,0) = get_p(i) ;
01162 for (short int k=1; k<nbest; k++)
01163 {
01164 delta->element(i,k)=-CMath::INFTY ;
01165 ktable.element(0,i,k)=0 ;
01166 }
01167 }
01168 }
01169
01170
01171 for (INT t=1; t<max_iter; t++)
01172 {
01173 CArray2<DREAL>* dummy=NULL;
01174
01175 for (T_STATES j=0; j<N; j++)
01176 {
01177 const T_STATES num_elem = trans_list_forward_cnt[j] ;
01178 const T_STATES *elem_list = trans_list_forward[j] ;
01179 const DREAL *elem_val = trans_list_forward_val[j] ;
01180
01181 INT list_len=0 ;
01182 for (short int diff=0; diff<nbest; diff++)
01183 {
01184 for (INT i=0; i<num_elem; i++)
01185 {
01186 T_STATES ii = elem_list[i] ;
01187
01188 tempvv.element(list_len) = -(delta->element(ii,diff) + elem_val[i]) ;
01189 tempii.element(list_len) = diff*N + ii ;
01190 list_len++ ;
01191 }
01192 }
01193 CMath::qsort_index(tempvv.get_array(), tempii.get_array(), list_len) ;
01194
01195 for (short int k=0; k<nbest; k++)
01196 {
01197 if (k<list_len)
01198 {
01199 delta_new->element(j,k) = -tempvv[k] ;
01200 psi.element(t,j,k) = (tempii[k]%N) ;
01201 ktable.element(t,j,k) = (tempii[k]-(tempii[k]%N))/N ;
01202 }
01203 else
01204 {
01205 delta_new->element(j,k) = -CMath::INFTY ;
01206 psi.element(t,j,k) = 0 ;
01207 ktable.element(t,j,k) = 0 ;
01208 }
01209 }
01210 }
01211
01212 dummy=delta;
01213 delta=delta_new;
01214 delta_new=dummy;
01215
01216 {
01217 INT list_len = 0 ;
01218 for (short int diff=0; diff<nbest; diff++)
01219 {
01220 for (T_STATES i=0; i<N; i++)
01221 {
01222 tempvv.element(list_len) = -(delta->element(i,diff)+get_q(i));
01223 tempii.element(list_len) = diff*N + i ;
01224 list_len++ ;
01225 }
01226 }
01227 CMath::qsort_index(tempvv.get_array(), tempii.get_array(), list_len) ;
01228
01229 for (short int k=0; k<nbest; k++)
01230 {
01231 delta_end.element(t-1,k) = -tempvv[k] ;
01232 path_ends.element(t-1,k) = (tempii[k]%N) ;
01233 ktable_ends.element(t-1,k) = (tempii[k]-(tempii[k]%N))/N ;
01234 }
01235 }
01236 }
01237
01238 {
01239 max_best_iter=0 ;
01240
01241 CArray<DREAL> sort_delta_end(max_iter*nbest) ;
01242 CArray<short int> sort_k(max_iter*nbest) ;
01243 CArray<INT> sort_t(max_iter*nbest) ;
01244 CArray<INT> sort_idx(max_iter*nbest) ;
01245
01246 INT i=0 ;
01247 for (INT iter=0; iter<max_iter-1; iter++)
01248 for (short int k=0; k<nbest; k++)
01249 {
01250 sort_delta_end[i]=-delta_end.element(iter,k) ;
01251 sort_k[i]=k ;
01252 sort_t[i]=iter+1 ;
01253 sort_idx[i]=i ;
01254 i++ ;
01255 }
01256
01257 CMath::qsort_index(sort_delta_end.get_array(), sort_idx.get_array(), (max_iter-1)*nbest) ;
01258
01259 for (short int n=0; n<nbest; n++)
01260 {
01261 short int k=sort_k[sort_idx[n]] ;
01262 INT iter=sort_t[sort_idx[n]] ;
01263 prob_nbest[n]=-sort_delta_end[n] ;
01264
01265 if (iter>max_best_iter)
01266 max_best_iter=iter ;
01267
01268 ASSERT(k<nbest) ;
01269 ASSERT(iter<max_iter) ;
01270
01271 paths.element(iter,n) = path_ends.element(iter-1, k) ;
01272 short int q = ktable_ends.element(iter-1, k) ;
01273
01274 for (INT t = iter; t>0; t--)
01275 {
01276 paths.element(t-1,n)=psi.element(t, paths.element(t,n), q);
01277 q = ktable.element(t, paths.element(t,n), q) ;
01278 }
01279 }
01280 }
01281
01282 delete delta ;
01283 delete delta_new ;
01284 }
01285
01286
01287 void CDynProg::translate_from_single_order(WORD* obs, INT sequence_length,
01288 INT start, INT order,
01289 INT max_val)
01290 {
01291 INT i,j;
01292 WORD value=0;
01293
01294 for (i=sequence_length-1; i>= ((int) order)-1; i--)
01295 {
01296 value=0;
01297 for (j=i; j>=i-((int) order)+1; j--)
01298 value= (value >> max_val) | (obs[j] << (max_val * (order-1)));
01299
01300 obs[i]= (WORD) value;
01301 }
01302
01303 for (i=order-2;i>=0;i--)
01304 {
01305 value=0;
01306 for (j=i; j>=i-order+1; j--)
01307 {
01308 value= (value >> max_val);
01309 if (j>=0)
01310 value|=obs[j] << (max_val * (order-1));
01311 }
01312 obs[i]=value;
01313
01314 }
01315 if (start>0)
01316 for (i=start; i<sequence_length; i++)
01317 obs[i-start]=obs[i];
01318 }
01319
01320 void CDynProg::reset_svm_value(INT pos, INT & last_svm_pos, DREAL * svm_value)
01321 {
01322 for (int i=0; i<num_words_single; i++)
01323 word_used_single[i]=false ;
01324 for (INT s=0; s<num_svms; s++)
01325 svm_value_unnormalized_single[s] = 0 ;
01326 for (INT s=0; s<num_svms; s++)
01327 svm_value[s] = 0 ;
01328 last_svm_pos = pos - 6+1 ;
01329 num_unique_words_single=0 ;
01330 }
01331
01332 void CDynProg::extend_svm_value(WORD* wordstr, INT pos, INT &last_svm_pos, DREAL* svm_value)
01333 {
01334 bool did_something = false ;
01335 for (int i=last_svm_pos-1; (i>=pos) && (i>=0); i--)
01336 {
01337 if (wordstr[i]>=num_words_single)
01338 SG_DEBUG( "wordstr[%i]=%i\n", i, wordstr[i]) ;
01339
01340 if (!word_used_single[wordstr[i]])
01341 {
01342 for (INT s=0; s<num_svms_single; s++)
01343 svm_value_unnormalized_single[s]+=dict_weights.element(wordstr[i],s) ;
01344
01345 word_used_single[wordstr[i]]=true ;
01346 num_unique_words_single++ ;
01347 did_something=true ;
01348 }
01349 } ;
01350 if (num_unique_words_single>0)
01351 {
01352 last_svm_pos=pos ;
01353 if (did_something)
01354 for (INT s=0; s<num_svms; s++)
01355 svm_value[s]= svm_value_unnormalized_single[s]/sqrt((double)num_unique_words_single) ;
01356 }
01357 else
01358 {
01359
01360 for (INT s=0; s<num_svms; s++)
01361 svm_value[s]=0 ;
01362 }
01363
01364 }
01365
01366
01367 void CDynProg::reset_segment_sum_value(INT num_states, INT pos, INT & last_segment_sum_pos, DREAL * segment_sum_value)
01368 {
01369 for (INT s=0; s<num_states; s++)
01370 segment_sum_value[s] = 0 ;
01371 last_segment_sum_pos = pos ;
01372
01373 }
01374
01375 void CDynProg::extend_segment_sum_value(DREAL *segment_sum_weights, INT seqlen, INT num_states,
01376 INT pos, INT &last_segment_sum_pos, DREAL* segment_sum_value)
01377 {
01378 for (int i=last_segment_sum_pos-1; (i>=pos) && (i>=0); i--)
01379 {
01380 for (INT s=0; s<num_states; s++)
01381 segment_sum_value[s] += segment_sum_weights[i*num_states+s] ;
01382 } ;
01383
01384 last_segment_sum_pos = pos ;
01385 }
01386
01387
01388 void CDynProg::best_path_2struct(const DREAL *seq_array, INT seq_len, const INT *pos,
01389 CPlifBase **Plif_matrix,
01390 const char *genestr, INT genestr_len,
01391 short int nbest,
01392 DREAL *prob_nbest, INT *my_state_seq, INT *my_pos_seq,
01393 DREAL *dictionary_weights, INT dict_len, DREAL *segment_sum_weights)
01394 {
01395 const INT default_look_back = 100 ;
01396 INT max_look_back = default_look_back ;
01397 bool use_svm = false ;
01398 ASSERT(dict_len==num_svms*num_words_single) ;
01399 dict_weights.set_array(dictionary_weights, dict_len, num_svms, false, false) ;
01400 dict_weights_array=dict_weights.get_array() ;
01401
01402 CArray2<CPlifBase*> PEN(Plif_matrix, N, N, false) ;
01403 CArray2<DREAL> seq((DREAL *)seq_array, N, seq_len, false) ;
01404
01405 DREAL svm_value[num_svms] ;
01406 DREAL segment_sum_value[N] ;
01407
01408 {
01409 for (INT s=0; s<num_svms; s++)
01410 svm_value[s]=0 ;
01411 }
01412
01413 {
01414 for (INT i=0; i<N; i++)
01415 for (INT j=0; j<N; j++)
01416 {
01417 CPlifBase *penij=PEN.element(i,j) ;
01418 if (penij==NULL)
01419 continue ;
01420 if (penij->get_max_value()>max_look_back)
01421 max_look_back=(INT) CMath::ceil(penij->get_max_value());
01422 if (penij->uses_svm_values())
01423 use_svm=true ;
01424 }
01425 }
01426 max_look_back = CMath::min(genestr_len, max_look_back) ;
01427
01428
01429
01430 const INT look_back_buflen = (max_look_back+1)*nbest*N ;
01431
01432 const DREAL mem_use = (DREAL)(seq_len*N*nbest*(sizeof(T_STATES)+sizeof(short int)+sizeof(INT))+
01433 look_back_buflen*(2*sizeof(DREAL)+sizeof(INT))+
01434 seq_len*(sizeof(T_STATES)+sizeof(INT))+
01435 genestr_len*sizeof(bool))/(1024*1024)
01436 ;
01437 bool is_big = (mem_use>200) || (seq_len>5000) ;
01438
01439 if (is_big)
01440 {
01441 SG_DEBUG("calling best_path_2struct: seq_len=%i, N=%i, lookback=%i nbest=%i\n",
01442 seq_len, N, max_look_back, nbest) ;
01443 SG_DEBUG("allocating %1.2fMB of memory\n",
01444 mem_use) ;
01445 }
01446 ASSERT(nbest<32000) ;
01447
01448 CArray3<DREAL> delta(max_look_back+1, N, nbest) ;
01449 CArray3<T_STATES> psi(seq_len,N,nbest) ;
01450 CArray3<short int> ktable(seq_len,N,nbest) ;
01451 CArray3<INT> ptable(seq_len,N,nbest) ;
01452
01453 CArray<DREAL> delta_end(nbest) ;
01454 CArray<T_STATES> path_ends(nbest) ;
01455 CArray<short int> ktable_end(nbest) ;
01456
01457 CArray<DREAL> tempvv(look_back_buflen) ;
01458 CArray<INT> tempii(look_back_buflen) ;
01459
01460 CArray<T_STATES> state_seq(seq_len) ;
01461 CArray<INT> pos_seq(seq_len) ;
01462
01463
01464 WORD* wordstr=NULL ;
01465 if (use_svm)
01466 {
01467 ASSERT(dictionary_weights!=NULL) ;
01468 wordstr=new WORD[genestr_len] ;
01469 for (INT i=0; i<genestr_len; i++)
01470 switch (genestr[i])
01471 {
01472 case 'A':
01473 case 'a': wordstr[i]=0 ; break ;
01474 case 'C':
01475 case 'c': wordstr[i]=1 ; break ;
01476 case 'G':
01477 case 'g': wordstr[i]=2 ; break ;
01478 case 'T':
01479 case 't': wordstr[i]=3 ; break ;
01480 default: ASSERT(0) ;
01481 }
01482 translate_from_single_order(wordstr, genestr_len, word_degree_single-1, word_degree_single) ;
01483 }
01484
01485
01486 {
01487 for (T_STATES i=0; i<N; i++)
01488 {
01489 delta.element(0,i,0) = get_p(i) + seq.element(i,0) ;
01490 psi.element(0,i,0) = 0 ;
01491 ktable.element(0,i,0) = 0 ;
01492 ptable.element(0,i,0) = 0 ;
01493 for (short int k=1; k<nbest; k++)
01494 {
01495 delta.element(0,i,k) = -CMath::INFTY ;
01496 psi.element(0,i,0) = 0 ;
01497 ktable.element(0,i,k) = 0 ;
01498 ptable.element(0,i,k) = 0 ;
01499 }
01500 }
01501 }
01502
01503
01504 for (INT t=1; t<seq_len; t++)
01505 {
01506 if (is_big && t%(seq_len/10000)==1)
01507 SG_PROGRESS(t, 0, seq_len);
01508
01509
01510 for (T_STATES j=0; j<N; j++)
01511 {
01512 if (seq.element(j,t)<-1e20)
01513 {
01514 for (short int k=0; k<nbest; k++)
01515 {
01516 delta.element(t%max_look_back,j,k) = seq.element(j,t) ;
01517 psi.element(t,j,k) = 0 ;
01518 ktable.element(t,j,k) = 0 ;
01519 ptable.element(t,j,k) = 0 ;
01520 }
01521 }
01522 else
01523 {
01524 const T_STATES num_elem = trans_list_forward_cnt[j] ;
01525 const T_STATES *elem_list = trans_list_forward[j] ;
01526 const DREAL *elem_val = trans_list_forward_val[j] ;
01527
01528 INT list_len=0 ;
01529 for (INT i=0; i<num_elem; i++)
01530 {
01531 T_STATES ii = elem_list[i] ;
01532
01533
01534 const CPlifBase * penalty = PEN.element(j,ii) ;
01535 INT look_back = default_look_back ;
01536 if (penalty!=NULL)
01537 look_back=(INT) (CMath::ceil(penalty->get_max_value()));
01538
01539 INT last_svm_pos ;
01540 if (use_svm)
01541 reset_svm_value(pos[t], last_svm_pos, svm_value) ;
01542
01543 INT last_segment_sum_pos ;
01544 reset_segment_sum_value(N, pos[t], last_segment_sum_pos, segment_sum_value) ;
01545
01546 for (INT ts=t-1; ts>=0 && pos[t]-pos[ts]<=look_back; ts--)
01547 {
01548 if (use_svm)
01549 extend_svm_value(wordstr, pos[ts], last_svm_pos, svm_value) ;
01550
01551 extend_segment_sum_value(segment_sum_weights, seq_len, N, pos[ts], last_segment_sum_pos, segment_sum_value) ;
01552
01553 DREAL pen_val = 0.0 ;
01554 if (penalty)
01555 pen_val=penalty->lookup_penalty(pos[t]-pos[ts], svm_value) + segment_sum_value[j] ;
01556 for (short int diff=0; diff<nbest; diff++)
01557 {
01558 DREAL val = delta.element(ts%max_look_back,ii,diff) + elem_val[i] ;
01559 val += pen_val ;
01560
01561 tempvv[list_len] = -val ;
01562 tempii[list_len] = ii + diff*N + ts*N*nbest;
01563
01564 list_len++ ;
01565 }
01566 }
01567 }
01568 CMath::nmin<INT>(tempvv.get_array(), tempii.get_array(), list_len, nbest) ;
01569
01570 for (short int k=0; k<nbest; k++)
01571 {
01572 if (k<list_len)
01573 {
01574 delta.element(t%max_look_back,j,k) = -tempvv[k] + seq.element(j,t);
01575 psi.element(t,j,k) = (tempii[k]%N) ;
01576 ktable.element(t,j,k) = (tempii[k]%(N*nbest)-psi.element(t,j,k))/N ;
01577 ptable.element(t,j,k) = (tempii[k]-(tempii[k]%(N*nbest)))/(N*nbest) ;
01578 }
01579 else
01580 {
01581 delta.element(t%max_look_back,j,k) = -CMath::INFTY ;
01582 psi.element(t,j,k) = 0 ;
01583 ktable.element(t,j,k) = 0 ;
01584 ptable.element(t,j,k) = 0 ;
01585 }
01586 }
01587 }
01588 }
01589 }
01590
01591 {
01592 INT list_len = 0 ;
01593 for (short int diff=0; diff<nbest; diff++)
01594 {
01595 for (T_STATES i=0; i<N; i++)
01596 {
01597 tempvv[list_len] = -(delta.element((seq_len-1)%max_look_back,i,diff)+get_q(i)) ;
01598 tempii[list_len] = i + diff*N ;
01599 list_len++ ;
01600 }
01601 }
01602
01603 CMath::nmin(tempvv.get_array(), tempii.get_array(), list_len, nbest) ;
01604
01605 for (short int k=0; k<nbest; k++)
01606 {
01607 delta_end.element(k) = -tempvv[k] ;
01608 path_ends.element(k) = (tempii[k]%N) ;
01609 ktable_end.element(k) = (tempii[k]-path_ends.element(k))/N ;
01610 }
01611 }
01612
01613 {
01614 for (short int k=0; k<nbest; k++)
01615 {
01616 prob_nbest[k]= delta_end.element(k) ;
01617
01618 INT i = 0 ;
01619 state_seq[i] = path_ends.element(k) ;
01620 short int q = ktable_end.element(k) ;
01621 pos_seq[i] = seq_len-1 ;
01622
01623 while (pos_seq[i]>0)
01624 {
01625
01626 state_seq[i+1] = psi.element(pos_seq[i], state_seq[i], q);
01627 pos_seq[i+1] = ptable.element(pos_seq[i], state_seq[i], q) ;
01628 q = ktable.element(pos_seq[i], state_seq[i], q) ;
01629 i++ ;
01630 }
01631
01632 INT num_states = i+1 ;
01633 for (i=0; i<num_states;i++)
01634 {
01635 my_state_seq[i+k*(seq_len+1)] = state_seq[num_states-i-1] ;
01636 my_pos_seq[i+k*(seq_len+1)] = pos_seq[num_states-i-1] ;
01637 }
01638 my_state_seq[num_states+k*(seq_len+1)]=-1 ;
01639 my_pos_seq[num_states+k*(seq_len+1)]=-1 ;
01640 }
01641 }
01642 if (is_big)
01643 SG_PRINT( "DONE. \n") ;
01644 }
01645
01646
01647
01648
01649
01650
01651
01652
01653
01654
01655
01656
01657
01658
01659
01660
01661
01662
01663
01664
01665
01666
01667
01668
01669
01670
01671
01672
01673
01674
01675
01676
01677
01678
01679
01680
01681
01682
01683
01684
01685
01686
01687
01688
01689
01690
01691
01692
01693
01694
01695
01696
01697
01698
01699
01700 void CDynProg::init_segment_loss(struct segment_loss_struct & loss, INT seqlen, INT howmuchlookback)
01701 {
01702 #ifdef DYNPROG_TIMING
01703 MyTime.start() ;
01704 #endif
01705 INT clear_size = CMath::min(howmuchlookback,seqlen) ;
01706
01707 if (!loss.num_segment_id)
01708 {
01709 loss.segments_changed = new INT[seqlen] ;
01710 loss.num_segment_id = new DREAL[(max_a_id+1)*seqlen] ;
01711 loss.length_segment_id = new INT[(max_a_id+1)*seqlen] ;
01712
01713 clear_size = seqlen ;
01714 }
01715
01716 for (INT j=0; j<clear_size; j++)
01717 {
01718 loss.segments_changed[j]=0 ;
01719 for (INT i=0; i<max_a_id+1; i++)
01720 {
01721 loss.num_segment_id[i*seqlen+j] = 0;
01722 loss.length_segment_id[i*seqlen+j] = 0;
01723 }
01724 }
01725
01726 loss.maxlookback = howmuchlookback ;
01727 loss.seqlen = seqlen;
01728
01729 #ifdef DYNPROG_TIMING
01730 MyTime.stop() ;
01731 segment_init_time += MyTime.time_diff_sec() ;
01732 #endif
01733 }
01734
01735 void CDynProg::clear_segment_loss(struct segment_loss_struct & loss)
01736 {
01737 #ifdef DYNPROG_TIMING
01738 MyTime.start() ;
01739 #endif
01740
01741 if (loss.num_segment_id != NULL)
01742 {
01743 delete[] loss.segments_changed ;
01744 delete[] loss.num_segment_id ;
01745 delete[] loss.length_segment_id ;
01746 loss.segments_changed = NULL ;
01747 loss.num_segment_id = NULL ;
01748 loss.length_segment_id = NULL ;
01749 }
01750 #ifdef DYNPROG_TIMING
01751 MyTime.stop() ;
01752 segment_clean_time += MyTime.time_diff_sec() ;
01753 #endif
01754 }
01755
01756 DREAL CDynProg::extend_segment_loss(struct segment_loss_struct & loss, const INT * pos_array, INT segment_id, INT pos, INT & last_pos, DREAL &last_value)
01757 {
01758 #ifdef DYNPROG_TIMING
01759 MyTime.start() ;
01760 #endif
01761
01762 if (pos==last_pos)
01763 return last_value ;
01764 ASSERT(pos<last_pos) ;
01765
01766 last_pos-- ;
01767 bool changed = false ;
01768 while (last_pos>=pos)
01769 {
01770 if (loss.segments_changed[last_pos])
01771 {
01772 changed=true ;
01773 break ;
01774 }
01775 last_pos-- ;
01776 }
01777 if (last_pos<pos)
01778 last_pos = pos ;
01779
01780 if (!changed)
01781 {
01782 ASSERT(last_pos>=0) ;
01783 ASSERT(last_pos<loss.seqlen) ;
01784 DREAL length_contrib = (pos_array[last_pos]-pos_array[pos])*m_segment_loss.element(m_segment_ids.element(pos), segment_id, 1) ;
01785 DREAL ret = last_value + length_contrib ;
01786 last_pos = pos ;
01787 return ret ;
01788 }
01789
01790 CArray2<DREAL> num_segment_id(loss.num_segment_id, loss.seqlen, max_a_id+1, false, false) ;
01791 CArray2<INT> length_segment_id(loss.length_segment_id, loss.seqlen, max_a_id+1, false, false) ;
01792 DREAL ret = 0.0 ;
01793 for (INT i=0; i<max_a_id+1; i++)
01794 {
01795
01796
01797 if (num_segment_id.element(pos, i)!=0)
01798 {
01799 ret += num_segment_id.element(pos, i)*m_segment_loss.element(i, segment_id, 0) ;
01800
01801
01802
01803
01804
01805
01806
01807 }
01808 if (length_segment_id.element(pos, i)!=0)
01809 ret += length_segment_id.element(pos, i)*m_segment_loss.element(i, segment_id, 1) ;
01810 }
01811 last_pos = pos ;
01812 last_value = ret ;
01813
01814 #ifdef DYNPROG_TIMING
01815 MyTime.stop() ;
01816 segment_extend_time += MyTime.time_diff_sec() ;
01817 #endif
01818 return ret ;
01819 }
01820
01821 void CDynProg::find_segment_loss_till_pos(const INT * pos, INT t_end, CArray<INT>& segment_ids, CArray<DREAL>& segment_mask, struct segment_loss_struct & loss)
01822 {
01823 #ifdef DYNPROG_TIMING
01824 MyTime.start() ;
01825 #endif
01826 CArray2<DREAL> num_segment_id(loss.num_segment_id, loss.seqlen, max_a_id+1, false, false) ;
01827 CArray2<INT> length_segment_id(loss.length_segment_id, loss.seqlen, max_a_id+1, false, false) ;
01828
01829 for (INT i=0; i<max_a_id+1; i++)
01830 {
01831 num_segment_id.element(t_end, i) = 0 ;
01832 length_segment_id.element(t_end, i) = 0 ;
01833 }
01834 INT wobble_pos_segment_id_switch = 0 ;
01835 INT last_segment_id = -1 ;
01836 INT ts = t_end-1 ;
01837 while ((ts>=0) && (pos[t_end] - pos[ts] <= loss.maxlookback))
01838 {
01839 INT cur_segment_id = segment_ids.element(ts) ;
01840
01841 bool wobble_pos = (CMath::abs(segment_mask.element(ts))<1e-7) && (wobble_pos_segment_id_switch==0) ;
01842 if (!(cur_segment_id<=max_a_id))
01843 SG_ERROR("(cur_segment_id<=max_a_id), cur_segment_id:%i max_a_id:%i\n",cur_segment_id,max_a_id);
01844 ASSERT(cur_segment_id>=0) ;
01845
01846 for (INT i=0; i<max_a_id+1; i++)
01847 {
01848 num_segment_id.element(ts, i) = num_segment_id.element(ts+1, i) ;
01849 length_segment_id.element(ts, i) = length_segment_id.element(ts+1, i) ;
01850 }
01851
01852 if (cur_segment_id!=last_segment_id)
01853 {
01854 if (wobble_pos)
01855 {
01856
01857 wobble_pos_segment_id_switch++ ;
01858
01859 }
01860 else
01861 {
01862
01863 loss.segments_changed[ts] = true ;
01864 num_segment_id.element(ts, cur_segment_id) += segment_mask.element(ts);
01865 length_segment_id.element(ts, cur_segment_id) += (INT)((pos[ts+1]-pos[ts])*segment_mask.element(ts));
01866 wobble_pos_segment_id_switch = 0 ;
01867 }
01868 last_segment_id = cur_segment_id ;
01869 }
01870 else
01871 if (!wobble_pos)
01872 length_segment_id.element(ts, cur_segment_id) += pos[ts+1] - pos[ts] ;
01873
01874 ts--;
01875 }
01876 #ifdef DYNPROG_TIMING
01877 MyTime.stop() ;
01878 segment_pos_time += MyTime.time_diff_sec() ;
01879 #endif
01880 }
01881
01882 void CDynProg::init_svm_values(struct svm_values_struct & svs, INT start_pos, INT seqlen, INT maxlookback)
01883 {
01884 #ifdef DYNPROG_TIMING
01885 MyTime.start() ;
01886 #endif
01887
01888
01889
01890
01891
01892
01893
01894
01895 INT clear_size = CMath::min(maxlookback,seqlen) ;
01896
01897 if (!svs.svm_values)
01898 {
01899 svs.svm_values = new DREAL[seqlen*num_svms] ;
01900 svs.num_unique_words = new INT*[num_degrees] ;
01901 svs.svm_values_unnormalized = new DREAL*[num_degrees] ;
01902 svs.word_used = new bool**[num_degrees] ;
01903 for (INT j=0; j<num_degrees; j++)
01904 {
01905 svs.word_used[j] = new bool*[num_svms] ;
01906 for (INT s=0; s<num_svms; s++)
01907 svs.word_used[j][s] = new bool[num_words_array[j]] ;
01908 }
01909 for (INT j=0; j<num_degrees; j++)
01910 {
01911 svs.svm_values_unnormalized[j] = new DREAL[num_svms] ;
01912 svs.num_unique_words[j] = new INT[num_svms] ;
01913 }
01914 svs.start_pos = new INT[num_svms] ;
01915 clear_size = seqlen ;
01916 }
01917
01918
01919
01920 memset(svs.svm_values, 0, clear_size*num_svms*sizeof(DREAL)) ;
01921
01922 for (INT j=0; j<num_degrees; j++)
01923 {
01924
01925
01926 memset(svs.svm_values_unnormalized[j], 0, num_svms*sizeof(DREAL)) ;
01927
01928
01929
01930 memset(svs.num_unique_words[j], 0, num_svms*sizeof(INT)) ;
01931 }
01932
01933 for (INT j=0; j<num_degrees; j++)
01934 for (INT s=0; s<num_svms; s++)
01935 {
01936
01937
01938 memset(svs.word_used[j][s], 0, num_words_array[j]*sizeof(bool)) ;
01939 }
01940
01941 for (INT s=0; s<num_svms; s++)
01942 svs.start_pos[s] = start_pos - mod_words.element(s,1) ;
01943
01944 svs.maxlookback = maxlookback ;
01945 svs.seqlen = seqlen ;
01946
01947 #ifdef DYNPROG_TIMING
01948 MyTime.stop() ;
01949 svm_init_time += MyTime.time_diff_sec() ;
01950 #endif
01951 }
01952
01953 void CDynProg::clear_svm_values(struct svm_values_struct & svs)
01954 {
01955 #ifdef DYNPROG_TIMING
01956 MyTime.start() ;
01957 #endif
01958 if (NULL != svs.svm_values)
01959 {
01960 for (INT j=0; j<num_degrees; j++)
01961 {
01962 for (INT s=0; s<num_svms; s++)
01963 delete[] svs.word_used[j][s] ;
01964 delete[] svs.word_used[j];
01965 }
01966 delete[] svs.word_used;
01967
01968 for (INT j=0; j<num_degrees; j++)
01969 delete[] svs.svm_values_unnormalized[j] ;
01970 for (INT j=0; j<num_degrees; j++)
01971 delete[] svs.num_unique_words[j] ;
01972
01973 delete[] svs.svm_values_unnormalized;
01974 delete[] svs.svm_values;
01975 delete[] svs.num_unique_words ;
01976
01977 svs.word_used=NULL ;
01978 svs.svm_values=NULL ;
01979 svs.svm_values_unnormalized=NULL ;
01980 }
01981
01982 #ifdef DYNPROG_TIMING
01983 MyTime.stop() ;
01984 svm_clean_time += MyTime.time_diff_sec() ;
01985 #endif
01986 }
01987
01988
01989 void CDynProg::find_svm_values_till_pos(WORD*** wordstr, const INT *pos, INT t_end, struct svm_values_struct &svs)
01990 {
01991 #ifdef DYNPROG_TIMING
01992 MyTime.start() ;
01993 #endif
01994
01995
01996
01997
01998
01999
02000
02001
02002
02003
02004
02005
02006
02007
02008
02009
02010 for (INT j=0; j<num_degrees; j++)
02011
02012 {
02013 INT plen = 1;
02014 INT ts = t_end-1;
02015 INT offset;
02016
02017 INT posprev = pos[t_end]-word_degree[j]+1;
02018 INT poscurrent = pos[ts];
02019
02020
02021
02022 if (poscurrent<0)
02023 poscurrent = 0;
02024 DREAL * my_svm_values_unnormalized = svs.svm_values_unnormalized[j] ;
02025 INT * my_num_unique_words = svs.num_unique_words[j] ;
02026 bool ** my_word_used = svs.word_used[j] ;
02027
02028 INT len = pos[t_end] - poscurrent;
02029 while ((ts>=0) && (len <= svs.maxlookback))
02030 {
02031 for (int i=posprev-1 ; (i>=poscurrent) && (i>=0) ; i--)
02032 {
02033
02034
02035 WORD word = wordstr[string_words_array[0]][j][i] ;
02036 INT last_string = string_words_array[0] ;
02037 for (INT s=0; s<num_svms; s++)
02038 {
02039
02040
02041 if (last_string != string_words_array[s])
02042 {
02043 last_string = string_words_array[s] ;
02044 word = wordstr[last_string][j][i] ;
02045 }
02046
02047
02048 if (sign_words_array[s] && my_word_used[s][word])
02049 continue ;
02050
02051
02052
02053 {
02054 my_svm_values_unnormalized[s] += dict_weights_array[(word+cum_num_words_array[j])+s*cum_num_words_array[num_degrees]] ;
02055
02056 my_num_unique_words[s]++ ;
02057 if (sign_words_array[s])
02058 my_word_used[s][word]=true ;
02059 }
02060 }
02061 }
02062 offset = plen*num_svms ;
02063 for (INT s=0; s<num_svms; s++)
02064 {
02065 double normalization_factor = 1.0;
02066 if (my_num_unique_words[s] > 0)
02067 {
02068 if (sign_words_array[s])
02069 normalization_factor = sqrt((double)my_num_unique_words[s]);
02070 else
02071 normalization_factor = (double)my_num_unique_words[s];
02072 }
02073
02074 if (j==0)
02075 svs.svm_values[offset+s]=0 ;
02076 svs.svm_values[offset+s] += my_svm_values_unnormalized[s] / normalization_factor;
02077 }
02078
02079 if (posprev > poscurrent)
02080 posprev = poscurrent;
02081
02082 ts--;
02083 plen++;
02084
02085 if (ts>=0)
02086 {
02087 poscurrent=pos[ts];
02088 if (poscurrent<0)
02089 poscurrent = 0;
02090 len = pos[t_end] - poscurrent;
02091 }
02092 }
02093 }
02094
02095 #ifdef DYNPROG_TIMING
02096 MyTime.stop() ;
02097 svm_pos_time += MyTime.time_diff_sec() ;
02098 #endif
02099 }
02100
02101
02102 void CDynProg::find_svm_values_till_pos(WORD** wordstr, const INT *pos, INT t_end, struct svm_values_struct &svs)
02103 {
02104 #ifdef DYNPROG_TIMING
02105 MyTime.start() ;
02106 #endif
02107
02108
02109
02110
02111
02112
02113
02114
02115
02116
02117
02118
02119
02120
02121
02122 for (INT j=0; j<num_degrees; j++)
02123
02124 {
02125 INT plen = 1;
02126 INT ts = t_end-1;
02127 INT offset;
02128
02129 INT posprev = pos[t_end]-word_degree[j]+1;
02130 INT poscurrent = pos[ts];
02131
02132
02133
02134 if (poscurrent<0)
02135 poscurrent = 0;
02136 DREAL * my_svm_values_unnormalized = svs.svm_values_unnormalized[j] ;
02137 INT * my_num_unique_words = svs.num_unique_words[j] ;
02138 bool ** my_word_used = svs.word_used[j] ;
02139
02140 INT len = pos[t_end] - poscurrent;
02141 while ((ts>=0) && (len <= svs.maxlookback))
02142 {
02143 for (int i=posprev-1 ; (i>=poscurrent) && (i>=0) ; i--)
02144 {
02145
02146
02147 WORD word = wordstr[j][i] ;
02148 for (INT s=0; s<num_svms; s++)
02149 {
02150
02151
02152 if (sign_words_array[s] && my_word_used[s][word])
02153 continue ;
02154
02155
02156
02157 {
02158 my_svm_values_unnormalized[s] += dict_weights_array[(word+cum_num_words_array[j])+s*cum_num_words_array[num_degrees]] ;
02159
02160 my_num_unique_words[s]++ ;
02161 if (sign_words_array[s])
02162 my_word_used[s][word]=true ;
02163 }
02164 }
02165 }
02166 offset = plen*num_svms ;
02167 for (INT s=0; s<num_svms; s++)
02168 {
02169 double normalization_factor = 1.0;
02170 if (my_num_unique_words[s] > 0)
02171 {
02172 if (sign_words_array[s])
02173 normalization_factor = sqrt((double)my_num_unique_words[s]);
02174 else
02175 normalization_factor = (double)my_num_unique_words[s];
02176 }
02177
02178 if (j==0)
02179 svs.svm_values[offset+s]=0 ;
02180 svs.svm_values[offset+s] += my_svm_values_unnormalized[s] / normalization_factor;
02181 }
02182
02183 if (posprev > poscurrent)
02184 posprev = poscurrent;
02185
02186 ts--;
02187 plen++;
02188
02189 if (ts>=0)
02190 {
02191 poscurrent=pos[ts];
02192 if (poscurrent<0)
02193 poscurrent = 0;
02194 len = pos[t_end] - poscurrent;
02195 }
02196 }
02197 }
02198
02199 #ifdef DYNPROG_TIMING
02200 MyTime.stop() ;
02201 svm_pos_time += MyTime.time_diff_sec() ;
02202 #endif
02203 }
02204
02205 bool CDynProg::extend_orf(INT orf_from, INT orf_to, INT start, INT &last_pos, INT to)
02206 {
02207 #ifdef DYNPROG_TIMING
02208 MyTime.start() ;
02209 #endif
02210
02211 if (start<0)
02212 start=0 ;
02213 if (to<0)
02214 to=0 ;
02215
02216 INT orf_target = orf_to-orf_from ;
02217 if (orf_target<0) orf_target+=3 ;
02218
02219 INT pos ;
02220 if (last_pos==to)
02221 pos = to-orf_to-3 ;
02222 else
02223 pos=last_pos ;
02224
02225 if (pos<0)
02226 return true ;
02227
02228 for (; pos>=start; pos-=3)
02229 if (m_genestr_stop[pos])
02230 return false ;
02231
02232 last_pos = CMath::min(pos+3,to-orf_to-3) ;
02233
02234 #ifdef DYNPROG_TIMING
02235 MyTime.stop() ;
02236 orf_time += MyTime.time_diff_sec() ;
02237 #endif
02238 return true ;
02239 }
02240
02241 template <short int nbest, bool with_loss, bool with_multiple_sequences>
02242 void CDynProg::best_path_trans(const DREAL *seq_array, INT seq_len, const INT *pos,
02243 const INT *orf_info_array, CPlifBase **Plif_matrix,
02244 CPlifBase **Plif_state_signals, INT max_num_signals, INT genestr_num,
02245 DREAL *prob_nbest, INT *my_state_seq, INT *my_pos_seq, bool use_orf)
02246 {
02247 #ifdef DYNPROG_TIMING
02248 segment_init_time = 0.0 ;
02249 segment_pos_time = 0.0 ;
02250 segment_extend_time = 0.0 ;
02251 segment_clean_time = 0.0 ;
02252 orf_time = 0.0 ;
02253 svm_init_time = 0.0 ;
02254 svm_pos_time = 0.0 ;
02255 svm_clean_time = 0.0 ;
02256 MyTime2.start() ;
02257 #endif
02258
02259
02260 if (!svm_arrays_clean)
02261 {
02262 SG_ERROR( "SVM arrays not clean") ;
02263 return ;
02264 }
02265
02266 #ifdef DYNPROG_DEBUG
02267 transition_matrix_a.set_name("transition_matrix");
02268 transition_matrix_a.display_array();
02269 mod_words.display_array() ;
02270 sign_words.display_array() ;
02271 string_words.display_array() ;
02272 fprintf(stderr, "use_orf = %i\n", use_orf) ;
02273 #endif
02274
02275 const INT default_look_back = 30000 ;
02276 INT max_look_back = default_look_back ;
02277 bool use_svm = false ;
02278
02279
02280
02281
02282 SG_PRINT("N:%i, seq_len:%i, max_num_signals:%i\n",N, seq_len, max_num_signals) ;
02283
02284
02285
02286
02287
02288 CArray2<CPlifBase*> PEN(Plif_matrix, N, N, false, true) ;
02289 PEN.set_name("PEN");
02290
02291 CArray2<CPlifBase*> PEN_state_signals(Plif_state_signals, N, max_num_signals, false, true) ;
02292 PEN_state_signals.set_name("state_signals");
02293
02294 CArray3<DREAL> seq_input(seq_array, N, seq_len, max_num_signals) ;
02295 seq_input.set_name("seq_input") ;
02296
02297 CArray2<DREAL> seq(N, seq_len) ;
02298 seq.set_name("seq") ;
02299 seq.zero() ;
02300
02301 CArray2<INT> orf_info(orf_info_array, N, 2) ;
02302 orf_info.set_name("orf_info") ;
02303
02304
02305
02306 DREAL svm_value[2*num_svms] ;
02307 {
02308 for (INT s=0; s<2*num_svms; s++)
02309 svm_value[s]=0 ;
02310 }
02311
02312 {
02313
02314 for (INT i=0; i<N; i++)
02315 for (INT j=0; j<seq_len; j++)
02316 seq.element(i,j) = 0 ;
02317
02318 for (INT i=0; i<N; i++)
02319 for (INT j=0; j<seq_len; j++)
02320 for (INT k=0; k<max_num_signals; k++)
02321 {
02322 if ((PEN_state_signals.element(i,k)==NULL) && (k==0))
02323 {
02324
02325 seq.element(i,j) = seq_input.element(i,j,k) ;
02326 break ;
02327 }
02328 if (PEN_state_signals.element(i,k)!=NULL)
02329 {
02330
02331 if (finite(seq_input.element(i,j,k)))
02332 seq.element(i,j) += PEN_state_signals.element(i,k)->lookup_penalty(seq_input.element(i,j,k), svm_value) ;
02333 else
02334
02335 seq.element(i,j) = seq_input.element(i, j, k) ;
02336 }
02337 else
02338 break ;
02339 }
02340 }
02341
02342 {
02343 for (INT i=0; i<N; i++)
02344 for (INT j=0; j<N; j++)
02345 {
02346 CPlifBase *penij=PEN.element(i,j) ;
02347 if (penij==NULL)
02348 continue ;
02349 if (penij->get_max_value()>max_look_back)
02350 {
02351 SG_DEBUG( "%d %d -> value: %f\n", i,j,penij->get_max_value());
02352 max_look_back=(INT) (CMath::ceil(penij->get_max_value()));
02353 }
02354 if (penij->uses_svm_values())
02355 use_svm=true ;
02356 }
02357 }
02358
02359
02360 max_look_back = CMath::min(m_genestr_len, max_look_back) ;
02361 SG_DEBUG("use_svm=%i\n", use_svm) ;
02362
02363 SG_DEBUG("maxlook: %d N: %d nbest: %d \n", max_look_back, N, nbest);
02364 const INT look_back_buflen = (max_look_back*N+1)*nbest ;
02365 SG_DEBUG("look_back_buflen=%i\n", look_back_buflen) ;
02366 const DREAL mem_use = (DREAL)(seq_len*N*nbest*(sizeof(T_STATES)+sizeof(short int)+sizeof(INT))+
02367 look_back_buflen*(2*sizeof(DREAL)+sizeof(INT))+
02368 seq_len*(sizeof(T_STATES)+sizeof(INT))+
02369 m_genestr_len*sizeof(bool))/(1024*1024);
02370
02371 bool is_big = (mem_use>200) || (seq_len>5000) ;
02372
02373 if (is_big)
02374 {
02375 SG_DEBUG("calling best_path_trans: seq_len=%i, N=%i, lookback=%i nbest=%i\n",
02376 seq_len, N, max_look_back, nbest) ;
02377 SG_DEBUG("allocating %1.2fMB of memory\n",
02378 mem_use) ;
02379 }
02380 ASSERT(nbest<32000) ;
02381
02382
02383
02384 CArray3<DREAL> delta(seq_len, N, nbest) ;
02385 delta.set_name("delta");
02386 DREAL* delta_array = delta.get_array() ;
02387
02388
02389 CArray3<T_STATES> psi(seq_len, N, nbest) ;
02390 psi.set_name("psi");
02391
02392
02393 CArray3<short int> ktable(seq_len, N, nbest) ;
02394 ktable.set_name("ktable");
02395
02396
02397 CArray3<INT> ptable(seq_len, N, nbest) ;
02398 ptable.set_name("ptable");
02399
02400
02401 CArray<DREAL> delta_end(nbest) ;
02402 delta_end.set_name("delta_end");
02403
02404
02405 CArray<T_STATES> path_ends(nbest) ;
02406 path_ends.set_name("path_ends");
02407
02408
02409 CArray<short int> ktable_end(nbest) ;
02410 ktable_end.set_name("ktable_end");
02411
02412
02413 DREAL * fixedtempvv=new DREAL[look_back_buflen] ;
02414 memset(fixedtempvv, 0, look_back_buflen*sizeof(DREAL)) ;
02415 INT * fixedtempii=new INT[look_back_buflen] ;
02416 memset(fixedtempii, 0, look_back_buflen*sizeof(INT)) ;
02417
02418 CArray<DREAL> oldtempvv(look_back_buflen) ;
02419 oldtempvv.set_name("oldtempvv");
02420 CArray<DREAL> oldtempvv2(look_back_buflen) ;
02421 oldtempvv2.set_name("oldtempvv2");
02422
02423
02424
02425 CArray<INT> oldtempii(look_back_buflen) ;
02426 oldtempii.set_name("oldtempii");
02427 CArray<INT> oldtempii2(look_back_buflen) ;
02428 oldtempii2.set_name("oldtempii2");
02429
02430
02431 CArray<T_STATES> state_seq(seq_len) ;
02432 state_seq.set_name("state_seq");
02433
02434
02435 CArray<INT> pos_seq(seq_len) ;
02436 pos_seq.set_name("pos_seq");
02437
02438
02439
02440
02441 word_degree.set_name("word_degree") ;
02442 cum_num_words.set_name("cum_num_words") ;
02443 num_words.set_name("num_words") ;
02444
02445
02446 svm_pos_start.set_name("svm_pos_start") ;
02447 num_unique_words.set_name("num_unique_words") ;
02448
02449 PEN.set_name("PEN") ;
02450 seq.set_name("seq") ;
02451 orf_info.set_name("orf_info") ;
02452
02453 delta.set_name("delta") ;
02454 psi.set_name("psi") ;
02455 ktable.set_name("ktable") ;
02456 ptable.set_name("ptable") ;
02457 delta_end.set_name("delta_end") ;
02458 path_ends.set_name("path_ends") ;
02459 ktable_end.set_name("ktable_end") ;
02460
02461 #ifdef USE_TMP_ARRAYCLASS
02462 fixedtempvv.set_name("fixedtempvv") ;
02463 fixedtempii.set_name("fixedtempvv") ;
02464 #endif
02465
02466 oldtempvv.set_name("oldtempvv") ;
02467 oldtempvv2.set_name("oldtempvv2") ;
02468 oldtempii.set_name("oldtempii") ;
02469 oldtempii2.set_name("oldtempii2") ;
02470
02471
02473
02474 #ifdef DYNPROG_DEBUG
02475 state_seq.display_size() ;
02476 pos_seq.display_size() ;
02477
02478 dict_weights.display_size() ;
02479 word_degree.display_array() ;
02480 cum_num_words.display_array() ;
02481 num_words.display_array() ;
02482
02483
02484 svm_pos_start.display_array() ;
02485 num_unique_words.display_array() ;
02486
02487 PEN.display_size() ;
02488 PEN_state_signals.display_size() ;
02489 seq.display_size() ;
02490 orf_info.display_size() ;
02491
02492
02493 delta.display_size() ;
02494 psi.display_size() ;
02495 ktable.display_size() ;
02496 ptable.display_size() ;
02497 delta_end.display_size() ;
02498 path_ends.display_size() ;
02499 ktable_end.display_size() ;
02500
02501 #ifdef USE_TMP_ARRAYCLASS
02502 fixedtempvv.display_size() ;
02503 fixedtempii.display_size() ;
02504 #endif
02505
02506
02507
02508
02509 state_seq.display_size() ;
02510 pos_seq.display_size() ;
02511
02512 CArray<INT>pp = CArray<INT>(pos, seq_len) ;
02513 pp.set_name("pp");
02514 pp.display_array() ;
02515
02516
02517
02518
02519 #endif //DYNPROG_DEBUG
02520
02522
02523
02524
02525 {
02526 for (INT s=0; s<num_svms; s++)
02527 ASSERT(string_words_array[s]<genestr_num) ;
02528 }
02529
02530
02531
02532
02533
02534 {
02535
02536 for (T_STATES i=0; i<N; i++)
02537 {
02538
02539 delta.element(delta_array, 0, i, 0, seq_len, N) = get_p(i) + seq.element(i,0) ;
02540 psi.element(0,i,0) = 0 ;
02541 if (nbest>1)
02542 ktable.element(0,i,0) = 0 ;
02543 ptable.element(0,i,0) = 0 ;
02544 for (short int k=1; k<nbest; k++)
02545 {
02546 INT dim1, dim2, dim3 ;
02547 delta.get_array_size(dim1, dim2, dim3) ;
02548
02549
02550 delta.element(delta_array, 0, i, k, seq_len, N) = -CMath::INFTY ;
02551 psi.element(0,i,0) = 0 ;
02552 if (nbest>1)
02553 ktable.element(0,i,k) = 0 ;
02554 ptable.element(0,i,k) = 0 ;
02555 }
02556
02557
02558
02559
02560
02561
02562
02563
02564
02565
02566
02567
02568
02569
02570
02571
02572
02573
02574 }
02575 }
02576
02577 struct svm_values_struct svs;
02578 svs.num_unique_words = NULL;
02579 svs.svm_values = NULL;
02580 svs.svm_values_unnormalized = NULL;
02581 svs.word_used = NULL;
02582
02583 struct segment_loss_struct loss;
02584 loss.segments_changed = NULL;
02585 loss.num_segment_id = NULL;
02586
02587 SG_DEBUG("START_RECURSION \n\n");
02588
02589
02590 for (INT t=1; t<seq_len; t++)
02591 {
02592 if (is_big && t%(1+(seq_len/1000))==1)
02593 SG_PROGRESS(t, 0, seq_len);
02594
02595
02596
02597
02598
02599
02600
02601
02602 if (with_loss)
02603 {
02604 init_segment_loss(loss, seq_len, max_look_back);
02605 find_segment_loss_till_pos(pos, t, m_segment_ids, m_segment_mask, loss);
02606 }
02607
02608 for (T_STATES j=0; j<N; j++)
02609 {
02610 if (seq.element(j,t)<=-1e20)
02611 {
02612 for (short int k=0; k<nbest; k++)
02613 {
02614 delta.element(delta_array, t, j, k, seq_len, N) = seq.element(j,t) ;
02615 psi.element(t,j,k) = 0 ;
02616 if (nbest>1)
02617 ktable.element(t,j,k) = 0 ;
02618 ptable.element(t,j,k) = 0 ;
02619 }
02620 }
02621 else
02622 {
02623 const T_STATES num_elem = trans_list_forward_cnt[j] ;
02624 const T_STATES *elem_list = trans_list_forward[j] ;
02625 const DREAL *elem_val = trans_list_forward_val[j] ;
02626 const INT *elem_id = trans_list_forward_id[j] ;
02627
02628 INT fixed_list_len = 0 ;
02629 DREAL fixedtempvv_ = CMath::INFTY ;
02630 INT fixedtempii_ = 0 ;
02631
02632
02633 for (INT i=0; i<num_elem; i++)
02634 {
02635 T_STATES ii = elem_list[i] ;
02636
02637 const CPlifBase * penalty = PEN.element(j,ii) ;
02638 INT look_back = default_look_back ;
02639 {
02640 CPlifBase *pen = (CPlifBase*) penalty ;
02641 if (pen!=NULL)
02642 look_back=(INT) (CMath::ceil(pen->get_max_value()));
02643 ASSERT(look_back<1e6);
02644 }
02645
02646
02647
02648
02649
02650
02651 INT orf_from = orf_info.element(ii,0) ;
02652 INT orf_to = orf_info.element(j,1) ;
02653 if((orf_from!=-1)!=(orf_to!=-1))
02654 SG_DEBUG("j=%i ii=%i orf_from=%i orf_to=%i p=%1.2f\n", j, ii, orf_from, orf_to, elem_val[i]) ;
02655 ASSERT((orf_from!=-1)==(orf_to!=-1)) ;
02656
02657 INT orf_target = -1 ;
02658 if (orf_from!=-1)
02659 {
02660 orf_target=orf_to-orf_from ;
02661 if (orf_target<0)
02662 orf_target+=3 ;
02663 ASSERT(orf_target>=0 && orf_target<3) ;
02664 }
02665
02666 INT orf_last_pos = pos[t] ;
02667 INT loss_last_pos = t ;
02668 DREAL last_loss = 0.0 ;
02669
02670 for (INT ts=t-1; ts>=0 && pos[t]-pos[ts]<=look_back; ts--)
02671 {
02672 bool ok ;
02673
02674
02675
02676
02677
02678
02679
02680
02681
02682 if (orf_target==-1)
02683 ok=true ;
02684 else if (pos[ts]!=-1 && (pos[t]-pos[ts])%3==orf_target)
02685 ok=(!use_orf) || extend_orf(orf_from, orf_to, pos[ts], orf_last_pos, pos[t]) ;
02686 else
02687 ok=false ;
02688
02689 if (ok)
02690 {
02691
02692 DREAL segment_loss = 0.0 ;
02693 if (with_loss)
02694 segment_loss = extend_segment_loss(loss, pos, elem_id[i], ts, loss_last_pos, last_loss) ;
02695
02697
02699 INT frame = orf_info.element(ii,0);
02700 lookup_content_svm_values(ts, t, pos[ts], pos[t], svm_value, frame);
02701 if (m_use_tiling)
02702 lookup_tiling_plif_values(ts, t, pos[t]-pos[ts], svm_value);
02703
02704
02705
02706
02707
02708
02709
02710
02711
02712
02713 DREAL pen_val = 0.0 ;
02714 if (penalty)
02715 pen_val = penalty->lookup_penalty(pos[t]-pos[ts], svm_value) ;
02716 if (nbest==1)
02717 {
02718 DREAL val = elem_val[i] + pen_val ;
02719 if (with_loss)
02720 val += segment_loss ;
02721
02722 DREAL mval = -(val + delta.element(delta_array, ts, ii, 0, seq_len, N)) ;
02723 if (mval<fixedtempvv_)
02724 {
02725 fixedtempvv_ = mval ;
02726 fixedtempii_ = ii + ts*N;
02727 fixed_list_len = 1 ;
02728 }
02729 }
02730 else
02731 {
02732 for (short int diff=0; diff<nbest; diff++)
02733 {
02734 DREAL val = elem_val[i] ;
02735 val += pen_val ;
02736 if (with_loss)
02737 val += segment_loss ;
02738
02739 DREAL mval = -(val + delta.element(delta_array, ts, ii, diff, seq_len, N)) ;
02740
02741
02742
02743
02744
02745 if ((fixed_list_len < nbest) || ((0==fixed_list_len) || (mval < fixedtempvv[fixed_list_len-1])))
02746 {
02747 if ( (fixed_list_len<nbest) && ((0==fixed_list_len) || (mval>fixedtempvv[fixed_list_len-1])) )
02748 {
02749 fixedtempvv[fixed_list_len] = mval ;
02750 fixedtempii[fixed_list_len] = ii + diff*N + ts*N*nbest;
02751 fixed_list_len++ ;
02752 }
02753 else
02754 {
02755 int addhere = fixed_list_len;
02756 while ((addhere > 0) && (mval < fixedtempvv[addhere-1]))
02757 addhere--;
02758
02759
02760 for (int jj=fixed_list_len-1; jj>addhere; jj--)
02761 {
02762 fixedtempvv[jj] = fixedtempvv[jj-1];
02763 fixedtempii[jj] = fixedtempii[jj-1];
02764 }
02765
02766 fixedtempvv[addhere] = mval;
02767 fixedtempii[addhere] = ii + diff*N + ts*N*nbest;
02768
02769 if (fixed_list_len < nbest)
02770 fixed_list_len++;
02771 }
02772 }
02773 }
02774 }
02775 }
02776 }
02777 }
02778
02779
02780 int numEnt = fixed_list_len;
02781
02782 double minusscore;
02783 long int fromtjk;
02784
02785 for (short int k=0; k<nbest; k++)
02786 {
02787 if (k<numEnt)
02788 {
02789 if (nbest==1)
02790 {
02791 minusscore = fixedtempvv_ ;
02792 fromtjk = fixedtempii_ ;
02793 }
02794 else
02795 {
02796 minusscore = fixedtempvv[k];
02797 fromtjk = fixedtempii[k];
02798 }
02799
02800 delta.element(delta_array, t, j, k, seq_len, N) = -minusscore + seq.element(j,t);
02801 psi.element(t,j,k) = (fromtjk%N) ;
02802 if (nbest>1)
02803 ktable.element(t,j,k) = (fromtjk%(N*nbest)-psi.element(t,j,k))/N ;
02804 ptable.element(t,j,k) = (fromtjk-(fromtjk%(N*nbest)))/(N*nbest) ;
02805 }
02806 else
02807 {
02808 delta.element(delta_array, t, j, k, seq_len, N) = -CMath::INFTY ;
02809 psi.element(t,j,k) = 0 ;
02810 if (nbest>1)
02811 ktable.element(t,j,k) = 0 ;
02812 ptable.element(t,j,k) = 0 ;
02813 }
02814 }
02815 }
02816 }
02817 }
02818
02819 clear_svm_values(svs);
02820 if (with_loss)
02821 clear_segment_loss(loss);
02822
02823 {
02824 INT list_len = 0 ;
02825 for (short int diff=0; diff<nbest; diff++)
02826 {
02827 for (T_STATES i=0; i<N; i++)
02828 {
02829 oldtempvv[list_len] = -(delta.element(delta_array, (seq_len-1), i, diff, seq_len, N)+get_q(i)) ;
02830 oldtempii[list_len] = i + diff*N ;
02831 list_len++ ;
02832 }
02833 }
02834
02835 CMath::nmin(oldtempvv.get_array(), oldtempii.get_array(), list_len, nbest) ;
02836
02837 for (short int k=0; k<nbest; k++)
02838 {
02839 delta_end.element(k) = -oldtempvv[k] ;
02840 path_ends.element(k) = (oldtempii[k]%N) ;
02841 if (nbest>1)
02842 ktable_end.element(k) = (oldtempii[k]-path_ends.element(k))/N ;
02843 }
02844 }
02845
02846 {
02847 for (short int k=0; k<nbest; k++)
02848 {
02849 prob_nbest[k]= delta_end.element(k) ;
02850
02851 INT i = 0 ;
02852 state_seq[i] = path_ends.element(k) ;
02853 short int q = 0 ;
02854 if (nbest>1)
02855 q=ktable_end.element(k) ;
02856 pos_seq[i] = seq_len-1 ;
02857
02858 while (pos_seq[i]>0)
02859 {
02860
02861 state_seq[i+1] = psi.element(pos_seq[i], state_seq[i], q);
02862 pos_seq[i+1] = ptable.element(pos_seq[i], state_seq[i], q) ;
02863 if (nbest>1)
02864 q = ktable.element(pos_seq[i], state_seq[i], q) ;
02865 i++ ;
02866 }
02867
02868 INT num_states = i+1 ;
02869 for (i=0; i<num_states;i++)
02870 {
02871 my_state_seq[i+k*seq_len] = state_seq[num_states-i-1] ;
02872 my_pos_seq[i+k*seq_len] = pos_seq[num_states-i-1] ;
02873 }
02874 my_state_seq[num_states+k*seq_len]=-1 ;
02875 my_pos_seq[num_states+k*seq_len]=-1 ;
02876 }
02877 }
02878
02879 if (is_big)
02880 SG_PRINT( "DONE. \n") ;
02881
02882
02883 #ifdef DYNPROG_TIMING
02884 MyTime2.stop() ;
02885
02886 if (is_big)
02887 SG_PRINT("Timing: orf=%1.2f s \n Segment_init=%1.2f s Segment_pos=%1.2f s Segment_extend=%1.2f s Segment_clean=%1.2f s\nsvm_init=%1.2f s svm_pos=%1.2f svm_clean=%1.2f\n total=%1.2f\n", orf_time, segment_init_time, segment_pos_time, segment_extend_time, segment_clean_time, svm_init_time, svm_pos_time, svm_clean_time, MyTime2.time_diff_sec()) ;
02888 #endif
02889
02890 delete[] fixedtempvv ;
02891 delete[] fixedtempii ;
02892 }
02893
02894 void CDynProg::best_path_trans_deriv(INT *my_state_seq, INT *my_pos_seq, DREAL *my_scores, DREAL* my_losses,INT my_seq_len,
02895 const DREAL *seq_array, INT seq_len, const INT *pos, CPlifBase **Plif_matrix,
02896 CPlifBase **Plif_state_signals, INT max_num_signals,INT genestr_num)
02897 {
02898 if (!svm_arrays_clean)
02899 {
02900 SG_ERROR( "SVM arrays not clean") ;
02901 return ;
02902 } ;
02903
02904
02905
02906
02907
02908 bool use_svm = false ;
02909
02910
02911
02912
02913 CArray2<CPlifBase*> PEN(Plif_matrix, N, N, false, false) ;
02914 CArray2<CPlifBase*> PEN_state_signals(Plif_state_signals, N, max_num_signals, false, false) ;
02915 CArray3<DREAL> seq_input(seq_array, N, seq_len, max_num_signals) ;
02916
02917 {
02918 for (INT i=0; i<N; i++)
02919 for (INT j=0; j<N; j++)
02920 {
02921 CPlifBase *penij=PEN.element(i,j) ;
02922 if (penij==NULL)
02923 continue ;
02924
02925 if (penij->uses_svm_values())
02926 use_svm=true ;
02927 penij->penalty_clear_derivative() ;
02928 }
02929 for (INT i=0; i<N; i++)
02930 for (INT j=0; j<max_num_signals; j++)
02931 {
02932 CPlifBase *penij=PEN_state_signals.element(i,j) ;
02933 if (penij==NULL)
02934 continue ;
02935 if (penij->uses_svm_values())
02936 use_svm=true ;
02937 penij->penalty_clear_derivative() ;
02938 }
02939 }
02940
02941 {
02942 for (INT i=0; i<N; i++)
02943 {
02944 initial_state_distribution_p_deriv.element(i)=0 ;
02945 end_state_distribution_q_deriv.element(i)=0 ;
02946 for (INT j=0; j<N; j++)
02947 transition_matrix_a_deriv.element(i,j)=0 ;
02948 }
02949 }
02950
02951 {
02952 for (INT i=0; i<my_seq_len; i++)
02953 {
02954 my_scores[i]=0.0 ;
02955 my_losses[i]=0.0 ;
02956 }
02957 }
02958
02959
02960
02961
02962
02963
02964 {
02965 DREAL svm_value[2*num_svms] ;
02966 for (INT s=0; s<2*num_svms; s++)
02967 svm_value[s]=0 ;
02968
02969 for (INT i=0; i<my_seq_len; i++)
02970 {
02971 my_scores[i]=0.0 ;
02972 my_losses[i]=0.0 ;
02973 }
02974
02975
02976 DREAL total_score = 0.0 ;
02977 DREAL total_loss = 0.0 ;
02978
02979
02980 ASSERT(my_state_seq[0]>=0) ;
02981 initial_state_distribution_p_deriv.element(my_state_seq[0])++ ;
02982 my_scores[0] += initial_state_distribution_p.element(my_state_seq[0]) ;
02983
02984 ASSERT(my_state_seq[my_seq_len-1]>=0) ;
02985 end_state_distribution_q_deriv.element(my_state_seq[my_seq_len-1])++ ;
02986 my_scores[my_seq_len-1] += end_state_distribution_q.element(my_state_seq[my_seq_len-1]);
02987
02988
02989 total_score += my_scores[0] + my_scores[my_seq_len-1] ;
02990
02991 struct svm_values_struct svs;
02992 svs.num_unique_words = NULL;
02993 svs.svm_values = NULL;
02994 svs.svm_values_unnormalized = NULL;
02995 svs.word_used = NULL;
02996
02997
02998 struct segment_loss_struct loss;
02999 loss.segments_changed = NULL;
03000 loss.num_segment_id = NULL;
03001
03002 for (INT i=0; i<my_seq_len-1; i++)
03003 {
03004 if (my_state_seq[i+1]==-1)
03005 break ;
03006 INT from_state = my_state_seq[i] ;
03007 INT to_state = my_state_seq[i+1] ;
03008 INT from_pos = my_pos_seq[i] ;
03009 INT to_pos = my_pos_seq[i+1] ;
03010
03011
03012 init_segment_loss(loss, seq_len, pos[to_pos]-pos[from_pos]+10);
03013 find_segment_loss_till_pos(pos, to_pos,m_segment_ids, m_segment_mask, loss);
03014
03015 INT loss_last_pos = to_pos ;
03016 DREAL last_loss = 0.0 ;
03017 INT elem_id = transition_matrix_a_id.element(from_state, to_state) ;
03018 my_losses[i] = extend_segment_loss(loss, pos, elem_id, from_pos, loss_last_pos, last_loss) ;
03019 #ifdef DYNPROG_DEBUG
03020 io.set_loglevel(M_DEBUG) ;
03021 SG_DEBUG( "%i. segment loss %f (id=%i): from=%i(%i), to=%i(%i)\n", i, my_losses[i], elem_id, from_pos, from_state, to_pos, to_state) ;
03022 #endif
03023
03024 transition_matrix_a_deriv.element(from_state, to_state)++ ;
03025 my_scores[i] += transition_matrix_a.element(from_state, to_state) ;
03026
03027 #ifdef DYNPROG_DEBUG
03028 SG_DEBUG( "%i. scores[i]=%f\n", i, my_scores[i]) ;
03029 #endif
03030
03031
03032
03033
03034
03035 if (use_svm)
03036 {
03037
03038 INT frame = m_orf_info.element(from_state,0);
03039 lookup_content_svm_values(from_pos, to_pos, pos[from_pos],pos[to_pos], svm_value, frame);
03040 if (m_use_tiling)
03041 lookup_tiling_plif_values(from_pos, to_pos, pos[to_pos]-pos[from_pos], svm_value);
03042
03043 #ifdef DYNPROG_DEBUG
03044 SG_DEBUG( "svm[%i]: %f\n", ss, svm_value[ss]) ;
03045 #endif
03046 }
03047
03048 if (PEN.element(to_state, from_state)!=NULL)
03049 {
03050 DREAL nscore = PEN.element(to_state, from_state)->lookup_penalty(pos[to_pos]-pos[from_pos], svm_value) ;
03051 my_scores[i] += nscore ;
03052
03053 if (m_use_tiling)
03054 for (INT s=0;s<num_svms;s++)
03055 svm_value[num_svms+s]=-CMath::INFTY;
03056
03057 #ifdef DYNPROG_DEBUG
03058
03059
03060
03061
03062
03063
03064
03065
03066
03067
03068
03069 #endif
03070 PEN.element(to_state, from_state)->penalty_add_derivative(pos[to_pos]-pos[from_pos], svm_value) ;
03071 if (m_use_tiling)
03072 {
03073 for (INT s=0;s<num_svms;s++)
03074 svm_value[s]=-CMath::INFTY;
03075 DREAL intensities[m_num_probes];
03076 INT num_intensities = raw_intensities_interval_query(pos[from_pos], pos[to_pos],intensities);
03077
03078 for (INT k=0;k<num_intensities;k++)
03079 {
03080 for (INT j=0;j<num_svms;j++)
03081 svm_value[num_svms+j]=intensities[k];
03082 PEN.element(to_state, from_state)->penalty_add_derivative(-CMath::INFTY, svm_value) ;
03083
03084 }
03085 }
03086 }
03087 #ifdef DYNPROG_DEBUG
03088 SG_DEBUG( "%i. scores[i]=%f\n", i, my_scores[i]) ;
03089 #endif
03090
03091
03092 for (INT k=0; k<max_num_signals; k++)
03093 {
03094 if ((PEN_state_signals.element(to_state,k)==NULL)&&(k==0))
03095 {
03096 #ifdef DYNPROG_DEBUG
03097 SG_DEBUG( "%i. emmission penalty: to_state=%i to_pos=%i score=%1.2f (no signal plif)\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k)) ;
03098 #endif
03099 my_scores[i] += seq_input.element(to_state, to_pos, k) ;
03100
03101 break ;
03102 }
03103 if (PEN_state_signals.element(to_state, k)!=NULL)
03104 {
03105 DREAL nscore = PEN_state_signals.element(to_state,k)->lookup_penalty(seq_input.element(to_state, to_pos, k), svm_value) ;
03106 my_scores[i] += nscore ;
03107
03108
03109
03110
03111
03112
03113
03114 #ifdef DYNPROG_DEBUG
03115 SG_DEBUG( "%i. emmission penalty: to_state=%i to_pos=%i value=%1.2f score=%1.2f k=%i\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k), nscore, k) ;
03116 #endif
03117 PEN_state_signals.element(to_state,k)->penalty_add_derivative(seq_input.element(to_state, to_pos, k), svm_value) ;
03118 } else
03119 break ;
03120 }
03121
03122
03123
03124
03125 total_score += my_scores[i] ;
03126 total_loss += my_losses[i] ;
03127
03128 }
03129
03130
03131 SG_PRINT( "total loss = %f \n", total_loss) ;
03132
03133 clear_svm_values(svs);
03134 clear_segment_loss(loss);
03135 }
03136
03137
03138 }
03139
03140
03141 void CDynProg::best_path_trans_simple(const DREAL *seq_array, INT seq_len, short int nbest,
03142 DREAL *prob_nbest, INT *my_state_seq)
03143 {
03144 if (!svm_arrays_clean)
03145 {
03146 SG_ERROR( "SVM arrays not clean") ;
03147 return ;
03148 } ;
03149
03150 INT max_look_back = 2 ;
03151 const INT look_back_buflen = max_look_back*nbest*N ;
03152 ASSERT(nbest<32000) ;
03153
03154 CArray2<DREAL> seq((DREAL *)seq_array, N, seq_len, false) ;
03155
03156 CArray3<DREAL> delta(max_look_back, N, nbest) ;
03157 CArray3<T_STATES> psi(seq_len, N, nbest) ;
03158 CArray3<short int> ktable(seq_len,N,nbest) ;
03159 CArray3<INT> ptable(seq_len,N,nbest) ;
03160
03161 CArray<DREAL> delta_end(nbest) ;
03162 CArray<T_STATES> path_ends(nbest) ;
03163 CArray<short int> ktable_end(nbest) ;
03164
03165 CArray<DREAL> oldtempvv(look_back_buflen) ;
03166 CArray<INT> oldtempii(look_back_buflen) ;
03167
03168 CArray<T_STATES> state_seq(seq_len) ;
03169 CArray<INT> pos_seq(seq_len) ;
03170
03171 {
03172
03173 for (T_STATES i=0; i<N; i++)
03174 {
03175 delta.element(0,i,0) = get_p(i) + seq.element(i,0) ;
03176 psi.element(0,i,0) = 0 ;
03177 ktable.element(0,i,0) = 0 ;
03178 ptable.element(0,i,0) = 0 ;
03179 for (short int k=1; k<nbest; k++)
03180 {
03181 delta.element(0,i,k) = -CMath::INFTY ;
03182 psi.element(0,i,0) = 0 ;
03183 ktable.element(0,i,k) = 0 ;
03184 ptable.element(0,i,k) = 0 ;
03185 }
03186 }
03187 }
03188
03189
03190 for (INT t=1; t<seq_len; t++)
03191 {
03192 for (T_STATES j=0; j<N; j++)
03193 {
03194 if (seq.element(j,t)<-1e20)
03195 {
03196 for (short int k=0; k<nbest; k++)
03197 {
03198 delta.element(t%max_look_back,j,k) = seq.element(j,t) ;
03199 psi.element(t,j,k) = 0 ;
03200 ktable.element(t,j,k) = 0 ;
03201 ptable.element(t,j,k) = 0 ;
03202 }
03203 }
03204 else
03205 {
03206 const T_STATES num_elem = trans_list_forward_cnt[j] ;
03207 const T_STATES *elem_list = trans_list_forward[j] ;
03208 const DREAL *elem_val = trans_list_forward_val[j] ;
03209
03210 INT old_list_len = 0 ;
03211
03212 for (INT i=0; i<num_elem; i++)
03213 {
03214 T_STATES ii = elem_list[i] ;
03215
03216 INT ts=t-1;
03217 if (ts>=0)
03218 {
03219 bool ok=true ;
03220
03221 if (ok)
03222 {
03223
03224
03225 for (short int diff=0; diff<nbest; diff++)
03226 {
03227 DREAL val = delta.element(ts%max_look_back,ii,diff) + elem_val[i] ;
03228 DREAL mval = -val;
03229
03230 oldtempvv[old_list_len] = mval ;
03231 oldtempii[old_list_len] = ii + diff*N + ts*N*nbest;
03232 old_list_len++ ;
03233 }
03234 }
03235 }
03236 }
03237
03238 CMath::nmin<INT>(oldtempvv.get_array(), oldtempii.get_array(), old_list_len, nbest) ;
03239
03240 int numEnt = 0;
03241 numEnt = old_list_len;
03242
03243 double minusscore;
03244 long int fromtjk;
03245
03246 for (short int k=0; k<nbest; k++)
03247 {
03248 if (k<numEnt)
03249 {
03250 minusscore = oldtempvv[k];
03251 fromtjk = oldtempii[k];
03252
03253 delta.element(t%max_look_back,j,k) = -minusscore + seq.element(j,t);
03254 psi.element(t,j,k) = (fromtjk%N) ;
03255 ktable.element(t,j,k) = (fromtjk%(N*nbest)-psi.element(t,j,k))/N ;
03256 ptable.element(t,j,k) = (fromtjk-(fromtjk%(N*nbest)))/(N*nbest) ;
03257 }
03258 else
03259 {
03260 delta.element(t%max_look_back,j,k) = -CMath::INFTY ;
03261 psi.element(t,j,k) = 0 ;
03262 ktable.element(t,j,k) = 0 ;
03263 ptable.element(t,j,k) = 0 ;
03264 }
03265 }
03266
03267 }
03268 }
03269 }
03270
03271
03272 {
03273 INT list_len = 0 ;
03274 for (short int diff=0; diff<nbest; diff++)
03275 {
03276 for (T_STATES i=0; i<N; i++)
03277 {
03278 oldtempvv[list_len] = -(delta.element((seq_len-1)%max_look_back,i,diff)+get_q(i)) ;
03279 oldtempii[list_len] = i + diff*N ;
03280 list_len++ ;
03281 }
03282 }
03283
03284 CMath::nmin(oldtempvv.get_array(), oldtempii.get_array(), list_len, nbest) ;
03285
03286 for (short int k=0; k<nbest; k++)
03287 {
03288 delta_end.element(k) = -oldtempvv[k] ;
03289 path_ends.element(k) = (oldtempii[k]%N) ;
03290 ktable_end.element(k) = (oldtempii[k]-path_ends.element(k))/N ;
03291 }
03292 }
03293
03294 {
03295 for (short int k=0; k<nbest; k++)
03296 {
03297 prob_nbest[k]= delta_end.element(k) ;
03298
03299 INT i = 0 ;
03300 state_seq[i] = path_ends.element(k) ;
03301 short int q = ktable_end.element(k) ;
03302 pos_seq[i] = seq_len-1 ;
03303
03304 while (pos_seq[i]>0)
03305 {
03306
03307 state_seq[i+1] = psi.element(pos_seq[i], state_seq[i], q);
03308 pos_seq[i+1] = ptable.element(pos_seq[i], state_seq[i], q) ;
03309 q = ktable.element(pos_seq[i], state_seq[i], q) ;
03310 i++ ;
03311 }
03312
03313 INT num_states = i+1 ;
03314 for (i=0; i<num_states;i++)
03315 {
03316 my_state_seq[i+k*seq_len] = state_seq[num_states-i-1] ;
03317 }
03318
03319 }
03320
03321 }
03322 }
03323