00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef __CDYNPROG_H__
00013 #define __CDYNPROG_H__
00014
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017 #include "base/SGObject.h"
00018 #include "lib/io.h"
00019 #include "lib/config.h"
00020 #include "structure/PlifBase.h"
00021 #include "structure/Plif.h"
00022 #include "features/StringFeatures.h"
00023 #include "distributions/Distribution.h"
00024 #include "lib/DynamicArray.h"
00025 #include "lib/Array.h"
00026 #include "lib/Array2.h"
00027 #include "lib/Array3.h"
00028 #include "lib/Time.h"
00029
00030 #include <stdio.h>
00031
00032
00033
00034 #ifdef USE_BIGSTATES
00035 typedef WORD T_STATES ;
00036 #else
00037 typedef BYTE T_STATES ;
00038 #endif
00039 typedef T_STATES* P_STATES ;
00040
00045 class CDynProg : public CSGObject
00046 {
00047 private:
00048
00049 T_STATES trans_list_len;
00050 T_STATES **trans_list_forward;
00051 T_STATES *trans_list_forward_cnt;
00052 DREAL **trans_list_forward_val;
00053 INT **trans_list_forward_id;
00054 bool mem_initialized;
00055
00056 #ifdef DYNPROG_TIMING
00057 CTime MyTime;
00058 CTime MyTime2;
00059
00060 DREAL segment_init_time;
00061 DREAL segment_pos_time;
00062 DREAL segment_clean_time;
00063 DREAL segment_extend_time;
00064 DREAL orf_time;
00065 DREAL content_time;
00066 DREAL content_penalty_time;
00067 DREAL svm_init_time;
00068 DREAL svm_pos_time;
00069 DREAL svm_clean_time;
00070 #endif
00071
00072 public:
00077 CDynProg(INT p_num_svms=8);
00078 ~CDynProg();
00079
00088 DREAL best_path_no_b(INT max_iter, INT & best_iter, INT *my_path);
00089
00098 void best_path_no_b_trans(INT max_iter, INT & max_best_iter, SHORT nbest, DREAL *prob_nbest, INT *my_paths);
00099
00100
00106 void set_num_states(INT p_N);
00107
00109 INT get_num_states();
00110
00112 INT get_num_svms();
00113
00119 void init_content_svm_value_array(const INT seq_len);
00120
00129 void init_tiling_data(DREAL* probe_pos, DREAL* intensities, const INT num_probes, const INT seq_len);
00130
00138 void precompute_tiling_plifs(CPlif** PEN, const INT num_penalties, const INT seq_len, const INT* pos);
00139
00145 void set_p_vector(DREAL* p, INT N);
00146
00152 void set_q_vector(DREAL* q, INT N);
00153
00160 void set_a(DREAL* a, INT M, INT N);
00161
00168 void set_a_id(INT *a, INT M, INT N);
00169
00176 void set_a_trans_matrix(DREAL *a_trans, INT num_trans, INT N);
00177
00178
00184 void init_svm_arrays(INT p_num_degrees, INT p_num_svms);
00185
00191 void init_word_degree_array(INT * p_word_degree_array, INT num_elem);
00192
00198 void init_cum_num_words_array(INT * p_cum_num_words_array, INT num_elem);
00199
00205 void init_num_words_array(INT * p_num_words_array, INT num_elem);
00206
00213 void init_mod_words_array(INT * p_mod_words_array, INT num_elem, INT num_columns);
00214
00220 void init_sign_words_array(bool * p_sign_words_array, INT num_elem);
00221
00227 void init_string_words_array(INT * p_string_words_array, INT num_elem);
00228
00234 bool check_svm_arrays();
00235
00236
00243 void best_path_set_seq(DREAL *seq, INT N, INT seq_len);
00244
00252 void best_path_set_seq3d(DREAL *seq, INT p_N, INT seq_len, INT max_num_signals);
00253
00259 void best_path_set_pos(INT *pos, INT seq_len);
00260
00268 void best_path_set_orf_info(INT *orf_info, INT m, INT n);
00269
00277 void best_path_set_segment_sum_weights(DREAL *segment_sum_weights, INT num_states, INT seq_len);
00278
00283 void best_path_set_plif_list(CDynamicArray<CPlifBase*>* plifs);
00284
00291 void best_path_set_plif_id_matrix(INT *plif_id_matrix, INT m, INT n);
00292
00299 void best_path_set_plif_state_signal_matrix(INT *plif_id_matrix, INT m, INT n);
00300
00307 void best_path_set_genestr(CHAR* genestr, INT genestr_len, INT genestr_num);
00308
00309
00315 void best_path_set_my_state_seq(INT* my_state_seq, INT seq_len);
00316
00322 void best_path_set_my_pos_seq(INT* my_pos_seq, INT seq_len);
00323
00329 inline void best_path_set_single_genestr(CHAR* genestr, INT genestr_len)
00330 {
00331 SG_DEBUG("genestrpy: %d", genestr_len);
00332 best_path_set_genestr(genestr, genestr_len, 1);
00333 }
00334
00341 void best_path_set_dict_weights(DREAL* dictionary_weights, INT dict_len, INT n);
00342
00349 void best_path_set_segment_loss(DREAL * segment_loss, INT num_segment_id1, INT num_segment_id2);
00350
00357 void best_path_set_segment_ids_mask(INT* segment_ids, DREAL* segment_mask, INT m);
00358
00359
00365 void best_path_call(INT nbest, bool use_orf);
00366
00368 void best_path_deriv_call();
00369
00374 void best_path_2struct_call(INT nbest);
00375
00380 void best_path_simple_call(INT nbest);
00381
00386 void best_path_deriv_call(INT nbest);
00387
00388
00394 void best_path_get_scores(DREAL **scores, INT *n);
00395
00402 void best_path_get_states(INT **states, INT *m, INT *n);
00403
00410 void best_path_get_positions(INT **positions, INT *m, INT *n);
00411
00412
00418 void best_path_get_losses(DREAL** my_losses, INT* seq_len);
00419
00421
00437 template <short int nbest, bool with_loss, bool with_multiple_sequences>
00438 void best_path_trans(const DREAL *seq, INT seq_len, const INT *pos,
00439 const INT *orf_info, CPlifBase **PLif_matrix,
00440 CPlifBase **Plif_state_signals, INT max_num_signals,
00441 INT genestr_num,
00442 DREAL *prob_nbest, INT *my_state_seq, INT *my_pos_seq,
00443 bool use_orf);
00444
00460 void best_path_trans_deriv(INT *my_state_seq, INT *my_pos_seq, DREAL *my_scores, DREAL* my_losses, INT my_seq_len,
00461 const DREAL *seq_array, INT seq_len, const INT *pos, CPlifBase **Plif_matrix,
00462 CPlifBase **Plif_state_signals, INT max_num_signals, INT genestr_num);
00463
00480 void best_path_2struct(const DREAL *seq, INT seq_len, const INT *pos,
00481 CPlifBase **Plif_matrix,
00482 const char *genestr, INT genestr_len,
00483 SHORT nbest,
00484 DREAL *prob_nbest, INT *my_state_seq, INT *my_pos_seq,
00485 DREAL *dictionary_weights, INT dict_len, DREAL *segment_sum_weights);
00486
00495 void best_path_trans_simple(const DREAL *seq, INT seq_len, SHORT nbest,
00496 DREAL *prob_nbest, INT *my_state_seq);
00497
00498
00499
00501 inline T_STATES get_N() const
00502 {
00503 return N ;
00504 }
00505
00510 inline void set_q(T_STATES offset, DREAL value)
00511 {
00512 end_state_distribution_q[offset]=value;
00513 }
00514
00519 inline void set_p(T_STATES offset, DREAL value)
00520 {
00521 initial_state_distribution_p[offset]=value;
00522 }
00523
00530 inline void set_a(T_STATES line_, T_STATES column, DREAL value)
00531 {
00532 transition_matrix_a.element(line_,column)=value;
00533 }
00534
00540 inline DREAL get_q(T_STATES offset) const
00541 {
00542 return end_state_distribution_q[offset];
00543 }
00544
00550 inline DREAL get_q_deriv(T_STATES offset) const
00551 {
00552 return end_state_distribution_q_deriv[offset];
00553 }
00554
00560 inline DREAL get_p(T_STATES offset) const
00561 {
00562 return initial_state_distribution_p[offset];
00563 }
00564
00570 inline DREAL get_p_deriv(T_STATES offset) const
00571 {
00572 return initial_state_distribution_p_deriv[offset];
00573 }
00574
00585 void precompute_content_values(WORD*** wordstr, const INT *pos,
00586 const INT num_cand_pos, const INT genestr_len,
00587 DREAL *dictionary_weights, INT dict_len);
00588
00597 void create_word_string(const CHAR* genestr, INT genestr_num, INT genestr_len, WORD*** wordstr);
00598
00604 void precompute_stop_codons(const CHAR* genestr, INT genestr_len);
00605
00611 void set_genestr_len(INT genestr_len);
00612
00619 inline DREAL get_a(T_STATES line_, T_STATES column) const
00620 {
00621 return transition_matrix_a.element(line_,column);
00622 }
00623
00630 inline DREAL get_a_deriv(T_STATES line_, T_STATES column) const
00631 {
00632 return transition_matrix_a_deriv.element(line_,column);
00633 }
00635 protected:
00636
00637
00638
00648 inline void lookup_content_svm_values(const INT from_state,
00649 const INT to_state, const INT from_pos, const INT to_pos,
00650 DREAL* svm_values, INT frame);
00651
00659 inline void lookup_tiling_plif_values(const INT from_state,
00660 const INT to_state, const INT len, DREAL* svm_values);
00661
00666 inline INT find_frame(const INT from_state);
00667
00675 inline INT raw_intensities_interval_query(
00676 const INT from_pos, const INT to_pos, DREAL* intensities);
00677
00686 void translate_from_single_order(WORD* obs, INT sequence_length, INT start,
00687 INT order, INT max_val=2);
00688
00695 void reset_svm_value(INT pos, INT & last_svm_pos, DREAL * svm_value);
00696
00704 void extend_svm_value(WORD* wordstr, INT pos, INT &last_svm_pos,
00705 DREAL* svm_value);
00706
00714 void reset_segment_sum_value(INT num_states, INT pos,
00715 INT & last_segment_sum_pos, DREAL * segment_sum_value);
00716
00726 void extend_segment_sum_value(DREAL *segment_sum_weights, INT seqlen,
00727 INT num_states, INT pos, INT &last_segment_sum_pos,
00728 DREAL* segment_sum_value);
00729
00731 struct svm_values_struct
00732 {
00734 INT maxlookback;
00736 INT seqlen;
00737
00739 INT* start_pos;
00741 DREAL ** svm_values_unnormalized;
00743 DREAL * svm_values;
00745 bool *** word_used;
00747 INT **num_unique_words;
00748 };
00749
00750
00751
00759 void init_svm_values(struct svm_values_struct & svs, INT start_pos,
00760 INT seqlen, INT howmuchlookback);
00761
00766 void clear_svm_values(struct svm_values_struct & svs);
00767
00775 void find_svm_values_till_pos(WORD*** wordstr, const INT *pos, INT t_end,
00776 struct svm_values_struct &svs);
00777
00785 void find_svm_values_till_pos(WORD** wordstr, const INT *pos, INT t_end,
00786 struct svm_values_struct &svs);
00787
00796 void update_svm_values_till_pos(WORD*** wordstr, const INT *pos, INT t_end,
00797 INT prev_t_end, struct svm_values_struct &svs);
00798
00807 bool extend_orf(INT orf_from, INT orf_to, INT start, INT &last_pos, INT to);
00808
00810 struct segment_loss_struct
00811 {
00813 INT maxlookback;
00815 INT seqlen;
00817 INT *segments_changed;
00819 DREAL *num_segment_id;
00821 INT *length_segment_id ;
00822 };
00823
00830 void init_segment_loss(struct segment_loss_struct & loss, INT seqlen,
00831 INT howmuchlookback);
00832
00837 void clear_segment_loss(struct segment_loss_struct & loss);
00838
00849 DREAL extend_segment_loss(struct segment_loss_struct & loss,
00850 const INT * pos_array, INT segment_id, INT pos, INT& last_pos,
00851 DREAL &last_value);
00852
00861 void find_segment_loss_till_pos(const INT * pos, INT t_end,
00862 CArray<INT>& segment_ids, CArray<DREAL>& segment_mask,
00863 struct segment_loss_struct& loss);
00864
00865
00870
00871 INT N;
00872
00874 CArray2<INT> transition_matrix_a_id;
00875 CArray2<DREAL> transition_matrix_a;
00876 CArray2<DREAL> transition_matrix_a_deriv;
00877
00879 CArray<DREAL> initial_state_distribution_p;
00880 CArray<DREAL> initial_state_distribution_p_deriv;
00881
00883 CArray<DREAL> end_state_distribution_q;
00884 CArray<DREAL> end_state_distribution_q_deriv;
00885
00887
00889 CArray2<DREAL> dict_weights;
00891 DREAL * dict_weights_array;
00892
00894 INT num_degrees;
00896 INT num_svms;
00898 INT num_strings;
00899
00901 CArray<INT> word_degree;
00903 CArray<INT> cum_num_words;
00905 INT * cum_num_words_array;
00907 CArray<INT> num_words;
00909 INT * num_words_array;
00911 CArray2<INT> mod_words;
00913 INT * mod_words_array;
00915 CArray<bool> sign_words;
00917 bool * sign_words_array;
00919 CArray<INT> string_words;
00921 INT * string_words_array;
00922
00923
00924
00925
00927 CArray<INT> svm_pos_start;
00929 CArray<INT> num_unique_words;
00931 bool svm_arrays_clean;
00932
00934 INT num_svms_single;
00936 INT word_degree_single;
00938 INT cum_num_words_single;
00940 INT num_words_single;
00941
00943 CArray<bool> word_used_single;
00945 CArray<DREAL> svm_value_unnormalized_single;
00947 INT num_unique_words_single;
00948
00950 INT max_a_id;
00951
00952
00954 INT m_step;
00956 INT m_call;
00957
00958
00960 CArray3<DREAL> m_seq;
00962 CArray<INT> m_pos;
00964 CArray2<INT> m_orf_info;
00966 CArray2<DREAL> m_segment_sum_weights;
00968 CArray<CPlifBase*> m_plif_list;
00970 CArray2<CPlifBase*> m_PEN;
00972 CArray2<CPlifBase*> m_PEN_state_signals;
00974 CArray2<CHAR> m_genestr;
00976 CArray2<DREAL> m_dict_weights;
00978 CArray3<DREAL> m_segment_loss;
00980 CArray<INT> m_segment_ids;
00982 CArray<DREAL> m_segment_mask;
00984 CArray<INT> m_my_state_seq;
00986 CArray<INT> m_my_pos_seq;
00988 CArray<DREAL> m_my_scores;
00990 CArray<DREAL> m_my_losses;
00991
00992
00994 CArray<DREAL> m_scores;
00996 CArray2<INT> m_states;
00998 CArray2<INT> m_positions;
00999
01003 CArray<bool> m_genestr_stop;
01004
01009 CArray2<DREAL> m_precomputed_svm_values;
01010
01012 CArray2<DREAL> m_precomputed_tiling_values;
01013
01015 DREAL* m_raw_intensities;
01017 INT* m_probe_pos;
01019 INT m_num_probes;
01021 bool m_use_tiling;
01023 INT m_genestr_len;
01024 };
01025
01026 inline INT CDynProg::raw_intensities_interval_query(const INT from_pos, const INT to_pos, DREAL* intensities)
01027 {
01028 ASSERT(from_pos<to_pos);
01029
01030 INT num_intensities = 0;
01031 INT* p_tiling_pos = m_probe_pos;
01032 DREAL* p_tiling_data = m_raw_intensities;
01033 INT last_pos;
01034 INT num = 0;
01035 while (*p_tiling_pos<to_pos)
01036 {
01037 if (*p_tiling_pos>=from_pos)
01038 {
01039 intensities[num_intensities] = *p_tiling_data;
01040 num_intensities++;
01041
01042 }
01043 num++;
01044 if (num>=m_num_probes)
01045 break;
01046 last_pos = *p_tiling_pos;
01047 p_tiling_pos++;
01048 p_tiling_data++;
01049 ASSERT(last_pos<*p_tiling_pos);
01050 }
01051 return num_intensities;
01052 }
01053 inline void CDynProg::lookup_content_svm_values(const INT from_state, const INT to_state, const INT from_pos, const INT to_pos, DREAL* svm_values, INT frame)
01054 {
01055
01056
01057
01058 for (INT i=0;i<4;i++)
01059 {
01060 DREAL to_val = m_precomputed_svm_values.get_element(i, to_state);
01061 DREAL from_val = m_precomputed_svm_values.get_element(i,from_state);
01062 svm_values[i]=(to_val-from_val)/(to_pos-from_pos);
01063 }
01064
01065
01066
01067
01068
01069
01070
01071
01072
01073
01074
01075
01076 }
01077 inline void CDynProg::lookup_tiling_plif_values(const INT from_state, const INT to_state, const INT len, DREAL* svm_values)
01078 {
01079 ASSERT(from_state<to_state);
01080 ASSERT(len>0);
01081 for (INT i=num_svms;i<2*num_svms;i++)
01082 {
01083
01084 svm_values[i]=(m_precomputed_tiling_values.get_element(i-num_svms,to_state)-m_precomputed_tiling_values.get_element(i-num_svms,from_state));
01085
01086 }
01087 }
01088 #endif