00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef __CDYNPROG_H__
00013 #define __CDYNPROG_H__
00014
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017 #include "base/SGObject.h"
00018 #include "lib/io.h"
00019 #include "lib/config.h"
00020 #include "structure/PlifBase.h"
00021 #include "structure/Plif.h"
00022 #include "features/StringFeatures.h"
00023 #include "distributions/Distribution.h"
00024 #include "lib/DynamicArray.h"
00025 #include "lib/Array.h"
00026 #include "lib/Array2.h"
00027 #include "lib/Array3.h"
00028 #include "lib/Time.h"
00029
00030 #include <stdio.h>
00031
00032
00033
00034 #ifdef USE_BIGSTATES
00035 typedef uint16_t T_STATES ;
00036 #else
00037 typedef uint8_t T_STATES ;
00038 #endif
00039 typedef T_STATES* P_STATES ;
00040
00045 class CDynProg : public CSGObject
00046 {
00047 private:
00048
00049 T_STATES trans_list_len;
00050 T_STATES **trans_list_forward;
00051 T_STATES *trans_list_forward_cnt;
00052 float64_t **trans_list_forward_val;
00053 int32_t **trans_list_forward_id;
00054 bool mem_initialized;
00055
00056 #ifdef DYNPROG_TIMING
00057 CTime MyTime;
00058 CTime MyTime2;
00059
00060 float64_t segment_init_time;
00061 float64_t segment_pos_time;
00062 float64_t segment_clean_time;
00063 float64_t segment_extend_time;
00064 float64_t orf_time;
00065 float64_t content_time;
00066 float64_t content_penalty_time;
00067 float64_t svm_init_time;
00068 float64_t svm_pos_time;
00069 float64_t svm_clean_time;
00070 #endif
00071
00072 public:
00077 CDynProg(int32_t p_num_svms=8);
00078 ~CDynProg();
00079
00088 float64_t best_path_no_b(
00089 int32_t max_iter, int32_t & best_iter, int32_t *my_path);
00090
00099 void best_path_no_b_trans(
00100 int32_t max_iter, int32_t & max_best_iter, int16_t nbest,
00101 float64_t *prob_nbest, int32_t *my_paths);
00102
00103
00109 void set_num_states(int32_t p_N);
00110
00112 int32_t get_num_states();
00113
00115 int32_t get_num_svms();
00116
00122 void init_content_svm_value_array(const int32_t seq_len);
00123
00132 void init_tiling_data(
00133 int32_t* probe_pos, float64_t* intensities, const int32_t num_probes,
00134 const int32_t seq_len);
00135
00144 void precompute_tiling_plifs(
00145 CPlif** PEN, const int32_t* tiling_plif_ids,
00146 const int32_t num_tiling_plifs, const int32_t seq_len,
00147 const int32_t* pos);
00148
00154 void set_p_vector(float64_t* p, int32_t N);
00155
00161 void set_q_vector(float64_t* q, int32_t N);
00162
00169 void set_a(float64_t* a, int32_t M, int32_t N);
00170
00177 void set_a_id(int32_t *a, int32_t M, int32_t N);
00178
00185 void set_a_trans_matrix(float64_t *a_trans, int32_t num_trans, int32_t N);
00186
00187
00193 void init_svm_arrays(int32_t p_num_degrees, int32_t p_num_svms);
00194
00200 void init_word_degree_array(int32_t * p_word_degree_array, int32_t num_elem);
00201
00207 void init_cum_num_words_array(
00208 int32_t * p_cum_num_words_array, int32_t num_elem);
00209
00215 void init_num_words_array(int32_t * p_num_words_array, int32_t num_elem);
00216
00223 void init_mod_words_array(
00224 int32_t * p_mod_words_array, int32_t num_elem, int32_t num_columns);
00225
00231 void init_sign_words_array(bool * p_sign_words_array, int32_t num_elem);
00232
00238 void init_string_words_array(
00239 int32_t * p_string_words_array, int32_t num_elem);
00240
00246 bool check_svm_arrays();
00247
00248
00255 void best_path_set_seq(float64_t *seq, int32_t N, int32_t seq_len);
00256
00264 void best_path_set_seq3d(
00265 float64_t *seq, int32_t p_N, int32_t seq_len, int32_t max_num_signals);
00266
00272 void best_path_set_pos(int32_t *pos, int32_t seq_len);
00273
00281 void best_path_set_orf_info(int32_t *orf_info, int32_t m, int32_t n);
00282
00290 void best_path_set_segment_sum_weights(
00291 float64_t *segment_sum_weights, int32_t num_states, int32_t seq_len);
00292
00297 void best_path_set_plif_list(CDynamicArray<CPlifBase*>* plifs);
00298
00305 void best_path_set_plif_id_matrix(
00306 int32_t *plif_id_matrix, int32_t m, int32_t n);
00307
00314 void best_path_set_plif_state_signal_matrix(
00315 int32_t *plif_id_matrix, int32_t m, int32_t n);
00316
00323 void best_path_set_genestr(
00324 char* genestr, int32_t genestr_len, int32_t genestr_num);
00325
00326
00332 void best_path_set_my_state_seq(int32_t* my_state_seq, int32_t seq_len);
00333
00339 void best_path_set_my_pos_seq(int32_t* my_pos_seq, int32_t seq_len);
00340
00346 inline void best_path_set_single_genestr(char* genestr, int32_t genestr_len)
00347 {
00348 SG_DEBUG("genestrpy: %d", genestr_len);
00349 best_path_set_genestr(genestr, genestr_len, 1);
00350 }
00351
00358 void best_path_set_dict_weights(
00359 float64_t* dictionary_weights, int32_t dict_len, int32_t n);
00360
00367 void best_path_set_segment_loss(
00368 float64_t * segment_loss, int32_t num_segment_id1,
00369 int32_t num_segment_id2);
00370
00377 void best_path_set_segment_ids_mask(
00378 int32_t* segment_ids, float64_t* segment_mask, int32_t m);
00379
00380
00386 void best_path_call(int32_t nbest, bool use_orf);
00387
00389 void best_path_deriv_call();
00390
00395 void best_path_2struct_call(int32_t nbest);
00396
00401 void best_path_simple_call(int32_t nbest);
00402
00407 void best_path_deriv_call(int32_t nbest);
00408
00409
00415 void best_path_get_scores(float64_t **scores, int32_t *n);
00416
00423 void best_path_get_states(int32_t **states, int32_t *m, int32_t *n);
00424
00431 void best_path_get_positions(int32_t **positions, int32_t *m, int32_t *n);
00432
00433
00439 void best_path_get_losses(float64_t** my_losses, int32_t* seq_len);
00440
00442
00458 template <int16_t nbest, bool with_loss, bool with_multiple_sequences>
00459 void best_path_trans(
00460 const float64_t *seq, int32_t seq_len, const int32_t *pos,
00461 const int32_t *orf_info, CPlifBase **PLif_matrix,
00462 CPlifBase **Plif_state_signals, int32_t max_num_signals,
00463 int32_t genestr_num, float64_t *prob_nbest, int32_t *my_state_seq,
00464 int32_t *my_pos_seq, bool use_orf);
00465
00481 void best_path_trans_deriv(
00482 int32_t *my_state_seq, int32_t *my_pos_seq, float64_t *my_scores,
00483 float64_t* my_losses, int32_t my_seq_len, const float64_t *seq_array,
00484 int32_t seq_len, const int32_t *pos, CPlifBase **Plif_matrix,
00485 CPlifBase **Plif_state_signals, int32_t max_num_signals,
00486 int32_t genestr_num);
00487
00504 void best_path_2struct(
00505 const float64_t *seq, int32_t seq_len, const int32_t *pos,
00506 CPlifBase **Plif_matrix, const char *genestr, int32_t genestr_len,
00507 int16_t nbest, float64_t *prob_nbest, int32_t *my_state_seq,
00508 int32_t *my_pos_seq, float64_t *dictionary_weights, int32_t dict_len,
00509 float64_t *segment_sum_weights);
00510
00519 void best_path_trans_simple(
00520 const float64_t *seq, int32_t seq_len, int16_t nbest,
00521 float64_t *prob_nbest, int32_t *my_state_seq);
00522
00523
00524
00526 inline T_STATES get_N() const
00527 {
00528 return N ;
00529 }
00530
00535 inline void set_q(T_STATES offset, float64_t value)
00536 {
00537 end_state_distribution_q[offset]=value;
00538 }
00539
00544 inline void set_p(T_STATES offset, float64_t value)
00545 {
00546 initial_state_distribution_p[offset]=value;
00547 }
00548
00555 inline void set_a(T_STATES line_, T_STATES column, float64_t value)
00556 {
00557 transition_matrix_a.element(line_,column)=value;
00558 }
00559
00565 inline float64_t get_q(T_STATES offset) const
00566 {
00567 return end_state_distribution_q[offset];
00568 }
00569
00575 inline float64_t get_q_deriv(T_STATES offset) const
00576 {
00577 return end_state_distribution_q_deriv[offset];
00578 }
00579
00585 inline float64_t get_p(T_STATES offset) const
00586 {
00587 return initial_state_distribution_p[offset];
00588 }
00589
00595 inline float64_t get_p_deriv(T_STATES offset) const
00596 {
00597 return initial_state_distribution_p_deriv[offset];
00598 }
00599
00610 void precompute_content_values(
00611 uint16_t*** wordstr, const int32_t *pos, const int32_t num_cand_pos,
00612 const int32_t genestr_len, float64_t *dictionary_weights,
00613 int32_t dict_len);
00614
00623 void create_word_string(
00624 const char* genestr, int32_t genestr_num, int32_t genestr_len,
00625 uint16_t*** wordstr);
00626
00632 void precompute_stop_codons(const char* genestr, int32_t genestr_len);
00633
00639 void set_genestr_len(int32_t genestr_len);
00640
00647 inline float64_t get_a(T_STATES line_, T_STATES column) const
00648 {
00649 return transition_matrix_a.element(line_,column);
00650 }
00651
00658 inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const
00659 {
00660 return transition_matrix_a_deriv.element(line_,column);
00661 }
00663 protected:
00664
00665
00666
00676 inline void lookup_content_svm_values(
00677 const int32_t from_state, const int32_t to_state,
00678 const int32_t from_pos, const int32_t to_pos, float64_t* svm_values,
00679 int32_t frame);
00680
00688 inline void lookup_tiling_plif_values(
00689 const int32_t from_state, const int32_t to_state, const int32_t len,
00690 float64_t* svm_values);
00691
00696 inline int32_t find_frame(const int32_t from_state);
00697
00705 inline int32_t raw_intensities_interval_query(
00706 const int32_t from_pos, const int32_t to_pos, float64_t* intensities);
00707
00716 void translate_from_single_order(
00717 uint16_t* obs, int32_t sequence_length, int32_t start, int32_t order,
00718 int32_t max_val=2);
00719
00726 void reset_svm_value(
00727 int32_t pos, int32_t & last_svm_pos, float64_t * svm_value);
00728
00736 void extend_svm_value(
00737 uint16_t* wordstr, int32_t pos, int32_t &last_svm_pos,
00738 float64_t* svm_value);
00739
00747 void reset_segment_sum_value(
00748 int32_t num_states, int32_t pos, int32_t & last_segment_sum_pos,
00749 float64_t * segment_sum_value);
00750
00760 void extend_segment_sum_value(
00761 float64_t *segment_sum_weights, int32_t seqlen, int32_t num_states,
00762 int32_t pos, int32_t &last_segment_sum_pos,
00763 float64_t* segment_sum_value);
00764
00766 struct svm_values_struct
00767 {
00769 int32_t maxlookback;
00771 int32_t seqlen;
00772
00774 int32_t* start_pos;
00776 float64_t ** svm_values_unnormalized;
00778 float64_t * svm_values;
00780 bool *** word_used;
00782 int32_t **num_unique_words;
00783 };
00784
00785
00786
00794 void init_svm_values(
00795 struct svm_values_struct & svs, int32_t start_pos, int32_t seqlen,
00796 int32_t howmuchlookback);
00797
00802 void clear_svm_values(struct svm_values_struct & svs);
00803
00811 void find_svm_values_till_pos(
00812 uint16_t*** wordstr, const int32_t *pos, int32_t t_end,
00813 struct svm_values_struct &svs);
00814
00822 void find_svm_values_till_pos(
00823 uint16_t** wordstr, const int32_t *pos, int32_t t_end,
00824 struct svm_values_struct &svs);
00825
00834 void update_svm_values_till_pos(
00835 uint16_t*** wordstr, const int32_t *pos, int32_t t_end,
00836 int32_t prev_t_end, struct svm_values_struct &svs);
00837
00846 bool extend_orf(
00847 int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos,
00848 int32_t to);
00849
00851 struct segment_loss_struct
00852 {
00854 int32_t maxlookback;
00856 int32_t seqlen;
00858 int32_t *segments_changed;
00860 float64_t *num_segment_id;
00862 int32_t *length_segment_id ;
00863 };
00864
00871 void init_segment_loss(
00872 struct segment_loss_struct & loss, int32_t seqlen,
00873 int32_t howmuchlookback);
00874
00879 void clear_segment_loss(struct segment_loss_struct & loss);
00880
00891 float64_t extend_segment_loss(
00892 struct segment_loss_struct & loss, const int32_t * pos_array,
00893 int32_t segment_id, int32_t pos, int32_t& last_pos,
00894 float64_t &last_value);
00895
00904 void find_segment_loss_till_pos(
00905 const int32_t * pos, int32_t t_end, CArray<int32_t>& segment_ids,
00906 CArray<float64_t>& segment_mask, struct segment_loss_struct& loss);
00907
00908
00913
00914 int32_t N;
00915
00917 CArray2<int32_t> transition_matrix_a_id;
00918 CArray2<float64_t> transition_matrix_a;
00919 CArray2<float64_t> transition_matrix_a_deriv;
00920
00922 CArray<float64_t> initial_state_distribution_p;
00923 CArray<float64_t> initial_state_distribution_p_deriv;
00924
00926 CArray<float64_t> end_state_distribution_q;
00927 CArray<float64_t> end_state_distribution_q_deriv;
00928
00930
00932 CArray2<float64_t> dict_weights;
00934 float64_t * dict_weights_array;
00935
00937 int32_t num_degrees;
00939 int32_t num_svms;
00941 int32_t num_strings;
00942
00944 CArray<int32_t> word_degree;
00946 CArray<int32_t> cum_num_words;
00948 int32_t * cum_num_words_array;
00950 CArray<int32_t> num_words;
00952 int32_t * num_words_array;
00954 CArray2<int32_t> mod_words;
00956 int32_t * mod_words_array;
00958 CArray<bool> sign_words;
00960 bool * sign_words_array;
00962 CArray<int32_t> string_words;
00964 int32_t * string_words_array;
00965
00966
00967
00968
00970 CArray<int32_t> svm_pos_start;
00972 CArray<int32_t> num_unique_words;
00974 bool svm_arrays_clean;
00975
00977 int32_t num_svms_single;
00979 int32_t word_degree_single;
00981 int32_t cum_num_words_single;
00983 int32_t num_words_single;
00984
00986 CArray<bool> word_used_single;
00988 CArray<float64_t> svm_value_unnormalized_single;
00990 int32_t num_unique_words_single;
00991
00993 int32_t max_a_id;
00994
00995
00997 int32_t m_step;
00999 int32_t m_call;
01000
01001
01003 CArray3<float64_t> m_seq;
01005 CArray<int32_t> m_pos;
01007 CArray2<int32_t> m_orf_info;
01009 CArray2<float64_t> m_segment_sum_weights;
01011 CArray<CPlifBase*> m_plif_list;
01013 CArray2<CPlifBase*> m_PEN;
01015 CArray2<CPlifBase*> m_PEN_state_signals;
01017 CArray2<char> m_genestr;
01019 CArray2<float64_t> m_dict_weights;
01021 CArray3<float64_t> m_segment_loss;
01023 CArray<int32_t> m_segment_ids;
01025 CArray<float64_t> m_segment_mask;
01027 CArray<int32_t> m_my_state_seq;
01029 CArray<int32_t> m_my_pos_seq;
01031 CArray<float64_t> m_my_scores;
01033 CArray<float64_t> m_my_losses;
01034
01035
01037 CArray<float64_t> m_scores;
01039 CArray2<int32_t> m_states;
01041 CArray2<int32_t> m_positions;
01042
01046 CArray<bool> m_genestr_stop;
01047
01052 CArray2<float64_t> m_precomputed_svm_values;
01053
01055 CArray2<float64_t> m_precomputed_tiling_values;
01056
01058 float64_t* m_raw_intensities;
01060 int32_t* m_probe_pos;
01062 int32_t m_num_probes;
01064 bool m_use_tiling;
01066 int32_t m_genestr_len;
01067 };
01068
01069 inline int32_t CDynProg::raw_intensities_interval_query(
01070 const int32_t from_pos, const int32_t to_pos, float64_t* intensities)
01071 {
01072 ASSERT(from_pos<to_pos);
01073
01074 int32_t num_intensities = 0;
01075 int32_t* p_tiling_pos = m_probe_pos;
01076 float64_t* p_tiling_data = m_raw_intensities;
01077 int32_t last_pos;
01078 int32_t num = 0;
01079 while (*p_tiling_pos<to_pos)
01080 {
01081 if (*p_tiling_pos>=from_pos)
01082 {
01083 intensities[num_intensities] = *p_tiling_data;
01084 num_intensities++;
01085
01086 }
01087 num++;
01088 if (num>=m_num_probes)
01089 break;
01090 last_pos = *p_tiling_pos;
01091 p_tiling_pos++;
01092 p_tiling_data++;
01093 ASSERT(last_pos<*p_tiling_pos);
01094 }
01095 return num_intensities;
01096 }
01097
01098 inline void CDynProg::lookup_content_svm_values(
01099 const int32_t from_state, const int32_t to_state, const int32_t from_pos,
01100 const int32_t to_pos, float64_t* svm_values, int32_t frame)
01101 {
01102
01103
01104
01105 for (int32_t i=0;i<4;i++)
01106 {
01107 float64_t to_val = m_precomputed_svm_values.get_element(i, to_state);
01108 float64_t from_val = m_precomputed_svm_values.get_element(i,from_state);
01109 svm_values[i]=(to_val-from_val)/(to_pos-from_pos);
01110 }
01111
01112 if (frame!=-1)
01113 {
01114 svm_values[4] = 1e10;
01115 svm_values[5] = 1e10;
01116 svm_values[6] = 1e10;
01117 int32_t global_frame = from_pos%3;
01118 int32_t row = ((global_frame+frame)%3)+4;
01119
01120 float64_t to_val = m_precomputed_svm_values.get_element(row, to_state);
01121 float64_t from_val = m_precomputed_svm_values.get_element(row,from_state);
01122 svm_values[frame+4] = (to_val-from_val)/(to_pos-from_pos);
01123 }
01124 }
01125
01126 inline void CDynProg::lookup_tiling_plif_values(
01127 const int32_t from_state, const int32_t to_state, const int32_t len,
01128 float64_t* svm_values)
01129 {
01130 ASSERT(from_state<to_state);
01131 ASSERT(len>0);
01132 for (int32_t i=num_svms;i<2*num_svms;i++)
01133 {
01134
01135 svm_values[i]=(m_precomputed_tiling_values.get_element(i-num_svms,to_state)-m_precomputed_tiling_values.get_element(i-num_svms,from_state));
01136
01137 }
01138 }
01139 #endif