DynProg.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 1999-2008 Soeren Sonnenburg
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef __CDYNPROG_H__
00013 #define __CDYNPROG_H__
00014 
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017 #include "base/SGObject.h"
00018 #include "lib/io.h"
00019 #include "lib/config.h"
00020 #include "structure/PlifBase.h"
00021 #include "structure/Plif.h"
00022 #include "features/StringFeatures.h"
00023 #include "distributions/Distribution.h"
00024 #include "lib/DynamicArray.h"
00025 #include "lib/Array.h"
00026 #include "lib/Array2.h"
00027 #include "lib/Array3.h"
00028 #include "lib/Time.h"
00029 
00030 #include <stdio.h>
00031 
00032 //#define DYNPROG_TIMING
00033 
00034 #ifdef USE_BIGSTATES
00035 typedef WORD T_STATES ;
00036 #else
00037 typedef BYTE T_STATES ;
00038 #endif
00039 typedef T_STATES* P_STATES ;
00040 
00045 class CDynProg : public CSGObject
00046 {
00047 private:
00048 
00049     T_STATES trans_list_len;
00050     T_STATES **trans_list_forward;
00051     T_STATES *trans_list_forward_cnt;
00052     DREAL **trans_list_forward_val;
00053     INT **trans_list_forward_id;
00054     bool mem_initialized;
00055 
00056 #ifdef DYNPROG_TIMING
00057     CTime MyTime;
00058     CTime MyTime2;
00059     
00060     DREAL segment_init_time;
00061     DREAL segment_pos_time;
00062     DREAL segment_clean_time;
00063     DREAL segment_extend_time;
00064     DREAL orf_time;
00065     DREAL content_time;
00066     DREAL content_penalty_time;
00067     DREAL svm_init_time;
00068     DREAL svm_pos_time;
00069     DREAL svm_clean_time;
00070 #endif
00071     
00072 public:
00077     CDynProg(INT p_num_svms=8);
00078     ~CDynProg();
00079 
00088     DREAL best_path_no_b(INT max_iter, INT & best_iter, INT *my_path);
00089 
00098     void best_path_no_b_trans(INT max_iter, INT & max_best_iter, SHORT nbest, DREAL *prob_nbest, INT *my_paths);
00099     
00100     // model related functions
00106     void set_num_states(INT p_N);
00107 
00109     INT get_num_states();
00110 
00112     INT get_num_svms();
00113 
00119     void init_content_svm_value_array(const INT seq_len);
00120 
00129     void init_tiling_data(DREAL* probe_pos, DREAL* intensities, const INT num_probes, const INT seq_len);
00130 
00138     void precompute_tiling_plifs(CPlif** PEN, const INT num_penalties, const INT seq_len, const INT* pos);  
00139 
00145     void set_p_vector(DREAL* p, INT N);
00146 
00152     void set_q_vector(DREAL* q, INT N);
00153     
00160     void set_a(DREAL* a, INT M, INT N);
00161     
00168     void set_a_id(INT *a, INT M, INT N);
00169     
00176     void set_a_trans_matrix(DREAL *a_trans, INT num_trans, INT N);
00177 
00178     // content svm related setup functions
00184     void init_svm_arrays(INT p_num_degrees, INT p_num_svms);
00185 
00191     void init_word_degree_array(INT * p_word_degree_array, INT num_elem);
00192 
00198     void init_cum_num_words_array(INT * p_cum_num_words_array, INT num_elem);
00199 
00205     void init_num_words_array(INT * p_num_words_array, INT num_elem);
00206 
00213     void init_mod_words_array(INT * p_mod_words_array, INT num_elem, INT num_columns);
00214 
00220     void init_sign_words_array(bool * p_sign_words_array, INT num_elem);
00221 
00227     void init_string_words_array(INT * p_string_words_array, INT num_elem);
00228 
00234     bool check_svm_arrays();
00235 
00236     // best_path_trans preparation functions
00243     void best_path_set_seq(DREAL *seq, INT N, INT seq_len);
00244 
00252     void best_path_set_seq3d(DREAL *seq, INT p_N, INT seq_len, INT max_num_signals);
00253 
00259     void best_path_set_pos(INT *pos, INT seq_len);
00260 
00268     void best_path_set_orf_info(INT *orf_info, INT m, INT n);
00269 
00277     void best_path_set_segment_sum_weights(DREAL *segment_sum_weights, INT num_states, INT seq_len);
00278 
00283     void best_path_set_plif_list(CDynamicArray<CPlifBase*>* plifs);
00284 
00291     void best_path_set_plif_id_matrix(INT *plif_id_matrix, INT m, INT n);
00292 
00299     void best_path_set_plif_state_signal_matrix(INT *plif_id_matrix, INT m, INT n);
00300 
00307     void best_path_set_genestr(CHAR* genestr, INT genestr_len, INT genestr_num);
00308 
00309     // additional best_path_trans_deriv functions
00315     void best_path_set_my_state_seq(INT* my_state_seq, INT seq_len);
00316 
00322     void best_path_set_my_pos_seq(INT* my_pos_seq, INT seq_len);
00323 
00329     inline void best_path_set_single_genestr(CHAR* genestr, INT genestr_len)
00330     {
00331         SG_DEBUG("genestrpy: %d", genestr_len);
00332         best_path_set_genestr(genestr, genestr_len, 1);
00333     }
00334 
00341     void best_path_set_dict_weights(DREAL* dictionary_weights, INT dict_len, INT n);
00342 
00349     void best_path_set_segment_loss(DREAL * segment_loss, INT num_segment_id1, INT num_segment_id2);
00350 
00357     void best_path_set_segment_ids_mask(INT* segment_ids, DREAL* segment_mask, INT m);
00358 
00359     // best_path functions
00365     void best_path_call(INT nbest, bool use_orf);
00366 
00368     void best_path_deriv_call();
00369 
00374     void best_path_2struct_call(INT nbest);
00375 
00380     void best_path_simple_call(INT nbest);
00381 
00386     void best_path_deriv_call(INT nbest);
00387     
00388     // best_path result retrieval functions
00394     void best_path_get_scores(DREAL **scores, INT *n);
00395 
00402     void best_path_get_states(INT **states, INT *m, INT *n);
00403 
00410     void best_path_get_positions(INT **positions, INT *m, INT *n);
00411 
00412     //best_path_trans_deriv result retrieval functions
00418     void best_path_get_losses(DREAL** my_losses, INT* seq_len);
00419 
00421 
00437     template <short int nbest, bool with_loss, bool with_multiple_sequences>
00438     void best_path_trans(const DREAL *seq, INT seq_len, const INT *pos,
00439                          const INT *orf_info, CPlifBase **PLif_matrix,
00440                          CPlifBase **Plif_state_signals, INT max_num_signals,
00441                          INT genestr_num,
00442                          DREAL *prob_nbest, INT *my_state_seq, INT *my_pos_seq,
00443                          bool use_orf);
00444 
00460     void best_path_trans_deriv(INT *my_state_seq, INT *my_pos_seq, DREAL *my_scores, DREAL* my_losses, INT my_seq_len,
00461                     const DREAL *seq_array, INT seq_len, const INT *pos, CPlifBase **Plif_matrix,
00462                     CPlifBase **Plif_state_signals, INT max_num_signals, INT genestr_num);
00463     
00480     void best_path_2struct(const DREAL *seq, INT seq_len, const INT *pos,
00481                            CPlifBase **Plif_matrix,
00482                            const char *genestr, INT genestr_len,
00483                            SHORT nbest,
00484                            DREAL *prob_nbest, INT *my_state_seq, INT *my_pos_seq,
00485                            DREAL *dictionary_weights, INT dict_len, DREAL *segment_sum_weights);
00486 
00495     void best_path_trans_simple(const DREAL *seq, INT seq_len, SHORT nbest,
00496                                 DREAL *prob_nbest, INT *my_state_seq);
00497 
00498 
00499 
00501     inline T_STATES get_N() const
00502       {
00503         return N ;
00504       }
00505     
00510     inline void set_q(T_STATES offset, DREAL value)
00511     {
00512         end_state_distribution_q[offset]=value;
00513     }
00514 
00519     inline void set_p(T_STATES offset, DREAL value)
00520     {
00521         initial_state_distribution_p[offset]=value;
00522     }
00523 
00530     inline void set_a(T_STATES line_, T_STATES column, DREAL value)
00531     {
00532       transition_matrix_a.element(line_,column)=value; // look also best_path!
00533     }
00534 
00540     inline DREAL get_q(T_STATES offset) const
00541     {
00542         return end_state_distribution_q[offset];
00543     }
00544 
00550     inline DREAL get_q_deriv(T_STATES offset) const
00551     {
00552         return end_state_distribution_q_deriv[offset];
00553     }
00554 
00560     inline DREAL get_p(T_STATES offset) const
00561     {
00562         return initial_state_distribution_p[offset];
00563     }
00564 
00570     inline DREAL get_p_deriv(T_STATES offset) const
00571     {
00572         return initial_state_distribution_p_deriv[offset];
00573     }
00574     
00585     void precompute_content_values(WORD*** wordstr, const INT *pos,
00586         const INT num_cand_pos, const INT genestr_len,
00587         DREAL *dictionary_weights, INT dict_len);
00588 
00597     void create_word_string(const CHAR* genestr, INT genestr_num, INT genestr_len, WORD*** wordstr);
00598 
00604     void precompute_stop_codons(const CHAR* genestr, INT genestr_len);
00605 
00611     void set_genestr_len(INT genestr_len);
00612 
00619     inline DREAL get_a(T_STATES line_, T_STATES column) const
00620     {
00621       return transition_matrix_a.element(line_,column); // look also best_path()!
00622     }
00623 
00630     inline DREAL get_a_deriv(T_STATES line_, T_STATES column) const
00631     {
00632       return transition_matrix_a_deriv.element(line_,column); // look also best_path()!
00633     }
00635 protected:
00636 
00637     /* helper functions */
00638 
00648     inline void lookup_content_svm_values(const INT from_state,
00649         const INT to_state, const INT from_pos, const INT to_pos,
00650         DREAL* svm_values, INT frame);
00651 
00659     inline void lookup_tiling_plif_values(const INT from_state,
00660         const INT to_state, const INT len, DREAL* svm_values);
00661 
00666     inline INT find_frame(const INT from_state);
00667 
00675     inline INT raw_intensities_interval_query(
00676         const INT from_pos, const INT to_pos, DREAL* intensities);
00677 
00686     void translate_from_single_order(WORD* obs, INT sequence_length, INT start,
00687         INT order, INT max_val=2);
00688 
00695     void reset_svm_value(INT pos, INT & last_svm_pos, DREAL * svm_value);
00696 
00704     void extend_svm_value(WORD* wordstr, INT pos, INT &last_svm_pos,
00705         DREAL* svm_value);
00706 
00714     void reset_segment_sum_value(INT num_states, INT pos,
00715         INT & last_segment_sum_pos, DREAL * segment_sum_value);
00716 
00726     void extend_segment_sum_value(DREAL *segment_sum_weights, INT seqlen,
00727         INT num_states, INT pos, INT &last_segment_sum_pos,
00728         DREAL* segment_sum_value);
00729 
00731     struct svm_values_struct
00732     {
00734         INT maxlookback;
00736         INT seqlen;
00737 
00739         INT* start_pos;
00741         DREAL ** svm_values_unnormalized;
00743         DREAL * svm_values;
00745         bool *** word_used;
00747         INT **num_unique_words;
00748     };
00749 
00750     //void reset_svm_values(INT pos, INT * last_svm_pos, DREAL * svm_value) ;
00751     //void extend_svm_values(WORD** wordstr, INT pos, INT *last_svm_pos, DREAL* svm_value) ;
00759     void init_svm_values(struct svm_values_struct & svs, INT start_pos,
00760         INT seqlen, INT howmuchlookback);
00761 
00766     void clear_svm_values(struct svm_values_struct & svs);
00767 
00775     void find_svm_values_till_pos(WORD*** wordstr, const INT *pos, INT t_end,
00776         struct svm_values_struct &svs);
00777 
00785     void find_svm_values_till_pos(WORD** wordstr, const INT *pos, INT t_end,
00786         struct svm_values_struct &svs);
00787 
00796     void update_svm_values_till_pos(WORD*** wordstr, const INT *pos, INT t_end,
00797         INT prev_t_end, struct svm_values_struct &svs);
00798 
00807     bool extend_orf(INT orf_from, INT orf_to, INT start, INT &last_pos, INT to);
00808 
00810     struct segment_loss_struct
00811     {
00813         INT maxlookback;
00815         INT seqlen;
00817         INT *segments_changed;
00819         DREAL *num_segment_id;
00821         INT *length_segment_id ;
00822     };
00823 
00830     void init_segment_loss(struct segment_loss_struct & loss, INT seqlen,
00831         INT howmuchlookback);
00832 
00837     void clear_segment_loss(struct segment_loss_struct & loss);
00838 
00849     DREAL extend_segment_loss(struct segment_loss_struct & loss,
00850         const INT * pos_array, INT segment_id, INT pos, INT& last_pos,
00851         DREAL &last_value);
00852 
00861     void find_segment_loss_till_pos(const INT * pos, INT t_end,
00862         CArray<INT>& segment_ids, CArray<DREAL>& segment_mask,
00863         struct segment_loss_struct& loss);
00864 
00865     
00870 
00871     INT N;
00872 
00874     CArray2<INT> transition_matrix_a_id;
00875     CArray2<DREAL> transition_matrix_a;
00876     CArray2<DREAL> transition_matrix_a_deriv;
00877 
00879     CArray<DREAL> initial_state_distribution_p;
00880     CArray<DREAL> initial_state_distribution_p_deriv;
00881 
00883     CArray<DREAL> end_state_distribution_q;
00884     CArray<DREAL> end_state_distribution_q_deriv;
00885 
00887     
00889     CArray2<DREAL> dict_weights;
00891     DREAL * dict_weights_array;
00892 
00894     INT num_degrees;
00896     INT num_svms;
00898     INT num_strings;
00899     
00901     CArray<INT> word_degree;
00903     CArray<INT> cum_num_words;
00905     INT * cum_num_words_array;
00907     CArray<INT> num_words;
00909     INT * num_words_array;
00911     CArray2<INT> mod_words;
00913     INT * mod_words_array;
00915     CArray<bool> sign_words;
00917     bool * sign_words_array;
00919     CArray<INT> string_words;
00921     INT * string_words_array;
00922 
00923 //  CArray3<INT> word_used ;
00924 //  INT *word_used_array ;
00925 //  CArray2<DREAL> svm_values_unnormalized ;
00927     CArray<INT> svm_pos_start;
00929     CArray<INT> num_unique_words;
00931     bool svm_arrays_clean;
00932 
00934     INT num_svms_single;
00936     INT word_degree_single;
00938     INT cum_num_words_single;
00940     INT num_words_single;
00941 
00943     CArray<bool> word_used_single;
00945     CArray<DREAL> svm_value_unnormalized_single;
00947     INT num_unique_words_single;
00948 
00950     INT max_a_id;
00951     
00952     // control info
00954     INT m_step;
00956     INT m_call;
00957 
00958     // input arguments
00960     CArray3<DREAL> m_seq;
00962     CArray<INT> m_pos;
00964     CArray2<INT> m_orf_info;
00966     CArray2<DREAL> m_segment_sum_weights;
00968     CArray<CPlifBase*> m_plif_list;
00970     CArray2<CPlifBase*> m_PEN;
00972     CArray2<CPlifBase*> m_PEN_state_signals;
00974     CArray2<CHAR> m_genestr;
00976     CArray2<DREAL> m_dict_weights;
00978     CArray3<DREAL> m_segment_loss;
00980     CArray<INT> m_segment_ids;
00982     CArray<DREAL> m_segment_mask;
00984     CArray<INT> m_my_state_seq;
00986     CArray<INT> m_my_pos_seq;
00988     CArray<DREAL> m_my_scores;
00990     CArray<DREAL> m_my_losses;
00991 
00992     // output arguments
00994     CArray<DREAL> m_scores;
00996     CArray2<INT> m_states;
00998     CArray2<INT> m_positions;
00999 
01003     CArray<bool> m_genestr_stop;
01004 
01009     CArray2<DREAL> m_precomputed_svm_values;
01010 
01012     CArray2<DREAL> m_precomputed_tiling_values;
01013 
01015     DREAL* m_raw_intensities;
01017     INT* m_probe_pos;
01019     INT m_num_probes;
01021     bool m_use_tiling;
01023     INT m_genestr_len;
01024 };
01025 
01026 inline INT CDynProg::raw_intensities_interval_query(const INT from_pos, const INT to_pos, DREAL* intensities)
01027 {
01028     ASSERT(from_pos<to_pos);
01029     //SG_PRINT("m_num_probes:%i, m_raw_intensities[1]:%f, m_probe_pos[1]:%i \n",m_num_probes, m_raw_intensities[10], m_probe_pos[10]);
01030     INT num_intensities = 0;
01031     INT* p_tiling_pos  = m_probe_pos;
01032     DREAL* p_tiling_data = m_raw_intensities;
01033     INT last_pos;
01034     INT num = 0;
01035     while (*p_tiling_pos<to_pos)
01036     {
01037         if (*p_tiling_pos>=from_pos)
01038         {
01039             intensities[num_intensities] = *p_tiling_data;
01040             num_intensities++;
01041             //SG_PRINT("*p_tiling_data:%f, *p_tiling_pos:%i\n",*p_tiling_data,*p_tiling_pos);
01042         }
01043         num++;
01044         if (num>=m_num_probes)
01045             break;
01046         last_pos = *p_tiling_pos;
01047         p_tiling_pos++;
01048         p_tiling_data++;
01049         ASSERT(last_pos<*p_tiling_pos);
01050     }
01051     return num_intensities;
01052 }
01053 inline void CDynProg::lookup_content_svm_values(const INT from_state, const INT to_state, const INT from_pos, const INT to_pos, DREAL* svm_values, INT frame)
01054 {
01055 //  ASSERT(from_state<to_state);
01056 //  if (!(from_pos<to_pos))
01057 //      SG_ERROR("from_pos!<to_pos, from_pos: %i to_pos: %i \n",from_pos,to_pos);
01058     for (INT i=0;i<4;i++)
01059     {
01060         DREAL to_val   = m_precomputed_svm_values.get_element(i,  to_state);
01061         DREAL from_val = m_precomputed_svm_values.get_element(i,from_state);
01062         svm_values[i]=(to_val-from_val)/(to_pos-from_pos);
01063     }
01064     // find the correct row with precomputed 
01065     /*if (frame!=-1)
01066     {
01067         svm_values[4] = 1e10;
01068         svm_values[5] = 1e10;
01069         svm_values[6] = 1e10;
01070         INT global_frame = from_pos%3;
01071             INT row = ((global_frame+frame)%3)+4;
01072         DREAL to_val   = m_precomputed_svm_values.get_element(row,  to_state);
01073         DREAL from_val = m_precomputed_svm_values.get_element(row,from_state);
01074         svm_values[frame+4] = (to_val-from_val)/(to_pos-from_pos);
01075     }*/
01076 }
01077 inline void CDynProg::lookup_tiling_plif_values(const INT from_state, const INT to_state, const INT len, DREAL* svm_values)
01078 {
01079     ASSERT(from_state<to_state);
01080     ASSERT(len>0);
01081     for (INT i=num_svms;i<2*num_svms;i++)
01082     {
01083         //svm_values[i]=(m_precomputed_tiling_values.get_element(i-num_svms,to_state)-m_precomputed_tiling_values.get_element(i-num_svms,from_state))/len;
01084         svm_values[i]=(m_precomputed_tiling_values.get_element(i-num_svms,to_state)-m_precomputed_tiling_values.get_element(i-num_svms,from_state));
01085         //svm_values[i]=0.0;
01086     }
01087 }
01088 #endif

SHOGUN Machine Learning Toolbox - Documentation