DynProg.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 1999-2008 Soeren Sonnenburg
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef __CDYNPROG_H__
00013 #define __CDYNPROG_H__
00014 
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017 #include "base/SGObject.h"
00018 #include "lib/io.h"
00019 #include "lib/config.h"
00020 #include "structure/PlifBase.h"
00021 #include "structure/Plif.h"
00022 #include "features/StringFeatures.h"
00023 #include "distributions/Distribution.h"
00024 #include "lib/DynamicArray.h"
00025 #include "lib/Array.h"
00026 #include "lib/Array2.h"
00027 #include "lib/Array3.h"
00028 #include "lib/Time.h"
00029 
00030 #include <stdio.h>
00031 
00032 //#define DYNPROG_TIMING
00033 
00034 #ifdef USE_BIGSTATES
00035 typedef uint16_t T_STATES ;
00036 #else
00037 typedef uint8_t T_STATES ;
00038 #endif
00039 typedef T_STATES* P_STATES ;
00040 
00045 class CDynProg : public CSGObject
00046 {
00047 private:
00048 
00049     T_STATES trans_list_len;
00050     T_STATES **trans_list_forward;
00051     T_STATES *trans_list_forward_cnt;
00052     float64_t **trans_list_forward_val;
00053     int32_t **trans_list_forward_id;
00054     bool mem_initialized;
00055 
00056 #ifdef DYNPROG_TIMING
00057     CTime MyTime;
00058     CTime MyTime2;
00059     
00060     float64_t segment_init_time;
00061     float64_t segment_pos_time;
00062     float64_t segment_clean_time;
00063     float64_t segment_extend_time;
00064     float64_t orf_time;
00065     float64_t content_time;
00066     float64_t content_penalty_time;
00067     float64_t svm_init_time;
00068     float64_t svm_pos_time;
00069     float64_t svm_clean_time;
00070 #endif
00071     
00072 public:
00077     CDynProg(int32_t p_num_svms=8);
00078     ~CDynProg();
00079 
00088     float64_t best_path_no_b(
00089         int32_t max_iter, int32_t & best_iter, int32_t *my_path);
00090 
00099     void best_path_no_b_trans(
00100         int32_t max_iter, int32_t & max_best_iter, int16_t nbest,
00101         float64_t *prob_nbest, int32_t *my_paths);
00102     
00103     // model related functions
00109     void set_num_states(int32_t p_N);
00110 
00112     int32_t get_num_states();
00113 
00115     int32_t get_num_svms();
00116 
00122     void init_content_svm_value_array(const int32_t seq_len);
00123 
00132     void init_tiling_data(
00133         int32_t* probe_pos, float64_t* intensities, const int32_t num_probes,
00134         const int32_t seq_len);
00135 
00144     void precompute_tiling_plifs(
00145         CPlif** PEN, const int32_t* tiling_plif_ids,
00146         const int32_t num_tiling_plifs, const int32_t seq_len,
00147         const int32_t* pos);
00148 
00154     void set_p_vector(float64_t* p, int32_t N);
00155 
00161     void set_q_vector(float64_t* q, int32_t N);
00162     
00169     void set_a(float64_t* a, int32_t M, int32_t N);
00170     
00177     void set_a_id(int32_t *a, int32_t M, int32_t N);
00178     
00185     void set_a_trans_matrix(float64_t *a_trans, int32_t num_trans, int32_t N);
00186 
00187     // content svm related setup functions
00193     void init_svm_arrays(int32_t p_num_degrees, int32_t p_num_svms);
00194 
00200     void init_word_degree_array(int32_t * p_word_degree_array, int32_t num_elem);
00201 
00207     void init_cum_num_words_array(
00208         int32_t * p_cum_num_words_array, int32_t num_elem);
00209 
00215     void init_num_words_array(int32_t * p_num_words_array, int32_t num_elem);
00216 
00223     void init_mod_words_array(
00224         int32_t * p_mod_words_array, int32_t num_elem, int32_t num_columns);
00225 
00231     void init_sign_words_array(bool * p_sign_words_array, int32_t num_elem);
00232 
00238     void init_string_words_array(
00239         int32_t * p_string_words_array, int32_t num_elem);
00240 
00246     bool check_svm_arrays();
00247 
00248     // best_path_trans preparation functions
00255     void best_path_set_seq(float64_t *seq, int32_t N, int32_t seq_len);
00256 
00264     void best_path_set_seq3d(
00265         float64_t *seq, int32_t p_N, int32_t seq_len, int32_t max_num_signals);
00266 
00272     void best_path_set_pos(int32_t *pos, int32_t seq_len);
00273 
00281     void best_path_set_orf_info(int32_t *orf_info, int32_t m, int32_t n);
00282 
00290     void best_path_set_segment_sum_weights(
00291         float64_t *segment_sum_weights, int32_t num_states, int32_t seq_len);
00292 
00297     void best_path_set_plif_list(CDynamicArray<CPlifBase*>* plifs);
00298 
00305     void best_path_set_plif_id_matrix(
00306         int32_t *plif_id_matrix, int32_t m, int32_t n);
00307 
00314     void best_path_set_plif_state_signal_matrix(
00315         int32_t *plif_id_matrix, int32_t m, int32_t n);
00316 
00323     void best_path_set_genestr(
00324         char* genestr, int32_t genestr_len, int32_t genestr_num);
00325 
00326     // additional best_path_trans_deriv functions
00332     void best_path_set_my_state_seq(int32_t* my_state_seq, int32_t seq_len);
00333 
00339     void best_path_set_my_pos_seq(int32_t* my_pos_seq, int32_t seq_len);
00340 
00346     inline void best_path_set_single_genestr(char* genestr, int32_t genestr_len)
00347     {
00348         SG_DEBUG("genestrpy: %d", genestr_len);
00349         best_path_set_genestr(genestr, genestr_len, 1);
00350     }
00351 
00358     void best_path_set_dict_weights(
00359         float64_t* dictionary_weights, int32_t dict_len, int32_t n);
00360 
00367     void best_path_set_segment_loss(
00368         float64_t * segment_loss, int32_t num_segment_id1,
00369         int32_t num_segment_id2);
00370 
00377     void best_path_set_segment_ids_mask(
00378         int32_t* segment_ids, float64_t* segment_mask, int32_t m);
00379 
00380     // best_path functions
00386     void best_path_call(int32_t nbest, bool use_orf);
00387 
00389     void best_path_deriv_call();
00390 
00395     void best_path_2struct_call(int32_t nbest);
00396 
00401     void best_path_simple_call(int32_t nbest);
00402 
00407     void best_path_deriv_call(int32_t nbest);
00408     
00409     // best_path result retrieval functions
00415     void best_path_get_scores(float64_t **scores, int32_t *n);
00416 
00423     void best_path_get_states(int32_t **states, int32_t *m, int32_t *n);
00424 
00431     void best_path_get_positions(int32_t **positions, int32_t *m, int32_t *n);
00432 
00433     //best_path_trans_deriv result retrieval functions
00439     void best_path_get_losses(float64_t** my_losses, int32_t* seq_len);
00440 
00442 
00458     template <int16_t nbest, bool with_loss, bool with_multiple_sequences>
00459     void best_path_trans(
00460         const float64_t *seq, int32_t seq_len, const int32_t *pos,
00461         const int32_t *orf_info, CPlifBase **PLif_matrix,
00462         CPlifBase **Plif_state_signals, int32_t max_num_signals,
00463         int32_t genestr_num, float64_t *prob_nbest, int32_t *my_state_seq,
00464         int32_t *my_pos_seq, bool use_orf);
00465 
00481     void best_path_trans_deriv(
00482         int32_t *my_state_seq, int32_t *my_pos_seq, float64_t *my_scores,
00483         float64_t* my_losses, int32_t my_seq_len, const float64_t *seq_array,
00484         int32_t seq_len, const int32_t *pos, CPlifBase **Plif_matrix,
00485         CPlifBase **Plif_state_signals, int32_t max_num_signals,
00486         int32_t genestr_num);
00487     
00504     void best_path_2struct(
00505         const float64_t *seq, int32_t seq_len, const int32_t *pos,
00506         CPlifBase **Plif_matrix, const char *genestr, int32_t genestr_len,
00507         int16_t nbest, float64_t *prob_nbest, int32_t *my_state_seq,
00508         int32_t *my_pos_seq, float64_t *dictionary_weights, int32_t dict_len,
00509         float64_t *segment_sum_weights);
00510 
00519     void best_path_trans_simple(
00520         const float64_t *seq, int32_t seq_len, int16_t nbest,
00521         float64_t *prob_nbest, int32_t *my_state_seq);
00522 
00523 
00524 
00526     inline T_STATES get_N() const
00527       {
00528         return N ;
00529       }
00530     
00535     inline void set_q(T_STATES offset, float64_t value)
00536     {
00537         end_state_distribution_q[offset]=value;
00538     }
00539 
00544     inline void set_p(T_STATES offset, float64_t value)
00545     {
00546         initial_state_distribution_p[offset]=value;
00547     }
00548 
00555     inline void set_a(T_STATES line_, T_STATES column, float64_t value)
00556     {
00557       transition_matrix_a.element(line_,column)=value; // look also best_path!
00558     }
00559 
00565     inline float64_t get_q(T_STATES offset) const
00566     {
00567         return end_state_distribution_q[offset];
00568     }
00569 
00575     inline float64_t get_q_deriv(T_STATES offset) const
00576     {
00577         return end_state_distribution_q_deriv[offset];
00578     }
00579 
00585     inline float64_t get_p(T_STATES offset) const
00586     {
00587         return initial_state_distribution_p[offset];
00588     }
00589 
00595     inline float64_t get_p_deriv(T_STATES offset) const
00596     {
00597         return initial_state_distribution_p_deriv[offset];
00598     }
00599     
00610     void precompute_content_values(
00611         uint16_t*** wordstr, const int32_t *pos, const int32_t num_cand_pos,
00612         const int32_t genestr_len, float64_t *dictionary_weights,
00613         int32_t dict_len);
00614 
00623     void create_word_string(
00624         const char* genestr, int32_t genestr_num, int32_t genestr_len,
00625         uint16_t*** wordstr);
00626 
00632     void precompute_stop_codons(const char* genestr, int32_t genestr_len);
00633 
00639     void set_genestr_len(int32_t genestr_len);
00640 
00647     inline float64_t get_a(T_STATES line_, T_STATES column) const
00648     {
00649       return transition_matrix_a.element(line_,column); // look also best_path()!
00650     }
00651 
00658     inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const
00659     {
00660       return transition_matrix_a_deriv.element(line_,column); // look also best_path()!
00661     }
00663 protected:
00664 
00665     /* helper functions */
00666 
00676     inline void lookup_content_svm_values(
00677         const int32_t from_state, const int32_t to_state,
00678         const int32_t from_pos, const int32_t to_pos, float64_t* svm_values,
00679         int32_t frame);
00680 
00688     inline void lookup_tiling_plif_values(
00689         const int32_t from_state, const int32_t to_state, const int32_t len,
00690         float64_t* svm_values);
00691 
00696     inline int32_t find_frame(const int32_t from_state);
00697 
00705     inline int32_t raw_intensities_interval_query(
00706         const int32_t from_pos, const int32_t to_pos, float64_t* intensities);
00707 
00716     void translate_from_single_order(
00717         uint16_t* obs, int32_t sequence_length, int32_t start, int32_t order,
00718         int32_t max_val=2);
00719 
00726     void reset_svm_value(
00727         int32_t pos, int32_t & last_svm_pos, float64_t * svm_value);
00728 
00736     void extend_svm_value(
00737         uint16_t* wordstr, int32_t pos, int32_t &last_svm_pos,
00738         float64_t* svm_value);
00739 
00747     void reset_segment_sum_value(
00748         int32_t num_states, int32_t pos, int32_t & last_segment_sum_pos,
00749         float64_t * segment_sum_value);
00750 
00760     void extend_segment_sum_value(
00761         float64_t *segment_sum_weights, int32_t seqlen, int32_t num_states,
00762         int32_t pos, int32_t &last_segment_sum_pos,
00763         float64_t* segment_sum_value);
00764 
00766     struct svm_values_struct
00767     {
00769         int32_t maxlookback;
00771         int32_t seqlen;
00772 
00774         int32_t* start_pos;
00776         float64_t ** svm_values_unnormalized;
00778         float64_t * svm_values;
00780         bool *** word_used;
00782         int32_t **num_unique_words;
00783     };
00784 
00785     //void reset_svm_values(int32_t pos, int32_t * last_svm_pos, float64_t * svm_value) ;
00786     //void extend_svm_values(uint16_t** wordstr, int32_t pos, int32_t *last_svm_pos, float64_t* svm_value) ;
00794     void init_svm_values(
00795         struct svm_values_struct & svs, int32_t start_pos, int32_t seqlen,
00796         int32_t howmuchlookback);
00797 
00802     void clear_svm_values(struct svm_values_struct & svs);
00803 
00811     void find_svm_values_till_pos(
00812         uint16_t*** wordstr, const int32_t *pos, int32_t t_end,
00813         struct svm_values_struct &svs);
00814 
00822     void find_svm_values_till_pos(
00823         uint16_t** wordstr, const int32_t *pos, int32_t t_end,
00824         struct svm_values_struct &svs);
00825 
00834     void update_svm_values_till_pos(
00835         uint16_t*** wordstr, const int32_t *pos, int32_t t_end,
00836         int32_t prev_t_end, struct svm_values_struct &svs);
00837 
00846     bool extend_orf(
00847         int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos,
00848         int32_t to);
00849 
00851     struct segment_loss_struct
00852     {
00854         int32_t maxlookback;
00856         int32_t seqlen;
00858         int32_t *segments_changed;
00860         float64_t *num_segment_id;
00862         int32_t *length_segment_id ;
00863     };
00864 
00871     void init_segment_loss(
00872         struct segment_loss_struct & loss, int32_t seqlen,
00873         int32_t howmuchlookback);
00874 
00879     void clear_segment_loss(struct segment_loss_struct & loss);
00880 
00891     float64_t extend_segment_loss(
00892         struct segment_loss_struct & loss, const int32_t * pos_array,
00893         int32_t segment_id, int32_t pos, int32_t& last_pos,
00894         float64_t &last_value);
00895 
00904     void find_segment_loss_till_pos(
00905         const int32_t * pos, int32_t t_end, CArray<int32_t>& segment_ids,
00906         CArray<float64_t>& segment_mask, struct segment_loss_struct& loss);
00907 
00908     
00913 
00914     int32_t N;
00915 
00917     CArray2<int32_t> transition_matrix_a_id;
00918     CArray2<float64_t> transition_matrix_a;
00919     CArray2<float64_t> transition_matrix_a_deriv;
00920 
00922     CArray<float64_t> initial_state_distribution_p;
00923     CArray<float64_t> initial_state_distribution_p_deriv;
00924 
00926     CArray<float64_t> end_state_distribution_q;
00927     CArray<float64_t> end_state_distribution_q_deriv;
00928 
00930     
00932     CArray2<float64_t> dict_weights;
00934     float64_t * dict_weights_array;
00935 
00937     int32_t num_degrees;
00939     int32_t num_svms;
00941     int32_t num_strings;
00942     
00944     CArray<int32_t> word_degree;
00946     CArray<int32_t> cum_num_words;
00948     int32_t * cum_num_words_array;
00950     CArray<int32_t> num_words;
00952     int32_t * num_words_array;
00954     CArray2<int32_t> mod_words;
00956     int32_t * mod_words_array;
00958     CArray<bool> sign_words;
00960     bool * sign_words_array;
00962     CArray<int32_t> string_words;
00964     int32_t * string_words_array;
00965 
00966 //  CArray3<int32_t> word_used ;
00967 //  int32_t *word_used_array ;
00968 //  CArray2<float64_t> svm_values_unnormalized ;
00970     CArray<int32_t> svm_pos_start;
00972     CArray<int32_t> num_unique_words;
00974     bool svm_arrays_clean;
00975 
00977     int32_t num_svms_single;
00979     int32_t word_degree_single;
00981     int32_t cum_num_words_single;
00983     int32_t num_words_single;
00984 
00986     CArray<bool> word_used_single;
00988     CArray<float64_t> svm_value_unnormalized_single;
00990     int32_t num_unique_words_single;
00991 
00993     int32_t max_a_id;
00994     
00995     // control info
00997     int32_t m_step;
00999     int32_t m_call;
01000 
01001     // input arguments
01003     CArray3<float64_t> m_seq;
01005     CArray<int32_t> m_pos;
01007     CArray2<int32_t> m_orf_info;
01009     CArray2<float64_t> m_segment_sum_weights;
01011     CArray<CPlifBase*> m_plif_list;
01013     CArray2<CPlifBase*> m_PEN;
01015     CArray2<CPlifBase*> m_PEN_state_signals;
01017     CArray2<char> m_genestr;
01019     CArray2<float64_t> m_dict_weights;
01021     CArray3<float64_t> m_segment_loss;
01023     CArray<int32_t> m_segment_ids;
01025     CArray<float64_t> m_segment_mask;
01027     CArray<int32_t> m_my_state_seq;
01029     CArray<int32_t> m_my_pos_seq;
01031     CArray<float64_t> m_my_scores;
01033     CArray<float64_t> m_my_losses;
01034 
01035     // output arguments
01037     CArray<float64_t> m_scores;
01039     CArray2<int32_t> m_states;
01041     CArray2<int32_t> m_positions;
01042 
01046     CArray<bool> m_genestr_stop;
01047 
01052     CArray2<float64_t> m_precomputed_svm_values;
01053 
01055     CArray2<float64_t> m_precomputed_tiling_values;
01056 
01058     float64_t* m_raw_intensities;
01060     int32_t* m_probe_pos;
01062     int32_t m_num_probes;
01064     bool m_use_tiling;
01066     int32_t m_genestr_len;
01067 };
01068 
01069 inline int32_t CDynProg::raw_intensities_interval_query(
01070     const int32_t from_pos, const int32_t to_pos, float64_t* intensities)
01071 {
01072     ASSERT(from_pos<to_pos);
01073     //SG_PRINT("m_num_probes:%i, m_raw_intensities[1]:%f, m_probe_pos[1]:%i \n",m_num_probes, m_raw_intensities[10], m_probe_pos[10]);
01074     int32_t num_intensities = 0;
01075     int32_t* p_tiling_pos  = m_probe_pos;
01076     float64_t* p_tiling_data = m_raw_intensities;
01077     int32_t last_pos;
01078     int32_t num = 0;
01079     while (*p_tiling_pos<to_pos)
01080     {
01081         if (*p_tiling_pos>=from_pos)
01082         {
01083             intensities[num_intensities] = *p_tiling_data;
01084             num_intensities++;
01085             //SG_PRINT("*p_tiling_data:%f, *p_tiling_pos:%i\n",*p_tiling_data,*p_tiling_pos);
01086         }
01087         num++;
01088         if (num>=m_num_probes)
01089             break;
01090         last_pos = *p_tiling_pos;
01091         p_tiling_pos++;
01092         p_tiling_data++;
01093         ASSERT(last_pos<*p_tiling_pos);
01094     }
01095     return num_intensities;
01096 }
01097 
01098 inline void CDynProg::lookup_content_svm_values(
01099     const int32_t from_state, const int32_t to_state, const int32_t from_pos,
01100     const int32_t to_pos, float64_t* svm_values, int32_t frame)
01101 {
01102 //  ASSERT(from_state<to_state);
01103 //  if (!(from_pos<to_pos))
01104 //      SG_ERROR("from_pos!<to_pos, from_pos: %i to_pos: %i \n",from_pos,to_pos);
01105     for (int32_t i=0;i<4;i++)
01106     {
01107         float64_t to_val   = m_precomputed_svm_values.get_element(i,  to_state);
01108         float64_t from_val = m_precomputed_svm_values.get_element(i,from_state);
01109         svm_values[i]=(to_val-from_val)/(to_pos-from_pos);
01110     }
01111     // find the correct row with precomputed 
01112     if (frame!=-1)
01113     {
01114         svm_values[4] = 1e10;
01115         svm_values[5] = 1e10;
01116         svm_values[6] = 1e10;
01117         int32_t global_frame = from_pos%3;
01118             int32_t row = ((global_frame+frame)%3)+4;
01119         //SG_PRINT("global_frame:%i row:%i frame:%i \n", global_frame, row, frame);
01120         float64_t to_val   = m_precomputed_svm_values.get_element(row,  to_state);
01121         float64_t from_val = m_precomputed_svm_values.get_element(row,from_state);
01122         svm_values[frame+4] = (to_val-from_val)/(to_pos-from_pos);
01123     }
01124 }
01125 
01126 inline void CDynProg::lookup_tiling_plif_values(
01127     const int32_t from_state, const int32_t to_state, const int32_t len,
01128     float64_t* svm_values)
01129 {
01130     ASSERT(from_state<to_state);
01131     ASSERT(len>0);
01132     for (int32_t i=num_svms;i<2*num_svms;i++)
01133     {
01134         //svm_values[i]=(m_precomputed_tiling_values.get_element(i-num_svms,to_state)-m_precomputed_tiling_values.get_element(i-num_svms,from_state))/len;
01135         svm_values[i]=(m_precomputed_tiling_values.get_element(i-num_svms,to_state)-m_precomputed_tiling_values.get_element(i-num_svms,from_state));
01136         //svm_values[i]=0.0;
01137     }
01138 }
01139 #endif

SHOGUN Machine Learning Toolbox - Documentation