WeightedDegreePositionStringKernel.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _WEIGHTEDDEGREEPOSITIONSTRINGKERNEL_H___
00013 #define _WEIGHTEDDEGREEPOSITIONSTRINGKERNEL_H___
00014 
00015 #include "lib/common.h"
00016 #include "kernel/StringKernel.h"
00017 #include "lib/Trie.h"
00018 
00019 class CSVM ;
00020 
00022 class CWeightedDegreePositionStringKernel: public CStringKernel<CHAR>
00023 {
00024     public:
00033         CWeightedDegreePositionStringKernel(INT size, INT degree,
00034             INT max_mismatch=0, bool use_norm=true, INT mkl_stepsize=1);
00035 
00047         CWeightedDegreePositionStringKernel(INT size, DREAL* weights,
00048             INT degree, INT max_mismatch, INT* shift, INT shift_len,
00049             bool use_norm=true, INT mkl_stepsize=1);
00050 
00057         CWeightedDegreePositionStringKernel(
00058             CStringFeatures<CHAR>* l, CStringFeatures<CHAR>* r,
00059             INT degree);
00060 
00061         virtual ~CWeightedDegreePositionStringKernel();
00062 
00069         virtual bool init(CFeatures* l, CFeatures* r);
00070 
00072         virtual void cleanup();
00073 
00079         bool load_init(FILE* src);
00080 
00086         bool save_init(FILE* dest);
00087 
00092         virtual EKernelType get_kernel_type() { return K_WEIGHTEDDEGREEPOS; }
00093 
00098         virtual const CHAR* get_name() { return "WeightedDegreePos" ; } ;
00099 
00107         inline virtual bool init_optimization(INT p_count, INT *IDX, DREAL * alphas)
00108         { 
00109             return init_optimization(p_count, IDX, alphas, -1);
00110         }
00111 
00123         virtual bool init_optimization(INT count, INT *IDX, DREAL * alphas,
00124             INT tree_num, INT upto_tree=-1);
00125 
00130         virtual bool delete_optimization();
00131 
00137         inline virtual DREAL compute_optimized(INT idx)
00138         { 
00139             ASSERT(get_is_initialized());
00140             ASSERT(alphabet);
00141             ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00142             return compute_by_tree(idx);
00143         }
00144 
00149         static void* compute_batch_helper(void* p);
00150 
00161         virtual void compute_batch(INT num_vec, INT* vec_idx, DREAL* target,
00162             INT num_suppvec, INT* IDX, DREAL* alphas, DREAL factor=1.0);
00163 
00167         inline virtual void clear_normal()
00168         {
00169             if ((opt_type==FASTBUTMEMHUNGRY) && (tries.get_use_compact_terminal_nodes()))
00170             {
00171                 tries.set_use_compact_terminal_nodes(false) ;
00172                 SG_DEBUG( "disabling compact trie nodes with FASTBUTMEMHUNGRY\n") ;
00173             }
00174 
00175             if (get_is_initialized())
00176             {
00177                 if (opt_type==SLOWBUTMEMEFFICIENT)
00178                     tries.delete_trees(true); 
00179                 else if (opt_type==FASTBUTMEMHUNGRY)
00180                     tries.delete_trees(false);  // still buggy
00181                 else
00182                     SG_ERROR( "unknown optimization type\n");
00183 
00184                 set_is_initialized(false);
00185             }
00186         }
00187 
00193         inline virtual void add_to_normal(INT idx, DREAL weight)
00194         {
00195             add_example_to_tree(idx, weight);
00196             set_is_initialized(true);
00197         }
00198 
00203         inline virtual INT get_num_subkernels()
00204         {
00205             if (position_weights!=NULL)
00206                 return (INT) ceil(1.0*seq_length/mkl_stepsize) ;
00207             if (length==0)
00208                 return (INT) ceil(1.0*get_degree()/mkl_stepsize);
00209             return (INT) ceil(1.0*get_degree()*length/mkl_stepsize) ;
00210         }
00211 
00217         inline void compute_by_subkernel(INT idx, DREAL * subkernel_contrib)
00218         { 
00219             if (get_is_initialized())
00220             {
00221                 compute_by_tree(idx, subkernel_contrib);
00222                 return ;
00223             }
00224 
00225             SG_ERROR( "CWeightedDegreePositionStringKernel optimization not initialized\n") ;
00226         }
00227 
00233         inline const DREAL* get_subkernel_weights(INT& num_weights)
00234         {
00235             num_weights = get_num_subkernels() ;
00236 
00237             delete[] weights_buffer ;
00238             weights_buffer = new DREAL[num_weights] ;
00239 
00240             if (position_weights!=NULL)
00241                 for (INT i=0; i<num_weights; i++)
00242                     weights_buffer[i] = position_weights[i*mkl_stepsize] ;
00243             else
00244                 for (INT i=0; i<num_weights; i++)
00245                     weights_buffer[i] = weights[i*mkl_stepsize] ;
00246 
00247             return weights_buffer ;
00248         }
00249 
00255         inline void set_subkernel_weights(DREAL* weights2, INT num_weights2)
00256         {
00257             INT num_weights = get_num_subkernels() ;
00258             if (num_weights!=num_weights2)
00259                 SG_ERROR( "number of weights do not match\n") ;
00260 
00261             if (position_weights!=NULL)
00262                 for (INT i=0; i<num_weights; i++)
00263                     for (INT j=0; j<mkl_stepsize; j++)
00264                     {
00265                         if (i*mkl_stepsize+j<seq_length)
00266                             position_weights[i*mkl_stepsize+j] = weights2[i] ;
00267                     }
00268             else if (length==0)
00269             {
00270                 for (INT i=0; i<num_weights; i++)
00271                     for (INT j=0; j<mkl_stepsize; j++)
00272                         if (i*mkl_stepsize+j<get_degree())
00273                             weights[i*mkl_stepsize+j] = weights2[i] ;
00274             }
00275             else
00276             {
00277                 for (INT i=0; i<num_weights; i++)
00278                     for (INT j=0; j<mkl_stepsize; j++)
00279                         if (i*mkl_stepsize+j<get_degree()*length)
00280                             weights[i*mkl_stepsize+j] = weights2[i] ;
00281             }
00282         }
00283 
00284         // other kernel tree operations
00290         DREAL *compute_abs_weights(INT & len);
00291 
00296         bool is_tree_initialized() { return tree_initialized; }
00297 
00302         inline INT get_max_mismatch() { return max_mismatch; }
00303 
00308         inline INT get_degree() { return degree; }
00309 
00314         inline DREAL get_normalization_const() { return normalization_const; }
00315 
00321         inline DREAL *get_degree_weights(INT& d, INT& len)
00322         {
00323             d=degree;
00324             len=length;
00325             return weights;
00326         }
00327 
00333         inline DREAL *get_weights(INT& num_weights)
00334         {
00335             if (position_weights!=NULL)
00336             {
00337                 num_weights = seq_length ;
00338                 return position_weights ;
00339             }
00340             if (length==0)
00341                 num_weights = degree ;
00342             else
00343                 num_weights = degree*length ;
00344             return weights;
00345         }
00346 
00352         inline DREAL *get_position_weights(INT& len)
00353         {
00354             len=seq_length;
00355             return position_weights;
00356         }
00357 
00363         bool set_shifts(INT* shifts, INT len);
00364 
00371         virtual bool set_weights(DREAL* weights, INT d, INT len=0);
00372 
00377         virtual bool set_wd_weights();
00378 
00385         virtual bool set_position_weights(DREAL* position_weights, INT len=0);
00386 
00394         bool set_position_weights_lhs(DREAL* pws, INT len, INT num);
00395 
00403         bool set_position_weights_rhs(DREAL* pws, INT len, INT num);
00404 
00409         bool init_block_weights();
00410 
00415         bool init_block_weights_from_wd();
00416 
00421         bool init_block_weights_from_wd_external();
00422 
00427         bool init_block_weights_const();
00428 
00433         bool init_block_weights_linear();
00434 
00439         bool init_block_weights_sqpoly();
00440 
00445         bool init_block_weights_cubicpoly();
00446 
00451         bool init_block_weights_exp();
00452 
00457         bool init_block_weights_log();
00458 
00463         bool init_block_weights_external();
00464 
00469         bool delete_position_weights() { delete[] position_weights ; position_weights=NULL ; return true ; } ;
00470 
00475         bool delete_position_weights_lhs() { delete[] position_weights_lhs ; position_weights_lhs=NULL ; return true ; } ;
00476 
00481         bool delete_position_weights_rhs() { delete[] position_weights_rhs ; position_weights_rhs=NULL ; return true ; } ;
00482 
00487         inline bool get_use_normalization() { return use_normalization; }
00488 
00494         virtual DREAL compute_by_tree(INT idx);
00495 
00501         virtual void compute_by_tree(INT idx, DREAL* LevelContrib);
00502 
00515         DREAL* compute_scoring(INT max_degree, INT& num_feat, INT& num_sym,
00516             DREAL* target, INT num_suppvec, INT* IDX, DREAL* weights);
00517 
00526         CHAR* compute_consensus(INT &num_feat, INT num_suppvec, INT* IDX,
00527             DREAL* alphas);
00528 
00540         DREAL* extract_w( INT max_degree, INT& num_feat, INT& num_sym,
00541             DREAL* w_result, INT num_suppvec, INT* IDX, DREAL* alphas);
00542 
00555         DREAL* compute_POIM( INT max_degree, INT& num_feat, INT& num_sym,
00556             DREAL* poim_result, INT num_suppvec, INT* IDX, DREAL* alphas, DREAL* distrib );
00557 
00564         void prepare_POIM2(DREAL* distrib, INT num_sym, INT num_feat);      
00565 
00572         void compute_POIM2(INT max_degree, CSVM* svm);
00573 
00579         void get_POIM2(DREAL** poim, INT* result_len);
00580 
00582         void cleanup_POIM2();
00583         
00584     protected:
00586         void create_empty_tries();
00587 
00593         virtual void add_example_to_tree(INT idx, DREAL weight);
00594 
00601         void add_example_to_single_tree(INT idx, DREAL weight, INT tree_num);
00602 
00611         virtual DREAL compute(INT idx_a, INT idx_b);
00612 
00621         DREAL compute_with_mismatch(CHAR* avec, INT alen,
00622             CHAR* bvec, INT blen);
00623 
00632         DREAL compute_without_mismatch(CHAR* avec, INT alen,
00633             CHAR* bvec, INT blen);
00634 
00643         DREAL compute_without_mismatch_matrix(CHAR* avec, INT alen,
00644             CHAR* bvec, INT blen);
00645 
00656         DREAL compute_without_mismatch_position_weights(
00657             CHAR* avec, DREAL *posweights_lhs, INT alen,
00658             CHAR* bvec, DREAL *posweights_rhs, INT blen);
00659 
00661         virtual void remove_lhs();
00662 
00663     protected:
00665         DREAL* weights;
00667         DREAL* position_weights;
00669         DREAL* position_weights_lhs;
00671         DREAL* position_weights_rhs;
00673         bool* position_mask;
00674 
00676         DREAL* weights_buffer;
00678         INT mkl_stepsize;
00679 
00681         INT degree;
00683         INT length;
00684 
00686         INT max_mismatch;
00688         INT seq_length;
00689 
00691         INT *shift;
00693         INT shift_len;
00695         INT max_shift;
00696 
00698         bool initialized;
00700         bool use_normalization;
00702         bool block_computation;
00703 
00705         DREAL normalization_const;
00706 
00708         INT num_block_weights_external;
00710         DREAL* block_weights_external;
00711 
00713         DREAL* block_weights;
00715         EWDKernType type;
00717         INT which_degree;
00718 
00720         CTrie<DNATrie> tries;
00722         CTrie<POIMTrie> poim_tries;
00723 
00725         bool tree_initialized;
00727         bool use_poim_tries;
00728 
00730         DREAL* m_poim_distrib;
00732         DREAL* m_poim;
00733 
00735         INT m_poim_num_sym;
00737         INT m_poim_num_feat;
00739         INT m_poim_result_len;
00740 
00742         CAlphabet* alphabet;
00743 };
00744 #endif /* _WEIGHTEDDEGREEPOSITIONSTRINGKERNEL_H__ */

SHOGUN Machine Learning Toolbox - Documentation