WordFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "features/WordFeatures.h"
00013 #include "features/CharFeatures.h"
00014 #include "lib/File.h"
00015 
00016 CWordFeatures::CWordFeatures(INT size, INT num_sym)
00017 : CSimpleFeatures<WORD>(size), num_symbols(num_sym),
00018     original_num_symbols(num_sym), order(0), symbol_mask_table(NULL)
00019 {
00020 }
00021 
00022 CWordFeatures::CWordFeatures(const CWordFeatures & orig)
00023 : CSimpleFeatures<WORD>(orig)
00024 {
00025 }
00026 
00027 CWordFeatures::CWordFeatures(CHAR* fname, INT num_sym)
00028 : CSimpleFeatures<WORD>(fname), num_symbols(num_sym),
00029     original_num_symbols(num_sym), order(0), symbol_mask_table(NULL)
00030 {
00031 }
00032 
00033 CWordFeatures::~CWordFeatures()
00034 {
00035     delete[] symbol_mask_table;
00036 }
00037 
00038 bool CWordFeatures::obtain_from_char_features(CCharFeatures* cf, INT start, INT p_order, INT gap)
00039 {
00040     ASSERT(cf);
00041 
00042     this->order=p_order;
00043     delete[] symbol_mask_table;
00044     symbol_mask_table=new WORD[256];
00045 
00046     num_vectors=cf->get_num_vectors();
00047     num_features=cf->get_num_features();
00048 
00049     CAlphabet* alpha=cf->get_alphabet();
00050     ASSERT(alpha);
00051 
00052     INT len=num_vectors*num_features;
00053     delete[] feature_matrix;
00054     feature_matrix=new WORD[len];
00055     INT num_cf_feat=0;
00056     INT num_cf_vec=0;
00057     CHAR* fm=cf->get_feature_matrix(num_cf_feat, num_cf_vec);
00058 
00059     ASSERT(num_cf_vec==num_vectors);
00060     ASSERT(num_cf_feat==num_features);
00061 
00062     INT max_val=0;
00063     for (INT i=0; i<len; i++)
00064     {
00065         feature_matrix[i]=(WORD) alpha->remap_to_bin(fm[i]);
00066         max_val=CMath::max((INT) feature_matrix[i],max_val);
00067     }
00068 
00069     original_num_symbols=max_val+1;
00070     
00071     INT* hist = new int[max_val+1] ;
00072     for (INT i=0; i<=max_val; i++)
00073       hist[i]=0 ;
00074 
00075     for (INT i=0; i<len; i++)
00076     {
00077         feature_matrix[i]=(WORD) alpha->remap_to_bin(fm[i]);
00078         hist[feature_matrix[i]]++ ;
00079     }
00080     for (INT i=0; i<=max_val; i++)
00081       if (hist[i]>0)
00082         SG_DEBUG( "symbol: %i  number of occurence: %i\n", i, hist[i]) ;
00083 
00084     delete[] hist;
00085 
00086     //number of bits the maximum value in feature matrix requires to get stored
00087     max_val= (int) ceil(log((double) max_val+1)/log((double) 2));
00088     num_symbols=1<<(max_val*p_order);
00089 
00090     SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %d\n", max_val, p_order, num_symbols);
00091 
00092     if (num_symbols>(1<<(sizeof(WORD)*8)))
00093     {
00094       SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
00095         return false;
00096     }
00097 
00098     for (INT line=0; line<num_vectors; line++)
00099         translate_from_single_order(&feature_matrix[line*num_features], num_features, start+gap, p_order+gap, max_val, gap);
00100 
00101     if (start+gap!=0)
00102     {
00103         // condensing feature matrix ... 
00104         ASSERT(start+gap>=0);
00105         for (INT line=0; line<num_vectors; line++)
00106             for (INT j=0; j<num_features-start-gap; j++)
00107                 feature_matrix[line*(num_features-(start+gap))+j]=feature_matrix[line*num_features+j] ;
00108         num_features=num_features-(start+gap) ;
00109     }
00110     
00111     for (INT i=0; i<256; i++)
00112         symbol_mask_table[i]=0;
00113 
00114     WORD mask=0;
00115     for (INT i=0; i<max_val; i++)
00116         mask=(mask<<1) | 1;
00117 
00118     for (INT i=0; i<256; i++)
00119     {
00120         BYTE bits=(BYTE) i;
00121         symbol_mask_table[i]=0;
00122 
00123         for (INT j=0; j<8; j++)
00124         {
00125             if (bits & 1)
00126                 symbol_mask_table[i]|=mask<<(max_val*j);
00127 
00128             bits>>=1;
00129         }
00130     }
00131 
00132     return true;
00133 }
00134 
00135 void CWordFeatures::translate_from_single_order(WORD* obs, INT sequence_length, INT start, INT p_order, INT max_val, INT gap)
00136 {
00137     ASSERT(gap>=0);
00138     
00139     const INT start_gap = (p_order - gap)/2 ;
00140     const INT end_gap = start_gap + gap ;
00141     
00142     INT i,j;
00143     WORD value=0;
00144 
00145     // almost all positions
00146     for (i=sequence_length-1; i>= ((int) p_order)-1; i--)   //convert interval of size T
00147     {
00148         value=0;
00149         for (j=i; j>=i-((int) p_order)+1; j--)
00150         {
00151             if (i-j<start_gap)
00152             {
00153                 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00154             }
00155             else if (i-j>=end_gap)
00156             {
00157                 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00158             }
00159         }
00160         obs[i]= (WORD) value;
00161     }
00162 
00163     // the remaining `order` positions
00164     for (i=p_order-2;i>=0;i--)
00165     {
00166         value=0;
00167         for (j=i; j>=i-p_order+1; j--)
00168         {
00169             if (i-j<start_gap)
00170             {
00171                 value= (value >> max_val);
00172                 if (j>=0)
00173                     value|=obs[j] << (max_val * (p_order-1-gap));
00174             }
00175             else if (i-j>=end_gap)
00176             {
00177                 value= (value >> max_val);
00178                 if (j>=0)
00179                     value|=obs[j] << (max_val * (p_order-1-gap));
00180             }           
00181         }
00182         obs[i]=value;
00183     }
00184 
00185     // shifting
00186     for (i=start; i<sequence_length; i++)   
00187         obs[i-start]=obs[i];
00188 }
00189 
00190 bool CWordFeatures::load(CHAR* fname)
00191 {
00192     return false;
00193 }
00194 
00195 bool CWordFeatures::save(CHAR* fname)
00196 {
00197     INT len;
00198     bool free;
00199     WORD* fv;
00200 
00201     CFile f(fname, 'w', F_WORD);
00202 
00203     for (INT i=0; i< (INT) num_vectors && f.is_ok(); i++)
00204     {
00205         if (!(i % (num_vectors/10+1)))
00206             SG_PRINT( "%02d%%.", (int) (100.0*i/num_vectors));
00207         else if (!(i % (num_vectors/200+1)))
00208             SG_PRINT( ".");
00209 
00210         fv=get_feature_vector(i, len, free);
00211         f.save_word_data(fv, len);
00212         free_feature_vector(fv, i, free) ;
00213     }
00214 
00215     if (f.is_ok())
00216         SG_INFO( "%d vectors with %d features each successfully written (filesize: %ld)\n", num_vectors, num_features, num_vectors*num_features*sizeof(WORD));
00217 
00218     return true;
00219 }
00220 
00221 /* 
00222 XT=['ATTTTTTAA';'ATTTTTTAA']' ;
00223 sg('send_command', 'loglevel ALL') ;
00224 sg('set_features', 'TRAIN', XT)
00225 sg('send_command', 'convert TRAIN SIMPLE CHAR SIMPLE WORD DNA 3 2 0') ;
00226 */

SHOGUN Machine Learning Toolbox - Documentation