WordFeatures.cpp
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "features/WordFeatures.h"
00013 #include "features/CharFeatures.h"
00014 #include "lib/File.h"
00015
00016 CWordFeatures::CWordFeatures(INT size, INT num_sym)
00017 : CSimpleFeatures<WORD>(size), num_symbols(num_sym),
00018 original_num_symbols(num_sym), order(0), symbol_mask_table(NULL)
00019 {
00020 }
00021
00022 CWordFeatures::CWordFeatures(const CWordFeatures & orig)
00023 : CSimpleFeatures<WORD>(orig)
00024 {
00025 }
00026
00027 CWordFeatures::CWordFeatures(CHAR* fname, INT num_sym)
00028 : CSimpleFeatures<WORD>(fname), num_symbols(num_sym),
00029 original_num_symbols(num_sym), order(0), symbol_mask_table(NULL)
00030 {
00031 }
00032
00033 CWordFeatures::~CWordFeatures()
00034 {
00035 delete[] symbol_mask_table;
00036 }
00037
00038 bool CWordFeatures::obtain_from_char_features(CCharFeatures* cf, INT start, INT p_order, INT gap)
00039 {
00040 ASSERT(cf);
00041
00042 this->order=p_order;
00043 delete[] symbol_mask_table;
00044 symbol_mask_table=new WORD[256];
00045
00046 num_vectors=cf->get_num_vectors();
00047 num_features=cf->get_num_features();
00048
00049 CAlphabet* alpha=cf->get_alphabet();
00050 ASSERT(alpha);
00051
00052 INT len=num_vectors*num_features;
00053 delete[] feature_matrix;
00054 feature_matrix=new WORD[len];
00055 INT num_cf_feat=0;
00056 INT num_cf_vec=0;
00057 CHAR* fm=cf->get_feature_matrix(num_cf_feat, num_cf_vec);
00058
00059 ASSERT(num_cf_vec==num_vectors);
00060 ASSERT(num_cf_feat==num_features);
00061
00062 INT max_val=0;
00063 for (INT i=0; i<len; i++)
00064 {
00065 feature_matrix[i]=(WORD) alpha->remap_to_bin(fm[i]);
00066 max_val=CMath::max((INT) feature_matrix[i],max_val);
00067 }
00068
00069 original_num_symbols=max_val+1;
00070
00071 INT* hist = new int[max_val+1] ;
00072 for (INT i=0; i<=max_val; i++)
00073 hist[i]=0 ;
00074
00075 for (INT i=0; i<len; i++)
00076 {
00077 feature_matrix[i]=(WORD) alpha->remap_to_bin(fm[i]);
00078 hist[feature_matrix[i]]++ ;
00079 }
00080 for (INT i=0; i<=max_val; i++)
00081 if (hist[i]>0)
00082 SG_DEBUG( "symbol: %i number of occurence: %i\n", i, hist[i]) ;
00083
00084 delete[] hist;
00085
00086
00087 max_val= (int) ceil(log((double) max_val+1)/log((double) 2));
00088 num_symbols=1<<(max_val*p_order);
00089
00090 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %d\n", max_val, p_order, num_symbols);
00091
00092 if (num_symbols>(1<<(sizeof(WORD)*8)))
00093 {
00094 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
00095 return false;
00096 }
00097
00098 for (INT line=0; line<num_vectors; line++)
00099 translate_from_single_order(&feature_matrix[line*num_features], num_features, start+gap, p_order+gap, max_val, gap);
00100
00101 if (start+gap!=0)
00102 {
00103
00104 ASSERT(start+gap>=0);
00105 for (INT line=0; line<num_vectors; line++)
00106 for (INT j=0; j<num_features-start-gap; j++)
00107 feature_matrix[line*(num_features-(start+gap))+j]=feature_matrix[line*num_features+j] ;
00108 num_features=num_features-(start+gap) ;
00109 }
00110
00111 for (INT i=0; i<256; i++)
00112 symbol_mask_table[i]=0;
00113
00114 WORD mask=0;
00115 for (INT i=0; i<max_val; i++)
00116 mask=(mask<<1) | 1;
00117
00118 for (INT i=0; i<256; i++)
00119 {
00120 BYTE bits=(BYTE) i;
00121 symbol_mask_table[i]=0;
00122
00123 for (INT j=0; j<8; j++)
00124 {
00125 if (bits & 1)
00126 symbol_mask_table[i]|=mask<<(max_val*j);
00127
00128 bits>>=1;
00129 }
00130 }
00131
00132 return true;
00133 }
00134
00135 void CWordFeatures::translate_from_single_order(WORD* obs, INT sequence_length, INT start, INT p_order, INT max_val, INT gap)
00136 {
00137 ASSERT(gap>=0);
00138
00139 const INT start_gap = (p_order - gap)/2 ;
00140 const INT end_gap = start_gap + gap ;
00141
00142 INT i,j;
00143 WORD value=0;
00144
00145
00146 for (i=sequence_length-1; i>= ((int) p_order)-1; i--)
00147 {
00148 value=0;
00149 for (j=i; j>=i-((int) p_order)+1; j--)
00150 {
00151 if (i-j<start_gap)
00152 {
00153 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00154 }
00155 else if (i-j>=end_gap)
00156 {
00157 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00158 }
00159 }
00160 obs[i]= (WORD) value;
00161 }
00162
00163
00164 for (i=p_order-2;i>=0;i--)
00165 {
00166 value=0;
00167 for (j=i; j>=i-p_order+1; j--)
00168 {
00169 if (i-j<start_gap)
00170 {
00171 value= (value >> max_val);
00172 if (j>=0)
00173 value|=obs[j] << (max_val * (p_order-1-gap));
00174 }
00175 else if (i-j>=end_gap)
00176 {
00177 value= (value >> max_val);
00178 if (j>=0)
00179 value|=obs[j] << (max_val * (p_order-1-gap));
00180 }
00181 }
00182 obs[i]=value;
00183 }
00184
00185
00186 for (i=start; i<sequence_length; i++)
00187 obs[i-start]=obs[i];
00188 }
00189
00190 bool CWordFeatures::load(CHAR* fname)
00191 {
00192 return false;
00193 }
00194
00195 bool CWordFeatures::save(CHAR* fname)
00196 {
00197 INT len;
00198 bool free;
00199 WORD* fv;
00200
00201 CFile f(fname, 'w', F_WORD);
00202
00203 for (INT i=0; i< (INT) num_vectors && f.is_ok(); i++)
00204 {
00205 if (!(i % (num_vectors/10+1)))
00206 SG_PRINT( "%02d%%.", (int) (100.0*i/num_vectors));
00207 else if (!(i % (num_vectors/200+1)))
00208 SG_PRINT( ".");
00209
00210 fv=get_feature_vector(i, len, free);
00211 f.save_word_data(fv, len);
00212 free_feature_vector(fv, i, free) ;
00213 }
00214
00215 if (f.is_ok())
00216 SG_INFO( "%d vectors with %d features each successfully written (filesize: %ld)\n", num_vectors, num_features, num_vectors*num_features*sizeof(WORD));
00217
00218 return true;
00219 }
00220
00221
00222
00223
00224
00225
00226