WordFeatures.cpp
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "features/WordFeatures.h"
00013 #include "features/CharFeatures.h"
00014 #include "lib/File.h"
00015
00016 CWordFeatures::CWordFeatures(int32_t size, int32_t num_sym)
00017 : CSimpleFeatures<uint16_t>(size), num_symbols(num_sym),
00018 original_num_symbols(num_sym), order(0), symbol_mask_table(NULL)
00019 {
00020 }
00021
00022 CWordFeatures::CWordFeatures(const CWordFeatures & orig)
00023 : CSimpleFeatures<uint16_t>(orig)
00024 {
00025 }
00026
00027 CWordFeatures::CWordFeatures(char* fname, int32_t num_sym)
00028 : CSimpleFeatures<uint16_t>(fname), num_symbols(num_sym),
00029 original_num_symbols(num_sym), order(0), symbol_mask_table(NULL)
00030 {
00031 }
00032
00033 CWordFeatures::~CWordFeatures()
00034 {
00035 delete[] symbol_mask_table;
00036 }
00037
00038 bool CWordFeatures::obtain_from_char_features(
00039 CCharFeatures* cf, int32_t start, int32_t p_order, int32_t gap)
00040 {
00041 ASSERT(cf);
00042
00043 this->order=p_order;
00044 delete[] symbol_mask_table;
00045 symbol_mask_table=new uint16_t[256];
00046
00047 num_vectors=cf->get_num_vectors();
00048 num_features=cf->get_num_features();
00049
00050 CAlphabet* alpha=cf->get_alphabet();
00051 ASSERT(alpha);
00052
00053 int32_t len=num_vectors*num_features;
00054 delete[] feature_matrix;
00055 feature_matrix=new uint16_t[len];
00056 int32_t num_cf_feat=0;
00057 int32_t num_cf_vec=0;
00058 char* fm=cf->get_feature_matrix(num_cf_feat, num_cf_vec);
00059
00060 ASSERT(num_cf_vec==num_vectors);
00061 ASSERT(num_cf_feat==num_features);
00062
00063 int32_t max_val=0;
00064 for (int32_t i=0; i<len; i++)
00065 {
00066 feature_matrix[i]=(uint16_t) alpha->remap_to_bin(fm[i]);
00067 max_val=CMath::max((int32_t) feature_matrix[i],max_val);
00068 }
00069
00070 original_num_symbols=max_val+1;
00071
00072 int32_t* hist = new int[max_val+1] ;
00073 for (int32_t i=0; i<=max_val; i++)
00074 hist[i]=0 ;
00075
00076 for (int32_t i=0; i<len; i++)
00077 {
00078 feature_matrix[i]=(uint16_t) alpha->remap_to_bin(fm[i]);
00079 hist[feature_matrix[i]]++ ;
00080 }
00081 for (int32_t i=0; i<=max_val; i++)
00082 if (hist[i]>0)
00083 SG_DEBUG( "symbol: %i number of occurence: %i\n", i, hist[i]) ;
00084
00085 delete[] hist;
00086
00087
00088 max_val= (int32_t) ceil(log((float64_t) max_val+1)/log((float64_t) 2));
00089 num_symbols=1<<(max_val*p_order);
00090
00091 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %d\n", max_val, p_order, num_symbols);
00092
00093 if (num_symbols>(1<<(sizeof(uint16_t)*8)))
00094 {
00095 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
00096 return false;
00097 }
00098
00099 for (int32_t line=0; line<num_vectors; line++)
00100 translate_from_single_order(&feature_matrix[line*num_features], num_features, start+gap, p_order+gap, max_val, gap);
00101
00102 if (start+gap!=0)
00103 {
00104
00105 ASSERT(start+gap>=0);
00106 for (int32_t line=0; line<num_vectors; line++)
00107 for (int32_t j=0; j<num_features-start-gap; j++)
00108 feature_matrix[line*(num_features-(start+gap))+j]=feature_matrix[line*num_features+j] ;
00109 num_features=num_features-(start+gap) ;
00110 }
00111
00112 for (int32_t i=0; i<256; i++)
00113 symbol_mask_table[i]=0;
00114
00115 uint16_t mask=0;
00116 for (int32_t i=0; i<max_val; i++)
00117 mask=(mask<<1) | 1;
00118
00119 for (int32_t i=0; i<256; i++)
00120 {
00121 uint8_t bits=(uint8_t) i;
00122 symbol_mask_table[i]=0;
00123
00124 for (int32_t j=0; j<8; j++)
00125 {
00126 if (bits & 1)
00127 symbol_mask_table[i]|=mask<<(max_val*j);
00128
00129 bits>>=1;
00130 }
00131 }
00132
00133 return true;
00134 }
00135
00136 void CWordFeatures::translate_from_single_order(
00137 uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order,
00138 int32_t max_val, int32_t gap)
00139 {
00140 ASSERT(gap>=0);
00141
00142 const int32_t start_gap = (p_order - gap)/2;
00143 const int32_t end_gap = start_gap + gap;
00144 int32_t i,j;
00145 uint16_t value=0;
00146
00147
00148 for (i=sequence_length-1; i>=p_order-1; i--)
00149 {
00150 value=0;
00151 for (j=i; j>=i-p_order+1; j--)
00152 {
00153 if (i-j<start_gap)
00154 {
00155 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00156 }
00157 else if (i-j>=end_gap)
00158 {
00159 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00160 }
00161 }
00162 obs[i]=value;
00163 }
00164
00165
00166 for (i=p_order-2;i>=0;i--)
00167 {
00168 value=0;
00169 for (j=i; j>=i-p_order+1; j--)
00170 {
00171 if (i-j<start_gap)
00172 {
00173 value= (value >> max_val);
00174 if (j>=0)
00175 value|=obs[j] << (max_val * (p_order-1-gap));
00176 }
00177 else if (i-j>=end_gap)
00178 {
00179 value= (value >> max_val);
00180 if (j>=0)
00181 value|=obs[j] << (max_val * (p_order-1-gap));
00182 }
00183 }
00184 obs[i]=value;
00185 }
00186
00187
00188 for (i=start; i<sequence_length; i++)
00189 obs[i-start]=obs[i];
00190 }
00191
00192 bool CWordFeatures::load(char* fname)
00193 {
00194 return false;
00195 }
00196
00197 bool CWordFeatures::save(char* fname)
00198 {
00199 int32_t len;
00200 bool free;
00201 uint16_t* fv;
00202
00203 CFile f(fname, 'w', F_WORD);
00204
00205 for (int32_t i=0; i< (int32_t) num_vectors && f.is_ok(); i++)
00206 {
00207 if (!(i % (num_vectors/10+1)))
00208 SG_PRINT( "%02d%%.", (int) (100.0*i/num_vectors));
00209 else if (!(i % (num_vectors/200+1)))
00210 SG_PRINT( ".");
00211
00212 fv=get_feature_vector(i, len, free);
00213 f.save_word_data(fv, len);
00214 free_feature_vector(fv, i, free) ;
00215 }
00216
00217 if (f.is_ok())
00218 SG_INFO( "%d vectors with %d features each successfully written (filesize: %ld)\n", num_vectors, num_features, num_vectors*num_features*sizeof(uint16_t));
00219
00220 return true;
00221 }
00222
00223
00224
00225
00226
00227
00228