WordFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "features/WordFeatures.h"
00013 #include "features/CharFeatures.h"
00014 #include "lib/File.h"
00015 
00016 CWordFeatures::CWordFeatures(int32_t size, int32_t num_sym)
00017 : CSimpleFeatures<uint16_t>(size), num_symbols(num_sym),
00018     original_num_symbols(num_sym), order(0), symbol_mask_table(NULL)
00019 {
00020 }
00021 
00022 CWordFeatures::CWordFeatures(const CWordFeatures & orig)
00023 : CSimpleFeatures<uint16_t>(orig)
00024 {
00025 }
00026 
00027 CWordFeatures::CWordFeatures(char* fname, int32_t num_sym)
00028 : CSimpleFeatures<uint16_t>(fname), num_symbols(num_sym),
00029     original_num_symbols(num_sym), order(0), symbol_mask_table(NULL)
00030 {
00031 }
00032 
00033 CWordFeatures::~CWordFeatures()
00034 {
00035     delete[] symbol_mask_table;
00036 }
00037 
00038 bool CWordFeatures::obtain_from_char_features(
00039     CCharFeatures* cf, int32_t start, int32_t p_order, int32_t gap)
00040 {
00041     ASSERT(cf);
00042 
00043     this->order=p_order;
00044     delete[] symbol_mask_table;
00045     symbol_mask_table=new uint16_t[256];
00046 
00047     num_vectors=cf->get_num_vectors();
00048     num_features=cf->get_num_features();
00049 
00050     CAlphabet* alpha=cf->get_alphabet();
00051     ASSERT(alpha);
00052 
00053     int32_t len=num_vectors*num_features;
00054     delete[] feature_matrix;
00055     feature_matrix=new uint16_t[len];
00056     int32_t num_cf_feat=0;
00057     int32_t num_cf_vec=0;
00058     char* fm=cf->get_feature_matrix(num_cf_feat, num_cf_vec);
00059 
00060     ASSERT(num_cf_vec==num_vectors);
00061     ASSERT(num_cf_feat==num_features);
00062 
00063     int32_t max_val=0;
00064     for (int32_t i=0; i<len; i++)
00065     {
00066         feature_matrix[i]=(uint16_t) alpha->remap_to_bin(fm[i]);
00067         max_val=CMath::max((int32_t) feature_matrix[i],max_val);
00068     }
00069 
00070     original_num_symbols=max_val+1;
00071     
00072     int32_t* hist = new int[max_val+1] ;
00073     for (int32_t i=0; i<=max_val; i++)
00074       hist[i]=0 ;
00075 
00076     for (int32_t i=0; i<len; i++)
00077     {
00078         feature_matrix[i]=(uint16_t) alpha->remap_to_bin(fm[i]);
00079         hist[feature_matrix[i]]++ ;
00080     }
00081     for (int32_t i=0; i<=max_val; i++)
00082       if (hist[i]>0)
00083         SG_DEBUG( "symbol: %i  number of occurence: %i\n", i, hist[i]) ;
00084 
00085     delete[] hist;
00086 
00087     //number of bits the maximum value in feature matrix requires to get stored
00088     max_val= (int32_t) ceil(log((float64_t) max_val+1)/log((float64_t) 2));
00089     num_symbols=1<<(max_val*p_order);
00090 
00091     SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %d\n", max_val, p_order, num_symbols);
00092 
00093     if (num_symbols>(1<<(sizeof(uint16_t)*8)))
00094     {
00095       SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
00096         return false;
00097     }
00098 
00099     for (int32_t line=0; line<num_vectors; line++)
00100         translate_from_single_order(&feature_matrix[line*num_features], num_features, start+gap, p_order+gap, max_val, gap);
00101 
00102     if (start+gap!=0)
00103     {
00104         // condensing feature matrix ... 
00105         ASSERT(start+gap>=0);
00106         for (int32_t line=0; line<num_vectors; line++)
00107             for (int32_t j=0; j<num_features-start-gap; j++)
00108                 feature_matrix[line*(num_features-(start+gap))+j]=feature_matrix[line*num_features+j] ;
00109         num_features=num_features-(start+gap) ;
00110     }
00111     
00112     for (int32_t i=0; i<256; i++)
00113         symbol_mask_table[i]=0;
00114 
00115     uint16_t mask=0;
00116     for (int32_t i=0; i<max_val; i++)
00117         mask=(mask<<1) | 1;
00118 
00119     for (int32_t i=0; i<256; i++)
00120     {
00121         uint8_t bits=(uint8_t) i;
00122         symbol_mask_table[i]=0;
00123 
00124         for (int32_t j=0; j<8; j++)
00125         {
00126             if (bits & 1)
00127                 symbol_mask_table[i]|=mask<<(max_val*j);
00128 
00129             bits>>=1;
00130         }
00131     }
00132 
00133     return true;
00134 }
00135 
00136 void CWordFeatures::translate_from_single_order(
00137     uint16_t* obs, int32_t sequence_length, int32_t start, int32_t p_order,
00138     int32_t max_val, int32_t gap)
00139 {
00140     ASSERT(gap>=0);
00141     
00142     const int32_t start_gap = (p_order - gap)/2;
00143     const int32_t end_gap = start_gap + gap;
00144     int32_t i,j;
00145     uint16_t value=0;
00146 
00147     // almost all positions
00148     for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
00149     {
00150         value=0;
00151         for (j=i; j>=i-p_order+1; j--)
00152         {
00153             if (i-j<start_gap)
00154             {
00155                 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00156             }
00157             else if (i-j>=end_gap)
00158             {
00159                 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00160             }
00161         }
00162         obs[i]=value;
00163     }
00164 
00165     // the remaining `order` positions
00166     for (i=p_order-2;i>=0;i--)
00167     {
00168         value=0;
00169         for (j=i; j>=i-p_order+1; j--)
00170         {
00171             if (i-j<start_gap)
00172             {
00173                 value= (value >> max_val);
00174                 if (j>=0)
00175                     value|=obs[j] << (max_val * (p_order-1-gap));
00176             }
00177             else if (i-j>=end_gap)
00178             {
00179                 value= (value >> max_val);
00180                 if (j>=0)
00181                     value|=obs[j] << (max_val * (p_order-1-gap));
00182             }           
00183         }
00184         obs[i]=value;
00185     }
00186 
00187     // shifting
00188     for (i=start; i<sequence_length; i++)   
00189         obs[i-start]=obs[i];
00190 }
00191 
00192 bool CWordFeatures::load(char* fname)
00193 {
00194     return false;
00195 }
00196 
00197 bool CWordFeatures::save(char* fname)
00198 {
00199     int32_t len;
00200     bool free;
00201     uint16_t* fv;
00202 
00203     CFile f(fname, 'w', F_WORD);
00204 
00205     for (int32_t i=0; i< (int32_t) num_vectors && f.is_ok(); i++)
00206     {
00207         if (!(i % (num_vectors/10+1)))
00208             SG_PRINT( "%02d%%.", (int) (100.0*i/num_vectors));
00209         else if (!(i % (num_vectors/200+1)))
00210             SG_PRINT( ".");
00211 
00212         fv=get_feature_vector(i, len, free);
00213         f.save_word_data(fv, len);
00214         free_feature_vector(fv, i, free) ;
00215     }
00216 
00217     if (f.is_ok())
00218         SG_INFO( "%d vectors with %d features each successfully written (filesize: %ld)\n", num_vectors, num_features, num_vectors*num_features*sizeof(uint16_t));
00219 
00220     return true;
00221 }
00222 
00223 /* 
00224 XT=['ATTTTTTAA';'ATTTTTTAA']' ;
00225 sg('send_command', 'loglevel ALL') ;
00226 sg('set_features', 'TRAIN', XT)
00227 sg('send_command', 'convert TRAIN SIMPLE CHAR SIMPLE WORD DNA 3 2 0') ;
00228 */

SHOGUN Machine Learning Toolbox - Documentation