StringFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _CSTRINGFEATURES__H__
00013 #define _CSTRINGFEATURES__H__
00014 
00015 
00016 #include "preproc/PreProc.h"
00017 #include "preproc/StringPreProc.h"
00018 #include "features/Features.h"
00019 #include "features/CharFeatures.h"
00020 #include "features/Alphabet.h"
00021 #include "lib/common.h"
00022 #include "lib/io.h"
00023 #include "lib/DynamicArray.h"
00024 #include "lib/File.h"
00025 #include "lib/Mathematics.h"
00026 
00027 #include <sys/types.h>
00028 #include <sys/stat.h>
00029 #include <dirent.h>
00030 #include <stdio.h>
00031 #include <stdlib.h>
00032 #include <unistd.h>
00033 
00034 class CFile;
00035 
00036 template <class ST> class CStringPreProc;
00037 
00039 template <class T> struct T_STRING
00040 {
00042     T* string;
00044     int32_t length;
00045 };
00046 
00047 template <class T> char* get_zero_terminated_string_copy(T_STRING<T> str)
00048 {
00049     int32_t l=str.length;
00050     char* s=new char[l+1];
00051     memcpy(s, str.string, sizeof(char)*l);
00052     s[l]='\0';
00053     return s;
00054 }
00055 
00068 template <class ST> class CStringFeatures : public CFeatures
00069 {
00070     public:
00075         CStringFeatures(EAlphabet alpha)
00076         : CFeatures(0), num_vectors(0), features(NULL),
00077             single_string(NULL),length_of_single_string(0),
00078             max_string_length(0), order(0), selected_vector(0),
00079             symbol_mask_table(NULL)
00080         {
00081             alphabet=new CAlphabet(alpha);
00082             SG_REF(alphabet);
00083             num_symbols=alphabet->get_num_symbols();
00084             original_num_symbols=num_symbols;
00085         }
00086 
00091         CStringFeatures(CAlphabet* alpha)
00092         : CFeatures(0), num_vectors(0), features(NULL),
00093             single_string(NULL),length_of_single_string(0),
00094             max_string_length(0), order(0), selected_vector(0),
00095             symbol_mask_table(NULL)
00096     {
00097         ASSERT(alpha);
00098         alphabet=new CAlphabet(alpha);
00099         num_symbols=alphabet->get_num_symbols();
00100         original_num_symbols=num_symbols;
00101     }
00102 
00104         CStringFeatures(const CStringFeatures & orig)
00105         : CFeatures(orig), num_vectors(orig.num_vectors),
00106             single_string(orig.single_string),
00107             length_of_single_string(orig.length_of_single_string),
00108             max_string_length(orig.max_string_length),
00109             num_symbols(orig.num_symbols),
00110             original_num_symbols(orig.original_num_symbols),
00111             order(orig.order), selected_vector(orig.selected_vector)
00112         {
00113             ASSERT(orig.single_string == NULL); //not implemented
00114 
00115             alphabet=new CAlphabet(orig.alphabet);
00116             SG_REF(alphabet);
00117 
00118             if (orig.features)
00119             {
00120                 features=new T_STRING<ST>[orig.num_vectors];
00121 
00122                 for (int32_t i=0; i<num_vectors; i++)
00123                 {
00124                     features[i].string=new ST[orig.features[i].length];
00125                     ASSERT(features[i].string);
00126                     features[i].length=orig.features[i].length;
00127                     memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length); 
00128                 }
00129             }
00130 
00131             if (orig.symbol_mask_table)
00132             {
00133                 symbol_mask_table=new ST[256];
00134                 for (int32_t i=0; i<256; i++)
00135                     symbol_mask_table[i]=orig.symbol_mask_table[i];
00136             }
00137         }
00138 
00144         CStringFeatures(char* fname, EAlphabet alpha=DNA)
00145         : CFeatures(fname), num_vectors(0),
00146             features(NULL), single_string(NULL),
00147             length_of_single_string(0), max_string_length(0),
00148             order(0), selected_vector(0), symbol_mask_table(NULL)
00149         {
00150             alphabet=new CAlphabet(alpha);
00151             SG_REF(alphabet);
00152             num_symbols=alphabet->get_num_symbols();
00153             original_num_symbols=num_symbols;
00154             load(fname);
00155         }
00156 
00157         virtual ~CStringFeatures()
00158         {
00159             cleanup();
00160 
00161 #ifdef HAVE_SWIG
00162             SG_UNREF(alphabet);
00163 #else
00164             delete alphabet;
00165 #endif
00166         }
00167 
00169         void cleanup()
00170         {
00171             if (single_string)
00172             {
00173                 delete[] single_string;
00174                 single_string=NULL;
00175             }
00176             else
00177             {
00178                 for (int32_t i=0; i<num_vectors; i++)
00179                 {
00180                     delete[] features[i].string;
00181                     features[i].length=0;
00182                 }
00183             }
00184             num_vectors=0;
00185             delete[] features;
00186 
00187             delete[] symbol_mask_table;
00188             alphabet->clear_histogram();
00189         }
00190 
00195         inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00196 
00201         inline virtual EFeatureType get_feature_type();
00202 
00207         inline CAlphabet* get_alphabet()
00208         {
00209             SG_REF(alphabet);
00210             return alphabet;
00211         }
00212 
00217         virtual CFeatures* duplicate() const
00218         {
00219             return new CStringFeatures<ST>(*this);
00220         }
00221 
00226         void select_feature_vector(int32_t num)
00227         {
00228             ASSERT(features);
00229             ASSERT(num<num_vectors);
00230 
00231             selected_vector=num;
00232         }
00233 
00239         void get_string(ST** dst, int32_t* len)
00240         {
00241             ASSERT(features);
00242             ASSERT(selected_vector<num_vectors);
00243 
00244             *len=features[selected_vector].length;
00245             *dst=new ST[*len];
00246             memcpy(*dst, features[selected_vector].string, *len * sizeof(ST));
00247         }
00248 
00255         virtual ST* get_feature_vector(int32_t num, int32_t& len)
00256         {
00257             ASSERT(features);
00258             ASSERT(num<num_vectors);
00259 
00260             len=features[num].length;
00261             return features[num].string;
00262         }
00263 
00270         virtual void set_feature_vector(int32_t num, ST* string, int32_t len)
00271         {
00272             ASSERT(features);
00273             ASSERT(num<num_vectors);
00274 
00275             features[num].length=len ;
00276             features[num].string=string ;
00277         }
00278 
00285         virtual ST inline get_feature(int32_t vec_num, int32_t feat_num)
00286         {
00287             ASSERT(features && vec_num<num_vectors);
00288             ASSERT(feat_num<features[vec_num].length);
00289 
00290             return features[vec_num].string[feat_num];
00291         }
00292 
00298         virtual inline int32_t get_vector_length(int32_t vec_num)
00299         {
00300             ASSERT(features && vec_num<num_vectors);
00301             return features[vec_num].length;
00302         }
00303 
00308         virtual inline int32_t get_max_vector_length()
00309         {
00310             return max_string_length;
00311         }
00312 
00317         virtual inline int32_t get_num_vectors() { return num_vectors; }
00318 
00325         inline float128_t get_num_symbols() { return num_symbols; }
00326 
00333         inline float128_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00334 
00335         // these functions are necessary to find out about a former conversion process
00336 
00341         inline float128_t get_original_num_symbols() { return original_num_symbols; }
00342 
00347         inline int32_t get_order() { return order; }
00348 
00356         inline ST get_masked_symbols(ST symbol, uint8_t mask)
00357         {
00358             ASSERT(symbol_mask_table);
00359             return symbol_mask_table[mask] & symbol;
00360         }
00361 
00368         inline ST shift_offset(ST offset, int32_t amount)
00369         {
00370             ASSERT(alphabet);
00371             return (offset << (amount*alphabet->get_num_bits()));
00372         }
00373 
00380         inline ST shift_symbol(ST symbol, int32_t amount)
00381         {
00382             ASSERT(alphabet);
00383             return (symbol >> (amount*alphabet->get_num_bits()));
00384         }
00385 
00391         virtual bool load(char* fname)
00392         {
00393             SG_INFO( "loading...\n");
00394             int64_t length=0;
00395             max_string_length=0;
00396 
00397             CFile f(fname, 'r', F_CHAR);
00398             char* feature_matrix=f.load_char_data(NULL, length);
00399 
00400             num_vectors=0;
00401 
00402             if (f.is_ok())
00403             {
00404                 for (int64_t i=0; i<length; i++)
00405                 {
00406                     if (feature_matrix[i]=='\n')
00407                         num_vectors++;
00408                 }
00409 
00410                 SG_INFO( "file contains %ld vectors\n", num_vectors);
00411                 features= new T_STRING<ST>[num_vectors];
00412 
00413                 int64_t index=0;
00414                 for (int32_t lines=0; lines<num_vectors; lines++)
00415                 {
00416                     char* p=&feature_matrix[index];
00417                     int32_t columns=0;
00418 
00419                     for (columns=0; index+columns<length && p[columns]!='\n'; columns++);
00420 
00421                     if (index+columns>=length && p[columns]!='\n') {
00422                         SG_ERROR( "error in \"%s\":%d\n", fname, lines);
00423                     }
00424 
00425                     features[lines].length=columns;
00426                     features[lines].string=new ST[columns];
00427 
00428                     max_string_length=CMath::max(max_string_length,columns);
00429 
00430                     for (int32_t i=0; i<columns; i++)
00431                         features[lines].string[i]= ((ST) p[i]);
00432 
00433                     index+= features[lines].length+1;
00434                 }
00435 
00436                 num_symbols=4; //FIXME
00437                 return true;
00438             }
00439             else
00440                 SG_ERROR( "reading file failed\n");
00441 
00442             return false;
00443         }
00444 
00451         bool load_dna_file(char* fname, bool remap_to_bin=true)
00452         {
00453             bool result=false;
00454 
00455             size_t blocksize=1024*1024;
00456             size_t required_blocksize=0;
00457             uint8_t* dummy=new uint8_t[blocksize];
00458             uint8_t* overflow=NULL;
00459             int32_t overflow_len=0;
00460 
00461             num_symbols=4;
00462             cleanup();
00463 
00464             CAlphabet* alpha=new CAlphabet(DNA);
00465 
00466             FILE* f=fopen(fname, "ro");
00467 
00468             if (f)
00469             {
00470                 num_vectors=0;
00471                 max_string_length=0;
00472 
00473                 SG_INFO("counting line numbers in file %s\n", fname);
00474                 SG_DEBUG("block_size=%d\n", required_blocksize);
00475                 size_t sz=blocksize;
00476                 size_t block_offs=0;
00477                 size_t old_block_offs=0;
00478                 fseek(f, 0, SEEK_END);
00479                 size_t fsize=ftell(f);
00480                 rewind(f);
00481 
00482                 while (sz == blocksize)
00483                 {
00484                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00485                     bool contains_cr=false;
00486                     for (size_t i=0; i<sz; i++)
00487                     {
00488                         block_offs++;
00489                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00490                         {
00491                             num_vectors++;
00492                             contains_cr=true;
00493                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00494                             old_block_offs=block_offs;
00495                         }
00496                     }
00497                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00498                 }
00499 
00500                 SG_INFO("found %d strings\n", num_vectors);
00501                 delete[] dummy;
00502                 blocksize=required_blocksize;
00503                 dummy = new uint8_t[blocksize];
00504                 overflow = new uint8_t[blocksize];
00505                 features=new T_STRING<ST>[num_vectors];
00506 
00507                 rewind(f);
00508                 sz=blocksize;
00509                 int32_t lines=0;
00510                 while (sz == blocksize)
00511                 {
00512                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00513 
00514                     size_t old_sz=0;
00515                     for (size_t i=0; i<sz; i++)
00516                     {
00517                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00518                         {
00519                             int32_t len=i-old_sz;
00520                             //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz);
00521                             max_string_length=CMath::max(max_string_length, len+overflow_len);
00522 
00523                             features[lines].length=len;
00524                             features[lines].string=new ST[len];
00525 
00526                             if (remap_to_bin)
00527                             {
00528                                 for (int32_t j=0; j<overflow_len; j++)
00529                                     features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00530                                 for (int32_t j=0; j<len; j++)
00531                                     features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00532                             }
00533                             else
00534                             {
00535                                 for (int32_t j=0; j<overflow_len; j++)
00536                                     features[lines].string[j]=overflow[j];
00537                                 for (int32_t j=0; j<len; j++)
00538                                     features[lines].string[j+overflow_len]=dummy[old_sz+j];
00539                             }
00540 
00541                             // clear overflow
00542                             overflow_len=0;
00543 
00544                             //CMath::display_vector(features[lines].string, len);
00545                             old_sz=i+1;
00546                             lines++;
00547                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00548                         }
00549                     }
00550                     for (size_t i=old_sz; i<sz; i++)
00551                         overflow[i-old_sz]=dummy[i];
00552 
00553                     overflow_len=sz-old_sz;
00554                 }
00555                 result=true;
00556                 SG_INFO("file successfully read\n");
00557                 SG_INFO("max_string_length=%d\n", max_string_length);
00558                 SG_INFO("num_strings=%d\n", num_vectors);
00559             }
00560 
00561             fclose(f);
00562             delete alpha;
00563             delete[] dummy;
00564 
00565 #ifdef HAVE_SWIG
00566             SG_UNREF(alphabet);
00567 #else
00568             delete alphabet;
00569 #endif
00570             if (remap_to_bin)
00571                 alphabet = new CAlphabet(RAWDNA);
00572             else
00573                 alphabet = new CAlphabet(DNA);
00574             SG_REF(alphabet);
00575 
00576             return result;
00577         }
00578 
00584         bool load_from_directory(char* dirname)
00585         {
00586             struct dirent **namelist;
00587             int32_t n;
00588 
00589             io.set_dirname(dirname);
00590 
00591             n = scandir(dirname, &namelist, io.filter, alphasort);
00592             if (n <= 0)
00593             {
00594                 SG_ERROR( "error calling scandir\n");
00595                 return false;
00596             }
00597             else
00598             {
00599                 T_STRING<ST>* strings=NULL;
00600                 alphabet->clear_histogram();
00601 
00602                 int32_t num=0;
00603                 int32_t max_len=-1;
00604 
00605                 //usually n==num_vec, but it might not in race conditions 
00606                 //(file perms modified, file erased)
00607                 strings=new T_STRING<ST>[n];
00608 
00609                 for (int32_t i=0; i<n; i++)
00610                 {
00611                     char* fname=io.concat_filename(namelist[i]->d_name);
00612 
00613                     struct stat s;
00614                     off_t filesize=0;
00615 
00616                     if (!stat(fname, &s) && s.st_size>0)
00617                     {
00618                         filesize=s.st_size/sizeof(ST);
00619 
00620                         FILE* f=fopen(fname, "ro");
00621                         if (f)
00622                         {
00623                             ST* str=new ST[filesize];
00624                             SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
00625                             fread(str, sizeof(ST), filesize, f);
00626                             strings[num].string=str;
00627                             strings[num].length=filesize;
00628                             max_len=CMath::max(max_len, strings[num].length);
00629 
00630                             num++;
00631                             fclose(f);
00632                         }
00633                     }
00634                     else
00635                         SG_ERROR("empty or non readable file \'%s\'\n", fname);
00636 
00637                     free(namelist[i]);
00638                 }
00639                 free(namelist);
00640 
00641                 if (num>0 && strings)
00642                 {
00643                     set_features(strings, num, max_len);
00644                     return true;
00645                 }
00646             }
00647             return false;
00648         }
00649 
00657         bool set_features(T_STRING<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00658         {
00659             if (p_features)
00660             {
00661                 CAlphabet* alpha=new CAlphabet(alphabet);
00662 
00663                 //compute histogram for char/byte
00664                 for (int32_t i=0; i<p_num_vectors; i++)
00665                     alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
00666 
00667                 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00668                 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00669 
00670                 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00671                 {
00672                     cleanup();
00673 
00674 #ifdef HAVE_SWIG
00675                     SG_UNREF(alphabet);
00676 #else
00677                     delete alphabet;
00678 #endif
00679                     alphabet=alpha;
00680                     SG_REF(alphabet);
00681 
00682                     this->features=p_features;
00683                     this->num_vectors=p_num_vectors;
00684                     this->max_string_length=p_max_string_length;
00685 
00686                     return true;
00687                 }
00688                 else
00689                     delete alpha;
00690             }
00691 
00692             return false;
00693         }
00694 
00701         virtual T_STRING<ST>* get_features(int32_t& num_str, int32_t& max_str_len)
00702         {
00703             num_str=num_vectors;
00704             max_str_len=max_string_length;
00705             return features;
00706         }
00707 
00713         virtual bool save(char* dest)
00714         {
00715             return false;
00716         }
00717 
00722         virtual int32_t get_size() { return sizeof(ST); }
00723 
00729         virtual bool apply_preproc(bool force_preprocessing=false)
00730         {
00731             SG_DEBUG( "force: %d\n", force_preprocessing);
00732 
00733             for (int32_t i=0; i<get_num_preproc(); i++)
00734             { 
00735                 if ( (!is_preprocessed(i) || force_preprocessing) )
00736                 {
00737                     set_preprocessed(i);
00738 
00739                     SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00740 
00741                     if (!((CStringPreProc<ST>*) get_preproc(i))->apply_to_string_features(this))
00742                         return false;
00743                 }
00744             }
00745             return true;
00746         }
00747 
00757         int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
00758         {
00759             ASSERT(step_size>0);
00760             ASSERT(window_size>0);
00761             ASSERT(num_vectors==1 || single_string);
00762             ASSERT(max_string_length>=window_size ||
00763                     (single_string && length_of_single_string>=window_size));
00764 
00765             //in case we are dealing with a single remapped string
00766             //allow remapping
00767             if (single_string)
00768                 num_vectors= (length_of_single_string-window_size)/step_size + 1;
00769             else if (num_vectors==1)
00770             {
00771                 num_vectors= (max_string_length-window_size)/step_size + 1;
00772                 length_of_single_string=max_string_length;
00773             }
00774 
00775             T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
00776             int32_t offs=0;
00777             for (int32_t i=0; i<num_vectors; i++)
00778             {
00779                 f[i].string=&features[0].string[offs+skip];
00780                 f[i].length=window_size-skip;
00781                 offs+=step_size;
00782             }
00783             single_string=features[0].string;
00784             delete[] features;
00785             features=f;
00786             selected_vector=0;
00787             max_string_length=window_size-skip;
00788 
00789             return num_vectors;
00790         }
00791 
00800         int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0)
00801         {
00802             ASSERT(positions);
00803             ASSERT(window_size>0);
00804             ASSERT(num_vectors==1 || single_string);
00805             ASSERT(max_string_length>=window_size ||
00806                     (single_string && length_of_single_string>=window_size));
00807 
00808             num_vectors= positions->get_num_elements();
00809             ASSERT(num_vectors>0);
00810 
00811             int32_t len;
00812 
00813             //in case we are dealing with a single remapped string
00814             //allow remapping
00815             if (single_string)
00816                 len=length_of_single_string;
00817             else
00818             {
00819                 single_string=features[0].string;
00820                 len=max_string_length;
00821                 length_of_single_string=max_string_length;
00822             }
00823 
00824             T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
00825             for (int32_t i=0; i<num_vectors; i++)
00826             {
00827                 int32_t p=positions->get_element(i);
00828 
00829                 if (p>=0 && p<=len-window_size)
00830                 {
00831                     f[i].string=&features[0].string[p+skip];
00832                     f[i].length=window_size-skip;
00833                 }
00834                 else
00835                 {
00836                     num_vectors=1;
00837                     max_string_length=len;
00838                     features[0].length=len;
00839                     single_string=NULL;
00840                     delete[] f;
00841                     SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
00842                             window_size, i, p, len);
00843                     return -1;
00844                 }
00845             }
00846 
00847             delete[] features;
00848             features=f;
00849             selected_vector=0;
00850             max_string_length=window_size-skip;
00851 
00852             return num_vectors;
00853         }
00854 
00866         inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
00867         {
00868             return obtain_from_char_features(sf, start, p_order, gap, rev);
00869         }
00870 
00880         template <class CT>
00881             bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
00882             {
00883                 ASSERT(sf);
00884                 this->order=p_order;
00885                 cleanup();
00886                 delete[] symbol_mask_table;
00887                 symbol_mask_table=new ST[256];
00888 
00889                 num_vectors=sf->get_num_vectors();
00890                 ASSERT(num_vectors>0);
00891                 max_string_length=sf->get_max_vector_length()-start;
00892                 features=new T_STRING<ST>[num_vectors];
00893                 CAlphabet* alpha=sf->get_alphabet();
00894                 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
00895 
00896                 SG_DEBUG( "%1.0llf symbols in StringFeatures<*>\n", sf->get_num_symbols());
00897 
00898                 for (int32_t i=0; i<num_vectors; i++)
00899                 {
00900                     int32_t len=-1;
00901                     CT* c=sf->get_feature_vector(i, len);
00902 
00903                     features[i].string=new ST[len];
00904                     features[i].length=len;
00905 
00906                     ST* str=features[i].string;
00907                     for (int32_t j=0; j<len; j++)
00908                         str[j]=(ST) alpha->remap_to_bin(c[j]);
00909 
00910                 }
00911 
00912                 original_num_symbols=alpha->get_num_symbols();
00913                 int32_t max_val=alpha->get_num_bits();
00914 
00915                 if (p_order>1)
00916                     num_symbols=CMath::powl((float128_t) 2, (float128_t) max_val*p_order);
00917                 else
00918                     num_symbols=original_num_symbols;
00919                 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
00920 
00921                 if ( ((float128_t) num_symbols) > CMath::powl(((float128_t) 2),((float128_t) sizeof(ST)*8)) )
00922                 {
00923                     SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
00924                     return false;
00925                 }
00926 
00927                 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
00928                 for (int32_t line=0; line<num_vectors; line++)
00929                 {
00930                     int32_t len=0;
00931                     ST* fv=get_feature_vector(line, len);
00932 
00933                     if (rev)
00934                         translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
00935                     else
00936                         translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
00937                     //translate_from_single_order(fv, len, start, p_order, max_val);
00938                     //translate_from_single_order_reversed(fv, len, start, p_order, max_val);
00939 
00940                     /* fix the length of the string -- hacky */
00941                     features[line].length-=start+gap ;
00942                     if (features[line].length<0)
00943                         features[line].length=0 ;
00944                 }         
00945 
00946                 uint64_t mask=0;
00947                 for (int32_t i=0; i< (int64_t) max_val; i++)
00948                     mask=(mask<<1) | 1;
00949 
00950                 for (int32_t i=0; i<256; i++)
00951                 {
00952                     uint8_t bits=(uint8_t) i;
00953                     symbol_mask_table[i]=0;
00954 
00955                     for (int32_t j=0; j<8; j++)
00956                     {
00957                         if (bits & 1)
00958                             symbol_mask_table[i]|=mask<<(max_val*j);
00959 
00960                         bits>>=1;
00961                     }
00962                 }
00963 
00964                 return true;
00965             }
00966 
00974         bool have_same_length(int32_t len)
00975         {
00976             if (len!=get_max_vector_length())
00977                 return false;
00978 
00979             for (int32_t i=0; i<num_vectors; i++)
00980             {
00981                 if (get_vector_length(i)!=len)
00982                     return false;
00983             }
00984 
00985             return true;
00986         }
00987 
00988     protected:
00997         void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00998         {
00999             int32_t i,j;
01000             ST value=0;
01001 
01002             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
01003             {
01004                 value=0;
01005                 for (j=i; j>=i-p_order+1; j--)
01006                     value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
01007 
01008                 obs[i]= (ST) value;
01009             }
01010 
01011             for (i=p_order-2;i>=0;i--)
01012             {
01013                 if (i>=sequence_length)
01014                     continue;
01015 
01016                 value=0;
01017                 for (j=i; j>=i-p_order+1; j--)
01018                 {
01019                     value= (value >> max_val);
01020                     if (j>=0 && j<sequence_length)
01021                         value|=obs[j] << (max_val * (p_order-1));
01022                 }
01023                 obs[i]=value;
01024             }
01025 
01026             for (i=start; i<sequence_length; i++)
01027                 obs[i-start]=obs[i];
01028         }
01029 
01038         void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
01039         {
01040             int32_t i,j;
01041             ST value=0;
01042 
01043             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
01044             {
01045                 value=0;
01046                 for (j=i; j>=i-p_order+1; j--)
01047                     value= (value << max_val) | obs[j];
01048 
01049                 obs[i]= (ST) value;
01050             }
01051 
01052             for (i=p_order-2;i>=0;i--)
01053             {
01054                 if (i>=sequence_length)
01055                     continue;
01056 
01057                 value=0;
01058                 for (j=i; j>=i-p_order+1; j--)
01059                 {
01060                     value= (value << max_val);
01061                     if (j>=0 && j<sequence_length)
01062                         value|=obs[j];
01063                 }
01064                 obs[i]=value;
01065             }
01066 
01067             for (i=start; i<sequence_length; i++)
01068                 obs[i-start]=obs[i];
01069         }
01070 
01080         void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01081         {
01082             ASSERT(gap>=0);
01083 
01084             const int32_t start_gap=(p_order-gap)/2;
01085             const int32_t end_gap=start_gap+gap;
01086 
01087             int32_t i,j;
01088             ST value=0;
01089 
01090             // almost all positions
01091             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
01092             {
01093                 value=0;
01094                 for (j=i; j>=i-p_order+1; j--)
01095                 {
01096                     if (i-j<start_gap)
01097                     {
01098                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
01099                     }
01100                     else if (i-j>=end_gap)
01101                     {
01102                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
01103                     }
01104                 }
01105                 obs[i]= (ST) value;
01106             }
01107 
01108             // the remaining `order` positions
01109             for (i=p_order-2;i>=0;i--)
01110             {
01111                 if (i>=sequence_length)
01112                     continue;
01113 
01114                 value=0;
01115                 for (j=i; j>=i-p_order+1; j--)
01116                 {
01117                     if (i-j<start_gap)
01118                     {
01119                         value= (value >> max_val);
01120                         if (j>=0 && j<sequence_length)
01121                             value|=obs[j] << (max_val * (p_order-1-gap));
01122                     }
01123                     else if (i-j>=end_gap)
01124                     {
01125                         value= (value >> max_val);
01126                         if (j>=0 && j<sequence_length)
01127                             value|=obs[j] << (max_val * (p_order-1-gap));
01128                     }
01129                 }
01130                 obs[i]=value;
01131             }
01132 
01133             // shifting
01134             for (i=start; i<sequence_length; i++)
01135                 obs[i-start]=obs[i];
01136         }
01137 
01147         void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01148         {
01149             ASSERT(gap>=0);
01150 
01151             const int32_t start_gap=(p_order-gap)/2;
01152             const int32_t end_gap=start_gap+gap;
01153 
01154             int32_t i,j;
01155             ST value=0;
01156 
01157             // almost all positions
01158             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
01159             {
01160                 value=0;
01161                 for (j=i; j>=i-p_order+1; j--)
01162                 {
01163                     if (i-j<start_gap)
01164                         value= (value << max_val) | obs[j];
01165                     else if (i-j>=end_gap)
01166                         value= (value << max_val) | obs[j];
01167                 }
01168                 obs[i]= (ST) value;
01169             }
01170 
01171             // the remaining `order` positions
01172             for (i=p_order-2;i>=0;i--)
01173             {
01174                 if (i>=sequence_length)
01175                     continue;
01176 
01177                 value=0;
01178                 for (j=i; j>=i-p_order+1; j--)
01179                 {
01180                     if (i-j<start_gap)
01181                     {
01182                         value= value << max_val;
01183                         if (j>=0 && j<sequence_length)
01184                             value|=obs[j];
01185                     }
01186                     else if (i-j>=end_gap)
01187                     {
01188                         value= value << max_val;
01189                         if (j>=0 && j<sequence_length)
01190                             value|=obs[j];
01191                     }           
01192                 }
01193                 obs[i]=value;
01194             }
01195 
01196             // shifting
01197             for (i=start; i<sequence_length; i++)
01198                 obs[i-start]=obs[i];
01199         }
01200 
01201     protected:
01202 
01204         CAlphabet* alphabet;
01205 
01207         int32_t num_vectors;
01208 
01210         T_STRING<ST>* features;
01211 
01213         ST* single_string;
01214 
01216         int32_t length_of_single_string;
01217 
01219         int32_t max_string_length;
01220 
01222         float128_t num_symbols;
01223 
01225         float128_t original_num_symbols;
01226 
01228         int32_t order;
01229 
01231         int32_t selected_vector;
01232 
01234         ST* symbol_mask_table;
01235 };
01236 
01241 template<> inline EFeatureType CStringFeatures<char>::get_feature_type()
01242 {
01243     return F_CHAR;
01244 }
01245 
01250 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type()
01251 {
01252     return F_BYTE;
01253 }
01254 
01259 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type()
01260 {
01261     return F_SHORT;
01262 }
01263 
01268 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type()
01269 {
01270     return F_WORD;
01271 }
01272 
01277 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type()
01278 {
01279     return F_INT;
01280 }
01281 
01286 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type()
01287 {
01288     return F_UINT;
01289 }
01290 
01295 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type()
01296 {
01297     return F_LONG;
01298 }
01299 
01304 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type()
01305 {
01306     return F_ULONG;
01307 }
01308 
01313 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type()
01314 {
01315     return F_DREAL;
01316 }
01317 #endif

SHOGUN Machine Learning Toolbox - Documentation