StringFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _CSTRINGFEATURES__H__
00013 #define _CSTRINGFEATURES__H__
00014 
00015 
00016 #include "preproc/PreProc.h"
00017 #include "preproc/StringPreProc.h"
00018 #include "features/Features.h"
00019 #include "features/CharFeatures.h"
00020 #include "features/Alphabet.h"
00021 #include "lib/common.h"
00022 #include "lib/io.h"
00023 #include "lib/DynamicArray.h"
00024 #include "lib/File.h"
00025 #include "lib/Mathematics.h"
00026 
00027 #include <sys/types.h>
00028 #include <sys/stat.h>
00029 #include <dirent.h>
00030 #include <stdio.h>
00031 #include <stdlib.h>
00032 #include <unistd.h>
00033 
00034 class CFile;
00035 
00036 template <class ST> class CStringPreProc;
00037 
00039 template <class T> struct T_STRING
00040 {
00042     T* string;
00044     INT length;
00045 };
00046 
00047 template <class T> CHAR* get_zero_terminated_string_copy(T_STRING<T> str)
00048 {
00049     INT l=str.length;
00050     CHAR* s=new CHAR[l+1];
00051     memcpy(s, str.string, sizeof(CHAR)*l);
00052     s[l]='\0';
00053     return s;
00054 }
00055 
00059 template <class ST> class CStringFeatures : public CFeatures
00060 {
00061     public:
00066         CStringFeatures(E_ALPHABET alpha)
00067         : CFeatures(0), num_vectors(0), features(NULL),
00068             single_string(NULL),length_of_single_string(0),
00069             max_string_length(0), order(0), selected_vector(0),
00070             symbol_mask_table(NULL)
00071         {
00072             alphabet=new CAlphabet(alpha);
00073             SG_REF(alphabet);
00074             num_symbols=alphabet->get_num_symbols();
00075             original_num_symbols=num_symbols;
00076         }
00077 
00082         CStringFeatures(CAlphabet* alpha)
00083         : CFeatures(0), num_vectors(0), features(NULL),
00084             single_string(NULL),length_of_single_string(0),
00085             max_string_length(0), order(0), selected_vector(0),
00086             symbol_mask_table(NULL)
00087     {
00088         ASSERT(alpha);
00089         alphabet=new CAlphabet(alpha);
00090         num_symbols=alphabet->get_num_symbols();
00091         original_num_symbols=num_symbols;
00092     }
00093 
00095         CStringFeatures(const CStringFeatures & orig)
00096         : CFeatures(orig), num_vectors(orig.num_vectors),
00097             single_string(orig.single_string),
00098             length_of_single_string(orig.length_of_single_string),
00099             max_string_length(orig.max_string_length),
00100             num_symbols(orig.num_symbols),
00101             original_num_symbols(orig.original_num_symbols),
00102             order(orig.order), selected_vector(orig.selected_vector)
00103         {
00104             ASSERT(orig.single_string == NULL); //not implemented
00105 
00106             alphabet=new CAlphabet(orig.alphabet);
00107             SG_REF(alphabet);
00108 
00109             if (orig.features)
00110             {
00111                 features=new T_STRING<ST>[orig.num_vectors];
00112 
00113                 for (INT i=0; i<num_vectors; i++)
00114                 {
00115                     features[i].string=new ST[orig.features[i].length];
00116                     ASSERT(features[i].string);
00117                     features[i].length=orig.features[i].length;
00118                     memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length); 
00119                 }
00120             }
00121 
00122             if (orig.symbol_mask_table)
00123             {
00124                 symbol_mask_table=new ST[256];
00125                 for (INT i=0; i<256; i++)
00126                     symbol_mask_table[i]=orig.symbol_mask_table[i];
00127             }
00128         }
00129 
00135         CStringFeatures(char* fname, E_ALPHABET alpha=DNA)
00136         : CFeatures(fname), num_vectors(0),
00137             features(NULL), single_string(NULL),
00138             length_of_single_string(0), max_string_length(0),
00139             order(0), selected_vector(0), symbol_mask_table(NULL)
00140         {
00141             alphabet=new CAlphabet(alpha);
00142             SG_REF(alphabet);
00143             num_symbols=alphabet->get_num_symbols();
00144             original_num_symbols=num_symbols;
00145             load(fname);
00146         }
00147 
00148         virtual ~CStringFeatures()
00149         {
00150             cleanup();
00151 
00152 #ifdef HAVE_SWIG
00153             SG_UNREF(alphabet);
00154 #else
00155             delete alphabet;
00156 #endif
00157         }
00158 
00160         void cleanup()
00161         {
00162             if (single_string)
00163             {
00164                 delete[] single_string;
00165                 single_string=NULL;
00166             }
00167             else
00168             {
00169                 for (int i=0; i<num_vectors; i++)
00170                 {
00171                     delete[] features[i].string;
00172                     features[i].length=0;
00173                 }
00174             }
00175             num_vectors=0;
00176             delete[] features;
00177 
00178             delete[] symbol_mask_table;
00179             alphabet->clear_histogram();
00180         }
00181 
00186         inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00187 
00192         inline virtual EFeatureType get_feature_type();
00193 
00198         inline CAlphabet* get_alphabet()
00199         {
00200             SG_REF(alphabet);
00201             return alphabet;
00202         }
00203 
00208         virtual CFeatures* duplicate() const
00209         {
00210             return new CStringFeatures<ST>(*this);
00211         }
00212 
00217         void select_feature_vector(INT num)
00218         {
00219             ASSERT(features);
00220             ASSERT(num<num_vectors);
00221 
00222             selected_vector=num;
00223         }
00224 
00230         void get_string(ST** dst, INT* len)
00231         {
00232             ASSERT(features);
00233             ASSERT(selected_vector<num_vectors);
00234 
00235             *len=features[selected_vector].length;
00236             *dst=new ST[*len];
00237             memcpy(*dst, features[selected_vector].string, *len * sizeof(ST));
00238         }
00239 
00246         virtual ST* get_feature_vector(INT num, INT& len)
00247         {
00248             ASSERT(features);
00249             ASSERT(num<num_vectors);
00250 
00251             len=features[num].length;
00252             return features[num].string;
00253         }
00254 
00261         virtual void set_feature_vector(INT num, ST* string, INT len)
00262         {
00263             ASSERT(features);
00264             ASSERT(num<num_vectors);
00265 
00266             features[num].length=len ;
00267             features[num].string=string ;
00268         }
00269 
00276         virtual ST inline get_feature(INT vec_num, INT feat_num)
00277         {
00278             ASSERT(features && vec_num<num_vectors);
00279             ASSERT(feat_num<features[vec_num].length);
00280 
00281             return features[vec_num].string[feat_num];
00282         }
00283 
00289         virtual inline INT get_vector_length(INT vec_num)
00290         {
00291             ASSERT(features && vec_num<num_vectors);
00292             return features[vec_num].length;
00293         }
00294 
00299         virtual inline INT get_max_vector_length()
00300         {
00301             return max_string_length;
00302         }
00303 
00308         virtual inline INT get_num_vectors() { return num_vectors; }
00309 
00316         inline LONGREAL get_num_symbols() { return num_symbols; }
00317 
00324         inline LONGREAL get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00325 
00326         // these functions are necessary to find out about a former conversion process
00327 
00332         inline LONGREAL get_original_num_symbols() { return original_num_symbols; }
00333 
00338         inline INT get_order() { return order; }
00339 
00347         inline ST get_masked_symbols(ST symbol, BYTE mask)
00348         {
00349             ASSERT(symbol_mask_table);
00350             return symbol_mask_table[mask] & symbol;
00351         }
00352 
00359         inline ST shift_offset(ST offset, INT amount)
00360         {
00361             ASSERT(alphabet);
00362             return (offset << (amount*alphabet->get_num_bits()));
00363         }
00364 
00371         inline ST shift_symbol(ST symbol, INT amount)
00372         {
00373             ASSERT(alphabet);
00374             return (symbol >> (amount*alphabet->get_num_bits()));
00375         }
00376 
00382         virtual bool load(CHAR* fname)
00383         {
00384             SG_INFO( "loading...\n");
00385             LONG length=0;
00386             max_string_length=0;
00387 
00388             CFile f(fname, 'r', F_CHAR);
00389             CHAR* feature_matrix=f.load_char_data(NULL, length);
00390 
00391             num_vectors=0;
00392 
00393             if (f.is_ok())
00394             {
00395                 for (long i=0; i<length; i++)
00396                 {
00397                     if (feature_matrix[i]=='\n')
00398                         num_vectors++;
00399                 }
00400 
00401                 SG_INFO( "file contains %ld vectors\n", num_vectors);
00402                 features= new T_STRING<ST>[num_vectors];
00403 
00404                 long index=0;
00405                 for (INT lines=0; lines<num_vectors; lines++)
00406                 {
00407                     CHAR* p=&feature_matrix[index];
00408                     INT columns=0;
00409 
00410                     for (columns=0; index+columns<length && p[columns]!='\n'; columns++);
00411 
00412                     if (index+columns>=length && p[columns]!='\n') {
00413                         SG_ERROR( "error in \"%s\":%d\n", fname, lines);
00414                     }
00415 
00416                     features[lines].length=columns;
00417                     features[lines].string=new ST[columns];
00418 
00419                     max_string_length=CMath::max(max_string_length,columns);
00420 
00421                     for (INT i=0; i<columns; i++)
00422                         features[lines].string[i]= ((ST) p[i]);
00423 
00424                     index+= features[lines].length+1;
00425                 }
00426 
00427                 num_symbols=4; //FIXME
00428                 return true;
00429             }
00430             else
00431                 SG_ERROR( "reading file failed\n");
00432 
00433             return false;
00434         }
00435 
00442         bool load_dna_file(CHAR* fname, bool remap_to_bin=true)
00443         {
00444             bool result=false;
00445 
00446             size_t blocksize=1024*1024;
00447             size_t required_blocksize=0;
00448             BYTE* dummy=new BYTE[blocksize];
00449             BYTE* overflow=NULL;
00450             INT overflow_len=0;
00451 
00452             num_symbols=4;
00453             cleanup();
00454 
00455             CAlphabet* alpha=new CAlphabet(DNA);
00456 
00457             FILE* f=fopen(fname, "ro");
00458 
00459             if (f)
00460             {
00461                 num_vectors=0;
00462                 max_string_length=0;
00463 
00464                 SG_INFO("counting line numbers in file %s\n", fname);
00465                 SG_DEBUG("block_size=%d\n", required_blocksize);
00466                 size_t sz=blocksize;
00467                 size_t block_offs=0;
00468                 size_t old_block_offs=0;
00469                 fseek(f, 0, SEEK_END);
00470                 size_t fsize=ftell(f);
00471                 rewind(f);
00472 
00473                 while (sz == blocksize)
00474                 {
00475                     sz=fread(dummy, sizeof(BYTE), blocksize, f);
00476                     bool contains_cr=false;
00477                     for (size_t i=0; i<sz; i++)
00478                     {
00479                         block_offs++;
00480                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00481                         {
00482                             num_vectors++;
00483                             contains_cr=true;
00484                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00485                             old_block_offs=block_offs;
00486                         }
00487                     }
00488                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00489                 }
00490 
00491                 SG_INFO("found %d strings\n", num_vectors);
00492                 delete[] dummy;
00493                 blocksize=required_blocksize;
00494                 dummy = new BYTE[blocksize];
00495                 overflow = new BYTE[blocksize];
00496                 features=new T_STRING<ST>[num_vectors];
00497 
00498                 rewind(f);
00499                 sz=blocksize;
00500                 INT lines=0;
00501                 while (sz == blocksize)
00502                 {
00503                     sz=fread(dummy, sizeof(BYTE), blocksize, f);
00504 
00505                     size_t old_sz=0;
00506                     for (size_t i=0; i<sz; i++)
00507                     {
00508                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00509                         {
00510                             INT len=i-old_sz;
00511                             //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz);
00512                             max_string_length=CMath::max(max_string_length, len+overflow_len);
00513 
00514                             features[lines].length=len;
00515                             features[lines].string=new ST[len];
00516 
00517                             if (remap_to_bin)
00518                             {
00519                                 for (INT j=0; j<overflow_len; j++)
00520                                     features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00521                                 for (INT j=0; j<len; j++)
00522                                     features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00523                             }
00524                             else
00525                             {
00526                                 for (INT j=0; j<overflow_len; j++)
00527                                     features[lines].string[j]=overflow[j];
00528                                 for (INT j=0; j<len; j++)
00529                                     features[lines].string[j+overflow_len]=dummy[old_sz+j];
00530                             }
00531 
00532                             // clear overflow
00533                             overflow_len=0;
00534 
00535                             //CMath::display_vector(features[lines].string, len);
00536                             old_sz=i+1;
00537                             lines++;
00538                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00539                         }
00540                     }
00541                     for (size_t i=old_sz; i<sz; i++)
00542                         overflow[i-old_sz]=dummy[i];
00543 
00544                     overflow_len=sz-old_sz;
00545                 }
00546                 result=true;
00547                 SG_INFO("file successfully read\n");
00548                 SG_INFO("max_string_length=%d\n", max_string_length);
00549                 SG_INFO("num_strings=%d\n", num_vectors);
00550             }
00551 
00552             fclose(f);
00553             delete alpha;
00554             delete[] dummy;
00555 
00556 #ifdef HAVE_SWIG
00557             SG_UNREF(alphabet);
00558 #else
00559             delete alphabet;
00560 #endif
00561             if (remap_to_bin)
00562                 alphabet = new CAlphabet(RAWDNA);
00563             else
00564                 alphabet = new CAlphabet(DNA);
00565             SG_REF(alphabet);
00566 
00567             return result;
00568         }
00569 
00575         bool load_from_directory(CHAR* dirname)
00576         {
00577             struct dirent **namelist;
00578             int n;
00579 
00580             io.set_dirname(dirname);
00581 
00582             n = scandir(dirname, &namelist, io.filter, alphasort);
00583             if (n <= 0)
00584             {
00585                 SG_ERROR( "error calling scandir\n");
00586                 return false;
00587             }
00588             else
00589             {
00590                 T_STRING<ST>* strings=NULL;
00591                 alphabet->clear_histogram();
00592 
00593                 INT num=0;
00594                 INT max_len=-1;
00595 
00596                 //usually n==num_vec, but it might not in race conditions 
00597                 //(file perms modified, file erased)
00598                 strings=new T_STRING<ST>[n];
00599 
00600                 for (int i=0; i<n; i++)
00601                 {
00602                     CHAR* fname=io.concat_filename(namelist[i]->d_name);
00603 
00604                     struct stat s;
00605                     off_t filesize=0;
00606 
00607                     if (!stat(fname, &s) && s.st_size>0)
00608                     {
00609                         filesize=s.st_size/sizeof(ST);
00610 
00611                         FILE* f=fopen(fname, "ro");
00612                         if (f)
00613                         {
00614                             ST* str=new ST[filesize];
00615                             SG_DEBUG("%s:%ld\n", fname, (long int) filesize);
00616                             fread(str, sizeof(ST), filesize, f);
00617                             strings[num].string=str;
00618                             strings[num].length=filesize;
00619                             max_len=CMath::max(max_len, strings[num].length);
00620 
00621                             num++;
00622                             fclose(f);
00623                         }
00624                     }
00625                     else
00626                         SG_ERROR("empty or non readable file \'%s\'\n", fname);
00627 
00628                     free(namelist[i]);
00629                 }
00630                 free(namelist);
00631 
00632                 if (num>0 && strings)
00633                 {
00634                     set_features(strings, num, max_len);
00635                     return true;
00636                 }
00637             }
00638             return false;
00639         }
00640 
00648         bool set_features(T_STRING<ST>* p_features, INT p_num_vectors, INT p_max_string_length)
00649         {
00650             if (p_features)
00651             {
00652                 CAlphabet* alpha=new CAlphabet(alphabet);
00653 
00654                 //compute histogram for char/byte
00655                 for (INT i=0; i<p_num_vectors; i++)
00656                     alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
00657 
00658                 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00659                 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00660 
00661                 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00662                 {
00663                     cleanup();
00664 
00665 #ifdef HAVE_SWIG
00666                     SG_UNREF(alphabet);
00667 #else
00668                     delete alphabet;
00669 #endif
00670                     alphabet=alpha;
00671                     SG_REF(alphabet);
00672 
00673                     this->features=p_features;
00674                     this->num_vectors=p_num_vectors;
00675                     this->max_string_length=p_max_string_length;
00676 
00677                     return true;
00678                 }
00679                 else
00680                     delete alpha;
00681             }
00682 
00683             return false;
00684         }
00685 
00692         virtual T_STRING<ST>* get_features(INT& num_str, INT& max_str_len)
00693         {
00694             num_str=num_vectors;
00695             max_str_len=max_string_length;
00696             return features;
00697         }
00698 
00704         virtual bool save(CHAR* dest)
00705         {
00706             return false;
00707         }
00708 
00713         virtual INT get_size() { return sizeof(ST); }
00714 
00720         virtual bool apply_preproc(bool force_preprocessing=false)
00721         {
00722             SG_DEBUG( "force: %d\n", force_preprocessing);
00723 
00724             for (INT i=0; i<get_num_preproc(); i++)
00725             { 
00726                 if ( (!is_preprocessed(i) || force_preprocessing) )
00727                 {
00728                     set_preprocessed(i);
00729 
00730                     SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00731 
00732                     if (!((CStringPreProc<ST>*) get_preproc(i))->apply_to_string_features(this))
00733                         return false;
00734                 }
00735             }
00736             return true;
00737         }
00738 
00748         INT obtain_by_sliding_window(INT window_size, INT step_size, INT skip=0)
00749         {
00750             ASSERT(step_size>0);
00751             ASSERT(window_size>0);
00752             ASSERT(num_vectors==1 || single_string);
00753             ASSERT(max_string_length>=window_size ||
00754                     (single_string && length_of_single_string>=window_size));
00755 
00756             //in case we are dealing with a single remapped string
00757             //allow remapping
00758             if (single_string)
00759                 num_vectors= (length_of_single_string-window_size)/step_size + 1;
00760             else if (num_vectors==1)
00761             {
00762                 num_vectors= (max_string_length-window_size)/step_size + 1;
00763                 length_of_single_string=max_string_length;
00764             }
00765 
00766             T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
00767             INT offs=0;
00768             for (INT i=0; i<num_vectors; i++)
00769             {
00770                 f[i].string=&features[0].string[offs+skip];
00771                 f[i].length=window_size-skip;
00772                 offs+=step_size;
00773             }
00774             single_string=features[0].string;
00775             delete[] features;
00776             features=f;
00777             selected_vector=0;
00778             max_string_length=window_size-skip;
00779 
00780             return num_vectors;
00781         }
00782 
00791         INT obtain_by_position_list(INT window_size, CDynamicArray<INT>* positions, INT skip=0)
00792         {
00793             ASSERT(positions);
00794             ASSERT(window_size>0);
00795             ASSERT(num_vectors==1 || single_string);
00796             ASSERT(max_string_length>=window_size ||
00797                     (single_string && length_of_single_string>=window_size));
00798 
00799             num_vectors= positions->get_num_elements();
00800             ASSERT(num_vectors>0);
00801 
00802             INT len;
00803 
00804             //in case we are dealing with a single remapped string
00805             //allow remapping
00806             if (single_string)
00807                 len=length_of_single_string;
00808             else
00809             {
00810                 single_string=features[0].string;
00811                 len=max_string_length;
00812                 length_of_single_string=max_string_length;
00813             }
00814 
00815             T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
00816             for (INT i=0; i<num_vectors; i++)
00817             {
00818                 INT p=positions->get_element(i);
00819 
00820                 if (p>=0 && p<=len-window_size)
00821                 {
00822                     f[i].string=&features[0].string[p+skip];
00823                     f[i].length=window_size-skip;
00824                 }
00825                 else
00826                 {
00827                     num_vectors=1;
00828                     max_string_length=len;
00829                     features[0].length=len;
00830                     single_string=NULL;
00831                     delete[] f;
00832                     SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
00833                             window_size, i, p, len);
00834                     return -1;
00835                 }
00836             }
00837 
00838             delete[] features;
00839             features=f;
00840             selected_vector=0;
00841             max_string_length=window_size-skip;
00842 
00843             return num_vectors;
00844         }
00845 
00857         inline bool obtain_from_char(CStringFeatures<CHAR>* sf, INT start, INT p_order, INT gap, bool rev)
00858         {
00859             return obtain_from_char_features(sf, start, p_order, gap, rev);
00860         }
00861 
00871         template <class CT>
00872             bool obtain_from_char_features(CStringFeatures<CT>* sf, INT start, INT p_order, INT gap, bool rev)
00873             {
00874                 ASSERT(sf);
00875                 this->order=p_order;
00876                 cleanup();
00877                 delete[] symbol_mask_table;
00878                 symbol_mask_table=new ST[256];
00879 
00880                 num_vectors=sf->get_num_vectors();
00881                 ASSERT(num_vectors>0);
00882                 max_string_length=sf->get_max_vector_length()-start;
00883                 features=new T_STRING<ST>[num_vectors];
00884                 CAlphabet* alpha=sf->get_alphabet();
00885                 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
00886 
00887                 SG_DEBUG( "%1.0llf symbols in StringFeatures<*>\n", sf->get_num_symbols());
00888 
00889                 for (INT i=0; i<num_vectors; i++)
00890                 {
00891                     INT len=-1;
00892                     CT* c=sf->get_feature_vector(i, len);
00893 
00894                     features[i].string=new ST[len];
00895                     features[i].length=len;
00896 
00897                     ST* str=features[i].string;
00898                     for (INT j=0; j<len; j++)
00899                         str[j]=(ST) alpha->remap_to_bin(c[j]);
00900 
00901                 }
00902 
00903                 original_num_symbols=alpha->get_num_symbols();
00904                 INT max_val=alpha->get_num_bits();
00905 
00906                 if (p_order>1)
00907                     num_symbols=CMath::powl((long double) 2, (long double) max_val*p_order);
00908                 else
00909                     num_symbols=original_num_symbols;
00910                 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
00911 
00912                 if ( ((long double) num_symbols) > CMath::powl(((long double) 2),((long double) sizeof(ST)*8)) )
00913                 {
00914                     SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
00915                     return false;
00916                 }
00917 
00918                 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
00919                 for (INT line=0; line<num_vectors; line++)
00920                 {
00921                     INT len=0;
00922                     ST* fv=get_feature_vector(line, len);
00923 
00924                     if (rev)
00925                         translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
00926                     else
00927                         translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
00928                     //translate_from_single_order(fv, len, start, p_order, max_val);
00929                     //translate_from_single_order_reversed(fv, len, start, p_order, max_val);
00930 
00931                     /* fix the length of the string -- hacky */
00932                     features[line].length-=start+gap ;
00933                     if (features[line].length<0)
00934                         features[line].length=0 ;
00935                 }         
00936 
00937                 ULONG mask=0;
00938                 for (INT i=0; i< (LONG) max_val; i++)
00939                     mask=(mask<<1) | 1;
00940 
00941                 for (INT i=0; i<256; i++)
00942                 {
00943                     BYTE bits=(BYTE) i;
00944                     symbol_mask_table[i]=0;
00945 
00946                     for (INT j=0; j<8; j++)
00947                     {
00948                         if (bits & 1)
00949                             symbol_mask_table[i]|=mask<<(max_val*j);
00950 
00951                         bits>>=1;
00952                     }
00953                 }
00954 
00955                 return true;
00956             }
00957 
00965         bool have_same_length(INT len)
00966         {
00967             if (len!=get_max_vector_length())
00968                 return false;
00969 
00970             for (INT i=0; i<num_vectors; i++)
00971             {
00972                 if (get_vector_length(i)!=len)
00973                     return false;
00974             }
00975 
00976             return true;
00977         }
00978 
00979     protected:
00988         void translate_from_single_order(ST* obs, INT sequence_length, INT start, INT p_order, INT max_val)
00989         {
00990             INT i,j;
00991             ST value=0;
00992 
00993             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
00994             {
00995                 value=0;
00996                 for (j=i; j>=i-p_order+1; j--)
00997                     value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
00998 
00999                 obs[i]= (ST) value;
01000             }
01001 
01002             for (i=p_order-2;i>=0;i--)
01003             {
01004                 if (i>=sequence_length)
01005                     continue;
01006 
01007                 value=0;
01008                 for (j=i; j>=i-p_order+1; j--)
01009                 {
01010                     value= (value >> max_val);
01011                     if (j>=0 && j<sequence_length)
01012                         value|=obs[j] << (max_val * (p_order-1));
01013                 }
01014                 obs[i]=value;
01015             }
01016 
01017             for (i=start; i<sequence_length; i++)
01018                 obs[i-start]=obs[i];
01019         }
01020 
01029         void translate_from_single_order_reversed(ST* obs, INT sequence_length, INT start, INT p_order, INT max_val)
01030         {
01031             INT i,j;
01032             ST value=0;
01033 
01034             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
01035             {
01036                 value=0;
01037                 for (j=i; j>=i-p_order+1; j--)
01038                     value= (value << max_val) | obs[j];
01039 
01040                 obs[i]= (ST) value;
01041             }
01042 
01043             for (i=p_order-2;i>=0;i--)
01044             {
01045                 if (i>=sequence_length)
01046                     continue;
01047 
01048                 value=0;
01049                 for (j=i; j>=i-p_order+1; j--)
01050                 {
01051                     value= (value << max_val);
01052                     if (j>=0 && j<sequence_length)
01053                         value|=obs[j];
01054                 }
01055                 obs[i]=value;
01056             }
01057 
01058             for (i=start; i<sequence_length; i++)
01059                 obs[i-start]=obs[i];
01060         }
01061 
01071         void translate_from_single_order(ST* obs, INT sequence_length, INT start, INT p_order, INT max_val, INT gap)
01072         {
01073             ASSERT(gap>=0);
01074 
01075             const INT start_gap=(p_order-gap)/2;
01076             const INT end_gap=start_gap+gap;
01077 
01078             INT i,j;
01079             ST value=0;
01080 
01081             // almost all positions
01082             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
01083             {
01084                 value=0;
01085                 for (j=i; j>=i-p_order+1; j--)
01086                 {
01087                     if (i-j<start_gap)
01088                     {
01089                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
01090                     }
01091                     else if (i-j>=end_gap)
01092                     {
01093                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
01094                     }
01095                 }
01096                 obs[i]= (ST) value;
01097             }
01098 
01099             // the remaining `order` positions
01100             for (i=p_order-2;i>=0;i--)
01101             {
01102                 if (i>=sequence_length)
01103                     continue;
01104 
01105                 value=0;
01106                 for (j=i; j>=i-p_order+1; j--)
01107                 {
01108                     if (i-j<start_gap)
01109                     {
01110                         value= (value >> max_val);
01111                         if (j>=0 && j<sequence_length)
01112                             value|=obs[j] << (max_val * (p_order-1-gap));
01113                     }
01114                     else if (i-j>=end_gap)
01115                     {
01116                         value= (value >> max_val);
01117                         if (j>=0 && j<sequence_length)
01118                             value|=obs[j] << (max_val * (p_order-1-gap));
01119                     }
01120                 }
01121                 obs[i]=value;
01122             }
01123 
01124             // shifting
01125             for (i=start; i<sequence_length; i++)
01126                 obs[i-start]=obs[i];
01127         }
01128 
01138         void translate_from_single_order_reversed(ST* obs, INT sequence_length, INT start, INT p_order, INT max_val, INT gap)
01139         {
01140             ASSERT(gap>=0);
01141 
01142             const INT start_gap=(p_order-gap)/2;
01143             const INT end_gap=start_gap+gap;
01144 
01145             INT i,j;
01146             ST value=0;
01147 
01148             // almost all positions
01149             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
01150             {
01151                 value=0;
01152                 for (j=i; j>=i-p_order+1; j--)
01153                 {
01154                     if (i-j<start_gap)
01155                         value= (value << max_val) | obs[j];
01156                     else if (i-j>=end_gap)
01157                         value= (value << max_val) | obs[j];
01158                 }
01159                 obs[i]= (ST) value;
01160             }
01161 
01162             // the remaining `order` positions
01163             for (i=p_order-2;i>=0;i--)
01164             {
01165                 if (i>=sequence_length)
01166                     continue;
01167 
01168                 value=0;
01169                 for (j=i; j>=i-p_order+1; j--)
01170                 {
01171                     if (i-j<start_gap)
01172                     {
01173                         value= value << max_val;
01174                         if (j>=0 && j<sequence_length)
01175                             value|=obs[j];
01176                     }
01177                     else if (i-j>=end_gap)
01178                     {
01179                         value= value << max_val;
01180                         if (j>=0 && j<sequence_length)
01181                             value|=obs[j];
01182                     }           
01183                 }
01184                 obs[i]=value;
01185             }
01186 
01187             // shifting
01188             for (i=start; i<sequence_length; i++)
01189                 obs[i-start]=obs[i];
01190         }
01191 
01192     protected:
01193 
01195         CAlphabet* alphabet;
01196 
01198         INT num_vectors;
01199 
01201         T_STRING<ST>* features;
01202 
01204         ST* single_string;
01205 
01207         INT length_of_single_string;
01208 
01210         INT max_string_length;
01211 
01213         LONGREAL num_symbols;
01214 
01216         LONGREAL original_num_symbols;
01217 
01219         INT order;
01220 
01222         INT selected_vector;
01223 
01225         ST* symbol_mask_table;
01226 };
01227 
01232 template<> inline EFeatureType CStringFeatures<CHAR>::get_feature_type()
01233 {
01234     return F_CHAR;
01235 }
01236 
01241 template<> inline EFeatureType CStringFeatures<BYTE>::get_feature_type()
01242 {
01243     return F_BYTE;
01244 }
01245 
01250 template<> inline EFeatureType CStringFeatures<SHORT>::get_feature_type()
01251 {
01252     return F_SHORT;
01253 }
01254 
01259 template<> inline EFeatureType CStringFeatures<WORD>::get_feature_type()
01260 {
01261     return F_WORD;
01262 }
01263 
01268 template<> inline EFeatureType CStringFeatures<INT>::get_feature_type()
01269 {
01270     return F_INT;
01271 }
01272 
01277 template<> inline EFeatureType CStringFeatures<UINT>::get_feature_type()
01278 {
01279     return F_UINT;
01280 }
01281 
01286 template<> inline EFeatureType CStringFeatures<LONG>::get_feature_type()
01287 {
01288     return F_LONG;
01289 }
01290 
01295 template<> inline EFeatureType CStringFeatures<ULONG>::get_feature_type()
01296 {
01297     return F_ULONG;
01298 }
01299 
01304 template<> inline EFeatureType CStringFeatures<DREAL>::get_feature_type()
01305 {
01306     return F_DREAL;
01307 }
01308 #endif

SHOGUN Machine Learning Toolbox - Documentation