00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _CSTRINGFEATURES__H__
00013 #define _CSTRINGFEATURES__H__
00014
00015
00016 #include "preproc/PreProc.h"
00017 #include "preproc/StringPreProc.h"
00018 #include "features/Features.h"
00019 #include "features/CharFeatures.h"
00020 #include "features/Alphabet.h"
00021 #include "lib/common.h"
00022 #include "lib/io.h"
00023 #include "lib/DynamicArray.h"
00024 #include "lib/File.h"
00025 #include "lib/Mathematics.h"
00026
00027 #include <sys/types.h>
00028 #include <sys/stat.h>
00029 #include <dirent.h>
00030 #include <stdio.h>
00031 #include <stdlib.h>
00032 #include <unistd.h>
00033
00034 class CFile;
00035
00036 template <class ST> class CStringPreProc;
00037
00039 template <class T> struct T_STRING
00040 {
00042 T* string;
00044 INT length;
00045 };
00046
00047 template <class T> CHAR* get_zero_terminated_string_copy(T_STRING<T> str)
00048 {
00049 INT l=str.length;
00050 CHAR* s=new CHAR[l+1];
00051 memcpy(s, str.string, sizeof(CHAR)*l);
00052 s[l]='\0';
00053 return s;
00054 }
00055
00059 template <class ST> class CStringFeatures : public CFeatures
00060 {
00061 public:
00066 CStringFeatures(E_ALPHABET alpha)
00067 : CFeatures(0), num_vectors(0), features(NULL),
00068 single_string(NULL),length_of_single_string(0),
00069 max_string_length(0), order(0), selected_vector(0),
00070 symbol_mask_table(NULL)
00071 {
00072 alphabet=new CAlphabet(alpha);
00073 SG_REF(alphabet);
00074 num_symbols=alphabet->get_num_symbols();
00075 original_num_symbols=num_symbols;
00076 }
00077
00082 CStringFeatures(CAlphabet* alpha)
00083 : CFeatures(0), num_vectors(0), features(NULL),
00084 single_string(NULL),length_of_single_string(0),
00085 max_string_length(0), order(0), selected_vector(0),
00086 symbol_mask_table(NULL)
00087 {
00088 ASSERT(alpha);
00089 alphabet=new CAlphabet(alpha);
00090 num_symbols=alphabet->get_num_symbols();
00091 original_num_symbols=num_symbols;
00092 }
00093
00095 CStringFeatures(const CStringFeatures & orig)
00096 : CFeatures(orig), num_vectors(orig.num_vectors),
00097 single_string(orig.single_string),
00098 length_of_single_string(orig.length_of_single_string),
00099 max_string_length(orig.max_string_length),
00100 num_symbols(orig.num_symbols),
00101 original_num_symbols(orig.original_num_symbols),
00102 order(orig.order), selected_vector(orig.selected_vector)
00103 {
00104 ASSERT(orig.single_string == NULL);
00105
00106 alphabet=new CAlphabet(orig.alphabet);
00107 SG_REF(alphabet);
00108
00109 if (orig.features)
00110 {
00111 features=new T_STRING<ST>[orig.num_vectors];
00112
00113 for (INT i=0; i<num_vectors; i++)
00114 {
00115 features[i].string=new ST[orig.features[i].length];
00116 ASSERT(features[i].string);
00117 features[i].length=orig.features[i].length;
00118 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length);
00119 }
00120 }
00121
00122 if (orig.symbol_mask_table)
00123 {
00124 symbol_mask_table=new ST[256];
00125 for (INT i=0; i<256; i++)
00126 symbol_mask_table[i]=orig.symbol_mask_table[i];
00127 }
00128 }
00129
00135 CStringFeatures(char* fname, E_ALPHABET alpha=DNA)
00136 : CFeatures(fname), num_vectors(0),
00137 features(NULL), single_string(NULL),
00138 length_of_single_string(0), max_string_length(0),
00139 order(0), selected_vector(0), symbol_mask_table(NULL)
00140 {
00141 alphabet=new CAlphabet(alpha);
00142 SG_REF(alphabet);
00143 num_symbols=alphabet->get_num_symbols();
00144 original_num_symbols=num_symbols;
00145 load(fname);
00146 }
00147
00148 virtual ~CStringFeatures()
00149 {
00150 cleanup();
00151
00152 #ifdef HAVE_SWIG
00153 SG_UNREF(alphabet);
00154 #else
00155 delete alphabet;
00156 #endif
00157 }
00158
00160 void cleanup()
00161 {
00162 if (single_string)
00163 {
00164 delete[] single_string;
00165 single_string=NULL;
00166 }
00167 else
00168 {
00169 for (int i=0; i<num_vectors; i++)
00170 {
00171 delete[] features[i].string;
00172 features[i].length=0;
00173 }
00174 }
00175 num_vectors=0;
00176 delete[] features;
00177
00178 delete[] symbol_mask_table;
00179 alphabet->clear_histogram();
00180 }
00181
00186 inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00187
00192 inline virtual EFeatureType get_feature_type();
00193
00198 inline CAlphabet* get_alphabet()
00199 {
00200 SG_REF(alphabet);
00201 return alphabet;
00202 }
00203
00208 virtual CFeatures* duplicate() const
00209 {
00210 return new CStringFeatures<ST>(*this);
00211 }
00212
00217 void select_feature_vector(INT num)
00218 {
00219 ASSERT(features);
00220 ASSERT(num<num_vectors);
00221
00222 selected_vector=num;
00223 }
00224
00230 void get_string(ST** dst, INT* len)
00231 {
00232 ASSERT(features);
00233 ASSERT(selected_vector<num_vectors);
00234
00235 *len=features[selected_vector].length;
00236 *dst=new ST[*len];
00237 memcpy(*dst, features[selected_vector].string, *len * sizeof(ST));
00238 }
00239
00246 virtual ST* get_feature_vector(INT num, INT& len)
00247 {
00248 ASSERT(features);
00249 ASSERT(num<num_vectors);
00250
00251 len=features[num].length;
00252 return features[num].string;
00253 }
00254
00261 virtual void set_feature_vector(INT num, ST* string, INT len)
00262 {
00263 ASSERT(features);
00264 ASSERT(num<num_vectors);
00265
00266 features[num].length=len ;
00267 features[num].string=string ;
00268 }
00269
00276 virtual ST inline get_feature(INT vec_num, INT feat_num)
00277 {
00278 ASSERT(features && vec_num<num_vectors);
00279 ASSERT(feat_num<features[vec_num].length);
00280
00281 return features[vec_num].string[feat_num];
00282 }
00283
00289 virtual inline INT get_vector_length(INT vec_num)
00290 {
00291 ASSERT(features && vec_num<num_vectors);
00292 return features[vec_num].length;
00293 }
00294
00299 virtual inline INT get_max_vector_length()
00300 {
00301 return max_string_length;
00302 }
00303
00308 virtual inline INT get_num_vectors() { return num_vectors; }
00309
00316 inline LONGREAL get_num_symbols() { return num_symbols; }
00317
00324 inline LONGREAL get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00325
00326
00327
00332 inline LONGREAL get_original_num_symbols() { return original_num_symbols; }
00333
00338 inline INT get_order() { return order; }
00339
00347 inline ST get_masked_symbols(ST symbol, BYTE mask)
00348 {
00349 ASSERT(symbol_mask_table);
00350 return symbol_mask_table[mask] & symbol;
00351 }
00352
00359 inline ST shift_offset(ST offset, INT amount)
00360 {
00361 ASSERT(alphabet);
00362 return (offset << (amount*alphabet->get_num_bits()));
00363 }
00364
00371 inline ST shift_symbol(ST symbol, INT amount)
00372 {
00373 ASSERT(alphabet);
00374 return (symbol >> (amount*alphabet->get_num_bits()));
00375 }
00376
00382 virtual bool load(CHAR* fname)
00383 {
00384 SG_INFO( "loading...\n");
00385 LONG length=0;
00386 max_string_length=0;
00387
00388 CFile f(fname, 'r', F_CHAR);
00389 CHAR* feature_matrix=f.load_char_data(NULL, length);
00390
00391 num_vectors=0;
00392
00393 if (f.is_ok())
00394 {
00395 for (long i=0; i<length; i++)
00396 {
00397 if (feature_matrix[i]=='\n')
00398 num_vectors++;
00399 }
00400
00401 SG_INFO( "file contains %ld vectors\n", num_vectors);
00402 features= new T_STRING<ST>[num_vectors];
00403
00404 long index=0;
00405 for (INT lines=0; lines<num_vectors; lines++)
00406 {
00407 CHAR* p=&feature_matrix[index];
00408 INT columns=0;
00409
00410 for (columns=0; index+columns<length && p[columns]!='\n'; columns++);
00411
00412 if (index+columns>=length && p[columns]!='\n') {
00413 SG_ERROR( "error in \"%s\":%d\n", fname, lines);
00414 }
00415
00416 features[lines].length=columns;
00417 features[lines].string=new ST[columns];
00418
00419 max_string_length=CMath::max(max_string_length,columns);
00420
00421 for (INT i=0; i<columns; i++)
00422 features[lines].string[i]= ((ST) p[i]);
00423
00424 index+= features[lines].length+1;
00425 }
00426
00427 num_symbols=4;
00428 return true;
00429 }
00430 else
00431 SG_ERROR( "reading file failed\n");
00432
00433 return false;
00434 }
00435
00442 bool load_dna_file(CHAR* fname, bool remap_to_bin=true)
00443 {
00444 bool result=false;
00445
00446 size_t blocksize=1024*1024;
00447 size_t required_blocksize=0;
00448 BYTE* dummy=new BYTE[blocksize];
00449 BYTE* overflow=NULL;
00450 INT overflow_len=0;
00451
00452 num_symbols=4;
00453 cleanup();
00454
00455 CAlphabet* alpha=new CAlphabet(DNA);
00456
00457 FILE* f=fopen(fname, "ro");
00458
00459 if (f)
00460 {
00461 num_vectors=0;
00462 max_string_length=0;
00463
00464 SG_INFO("counting line numbers in file %s\n", fname);
00465 SG_DEBUG("block_size=%d\n", required_blocksize);
00466 size_t sz=blocksize;
00467 size_t block_offs=0;
00468 size_t old_block_offs=0;
00469 fseek(f, 0, SEEK_END);
00470 size_t fsize=ftell(f);
00471 rewind(f);
00472
00473 while (sz == blocksize)
00474 {
00475 sz=fread(dummy, sizeof(BYTE), blocksize, f);
00476 bool contains_cr=false;
00477 for (size_t i=0; i<sz; i++)
00478 {
00479 block_offs++;
00480 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00481 {
00482 num_vectors++;
00483 contains_cr=true;
00484 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00485 old_block_offs=block_offs;
00486 }
00487 }
00488 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00489 }
00490
00491 SG_INFO("found %d strings\n", num_vectors);
00492 delete[] dummy;
00493 blocksize=required_blocksize;
00494 dummy = new BYTE[blocksize];
00495 overflow = new BYTE[blocksize];
00496 features=new T_STRING<ST>[num_vectors];
00497
00498 rewind(f);
00499 sz=blocksize;
00500 INT lines=0;
00501 while (sz == blocksize)
00502 {
00503 sz=fread(dummy, sizeof(BYTE), blocksize, f);
00504
00505 size_t old_sz=0;
00506 for (size_t i=0; i<sz; i++)
00507 {
00508 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00509 {
00510 INT len=i-old_sz;
00511
00512 max_string_length=CMath::max(max_string_length, len+overflow_len);
00513
00514 features[lines].length=len;
00515 features[lines].string=new ST[len];
00516
00517 if (remap_to_bin)
00518 {
00519 for (INT j=0; j<overflow_len; j++)
00520 features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00521 for (INT j=0; j<len; j++)
00522 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00523 }
00524 else
00525 {
00526 for (INT j=0; j<overflow_len; j++)
00527 features[lines].string[j]=overflow[j];
00528 for (INT j=0; j<len; j++)
00529 features[lines].string[j+overflow_len]=dummy[old_sz+j];
00530 }
00531
00532
00533 overflow_len=0;
00534
00535
00536 old_sz=i+1;
00537 lines++;
00538 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00539 }
00540 }
00541 for (size_t i=old_sz; i<sz; i++)
00542 overflow[i-old_sz]=dummy[i];
00543
00544 overflow_len=sz-old_sz;
00545 }
00546 result=true;
00547 SG_INFO("file successfully read\n");
00548 SG_INFO("max_string_length=%d\n", max_string_length);
00549 SG_INFO("num_strings=%d\n", num_vectors);
00550 }
00551
00552 fclose(f);
00553 delete alpha;
00554 delete[] dummy;
00555
00556 #ifdef HAVE_SWIG
00557 SG_UNREF(alphabet);
00558 #else
00559 delete alphabet;
00560 #endif
00561 if (remap_to_bin)
00562 alphabet = new CAlphabet(RAWDNA);
00563 else
00564 alphabet = new CAlphabet(DNA);
00565 SG_REF(alphabet);
00566
00567 return result;
00568 }
00569
00575 bool load_from_directory(CHAR* dirname)
00576 {
00577 struct dirent **namelist;
00578 int n;
00579
00580 io.set_dirname(dirname);
00581
00582 n = scandir(dirname, &namelist, io.filter, alphasort);
00583 if (n <= 0)
00584 {
00585 SG_ERROR( "error calling scandir\n");
00586 return false;
00587 }
00588 else
00589 {
00590 T_STRING<ST>* strings=NULL;
00591 alphabet->clear_histogram();
00592
00593 INT num=0;
00594 INT max_len=-1;
00595
00596
00597
00598 strings=new T_STRING<ST>[n];
00599
00600 for (int i=0; i<n; i++)
00601 {
00602 CHAR* fname=io.concat_filename(namelist[i]->d_name);
00603
00604 struct stat s;
00605 off_t filesize=0;
00606
00607 if (!stat(fname, &s) && s.st_size>0)
00608 {
00609 filesize=s.st_size/sizeof(ST);
00610
00611 FILE* f=fopen(fname, "ro");
00612 if (f)
00613 {
00614 ST* str=new ST[filesize];
00615 SG_DEBUG("%s:%ld\n", fname, (long int) filesize);
00616 fread(str, sizeof(ST), filesize, f);
00617 strings[num].string=str;
00618 strings[num].length=filesize;
00619 max_len=CMath::max(max_len, strings[num].length);
00620
00621 num++;
00622 fclose(f);
00623 }
00624 }
00625 else
00626 SG_ERROR("empty or non readable file \'%s\'\n", fname);
00627
00628 free(namelist[i]);
00629 }
00630 free(namelist);
00631
00632 if (num>0 && strings)
00633 {
00634 set_features(strings, num, max_len);
00635 return true;
00636 }
00637 }
00638 return false;
00639 }
00640
00648 bool set_features(T_STRING<ST>* p_features, INT p_num_vectors, INT p_max_string_length)
00649 {
00650 if (p_features)
00651 {
00652 CAlphabet* alpha=new CAlphabet(alphabet);
00653
00654
00655 for (INT i=0; i<p_num_vectors; i++)
00656 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
00657
00658 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00659 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00660
00661 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00662 {
00663 cleanup();
00664
00665 #ifdef HAVE_SWIG
00666 SG_UNREF(alphabet);
00667 #else
00668 delete alphabet;
00669 #endif
00670 alphabet=alpha;
00671 SG_REF(alphabet);
00672
00673 this->features=p_features;
00674 this->num_vectors=p_num_vectors;
00675 this->max_string_length=p_max_string_length;
00676
00677 return true;
00678 }
00679 else
00680 delete alpha;
00681 }
00682
00683 return false;
00684 }
00685
00692 virtual T_STRING<ST>* get_features(INT& num_str, INT& max_str_len)
00693 {
00694 num_str=num_vectors;
00695 max_str_len=max_string_length;
00696 return features;
00697 }
00698
00704 virtual bool save(CHAR* dest)
00705 {
00706 return false;
00707 }
00708
00713 virtual INT get_size() { return sizeof(ST); }
00714
00720 virtual bool apply_preproc(bool force_preprocessing=false)
00721 {
00722 SG_DEBUG( "force: %d\n", force_preprocessing);
00723
00724 for (INT i=0; i<get_num_preproc(); i++)
00725 {
00726 if ( (!is_preprocessed(i) || force_preprocessing) )
00727 {
00728 set_preprocessed(i);
00729
00730 SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00731
00732 if (!((CStringPreProc<ST>*) get_preproc(i))->apply_to_string_features(this))
00733 return false;
00734 }
00735 }
00736 return true;
00737 }
00738
00748 INT obtain_by_sliding_window(INT window_size, INT step_size, INT skip=0)
00749 {
00750 ASSERT(step_size>0);
00751 ASSERT(window_size>0);
00752 ASSERT(num_vectors==1 || single_string);
00753 ASSERT(max_string_length>=window_size ||
00754 (single_string && length_of_single_string>=window_size));
00755
00756
00757
00758 if (single_string)
00759 num_vectors= (length_of_single_string-window_size)/step_size + 1;
00760 else if (num_vectors==1)
00761 {
00762 num_vectors= (max_string_length-window_size)/step_size + 1;
00763 length_of_single_string=max_string_length;
00764 }
00765
00766 T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
00767 INT offs=0;
00768 for (INT i=0; i<num_vectors; i++)
00769 {
00770 f[i].string=&features[0].string[offs+skip];
00771 f[i].length=window_size-skip;
00772 offs+=step_size;
00773 }
00774 single_string=features[0].string;
00775 delete[] features;
00776 features=f;
00777 selected_vector=0;
00778 max_string_length=window_size-skip;
00779
00780 return num_vectors;
00781 }
00782
00791 INT obtain_by_position_list(INT window_size, CDynamicArray<INT>* positions, INT skip=0)
00792 {
00793 ASSERT(positions);
00794 ASSERT(window_size>0);
00795 ASSERT(num_vectors==1 || single_string);
00796 ASSERT(max_string_length>=window_size ||
00797 (single_string && length_of_single_string>=window_size));
00798
00799 num_vectors= positions->get_num_elements();
00800 ASSERT(num_vectors>0);
00801
00802 INT len;
00803
00804
00805
00806 if (single_string)
00807 len=length_of_single_string;
00808 else
00809 {
00810 single_string=features[0].string;
00811 len=max_string_length;
00812 length_of_single_string=max_string_length;
00813 }
00814
00815 T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
00816 for (INT i=0; i<num_vectors; i++)
00817 {
00818 INT p=positions->get_element(i);
00819
00820 if (p>=0 && p<=len-window_size)
00821 {
00822 f[i].string=&features[0].string[p+skip];
00823 f[i].length=window_size-skip;
00824 }
00825 else
00826 {
00827 num_vectors=1;
00828 max_string_length=len;
00829 features[0].length=len;
00830 single_string=NULL;
00831 delete[] f;
00832 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
00833 window_size, i, p, len);
00834 return -1;
00835 }
00836 }
00837
00838 delete[] features;
00839 features=f;
00840 selected_vector=0;
00841 max_string_length=window_size-skip;
00842
00843 return num_vectors;
00844 }
00845
00857 inline bool obtain_from_char(CStringFeatures<CHAR>* sf, INT start, INT p_order, INT gap, bool rev)
00858 {
00859 return obtain_from_char_features(sf, start, p_order, gap, rev);
00860 }
00861
00871 template <class CT>
00872 bool obtain_from_char_features(CStringFeatures<CT>* sf, INT start, INT p_order, INT gap, bool rev)
00873 {
00874 ASSERT(sf);
00875 this->order=p_order;
00876 cleanup();
00877 delete[] symbol_mask_table;
00878 symbol_mask_table=new ST[256];
00879
00880 num_vectors=sf->get_num_vectors();
00881 ASSERT(num_vectors>0);
00882 max_string_length=sf->get_max_vector_length()-start;
00883 features=new T_STRING<ST>[num_vectors];
00884 CAlphabet* alpha=sf->get_alphabet();
00885 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
00886
00887 SG_DEBUG( "%1.0llf symbols in StringFeatures<*>\n", sf->get_num_symbols());
00888
00889 for (INT i=0; i<num_vectors; i++)
00890 {
00891 INT len=-1;
00892 CT* c=sf->get_feature_vector(i, len);
00893
00894 features[i].string=new ST[len];
00895 features[i].length=len;
00896
00897 ST* str=features[i].string;
00898 for (INT j=0; j<len; j++)
00899 str[j]=(ST) alpha->remap_to_bin(c[j]);
00900
00901 }
00902
00903 original_num_symbols=alpha->get_num_symbols();
00904 INT max_val=alpha->get_num_bits();
00905
00906 if (p_order>1)
00907 num_symbols=CMath::powl((long double) 2, (long double) max_val*p_order);
00908 else
00909 num_symbols=original_num_symbols;
00910 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
00911
00912 if ( ((long double) num_symbols) > CMath::powl(((long double) 2),((long double) sizeof(ST)*8)) )
00913 {
00914 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
00915 return false;
00916 }
00917
00918 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
00919 for (INT line=0; line<num_vectors; line++)
00920 {
00921 INT len=0;
00922 ST* fv=get_feature_vector(line, len);
00923
00924 if (rev)
00925 translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
00926 else
00927 translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
00928
00929
00930
00931
00932 features[line].length-=start+gap ;
00933 if (features[line].length<0)
00934 features[line].length=0 ;
00935 }
00936
00937 ULONG mask=0;
00938 for (INT i=0; i< (LONG) max_val; i++)
00939 mask=(mask<<1) | 1;
00940
00941 for (INT i=0; i<256; i++)
00942 {
00943 BYTE bits=(BYTE) i;
00944 symbol_mask_table[i]=0;
00945
00946 for (INT j=0; j<8; j++)
00947 {
00948 if (bits & 1)
00949 symbol_mask_table[i]|=mask<<(max_val*j);
00950
00951 bits>>=1;
00952 }
00953 }
00954
00955 return true;
00956 }
00957
00965 bool have_same_length(INT len)
00966 {
00967 if (len!=get_max_vector_length())
00968 return false;
00969
00970 for (INT i=0; i<num_vectors; i++)
00971 {
00972 if (get_vector_length(i)!=len)
00973 return false;
00974 }
00975
00976 return true;
00977 }
00978
00979 protected:
00988 void translate_from_single_order(ST* obs, INT sequence_length, INT start, INT p_order, INT max_val)
00989 {
00990 INT i,j;
00991 ST value=0;
00992
00993 for (i=sequence_length-1; i>= p_order-1; i--)
00994 {
00995 value=0;
00996 for (j=i; j>=i-p_order+1; j--)
00997 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
00998
00999 obs[i]= (ST) value;
01000 }
01001
01002 for (i=p_order-2;i>=0;i--)
01003 {
01004 if (i>=sequence_length)
01005 continue;
01006
01007 value=0;
01008 for (j=i; j>=i-p_order+1; j--)
01009 {
01010 value= (value >> max_val);
01011 if (j>=0 && j<sequence_length)
01012 value|=obs[j] << (max_val * (p_order-1));
01013 }
01014 obs[i]=value;
01015 }
01016
01017 for (i=start; i<sequence_length; i++)
01018 obs[i-start]=obs[i];
01019 }
01020
01029 void translate_from_single_order_reversed(ST* obs, INT sequence_length, INT start, INT p_order, INT max_val)
01030 {
01031 INT i,j;
01032 ST value=0;
01033
01034 for (i=sequence_length-1; i>= p_order-1; i--)
01035 {
01036 value=0;
01037 for (j=i; j>=i-p_order+1; j--)
01038 value= (value << max_val) | obs[j];
01039
01040 obs[i]= (ST) value;
01041 }
01042
01043 for (i=p_order-2;i>=0;i--)
01044 {
01045 if (i>=sequence_length)
01046 continue;
01047
01048 value=0;
01049 for (j=i; j>=i-p_order+1; j--)
01050 {
01051 value= (value << max_val);
01052 if (j>=0 && j<sequence_length)
01053 value|=obs[j];
01054 }
01055 obs[i]=value;
01056 }
01057
01058 for (i=start; i<sequence_length; i++)
01059 obs[i-start]=obs[i];
01060 }
01061
01071 void translate_from_single_order(ST* obs, INT sequence_length, INT start, INT p_order, INT max_val, INT gap)
01072 {
01073 ASSERT(gap>=0);
01074
01075 const INT start_gap=(p_order-gap)/2;
01076 const INT end_gap=start_gap+gap;
01077
01078 INT i,j;
01079 ST value=0;
01080
01081
01082 for (i=sequence_length-1; i>=p_order-1; i--)
01083 {
01084 value=0;
01085 for (j=i; j>=i-p_order+1; j--)
01086 {
01087 if (i-j<start_gap)
01088 {
01089 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
01090 }
01091 else if (i-j>=end_gap)
01092 {
01093 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
01094 }
01095 }
01096 obs[i]= (ST) value;
01097 }
01098
01099
01100 for (i=p_order-2;i>=0;i--)
01101 {
01102 if (i>=sequence_length)
01103 continue;
01104
01105 value=0;
01106 for (j=i; j>=i-p_order+1; j--)
01107 {
01108 if (i-j<start_gap)
01109 {
01110 value= (value >> max_val);
01111 if (j>=0 && j<sequence_length)
01112 value|=obs[j] << (max_val * (p_order-1-gap));
01113 }
01114 else if (i-j>=end_gap)
01115 {
01116 value= (value >> max_val);
01117 if (j>=0 && j<sequence_length)
01118 value|=obs[j] << (max_val * (p_order-1-gap));
01119 }
01120 }
01121 obs[i]=value;
01122 }
01123
01124
01125 for (i=start; i<sequence_length; i++)
01126 obs[i-start]=obs[i];
01127 }
01128
01138 void translate_from_single_order_reversed(ST* obs, INT sequence_length, INT start, INT p_order, INT max_val, INT gap)
01139 {
01140 ASSERT(gap>=0);
01141
01142 const INT start_gap=(p_order-gap)/2;
01143 const INT end_gap=start_gap+gap;
01144
01145 INT i,j;
01146 ST value=0;
01147
01148
01149 for (i=sequence_length-1; i>=p_order-1; i--)
01150 {
01151 value=0;
01152 for (j=i; j>=i-p_order+1; j--)
01153 {
01154 if (i-j<start_gap)
01155 value= (value << max_val) | obs[j];
01156 else if (i-j>=end_gap)
01157 value= (value << max_val) | obs[j];
01158 }
01159 obs[i]= (ST) value;
01160 }
01161
01162
01163 for (i=p_order-2;i>=0;i--)
01164 {
01165 if (i>=sequence_length)
01166 continue;
01167
01168 value=0;
01169 for (j=i; j>=i-p_order+1; j--)
01170 {
01171 if (i-j<start_gap)
01172 {
01173 value= value << max_val;
01174 if (j>=0 && j<sequence_length)
01175 value|=obs[j];
01176 }
01177 else if (i-j>=end_gap)
01178 {
01179 value= value << max_val;
01180 if (j>=0 && j<sequence_length)
01181 value|=obs[j];
01182 }
01183 }
01184 obs[i]=value;
01185 }
01186
01187
01188 for (i=start; i<sequence_length; i++)
01189 obs[i-start]=obs[i];
01190 }
01191
01192 protected:
01193
01195 CAlphabet* alphabet;
01196
01198 INT num_vectors;
01199
01201 T_STRING<ST>* features;
01202
01204 ST* single_string;
01205
01207 INT length_of_single_string;
01208
01210 INT max_string_length;
01211
01213 LONGREAL num_symbols;
01214
01216 LONGREAL original_num_symbols;
01217
01219 INT order;
01220
01222 INT selected_vector;
01223
01225 ST* symbol_mask_table;
01226 };
01227
01232 template<> inline EFeatureType CStringFeatures<CHAR>::get_feature_type()
01233 {
01234 return F_CHAR;
01235 }
01236
01241 template<> inline EFeatureType CStringFeatures<BYTE>::get_feature_type()
01242 {
01243 return F_BYTE;
01244 }
01245
01250 template<> inline EFeatureType CStringFeatures<SHORT>::get_feature_type()
01251 {
01252 return F_SHORT;
01253 }
01254
01259 template<> inline EFeatureType CStringFeatures<WORD>::get_feature_type()
01260 {
01261 return F_WORD;
01262 }
01263
01268 template<> inline EFeatureType CStringFeatures<INT>::get_feature_type()
01269 {
01270 return F_INT;
01271 }
01272
01277 template<> inline EFeatureType CStringFeatures<UINT>::get_feature_type()
01278 {
01279 return F_UINT;
01280 }
01281
01286 template<> inline EFeatureType CStringFeatures<LONG>::get_feature_type()
01287 {
01288 return F_LONG;
01289 }
01290
01295 template<> inline EFeatureType CStringFeatures<ULONG>::get_feature_type()
01296 {
01297 return F_ULONG;
01298 }
01299
01304 template<> inline EFeatureType CStringFeatures<DREAL>::get_feature_type()
01305 {
01306 return F_DREAL;
01307 }
01308 #endif