00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _CSTRINGFEATURES__H__
00013 #define _CSTRINGFEATURES__H__
00014
00015
00016 #include "preproc/PreProc.h"
00017 #include "preproc/StringPreProc.h"
00018 #include "features/Features.h"
00019 #include "features/CharFeatures.h"
00020 #include "features/Alphabet.h"
00021 #include "lib/common.h"
00022 #include "lib/io.h"
00023 #include "lib/DynamicArray.h"
00024 #include "lib/File.h"
00025 #include "lib/Mathematics.h"
00026
00027 #include <sys/types.h>
00028 #include <sys/stat.h>
00029 #include <dirent.h>
00030 #include <stdio.h>
00031 #include <stdlib.h>
00032 #include <unistd.h>
00033
00034 class CFile;
00035
00036 template <class ST> class CStringPreProc;
00037
00039 template <class T> struct T_STRING
00040 {
00042 T* string;
00044 int32_t length;
00045 };
00046
00047 template <class T> char* get_zero_terminated_string_copy(T_STRING<T> str)
00048 {
00049 int32_t l=str.length;
00050 char* s=new char[l+1];
00051 memcpy(s, str.string, sizeof(char)*l);
00052 s[l]='\0';
00053 return s;
00054 }
00055
00068 template <class ST> class CStringFeatures : public CFeatures
00069 {
00070 public:
00075 CStringFeatures(EAlphabet alpha)
00076 : CFeatures(0), num_vectors(0), features(NULL),
00077 single_string(NULL),length_of_single_string(0),
00078 max_string_length(0), order(0), selected_vector(0),
00079 symbol_mask_table(NULL)
00080 {
00081 alphabet=new CAlphabet(alpha);
00082 SG_REF(alphabet);
00083 num_symbols=alphabet->get_num_symbols();
00084 original_num_symbols=num_symbols;
00085 }
00086
00091 CStringFeatures(CAlphabet* alpha)
00092 : CFeatures(0), num_vectors(0), features(NULL),
00093 single_string(NULL),length_of_single_string(0),
00094 max_string_length(0), order(0), selected_vector(0),
00095 symbol_mask_table(NULL)
00096 {
00097 ASSERT(alpha);
00098 alphabet=new CAlphabet(alpha);
00099 num_symbols=alphabet->get_num_symbols();
00100 original_num_symbols=num_symbols;
00101 }
00102
00104 CStringFeatures(const CStringFeatures & orig)
00105 : CFeatures(orig), num_vectors(orig.num_vectors),
00106 single_string(orig.single_string),
00107 length_of_single_string(orig.length_of_single_string),
00108 max_string_length(orig.max_string_length),
00109 num_symbols(orig.num_symbols),
00110 original_num_symbols(orig.original_num_symbols),
00111 order(orig.order), selected_vector(orig.selected_vector)
00112 {
00113 ASSERT(orig.single_string == NULL);
00114
00115 alphabet=new CAlphabet(orig.alphabet);
00116 SG_REF(alphabet);
00117
00118 if (orig.features)
00119 {
00120 features=new T_STRING<ST>[orig.num_vectors];
00121
00122 for (int32_t i=0; i<num_vectors; i++)
00123 {
00124 features[i].string=new ST[orig.features[i].length];
00125 ASSERT(features[i].string);
00126 features[i].length=orig.features[i].length;
00127 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length);
00128 }
00129 }
00130
00131 if (orig.symbol_mask_table)
00132 {
00133 symbol_mask_table=new ST[256];
00134 for (int32_t i=0; i<256; i++)
00135 symbol_mask_table[i]=orig.symbol_mask_table[i];
00136 }
00137 }
00138
00144 CStringFeatures(char* fname, EAlphabet alpha=DNA)
00145 : CFeatures(fname), num_vectors(0),
00146 features(NULL), single_string(NULL),
00147 length_of_single_string(0), max_string_length(0),
00148 order(0), selected_vector(0), symbol_mask_table(NULL)
00149 {
00150 alphabet=new CAlphabet(alpha);
00151 SG_REF(alphabet);
00152 num_symbols=alphabet->get_num_symbols();
00153 original_num_symbols=num_symbols;
00154 load(fname);
00155 }
00156
00157 virtual ~CStringFeatures()
00158 {
00159 cleanup();
00160
00161 #ifdef HAVE_SWIG
00162 SG_UNREF(alphabet);
00163 #else
00164 delete alphabet;
00165 #endif
00166 }
00167
00169 void cleanup()
00170 {
00171 if (single_string)
00172 {
00173 delete[] single_string;
00174 single_string=NULL;
00175 }
00176 else
00177 {
00178 for (int32_t i=0; i<num_vectors; i++)
00179 {
00180 delete[] features[i].string;
00181 features[i].length=0;
00182 }
00183 }
00184 num_vectors=0;
00185 delete[] features;
00186
00187 delete[] symbol_mask_table;
00188 alphabet->clear_histogram();
00189 }
00190
00195 inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00196
00201 inline virtual EFeatureType get_feature_type();
00202
00207 inline CAlphabet* get_alphabet()
00208 {
00209 SG_REF(alphabet);
00210 return alphabet;
00211 }
00212
00217 virtual CFeatures* duplicate() const
00218 {
00219 return new CStringFeatures<ST>(*this);
00220 }
00221
00226 void select_feature_vector(int32_t num)
00227 {
00228 ASSERT(features);
00229 ASSERT(num<num_vectors);
00230
00231 selected_vector=num;
00232 }
00233
00239 void get_string(ST** dst, int32_t* len)
00240 {
00241 ASSERT(features);
00242 ASSERT(selected_vector<num_vectors);
00243
00244 *len=features[selected_vector].length;
00245 *dst=new ST[*len];
00246 memcpy(*dst, features[selected_vector].string, *len * sizeof(ST));
00247 }
00248
00255 virtual ST* get_feature_vector(int32_t num, int32_t& len)
00256 {
00257 ASSERT(features);
00258 ASSERT(num<num_vectors);
00259
00260 len=features[num].length;
00261 return features[num].string;
00262 }
00263
00270 virtual void set_feature_vector(int32_t num, ST* string, int32_t len)
00271 {
00272 ASSERT(features);
00273 ASSERT(num<num_vectors);
00274
00275 features[num].length=len ;
00276 features[num].string=string ;
00277 }
00278
00285 virtual ST inline get_feature(int32_t vec_num, int32_t feat_num)
00286 {
00287 ASSERT(features && vec_num<num_vectors);
00288 ASSERT(feat_num<features[vec_num].length);
00289
00290 return features[vec_num].string[feat_num];
00291 }
00292
00298 virtual inline int32_t get_vector_length(int32_t vec_num)
00299 {
00300 ASSERT(features && vec_num<num_vectors);
00301 return features[vec_num].length;
00302 }
00303
00308 virtual inline int32_t get_max_vector_length()
00309 {
00310 return max_string_length;
00311 }
00312
00317 virtual inline int32_t get_num_vectors() { return num_vectors; }
00318
00325 inline float128_t get_num_symbols() { return num_symbols; }
00326
00333 inline float128_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00334
00335
00336
00341 inline float128_t get_original_num_symbols() { return original_num_symbols; }
00342
00347 inline int32_t get_order() { return order; }
00348
00356 inline ST get_masked_symbols(ST symbol, uint8_t mask)
00357 {
00358 ASSERT(symbol_mask_table);
00359 return symbol_mask_table[mask] & symbol;
00360 }
00361
00368 inline ST shift_offset(ST offset, int32_t amount)
00369 {
00370 ASSERT(alphabet);
00371 return (offset << (amount*alphabet->get_num_bits()));
00372 }
00373
00380 inline ST shift_symbol(ST symbol, int32_t amount)
00381 {
00382 ASSERT(alphabet);
00383 return (symbol >> (amount*alphabet->get_num_bits()));
00384 }
00385
00391 virtual bool load(char* fname)
00392 {
00393 SG_INFO( "loading...\n");
00394 int64_t length=0;
00395 max_string_length=0;
00396
00397 CFile f(fname, 'r', F_CHAR);
00398 char* feature_matrix=f.load_char_data(NULL, length);
00399
00400 num_vectors=0;
00401
00402 if (f.is_ok())
00403 {
00404 for (int64_t i=0; i<length; i++)
00405 {
00406 if (feature_matrix[i]=='\n')
00407 num_vectors++;
00408 }
00409
00410 SG_INFO( "file contains %ld vectors\n", num_vectors);
00411 features= new T_STRING<ST>[num_vectors];
00412
00413 int64_t index=0;
00414 for (int32_t lines=0; lines<num_vectors; lines++)
00415 {
00416 char* p=&feature_matrix[index];
00417 int32_t columns=0;
00418
00419 for (columns=0; index+columns<length && p[columns]!='\n'; columns++);
00420
00421 if (index+columns>=length && p[columns]!='\n') {
00422 SG_ERROR( "error in \"%s\":%d\n", fname, lines);
00423 }
00424
00425 features[lines].length=columns;
00426 features[lines].string=new ST[columns];
00427
00428 max_string_length=CMath::max(max_string_length,columns);
00429
00430 for (int32_t i=0; i<columns; i++)
00431 features[lines].string[i]= ((ST) p[i]);
00432
00433 index+= features[lines].length+1;
00434 }
00435
00436 num_symbols=4;
00437 return true;
00438 }
00439 else
00440 SG_ERROR( "reading file failed\n");
00441
00442 return false;
00443 }
00444
00451 bool load_dna_file(char* fname, bool remap_to_bin=true)
00452 {
00453 bool result=false;
00454
00455 size_t blocksize=1024*1024;
00456 size_t required_blocksize=0;
00457 uint8_t* dummy=new uint8_t[blocksize];
00458 uint8_t* overflow=NULL;
00459 int32_t overflow_len=0;
00460
00461 num_symbols=4;
00462 cleanup();
00463
00464 CAlphabet* alpha=new CAlphabet(DNA);
00465
00466 FILE* f=fopen(fname, "ro");
00467
00468 if (f)
00469 {
00470 num_vectors=0;
00471 max_string_length=0;
00472
00473 SG_INFO("counting line numbers in file %s\n", fname);
00474 SG_DEBUG("block_size=%d\n", required_blocksize);
00475 size_t sz=blocksize;
00476 size_t block_offs=0;
00477 size_t old_block_offs=0;
00478 fseek(f, 0, SEEK_END);
00479 size_t fsize=ftell(f);
00480 rewind(f);
00481
00482 while (sz == blocksize)
00483 {
00484 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00485 bool contains_cr=false;
00486 for (size_t i=0; i<sz; i++)
00487 {
00488 block_offs++;
00489 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00490 {
00491 num_vectors++;
00492 contains_cr=true;
00493 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00494 old_block_offs=block_offs;
00495 }
00496 }
00497 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00498 }
00499
00500 SG_INFO("found %d strings\n", num_vectors);
00501 delete[] dummy;
00502 blocksize=required_blocksize;
00503 dummy = new uint8_t[blocksize];
00504 overflow = new uint8_t[blocksize];
00505 features=new T_STRING<ST>[num_vectors];
00506
00507 rewind(f);
00508 sz=blocksize;
00509 int32_t lines=0;
00510 while (sz == blocksize)
00511 {
00512 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00513
00514 size_t old_sz=0;
00515 for (size_t i=0; i<sz; i++)
00516 {
00517 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00518 {
00519 int32_t len=i-old_sz;
00520
00521 max_string_length=CMath::max(max_string_length, len+overflow_len);
00522
00523 features[lines].length=len;
00524 features[lines].string=new ST[len];
00525
00526 if (remap_to_bin)
00527 {
00528 for (int32_t j=0; j<overflow_len; j++)
00529 features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00530 for (int32_t j=0; j<len; j++)
00531 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00532 }
00533 else
00534 {
00535 for (int32_t j=0; j<overflow_len; j++)
00536 features[lines].string[j]=overflow[j];
00537 for (int32_t j=0; j<len; j++)
00538 features[lines].string[j+overflow_len]=dummy[old_sz+j];
00539 }
00540
00541
00542 overflow_len=0;
00543
00544
00545 old_sz=i+1;
00546 lines++;
00547 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00548 }
00549 }
00550 for (size_t i=old_sz; i<sz; i++)
00551 overflow[i-old_sz]=dummy[i];
00552
00553 overflow_len=sz-old_sz;
00554 }
00555 result=true;
00556 SG_INFO("file successfully read\n");
00557 SG_INFO("max_string_length=%d\n", max_string_length);
00558 SG_INFO("num_strings=%d\n", num_vectors);
00559 }
00560
00561 fclose(f);
00562 delete alpha;
00563 delete[] dummy;
00564
00565 #ifdef HAVE_SWIG
00566 SG_UNREF(alphabet);
00567 #else
00568 delete alphabet;
00569 #endif
00570 if (remap_to_bin)
00571 alphabet = new CAlphabet(RAWDNA);
00572 else
00573 alphabet = new CAlphabet(DNA);
00574 SG_REF(alphabet);
00575
00576 return result;
00577 }
00578
00584 bool load_from_directory(char* dirname)
00585 {
00586 struct dirent **namelist;
00587 int32_t n;
00588
00589 io.set_dirname(dirname);
00590
00591 n = scandir(dirname, &namelist, io.filter, alphasort);
00592 if (n <= 0)
00593 {
00594 SG_ERROR( "error calling scandir\n");
00595 return false;
00596 }
00597 else
00598 {
00599 T_STRING<ST>* strings=NULL;
00600 alphabet->clear_histogram();
00601
00602 int32_t num=0;
00603 int32_t max_len=-1;
00604
00605
00606
00607 strings=new T_STRING<ST>[n];
00608
00609 for (int32_t i=0; i<n; i++)
00610 {
00611 char* fname=io.concat_filename(namelist[i]->d_name);
00612
00613 struct stat s;
00614 off_t filesize=0;
00615
00616 if (!stat(fname, &s) && s.st_size>0)
00617 {
00618 filesize=s.st_size/sizeof(ST);
00619
00620 FILE* f=fopen(fname, "ro");
00621 if (f)
00622 {
00623 ST* str=new ST[filesize];
00624 SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
00625 fread(str, sizeof(ST), filesize, f);
00626 strings[num].string=str;
00627 strings[num].length=filesize;
00628 max_len=CMath::max(max_len, strings[num].length);
00629
00630 num++;
00631 fclose(f);
00632 }
00633 }
00634 else
00635 SG_ERROR("empty or non readable file \'%s\'\n", fname);
00636
00637 free(namelist[i]);
00638 }
00639 free(namelist);
00640
00641 if (num>0 && strings)
00642 {
00643 set_features(strings, num, max_len);
00644 return true;
00645 }
00646 }
00647 return false;
00648 }
00649
00657 bool set_features(T_STRING<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
00658 {
00659 if (p_features)
00660 {
00661 CAlphabet* alpha=new CAlphabet(alphabet);
00662
00663
00664 for (int32_t i=0; i<p_num_vectors; i++)
00665 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
00666
00667 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
00668 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
00669
00670 if (alpha->check_alphabet_size() && alpha->check_alphabet())
00671 {
00672 cleanup();
00673
00674 #ifdef HAVE_SWIG
00675 SG_UNREF(alphabet);
00676 #else
00677 delete alphabet;
00678 #endif
00679 alphabet=alpha;
00680 SG_REF(alphabet);
00681
00682 this->features=p_features;
00683 this->num_vectors=p_num_vectors;
00684 this->max_string_length=p_max_string_length;
00685
00686 return true;
00687 }
00688 else
00689 delete alpha;
00690 }
00691
00692 return false;
00693 }
00694
00701 virtual T_STRING<ST>* get_features(int32_t& num_str, int32_t& max_str_len)
00702 {
00703 num_str=num_vectors;
00704 max_str_len=max_string_length;
00705 return features;
00706 }
00707
00713 virtual bool save(char* dest)
00714 {
00715 return false;
00716 }
00717
00722 virtual int32_t get_size() { return sizeof(ST); }
00723
00729 virtual bool apply_preproc(bool force_preprocessing=false)
00730 {
00731 SG_DEBUG( "force: %d\n", force_preprocessing);
00732
00733 for (int32_t i=0; i<get_num_preproc(); i++)
00734 {
00735 if ( (!is_preprocessed(i) || force_preprocessing) )
00736 {
00737 set_preprocessed(i);
00738
00739 SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00740
00741 if (!((CStringPreProc<ST>*) get_preproc(i))->apply_to_string_features(this))
00742 return false;
00743 }
00744 }
00745 return true;
00746 }
00747
00757 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
00758 {
00759 ASSERT(step_size>0);
00760 ASSERT(window_size>0);
00761 ASSERT(num_vectors==1 || single_string);
00762 ASSERT(max_string_length>=window_size ||
00763 (single_string && length_of_single_string>=window_size));
00764
00765
00766
00767 if (single_string)
00768 num_vectors= (length_of_single_string-window_size)/step_size + 1;
00769 else if (num_vectors==1)
00770 {
00771 num_vectors= (max_string_length-window_size)/step_size + 1;
00772 length_of_single_string=max_string_length;
00773 }
00774
00775 T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
00776 int32_t offs=0;
00777 for (int32_t i=0; i<num_vectors; i++)
00778 {
00779 f[i].string=&features[0].string[offs+skip];
00780 f[i].length=window_size-skip;
00781 offs+=step_size;
00782 }
00783 single_string=features[0].string;
00784 delete[] features;
00785 features=f;
00786 selected_vector=0;
00787 max_string_length=window_size-skip;
00788
00789 return num_vectors;
00790 }
00791
00800 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0)
00801 {
00802 ASSERT(positions);
00803 ASSERT(window_size>0);
00804 ASSERT(num_vectors==1 || single_string);
00805 ASSERT(max_string_length>=window_size ||
00806 (single_string && length_of_single_string>=window_size));
00807
00808 num_vectors= positions->get_num_elements();
00809 ASSERT(num_vectors>0);
00810
00811 int32_t len;
00812
00813
00814
00815 if (single_string)
00816 len=length_of_single_string;
00817 else
00818 {
00819 single_string=features[0].string;
00820 len=max_string_length;
00821 length_of_single_string=max_string_length;
00822 }
00823
00824 T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
00825 for (int32_t i=0; i<num_vectors; i++)
00826 {
00827 int32_t p=positions->get_element(i);
00828
00829 if (p>=0 && p<=len-window_size)
00830 {
00831 f[i].string=&features[0].string[p+skip];
00832 f[i].length=window_size-skip;
00833 }
00834 else
00835 {
00836 num_vectors=1;
00837 max_string_length=len;
00838 features[0].length=len;
00839 single_string=NULL;
00840 delete[] f;
00841 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
00842 window_size, i, p, len);
00843 return -1;
00844 }
00845 }
00846
00847 delete[] features;
00848 features=f;
00849 selected_vector=0;
00850 max_string_length=window_size-skip;
00851
00852 return num_vectors;
00853 }
00854
00866 inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
00867 {
00868 return obtain_from_char_features(sf, start, p_order, gap, rev);
00869 }
00870
00880 template <class CT>
00881 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
00882 {
00883 ASSERT(sf);
00884 this->order=p_order;
00885 cleanup();
00886 delete[] symbol_mask_table;
00887 symbol_mask_table=new ST[256];
00888
00889 num_vectors=sf->get_num_vectors();
00890 ASSERT(num_vectors>0);
00891 max_string_length=sf->get_max_vector_length()-start;
00892 features=new T_STRING<ST>[num_vectors];
00893 CAlphabet* alpha=sf->get_alphabet();
00894 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
00895
00896 SG_DEBUG( "%1.0llf symbols in StringFeatures<*>\n", sf->get_num_symbols());
00897
00898 for (int32_t i=0; i<num_vectors; i++)
00899 {
00900 int32_t len=-1;
00901 CT* c=sf->get_feature_vector(i, len);
00902
00903 features[i].string=new ST[len];
00904 features[i].length=len;
00905
00906 ST* str=features[i].string;
00907 for (int32_t j=0; j<len; j++)
00908 str[j]=(ST) alpha->remap_to_bin(c[j]);
00909
00910 }
00911
00912 original_num_symbols=alpha->get_num_symbols();
00913 int32_t max_val=alpha->get_num_bits();
00914
00915 if (p_order>1)
00916 num_symbols=CMath::powl((float128_t) 2, (float128_t) max_val*p_order);
00917 else
00918 num_symbols=original_num_symbols;
00919 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
00920
00921 if ( ((float128_t) num_symbols) > CMath::powl(((float128_t) 2),((float128_t) sizeof(ST)*8)) )
00922 {
00923 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
00924 return false;
00925 }
00926
00927 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
00928 for (int32_t line=0; line<num_vectors; line++)
00929 {
00930 int32_t len=0;
00931 ST* fv=get_feature_vector(line, len);
00932
00933 if (rev)
00934 translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
00935 else
00936 translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
00937
00938
00939
00940
00941 features[line].length-=start+gap ;
00942 if (features[line].length<0)
00943 features[line].length=0 ;
00944 }
00945
00946 uint64_t mask=0;
00947 for (int32_t i=0; i< (int64_t) max_val; i++)
00948 mask=(mask<<1) | 1;
00949
00950 for (int32_t i=0; i<256; i++)
00951 {
00952 uint8_t bits=(uint8_t) i;
00953 symbol_mask_table[i]=0;
00954
00955 for (int32_t j=0; j<8; j++)
00956 {
00957 if (bits & 1)
00958 symbol_mask_table[i]|=mask<<(max_val*j);
00959
00960 bits>>=1;
00961 }
00962 }
00963
00964 return true;
00965 }
00966
00974 bool have_same_length(int32_t len)
00975 {
00976 if (len!=get_max_vector_length())
00977 return false;
00978
00979 for (int32_t i=0; i<num_vectors; i++)
00980 {
00981 if (get_vector_length(i)!=len)
00982 return false;
00983 }
00984
00985 return true;
00986 }
00987
00988 protected:
00997 void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00998 {
00999 int32_t i,j;
01000 ST value=0;
01001
01002 for (i=sequence_length-1; i>= p_order-1; i--)
01003 {
01004 value=0;
01005 for (j=i; j>=i-p_order+1; j--)
01006 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
01007
01008 obs[i]= (ST) value;
01009 }
01010
01011 for (i=p_order-2;i>=0;i--)
01012 {
01013 if (i>=sequence_length)
01014 continue;
01015
01016 value=0;
01017 for (j=i; j>=i-p_order+1; j--)
01018 {
01019 value= (value >> max_val);
01020 if (j>=0 && j<sequence_length)
01021 value|=obs[j] << (max_val * (p_order-1));
01022 }
01023 obs[i]=value;
01024 }
01025
01026 for (i=start; i<sequence_length; i++)
01027 obs[i-start]=obs[i];
01028 }
01029
01038 void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
01039 {
01040 int32_t i,j;
01041 ST value=0;
01042
01043 for (i=sequence_length-1; i>= p_order-1; i--)
01044 {
01045 value=0;
01046 for (j=i; j>=i-p_order+1; j--)
01047 value= (value << max_val) | obs[j];
01048
01049 obs[i]= (ST) value;
01050 }
01051
01052 for (i=p_order-2;i>=0;i--)
01053 {
01054 if (i>=sequence_length)
01055 continue;
01056
01057 value=0;
01058 for (j=i; j>=i-p_order+1; j--)
01059 {
01060 value= (value << max_val);
01061 if (j>=0 && j<sequence_length)
01062 value|=obs[j];
01063 }
01064 obs[i]=value;
01065 }
01066
01067 for (i=start; i<sequence_length; i++)
01068 obs[i-start]=obs[i];
01069 }
01070
01080 void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01081 {
01082 ASSERT(gap>=0);
01083
01084 const int32_t start_gap=(p_order-gap)/2;
01085 const int32_t end_gap=start_gap+gap;
01086
01087 int32_t i,j;
01088 ST value=0;
01089
01090
01091 for (i=sequence_length-1; i>=p_order-1; i--)
01092 {
01093 value=0;
01094 for (j=i; j>=i-p_order+1; j--)
01095 {
01096 if (i-j<start_gap)
01097 {
01098 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
01099 }
01100 else if (i-j>=end_gap)
01101 {
01102 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
01103 }
01104 }
01105 obs[i]= (ST) value;
01106 }
01107
01108
01109 for (i=p_order-2;i>=0;i--)
01110 {
01111 if (i>=sequence_length)
01112 continue;
01113
01114 value=0;
01115 for (j=i; j>=i-p_order+1; j--)
01116 {
01117 if (i-j<start_gap)
01118 {
01119 value= (value >> max_val);
01120 if (j>=0 && j<sequence_length)
01121 value|=obs[j] << (max_val * (p_order-1-gap));
01122 }
01123 else if (i-j>=end_gap)
01124 {
01125 value= (value >> max_val);
01126 if (j>=0 && j<sequence_length)
01127 value|=obs[j] << (max_val * (p_order-1-gap));
01128 }
01129 }
01130 obs[i]=value;
01131 }
01132
01133
01134 for (i=start; i<sequence_length; i++)
01135 obs[i-start]=obs[i];
01136 }
01137
01147 void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
01148 {
01149 ASSERT(gap>=0);
01150
01151 const int32_t start_gap=(p_order-gap)/2;
01152 const int32_t end_gap=start_gap+gap;
01153
01154 int32_t i,j;
01155 ST value=0;
01156
01157
01158 for (i=sequence_length-1; i>=p_order-1; i--)
01159 {
01160 value=0;
01161 for (j=i; j>=i-p_order+1; j--)
01162 {
01163 if (i-j<start_gap)
01164 value= (value << max_val) | obs[j];
01165 else if (i-j>=end_gap)
01166 value= (value << max_val) | obs[j];
01167 }
01168 obs[i]= (ST) value;
01169 }
01170
01171
01172 for (i=p_order-2;i>=0;i--)
01173 {
01174 if (i>=sequence_length)
01175 continue;
01176
01177 value=0;
01178 for (j=i; j>=i-p_order+1; j--)
01179 {
01180 if (i-j<start_gap)
01181 {
01182 value= value << max_val;
01183 if (j>=0 && j<sequence_length)
01184 value|=obs[j];
01185 }
01186 else if (i-j>=end_gap)
01187 {
01188 value= value << max_val;
01189 if (j>=0 && j<sequence_length)
01190 value|=obs[j];
01191 }
01192 }
01193 obs[i]=value;
01194 }
01195
01196
01197 for (i=start; i<sequence_length; i++)
01198 obs[i-start]=obs[i];
01199 }
01200
01201 protected:
01202
01204 CAlphabet* alphabet;
01205
01207 int32_t num_vectors;
01208
01210 T_STRING<ST>* features;
01211
01213 ST* single_string;
01214
01216 int32_t length_of_single_string;
01217
01219 int32_t max_string_length;
01220
01222 float128_t num_symbols;
01223
01225 float128_t original_num_symbols;
01226
01228 int32_t order;
01229
01231 int32_t selected_vector;
01232
01234 ST* symbol_mask_table;
01235 };
01236
01241 template<> inline EFeatureType CStringFeatures<char>::get_feature_type()
01242 {
01243 return F_CHAR;
01244 }
01245
01250 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type()
01251 {
01252 return F_BYTE;
01253 }
01254
01259 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type()
01260 {
01261 return F_SHORT;
01262 }
01263
01268 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type()
01269 {
01270 return F_WORD;
01271 }
01272
01277 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type()
01278 {
01279 return F_INT;
01280 }
01281
01286 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type()
01287 {
01288 return F_UINT;
01289 }
01290
01295 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type()
01296 {
01297 return F_LONG;
01298 }
01299
01304 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type()
01305 {
01306 return F_ULONG;
01307 }
01308
01313 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type()
01314 {
01315 return F_DREAL;
01316 }
01317 #endif