Alphabet.h
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013
00014 #include "base/SGObject.h"
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017
00018
00020 enum EAlphabet
00021 {
00023 DNA=0,
00024
00026 RAWDNA=1,
00027
00029 RNA=2,
00030
00032 PROTEIN=3,
00033
00035 ALPHANUM=5,
00036
00038 CUBE=6,
00039
00041 RAWBYTE=7,
00042
00044 IUPAC_NUCLEIC_ACID=8,
00045
00047 IUPAC_AMINO_ACID=9,
00048
00050 NONE=10,
00051
00053 UNKNOWN=11,
00054 };
00055
00056
00065 class CAlphabet : public CSGObject
00066 {
00067 public:
00073 CAlphabet(char* alpha, int32_t len);
00074
00079 CAlphabet(EAlphabet alpha);
00080
00085 CAlphabet(CAlphabet* alpha);
00086 ~CAlphabet();
00087
00092 bool set_alphabet(EAlphabet alpha);
00093
00098 inline EAlphabet get_alphabet()
00099 {
00100 return alphabet;
00101 }
00102
00107 inline int32_t get_num_symbols()
00108 {
00109 return num_symbols;
00110 }
00111
00117 inline int32_t get_num_bits()
00118 {
00119 return num_bits;
00120 }
00121
00127 inline uint8_t remap_to_bin(uint8_t c)
00128 {
00129 return maptable_to_bin[c];
00130 }
00131
00137 inline uint8_t remap_to_char(uint8_t c)
00138 {
00139 return maptable_to_char[c];
00140 }
00141
00143 void clear_histogram();
00144
00150 void add_string_to_histogram(char* p, int64_t len);
00151
00157 void add_string_to_histogram(uint8_t* p, int64_t len);
00158
00164 void add_string_to_histogram(int16_t* p, int64_t len);
00165
00171 void add_string_to_histogram(uint16_t* p, int64_t len);
00172
00178 void add_string_to_histogram(int32_t* p, int64_t len);
00179
00185 void add_string_to_histogram(uint32_t* p, int64_t len);
00186
00192 void add_string_to_histogram(int64_t* p, int64_t len);
00193
00199 void add_string_to_histogram(uint64_t* p, int64_t len);
00200
00205 inline void add_byte_to_histogram(uint8_t p)
00206 {
00207 histogram[(int32_t) p]++;
00208 }
00209
00211 void print_histogram();
00212
00218 inline void get_hist(int64_t** h, int32_t* len)
00219 {
00220 int32_t hist_size=(1 << (sizeof(uint8_t)*8));
00221 ASSERT(h && len);
00222 *h=(int64_t*) malloc(sizeof(int64_t)*hist_size);
00223 ASSERT(*h);
00224 *len=hist_size;
00225 ASSERT(*len);
00226 memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size);
00227 }
00228
00230 inline const int64_t* get_histogram()
00231 {
00232 return &histogram[0];
00233 }
00234
00241 bool check_alphabet(bool print_error=true);
00242
00248 bool check_alphabet_size(bool print_error=true);
00249
00254 int32_t get_num_symbols_in_histogram();
00255
00260 int32_t get_max_value_in_histogram();
00261
00268 int32_t get_num_bits_in_histogram();
00269
00274 static const char* get_alphabet_name(EAlphabet alphabet);
00275
00276 protected:
00278 void init_map_table();
00279
00284 void copy_histogram(CAlphabet* src);
00285
00286 public:
00288 static const uint8_t B_A;
00290 static const uint8_t B_C;
00292 static const uint8_t B_G;
00294 static const uint8_t B_T;
00296 static const uint8_t MAPTABLE_UNDEF;
00298 static const char* alphabet_names[11];
00299
00300 protected:
00302 EAlphabet alphabet;
00304 int32_t num_symbols;
00306 int32_t num_bits;
00308 uint8_t valid_chars[1 << (sizeof(uint8_t)*8)];
00310 uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00312 uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00314 int64_t histogram[1 << (sizeof(uint8_t)*8)];
00315 };
00316 #endif