Alphabet.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2008 Soeren Sonnenburg
00008  * Copyright (C) 2006-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013 
00014 #include "base/SGObject.h"
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017 
00018 
00020 enum EAlphabet
00021 {
00023     DNA=0,
00024 
00026     RAWDNA=1,
00027 
00029     RNA=2,
00030 
00032     PROTEIN=3,
00033 
00035     ALPHANUM=5,
00036 
00038     CUBE=6,
00039 
00041     RAWBYTE=7,
00042 
00044     IUPAC_NUCLEIC_ACID=8,
00045 
00047     IUPAC_AMINO_ACID=9,
00048 
00050     NONE=10,
00051 
00053     UNKNOWN=11,
00054 };
00055 
00056 
00065 class CAlphabet : public CSGObject
00066 {
00067     public:
00073         CAlphabet(char* alpha, int32_t len);
00074 
00079         CAlphabet(EAlphabet alpha);
00080 
00085         CAlphabet(CAlphabet* alpha);
00086         ~CAlphabet();
00087 
00092         bool set_alphabet(EAlphabet alpha);
00093 
00098         inline EAlphabet get_alphabet()
00099         {
00100             return alphabet;
00101         }
00102 
00107         inline int32_t get_num_symbols()
00108         {
00109             return num_symbols;
00110         }
00111 
00117         inline int32_t get_num_bits()
00118         {
00119             return num_bits;
00120         }
00121 
00127         inline uint8_t remap_to_bin(uint8_t c)
00128         {
00129             return maptable_to_bin[c];
00130         }
00131 
00137         inline uint8_t remap_to_char(uint8_t c)
00138         {
00139             return maptable_to_char[c];
00140         }
00141 
00143         void clear_histogram();
00144 
00150         void add_string_to_histogram(char* p, int64_t len);
00151 
00157         void add_string_to_histogram(uint8_t* p, int64_t len);
00158 
00164         void add_string_to_histogram(int16_t* p, int64_t len);
00165 
00171         void add_string_to_histogram(uint16_t* p, int64_t len);
00172 
00178         void add_string_to_histogram(int32_t* p, int64_t len);
00179 
00185         void add_string_to_histogram(uint32_t* p, int64_t len);
00186 
00192         void add_string_to_histogram(int64_t* p, int64_t len);
00193 
00199         void add_string_to_histogram(uint64_t* p, int64_t len);
00200 
00205         inline void add_byte_to_histogram(uint8_t p)
00206         {
00207             histogram[(int32_t) p]++;
00208         }
00209 
00211         void print_histogram();
00212 
00218         inline void get_hist(int64_t** h, int32_t* len)
00219         {
00220             int32_t hist_size=(1 << (sizeof(uint8_t)*8));
00221             ASSERT(h && len);
00222             *h=(int64_t*) malloc(sizeof(int64_t)*hist_size);
00223             ASSERT(*h);
00224             *len=hist_size;
00225             ASSERT(*len);
00226             memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size);
00227         }
00228 
00230         inline const int64_t* get_histogram()
00231         {
00232             return &histogram[0];
00233         }
00234 
00241         bool check_alphabet(bool print_error=true);
00242 
00248         bool check_alphabet_size(bool print_error=true);
00249 
00254         int32_t get_num_symbols_in_histogram();
00255 
00260         int32_t get_max_value_in_histogram();
00261 
00268         int32_t get_num_bits_in_histogram();
00269 
00274         static const char* get_alphabet_name(EAlphabet alphabet);
00275 
00276     protected:
00278         void init_map_table();
00279 
00284         void copy_histogram(CAlphabet* src);
00285 
00286     public:
00288         static const uint8_t B_A;
00290         static const uint8_t B_C;
00292         static const uint8_t B_G;
00294         static const uint8_t B_T;
00296         static const uint8_t MAPTABLE_UNDEF;
00298         static const char* alphabet_names[11];
00299 
00300     protected:
00302         EAlphabet alphabet;
00304         int32_t num_symbols;
00306         int32_t num_bits;
00308         uint8_t valid_chars[1 << (sizeof(uint8_t)*8)];
00310         uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00312         uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00314         int64_t histogram[1 << (sizeof(uint8_t)*8)];
00315 };
00316 #endif

SHOGUN Machine Learning Toolbox - Documentation