Alphabet.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2008 Soeren Sonnenburg
00008  * Copyright (C) 2006-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <string.h>
00012 #include <math.h>
00013 
00014 #include "features/Alphabet.h"
00015 #include "lib/io.h"
00016 
00017 //define numbers for the bases 
00018 const BYTE CAlphabet::B_A=0;
00019 const BYTE CAlphabet::B_C=1;
00020 const BYTE CAlphabet::B_G=2;
00021 const BYTE CAlphabet::B_T=3;
00022 const BYTE CAlphabet::MAPTABLE_UNDEF=0xff;
00023 const CHAR* CAlphabet::alphabet_names[11]={"DNA", "RAWDNA", "RNA", "PROTEIN", "ALPHANUM", "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID", "NONE", "UNKNOWN"};
00024 
00025 CAlphabet::CAlphabet(CHAR* al, INT len)
00026 : CSGObject()
00027 {
00028     E_ALPHABET alpha=NONE;
00029 
00030     if (len>=(INT) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA")))
00031         alpha = DNA;
00032     else if (len>=(INT) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA")))
00033         alpha = RAWDNA;
00034     else if (len>=(INT) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA")))
00035         alpha = RNA;
00036     else if (len>=(INT) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN")))
00037         alpha = PROTEIN;
00038     else if (len>=(INT) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM")))
00039         alpha = ALPHANUM;
00040     else if (len>=(INT) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE")))
00041         alpha = CUBE;
00042     else if ((len>=(INT) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) || 
00043             (len>=(INT) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW"))))
00044         alpha = RAWBYTE;
00045     else if (len>=(INT) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID")))
00046         alpha = IUPAC_NUCLEIC_ACID;
00047     else if (len>=(INT) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID")))
00048         alpha = IUPAC_AMINO_ACID;
00049     else {
00050       SG_ERROR( "unknown alphabet %s\n", al);
00051    }
00052     
00053     set_alphabet(alpha);
00054 }
00055 
00056 CAlphabet::CAlphabet(E_ALPHABET alpha)
00057 : CSGObject()
00058 {
00059     set_alphabet(alpha);
00060 }
00061 
00062 CAlphabet::CAlphabet(CAlphabet* a)
00063 : CSGObject()
00064 {
00065     ASSERT(a);
00066     set_alphabet(a->get_alphabet());
00067     copy_histogram(a);
00068 }
00069 
00070 CAlphabet::~CAlphabet()
00071 {
00072 }
00073 
00074 bool CAlphabet::set_alphabet(E_ALPHABET alpha)
00075 {
00076     bool result=true;
00077     alphabet=alpha;
00078 
00079     switch (alphabet)
00080     {
00081         case DNA:
00082         case RAWDNA:
00083             num_symbols = 4;
00084             break;
00085         case RNA:
00086             num_symbols = 4;
00087             break;
00088         case PROTEIN:
00089             num_symbols = 26;
00090             break;
00091         case ALPHANUM:
00092             num_symbols = 36;
00093             break;
00094         case CUBE:
00095             num_symbols = 6;
00096             break;
00097         case RAWBYTE:
00098             num_symbols = 256;
00099             break;
00100         case IUPAC_NUCLEIC_ACID:
00101             num_symbols = 16;
00102             break;
00103         case IUPAC_AMINO_ACID:
00104             num_symbols = 23;
00105             break;
00106         case NONE:
00107             num_symbols = 0;
00108             break;
00109         default:
00110             num_symbols = 0;
00111             result=false;
00112             break;
00113     }
00114 
00115     num_bits=(INT) ceil(log((double) num_symbols)/log((double) 2));
00116     init_map_table();
00117     clear_histogram();
00118 
00119     SG_DEBUG( "initialised alphabet %s\n", get_alphabet_name(alphabet));
00120 
00121     return result;
00122 }
00123 
00124 void CAlphabet::init_map_table()
00125 {
00126     INT i;
00127     for (i=0; i<(1<<(8*sizeof(BYTE))); i++)
00128     {
00129         maptable_to_bin[i] = MAPTABLE_UNDEF;
00130         maptable_to_char[i] = MAPTABLE_UNDEF;
00131         valid_chars[i] = 0;
00132     }
00133 
00134     switch (alphabet)
00135     {
00136         case CUBE:
00137             valid_chars[(BYTE) '1']=1;
00138             valid_chars[(BYTE) '2']=1;
00139             valid_chars[(BYTE) '3']=1;
00140             valid_chars[(BYTE) '4']=1;  
00141             valid_chars[(BYTE) '5']=1;  
00142             valid_chars[(BYTE) '6']=1;  //Translation '123456' -> 012345
00143 
00144             maptable_to_bin[(BYTE) '1']=0;
00145             maptable_to_bin[(BYTE) '2']=1;
00146             maptable_to_bin[(BYTE) '3']=2;
00147             maptable_to_bin[(BYTE) '4']=3;  
00148             maptable_to_bin[(BYTE) '5']=4;  
00149             maptable_to_bin[(BYTE) '6']=5;  //Translation '123456' -> 012345
00150 
00151             maptable_to_char[(BYTE) 0]='1';
00152             maptable_to_char[(BYTE) 1]='2';
00153             maptable_to_char[(BYTE) 2]='3';
00154             maptable_to_char[(BYTE) 3]='4';
00155             maptable_to_char[(BYTE) 4]='5';
00156             maptable_to_char[(BYTE) 5]='6'; //Translation 012345->'123456'
00157             break;
00158 
00159         case PROTEIN:
00160             {
00161                 INT skip=0 ;
00162                 for (i=0; i<21; i++)
00163                 {
00164                     if (i==1) skip++ ;
00165                     if (i==8) skip++ ;
00166                     if (i==12) skip++ ;
00167                     if (i==17) skip++ ;
00168                     valid_chars['A'+i+skip]=1;
00169                     maptable_to_bin['A'+i+skip]=i ;
00170                     maptable_to_char[i]='A'+i+skip ;
00171                 } ;                   //Translation 012345->acde...xy -- the protein code
00172             } ;
00173             break;
00174 
00175         case ALPHANUM:
00176             {
00177                 for (i=0; i<26; i++)
00178                 {
00179                     valid_chars['A'+i]=1;
00180                     maptable_to_bin['A'+i]=i ;
00181                     maptable_to_char[i]='A'+i ;
00182                 } ;
00183                 for (i=0; i<10; i++)
00184                 {
00185                     valid_chars['0'+i]=1;
00186                     maptable_to_bin['0'+i]=26+i ;
00187                     maptable_to_char[26+i]='0'+i ;
00188                 } ;        //Translation 012345->acde...xy0123456789
00189             } ;
00190             break;
00191 
00192         case RAWBYTE:
00193             {
00194                 //identity
00195                 for (i=0; i<256; i++)
00196                 {
00197                     valid_chars[i]=1;
00198                     maptable_to_bin[i]=i;
00199                     maptable_to_char[i]=i;
00200                 }
00201             }
00202             break;
00203 
00204         case DNA:
00205             valid_chars[(BYTE) 'A']=1;
00206             valid_chars[(BYTE) 'C']=1;
00207             valid_chars[(BYTE) 'G']=1;
00208             valid_chars[(BYTE) 'T']=1;  
00209 
00210             maptable_to_bin[(BYTE) 'A']=B_A;
00211             maptable_to_bin[(BYTE) 'C']=B_C;
00212             maptable_to_bin[(BYTE) 'G']=B_G;
00213             maptable_to_bin[(BYTE) 'T']=B_T;    
00214 
00215             maptable_to_char[B_A]='A';
00216             maptable_to_char[B_C]='C';
00217             maptable_to_char[B_G]='G';
00218             maptable_to_char[B_T]='T';
00219             break;
00220         case RAWDNA:
00221             {
00222                 //identity
00223                 for (i=0; i<4; i++)
00224                 {
00225                     valid_chars[i]=1;
00226                     maptable_to_bin[i]=i;
00227                     maptable_to_char[i]=i;
00228                 }
00229             }
00230             break;
00231 
00232         case RNA:
00233             valid_chars[(BYTE) 'A']=1;
00234             valid_chars[(BYTE) 'C']=1;
00235             valid_chars[(BYTE) 'G']=1;
00236             valid_chars[(BYTE) 'U']=1;  
00237 
00238             maptable_to_bin[(BYTE) 'A']=B_A;
00239             maptable_to_bin[(BYTE) 'C']=B_C;
00240             maptable_to_bin[(BYTE) 'G']=B_G;
00241             maptable_to_bin[(BYTE) 'U']=B_T;    
00242 
00243             maptable_to_char[B_A]='A';
00244             maptable_to_char[B_C]='C';
00245             maptable_to_char[B_G]='G';
00246             maptable_to_char[B_T]='U';
00247             break;
00248 
00249         case IUPAC_NUCLEIC_ACID:
00250             valid_chars[(BYTE) 'A']=1; // A Adenine
00251             valid_chars[(BYTE) 'C']=1; // C Cytosine
00252             valid_chars[(BYTE) 'G']=1; // G Guanine
00253             valid_chars[(BYTE) 'T']=1; // T Thymine
00254             valid_chars[(BYTE) 'U']=1; // U Uracil
00255             valid_chars[(BYTE) 'R']=1; // R Purine (A or G)
00256             valid_chars[(BYTE) 'Y']=1; // Y Pyrimidine (C, T, or U)
00257             valid_chars[(BYTE) 'M']=1; // M C or A
00258             valid_chars[(BYTE) 'K']=1; // K T, U, or G
00259             valid_chars[(BYTE) 'W']=1; // W T, U, or A
00260             valid_chars[(BYTE) 'S']=1; // S C or G
00261             valid_chars[(BYTE) 'B']=1; // B C, T, U, or G (not A)
00262             valid_chars[(BYTE) 'D']=1; // D A, T, U, or G (not C)
00263             valid_chars[(BYTE) 'H']=1; // H A, T, U, or C (not G)
00264             valid_chars[(BYTE) 'V']=1; // V A, C, or G (not T, not U)
00265             valid_chars[(BYTE) 'N']=1; // N Any base (A, C, G, T, or U)
00266 
00267             maptable_to_bin[(BYTE) 'A']=0; // A Adenine
00268             maptable_to_bin[(BYTE) 'C']=1; // C Cytosine
00269             maptable_to_bin[(BYTE) 'G']=2; // G Guanine
00270             maptable_to_bin[(BYTE) 'T']=3; // T Thymine
00271             maptable_to_bin[(BYTE) 'U']=4; // U Uracil
00272             maptable_to_bin[(BYTE) 'R']=5; // R Purine (A or G)
00273             maptable_to_bin[(BYTE) 'Y']=6; // Y Pyrimidine (C, T, or U)
00274             maptable_to_bin[(BYTE) 'M']=7; // M C or A
00275             maptable_to_bin[(BYTE) 'K']=8; // K T, U, or G
00276             maptable_to_bin[(BYTE) 'W']=9; // W T, U, or A
00277             maptable_to_bin[(BYTE) 'S']=10; // S    C or G
00278             maptable_to_bin[(BYTE) 'B']=11; // B    C, T, U, or G (not A)
00279             maptable_to_bin[(BYTE) 'D']=12; // D    A, T, U, or G (not C)
00280             maptable_to_bin[(BYTE) 'H']=13; // H    A, T, U, or C (not G)
00281             maptable_to_bin[(BYTE) 'V']=14; // V    A, C, or G (not T, not U)
00282             maptable_to_bin[(BYTE) 'N']=15; // N    Any base (A, C, G, T, or U)
00283 
00284             maptable_to_char[0]=(BYTE) 'A'; // A    Adenine
00285             maptable_to_char[1]=(BYTE) 'C'; // C    Cytosine
00286             maptable_to_char[2]=(BYTE) 'G'; // G    Guanine
00287             maptable_to_char[3]=(BYTE) 'T'; // T    Thymine
00288             maptable_to_char[4]=(BYTE) 'U'; // U    Uracil
00289             maptable_to_char[5]=(BYTE) 'R'; // R    Purine (A or G)
00290             maptable_to_char[6]=(BYTE) 'Y'; // Y    Pyrimidine (C, T, or U)
00291             maptable_to_char[7]=(BYTE) 'M'; // M    C or A
00292             maptable_to_char[8]=(BYTE) 'K'; // K    T, U, or G
00293             maptable_to_char[9]=(BYTE) 'W'; // W    T, U, or A
00294             maptable_to_char[10]=(BYTE) 'S'; // S   C or G
00295             maptable_to_char[11]=(BYTE) 'B'; // B   C, T, U, or G (not A)
00296             maptable_to_char[12]=(BYTE) 'D'; // D   A, T, U, or G (not C)
00297             maptable_to_char[13]=(BYTE) 'H'; // H   A, T, U, or C (not G)
00298             maptable_to_char[14]=(BYTE) 'V'; // V   A, C, or G (not T, not U)
00299             maptable_to_char[15]=(BYTE) 'N'; // N   Any base (A, C, G, T, or U)
00300             break;
00301 
00302         case IUPAC_AMINO_ACID:
00303             valid_chars[(BYTE) 'A']=0;  //A Ala Alanine
00304             valid_chars[(BYTE) 'R']=1;  //R Arg Arginine
00305             valid_chars[(BYTE) 'N']=2;  //N Asn Asparagine
00306             valid_chars[(BYTE) 'D']=3;  //D Asp Aspartic acid
00307             valid_chars[(BYTE) 'C']=4;  //C Cys Cysteine
00308             valid_chars[(BYTE) 'Q']=5;  //Q Gln Glutamine
00309             valid_chars[(BYTE) 'E']=6;  //E Glu Glutamic acid
00310             valid_chars[(BYTE) 'G']=7;  //G Gly Glycine
00311             valid_chars[(BYTE) 'H']=8;  //H His Histidine
00312             valid_chars[(BYTE) 'I']=9;  //I Ile Isoleucine
00313             valid_chars[(BYTE) 'L']=10; //L Leu Leucine
00314             valid_chars[(BYTE) 'K']=11; //K Lys Lysine
00315             valid_chars[(BYTE) 'M']=12; //M Met Methionine
00316             valid_chars[(BYTE) 'F']=13; //F Phe Phenylalanine
00317             valid_chars[(BYTE) 'P']=14; //P Pro Proline
00318             valid_chars[(BYTE) 'S']=15; //S Ser Serine
00319             valid_chars[(BYTE) 'T']=16; //T Thr Threonine
00320             valid_chars[(BYTE) 'W']=17; //W Trp Tryptophan
00321             valid_chars[(BYTE) 'Y']=18; //Y Tyr Tyrosine
00322             valid_chars[(BYTE) 'V']=19; //V Val Valine
00323             valid_chars[(BYTE) 'B']=20; //B Asx Aspartic acid or Asparagine
00324             valid_chars[(BYTE) 'Z']=21; //Z Glx Glutamine or Glutamic acid
00325             valid_chars[(BYTE) 'X']=22; //X Xaa Any amino acid
00326 
00327             maptable_to_bin[(BYTE) 'A']=0;  //A Ala Alanine
00328             maptable_to_bin[(BYTE) 'R']=1;  //R Arg Arginine
00329             maptable_to_bin[(BYTE) 'N']=2;  //N Asn Asparagine
00330             maptable_to_bin[(BYTE) 'D']=3;  //D Asp Aspartic acid
00331             maptable_to_bin[(BYTE) 'C']=4;  //C Cys Cysteine
00332             maptable_to_bin[(BYTE) 'Q']=5;  //Q Gln Glutamine
00333             maptable_to_bin[(BYTE) 'E']=6;  //E Glu Glutamic acid
00334             maptable_to_bin[(BYTE) 'G']=7;  //G Gly Glycine
00335             maptable_to_bin[(BYTE) 'H']=8;  //H His Histidine
00336             maptable_to_bin[(BYTE) 'I']=9;  //I Ile Isoleucine
00337             maptable_to_bin[(BYTE) 'L']=10; //L Leu Leucine
00338             maptable_to_bin[(BYTE) 'K']=11; //K Lys Lysine
00339             maptable_to_bin[(BYTE) 'M']=12; //M Met Methionine
00340             maptable_to_bin[(BYTE) 'F']=13; //F Phe Phenylalanine
00341             maptable_to_bin[(BYTE) 'P']=14; //P Pro Proline
00342             maptable_to_bin[(BYTE) 'S']=15; //S Ser Serine
00343             maptable_to_bin[(BYTE) 'T']=16; //T Thr Threonine
00344             maptable_to_bin[(BYTE) 'W']=17; //W Trp Tryptophan
00345             maptable_to_bin[(BYTE) 'Y']=18; //Y Tyr Tyrosine
00346             maptable_to_bin[(BYTE) 'V']=19; //V Val Valine
00347             maptable_to_bin[(BYTE) 'B']=20; //B Asx Aspartic acid or Asparagine
00348             maptable_to_bin[(BYTE) 'Z']=21; //Z Glx Glutamine or Glutamic acid
00349             maptable_to_bin[(BYTE) 'X']=22; //X Xaa Any amino acid
00350 
00351             maptable_to_char[0]=(BYTE) 'A';  //A    Ala Alanine
00352             maptable_to_char[1]=(BYTE) 'R';  //R    Arg Arginine
00353             maptable_to_char[2]=(BYTE) 'N';  //N    Asn Asparagine
00354             maptable_to_char[3]=(BYTE) 'D';  //D    Asp Aspartic acid
00355             maptable_to_char[4]=(BYTE) 'C';  //C    Cys Cysteine
00356             maptable_to_char[5]=(BYTE) 'Q';  //Q    Gln Glutamine
00357             maptable_to_char[6]=(BYTE) 'E';  //E    Glu Glutamic acid
00358             maptable_to_char[7]=(BYTE) 'G';  //G    Gly Glycine
00359             maptable_to_char[8]=(BYTE) 'H';  //H    His Histidine
00360             maptable_to_char[9]=(BYTE) 'I';  //I    Ile Isoleucine
00361             maptable_to_char[10]=(BYTE) 'L'; //L    Leu Leucine
00362             maptable_to_char[11]=(BYTE) 'K'; //K    Lys Lysine
00363             maptable_to_char[12]=(BYTE) 'M'; //M    Met Methionine
00364             maptable_to_char[13]=(BYTE) 'F'; //F    Phe Phenylalanine
00365             maptable_to_char[14]=(BYTE) 'P'; //P    Pro Proline
00366             maptable_to_char[15]=(BYTE) 'S'; //S    Ser Serine
00367             maptable_to_char[16]=(BYTE) 'T'; //T    Thr Threonine
00368             maptable_to_char[17]=(BYTE) 'W'; //W    Trp Tryptophan
00369             maptable_to_char[18]=(BYTE) 'Y'; //Y    Tyr Tyrosine
00370             maptable_to_char[19]=(BYTE) 'V'; //V    Val Valine
00371             maptable_to_char[20]=(BYTE) 'B'; //B    Asx Aspartic acid or Asparagine
00372             maptable_to_char[21]=(BYTE) 'Z'; //Z    Glx Glutamine or Glutamic acid
00373             maptable_to_char[22]=(BYTE) 'X'; //X    Xaa Any amino acid
00374         default:
00375             break; //leave uninitialised
00376     };
00377 }
00378 
00379 void CAlphabet::clear_histogram()
00380 {
00381     memset(histogram, 0, sizeof(histogram));
00382     print_histogram();
00383 }
00384 
00385 void CAlphabet::add_string_to_histogram(BYTE* p, LONG len)
00386 {
00387     for (LONG i=0; i<len; i++)
00388         add_byte_to_histogram(p[i]);
00389 }
00390 
00391 void CAlphabet::add_string_to_histogram(CHAR* p, LONG len)
00392 {
00393     for (LONG i=0; i<len; i++)
00394         add_byte_to_histogram(p[i]);
00395 }
00396 
00397 void CAlphabet::add_string_to_histogram(WORD* p, LONG len)
00398 {
00399     SG_WARNING("computing byte histogram over word strings\n");
00400     BYTE* b= (BYTE*) p;
00401     for (LONG i=0; i<((LONG) sizeof(WORD))*len; i++)
00402         add_byte_to_histogram(b[i]);
00403 }
00404 
00405 void CAlphabet::add_string_to_histogram(SHORT* p, LONG len)
00406 {
00407     SG_WARNING("computing byte histogram over word strings\n");
00408     BYTE* b= (BYTE*) p;
00409     for (LONG i=0; i<((LONG) sizeof(SHORT))*len; i++)
00410         add_byte_to_histogram(b[i]);
00411 }
00412 
00413 void CAlphabet::add_string_to_histogram(INT* p, LONG len)
00414 {
00415     SG_WARNING("computing byte histogram over word strings\n");
00416     BYTE* b= (BYTE*) p;
00417     for (LONG i=0; i<((LONG) sizeof(INT))*len; i++)
00418         add_byte_to_histogram(b[i]);
00419 }
00420 
00421 void CAlphabet::add_string_to_histogram(UINT* p, LONG len)
00422 {
00423     SG_WARNING("computing byte histogram over word strings\n");
00424     BYTE* b= (BYTE*) p;
00425     for (LONG i=0; i<((LONG) sizeof(UINT))*len; i++)
00426         add_byte_to_histogram(b[i]);
00427 }
00428 
00429 void CAlphabet::add_string_to_histogram(LONG* p, LONG len)
00430 {
00431     SG_WARNING("computing byte histogram over word strings\n");
00432     BYTE* b= (BYTE*) p;
00433     for (LONG i=0; i<((LONG) sizeof(LONG))*len; i++)
00434         add_byte_to_histogram(b[i]);
00435 }
00436 
00437 void CAlphabet::add_string_to_histogram(ULONG* p, LONG len)
00438 {
00439     SG_WARNING("computing byte histogram over word strings\n");
00440     BYTE* b= (BYTE*) p;
00441     for (LONG i=0; i<((LONG) sizeof(ULONG))*len; i++)
00442         add_byte_to_histogram(b[i]);
00443 }
00444 
00445 INT CAlphabet::get_max_value_in_histogram()
00446 {
00447     INT max_sym=-1;
00448     for (INT i=(INT) (1 <<(sizeof(BYTE)*8))-1;i>=0; i--)
00449     {
00450         if (histogram[i])
00451         {
00452             max_sym=i;
00453             break;
00454         }
00455     }
00456 
00457     return max_sym;
00458 }
00459 
00460 INT CAlphabet::get_num_symbols_in_histogram()
00461 {
00462     INT num_sym=0;
00463     for (INT i=0; i<(INT) (1 <<(sizeof(BYTE)*8)); i++)
00464     {
00465         if (histogram[i])
00466             num_sym++;
00467     }
00468 
00469     return num_sym;
00470 }
00471 
00472 INT CAlphabet::get_num_bits_in_histogram()
00473 {
00474     INT num_sym=get_num_symbols_in_histogram();
00475     if (num_sym>0)
00476         return (INT) ceil(log((double) num_sym)/log((double) 2));
00477     else
00478         return 0;
00479 }
00480 
00481 void CAlphabet::print_histogram()
00482 {
00483     for (INT i=0; i<(INT) (1 <<(sizeof(BYTE)*8)); i++)
00484     {
00485         if (histogram[i])
00486             SG_PRINT( "hist[%d]=%lld\n", i, histogram[i]);
00487     }
00488 }
00489 
00490 bool CAlphabet::check_alphabet(bool print_error)
00491 {
00492     bool result = true;
00493 
00494     for (INT i=0; i<(INT) (1 <<(sizeof(BYTE)*8)); i++)
00495     {
00496         if (histogram[i]>0 && valid_chars[i]==0)
00497         {
00498             result=false;
00499             break;
00500         }
00501     }
00502 
00503     if (!result && print_error)
00504     {
00505         print_histogram();
00506       SG_ERROR( "ALPHABET does not contain all symbols in histogram\n");
00507     }
00508 
00509     return result;
00510 }
00511 
00512 bool CAlphabet::check_alphabet_size(bool print_error)
00513 {
00514     if (get_num_bits_in_histogram() > get_num_bits())
00515     {
00516         if (print_error)
00517         {
00518             print_histogram();
00519             fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ;
00520          SG_ERROR( "ALPHABET too small to contain all symbols in histogram\n");
00521         }
00522         return false;
00523     }
00524     else
00525         return true;
00526 
00527 }
00528 
00529 void CAlphabet::copy_histogram(CAlphabet* a)
00530 {
00531     memcpy(histogram, a->get_histogram(), sizeof(histogram));
00532 }
00533 
00534 const CHAR* CAlphabet::get_alphabet_name(E_ALPHABET alphabet)
00535 {
00536     
00537     INT idx;
00538     switch (alphabet)
00539     {
00540         case DNA:
00541             idx=0;
00542             break;
00543         case RAWDNA:
00544             idx=1;
00545             break;
00546         case RNA:
00547             idx=2;
00548             break;
00549         case PROTEIN:
00550             idx=3;
00551             break;
00552         case ALPHANUM:
00553             idx=4;
00554             break;
00555         case CUBE:
00556             idx=5;
00557             break;
00558         case RAWBYTE:
00559             idx=6;
00560             break;
00561         case IUPAC_NUCLEIC_ACID:
00562             idx=7;
00563             break;
00564         case IUPAC_AMINO_ACID:
00565             idx=8;
00566             break;
00567         case NONE:
00568             idx=9;
00569             break;
00570         default:
00571             idx=10;
00572             break;
00573     }
00574     return alphabet_names[idx];
00575 }

SHOGUN Machine Learning Toolbox - Documentation