MindyGramFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006 Konrad Rieck
00008  * Written (W) 2006-2008 Soeren Sonnenburg
00009  * Copyright (C) 2006-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "lib/config.h"
00013 
00014 #ifdef HAVE_MINDY
00015 
00016 #ifndef _CMINDYGRAMFEATURES__H__
00017 #define _CMINDYGRAMFEATURES__H__
00018 
00019 #include "features/Features.h"
00020 #include "features/CharFeatures.h"
00021 #include "features/StringFeatures.h"
00022 
00023 #include <mindy.h>
00024 
00025 // MindyGramFeatures
00026 class CMindyGramFeatures : public CFeatures
00027 {
00028     public:
00036         CMindyGramFeatures(CHAR *aname, CHAR *embed, CHAR *delim, BYTE nlen)
00037         : CFeatures(0)
00038         {
00039             ASSERT(aname && embed && delim);
00040 
00041             /* Allocate and generate gram configuration (words) */
00042             SG_DEBUG( "Initializing Mindy gram features\n");
00043             if (nlen == 0)
00044                 cfg = micfg_words(alph_get_type(aname), delim);
00045             else
00046                 cfg = micfg_ngrams(alph_get_type(aname), (byte_t) nlen);
00047 
00048             /* Set delimiters */
00049             if (strlen(delim) > 0)
00050                 micfg_set_delim(cfg, delim);
00051             
00052             /* Set embedding */
00053             if (!strcasecmp(embed, "freq"))
00054                 micfg_set_embed(cfg, ME_FREQ);
00055             else if (!strcasecmp(embed, "count"))
00056                 micfg_set_embed(cfg, ME_COUNT);
00057             else if (!strcasecmp(embed, "bin"))
00058                 micfg_set_embed(cfg, ME_BIN);
00059             else
00060                 SG_ERROR("Unknown embedding mode '%s'", embed);
00061 
00062             if (nlen == 0)
00063                SG_INFO("Mindy in word mode (d: '%s', a: %s, e: %s)\n",
00064                        delim, aname, micfg_get_embed(cfg->embed));
00065             else  
00066                SG_INFO("Mindy in n-gram mode (n: '%d', a: %s, e: %s)\n",
00067                        nlen, aname, micfg_get_embed(cfg->embed));
00068         }
00069 
00074 #if 0
00075          CMindyGramFeatures(const CMindyGramFeatures & orig) : CFeatures(orig)
00076          {
00077                  SG_DEBUG( "Duplicating Mindy gram features\n");
00078                  num_vectors = orig.num_vectors;
00079 
00080                  /* Clone configuration */
00081                  cfg = micfg_clone(orig.cfg);
00082 
00083                  /* Clone gram vectors */
00084                  vectors = (gram_t **) calloc(num_vectors, sizeof(gram_t *));
00085                  for (INT i = 0; i < num_vectors; i++)
00086                          vectors[i] = gram_clone(orig.vectors[i]);
00087                 }
00088 #endif
00089 
00095         template <class T> 
00096         bool import_features(CStringFeatures<T> *sf)
00097         {
00098             INT i;
00099             num_vectors = sf->get_num_vectors();
00100             SG_INFO( "Importing %ld string features\n", num_vectors);
00101 
00102             vectors = (gram_t **) calloc(num_vectors, sizeof(gram_t *));
00103             if (!vectors) {
00104                 SG_ERROR( "Could not allocate memory\n");
00105                 return false;
00106             }
00107 
00108             for (i = 0; i < num_vectors; i++) {
00109                 INT len;
00110                 T *s = sf->get_feature_vector(i, len);
00111                 vectors[i] = gram_extract(cfg, (byte_t *) s, (size_t) len);
00112 
00113                 SG_DEBUG( "Extracted gram vector %d: %d grams\n", i, 
00114                         vectors[i]->num);
00115             }
00116 
00117             return true;
00118         }
00119 
00120         /* Destructors */
00121         ~CMindyGramFeatures();
00122 
00123         CFeatures *duplicate() const;
00124 
00125         /* Feature and vector functions */
00126         gram_t *get_feature_vector(INT i);
00127         void set_feature_vector(INT i, gram_t * g);
00128         ULONG get_feature(INT i, INT j);
00129         INT get_vector_length(INT i);
00130         void trim_max(double m);
00131 
00132         /* Simple functions */
00133         virtual INT get_num_vectors() { return num_vectors; }
00134         virtual INT get_size() { return sizeof(gram_t *); }
00135         EFeatureClass get_feature_class() { return C_MINDYGRAM; }
00136         EFeatureType get_feature_type() { return F_ULONG; }
00137 
00138     protected:
00139         /* Import and load functions */
00140         virtual bool load(CHAR * fname);
00141 
00142     private:
00144         INT num_vectors;
00146         gram_t **vectors;
00148         micfg_t *cfg;
00149 };
00150 #endif
00151 #endif

SHOGUN Machine Learning Toolbox - Documentation