MindyGramFeatures.h
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/config.h"
00013
00014 #ifdef HAVE_MINDY
00015
00016 #ifndef _CMINDYGRAMFEATURES__H__
00017 #define _CMINDYGRAMFEATURES__H__
00018
00019 #include "features/Features.h"
00020 #include "features/CharFeatures.h"
00021 #include "features/StringFeatures.h"
00022
00023 #include <mindy.h>
00024
00025
00026 class CMindyGramFeatures : public CFeatures
00027 {
00028 public:
00036 CMindyGramFeatures(char *aname, char *embed, char *delim, uint8_t nlen)
00037 : CFeatures(0)
00038 {
00039 ASSERT(aname && embed && delim);
00040
00041
00042 SG_DEBUG( "Initializing Mindy gram features\n");
00043 if (nlen == 0)
00044 cfg = micfg_words(alph_get_type(aname), delim);
00045 else
00046 cfg = micfg_ngrams(alph_get_type(aname), (byte_t) nlen);
00047
00048
00049 if (strlen(delim) > 0)
00050 micfg_set_delim(cfg, delim);
00051
00052
00053 if (!strcasecmp(embed, "freq"))
00054 micfg_set_embed(cfg, ME_FREQ);
00055 else if (!strcasecmp(embed, "count"))
00056 micfg_set_embed(cfg, ME_COUNT);
00057 else if (!strcasecmp(embed, "bin"))
00058 micfg_set_embed(cfg, ME_BIN);
00059 else
00060 SG_ERROR("Unknown embedding mode '%s'", embed);
00061
00062 if (nlen == 0)
00063 SG_INFO("Mindy in word mode (d: '%s', a: %s, e: %s)\n",
00064 delim, aname, micfg_get_embed(cfg->embed));
00065 else
00066 SG_INFO("Mindy in n-gram mode (n: '%d', a: %s, e: %s)\n",
00067 nlen, aname, micfg_get_embed(cfg->embed));
00068 }
00069
00074 #if 0
00075 CMindyGramFeatures(const CMindyGramFeatures & orig) : CFeatures(orig)
00076 {
00077 SG_DEBUG( "Duplicating Mindy gram features\n");
00078 num_vectors = orig.num_vectors;
00079
00080
00081 cfg = micfg_clone(orig.cfg);
00082
00083
00084 vectors = (gram_t **) calloc(num_vectors, sizeof(gram_t *));
00085 for (int32_t i = 0; i < num_vectors; i++)
00086 vectors[i] = gram_clone(orig.vectors[i]);
00087 }
00088 #endif
00089
00095 template <class T>
00096 bool import_features(CStringFeatures<T> *sf)
00097 {
00098 int32_t i;
00099 num_vectors = sf->get_num_vectors();
00100 SG_INFO( "Importing %ld string features\n", num_vectors);
00101
00102 vectors = (gram_t **) calloc(num_vectors, sizeof(gram_t *));
00103 if (!vectors) {
00104 SG_ERROR( "Could not allocate memory\n");
00105 return false;
00106 }
00107
00108 for (i = 0; i < num_vectors; i++) {
00109 int32_t len;
00110 T *s = sf->get_feature_vector(i, len);
00111 vectors[i] = gram_extract(cfg, (byte_t *) s, (size_t) len);
00112
00113 SG_DEBUG( "Extracted gram vector %d: %d grams\n", i,
00114 vectors[i]->num);
00115 }
00116
00117 return true;
00118 }
00119
00120
00121 ~CMindyGramFeatures();
00122
00123 CFeatures *duplicate() const;
00124
00125
00126 gram_t *get_feature_vector(int32_t i);
00127 void set_feature_vector(int32_t i, gram_t * g);
00128 uint64_t get_feature(int32_t i, int32_t j);
00129 int32_t get_vector_length(int32_t i);
00130 void trim_max(float64_t m);
00131
00132
00133 virtual int32_t get_num_vectors() { return num_vectors; }
00134 virtual int32_t get_size() { return sizeof(gram_t *); }
00135 EFeatureClass get_feature_class() { return C_MINDYGRAM; }
00136 EFeatureType get_feature_type() { return F_ULONG; }
00137
00138 protected:
00139
00140 virtual bool load(char * fname);
00141
00142 private:
00144 int32_t num_vectors;
00146 gram_t **vectors;
00148 micfg_t *cfg;
00149 };
00150 #endif
00151 #endif