Histogram.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "distributions/histogram/Histogram.h"
00013 #include "lib/common.h"
00014 #include "features/StringFeatures.h"
00015 #include "lib/io.h"
00016 #include "lib/Mathematics.h"
00017 
00018 
00019 CHistogram::CHistogram()
00020 : CDistribution()
00021 {
00022     hist=new float64_t[1<<16];
00023 }
00024 
00025 CHistogram::CHistogram(CStringFeatures<uint16_t> *f)
00026 : CDistribution()
00027 {
00028     hist=new float64_t[1<<16];
00029     features=f;
00030 }
00031 
00032 CHistogram::~CHistogram()
00033 {
00034     delete[] hist;
00035 }
00036 
00037 bool CHistogram::train()
00038 {
00039     int32_t vec;
00040     int32_t feat;
00041     int32_t i;
00042 
00043     ASSERT(features);
00044     ASSERT(features->get_feature_class()==C_STRING);
00045     ASSERT(features->get_feature_type()==F_WORD);
00046 
00047     for (i=0; i< (int32_t) (1<<16); i++)
00048         hist[i]=0;
00049 
00050     for (vec=0; vec<features->get_num_vectors(); vec++)
00051     {
00052         int32_t len;
00053 
00054         uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00055             get_feature_vector(vec, len);
00056 
00057         for (feat=0; feat<len ; feat++)
00058             hist[vector[feat]]++;
00059     }
00060 
00061     for (i=0; i< (int32_t) (1<<16); i++)
00062         hist[i]=log(hist[i]);
00063 
00064     return true;
00065 }
00066 
00067 float64_t CHistogram::get_log_likelihood_example(int32_t num_example)
00068 {
00069     ASSERT(features);
00070     ASSERT(features->get_feature_class()==C_STRING);
00071     ASSERT(features->get_feature_type()==F_WORD);
00072 
00073     int32_t len;
00074     float64_t loglik=0;
00075 
00076     uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00077         get_feature_vector(num_example, len);
00078 
00079     for (int32_t i=0; i<len; i++)
00080         loglik+=hist[vector[i]];
00081 
00082     return loglik;
00083 }
00084 
00085 float64_t CHistogram::get_log_derivative(int32_t num_param, int32_t num_example)
00086 {
00087     if (hist[num_param] < CMath::ALMOST_NEG_INFTY)
00088         return -CMath::INFTY;
00089     else
00090     {
00091         ASSERT(features);
00092         ASSERT(features->get_feature_class()==C_STRING);
00093         ASSERT(features->get_feature_type()==F_WORD);
00094 
00095         int32_t len;
00096         float64_t deriv=0;
00097 
00098         uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00099             get_feature_vector(num_example, len);
00100 
00101         int32_t num_occurences=0;
00102 
00103         for (int32_t i=0; i<len; i++)
00104         {
00105             deriv+=hist[vector[i]];
00106 
00107             if (vector[i]==num_param)
00108                 num_occurences++;
00109         }
00110 
00111         if (num_occurences>0)
00112             deriv+=log(num_occurences)-hist[num_param];
00113         else
00114             deriv=-CMath::INFTY;
00115 
00116         return deriv;
00117     }
00118 }
00119 
00120 float64_t CHistogram::get_log_model_parameter(int32_t num_param)
00121 {
00122     return hist[num_param];
00123 }
00124 
00125 bool CHistogram::set_histogram(float64_t* src, int32_t num)
00126 {
00127     ASSERT(num==get_num_model_parameters());
00128 
00129     delete[] hist;
00130     hist=new float64_t[num];
00131     for (int32_t i=0; i<num; i++) {
00132         hist[i]=src[i];
00133     }
00134 
00135     return true;
00136 }
00137 
00138 void CHistogram::get_histogram(float64_t** dst, int32_t* num)
00139 {
00140     *num=get_num_model_parameters();
00141     size_t sz=sizeof(*hist)*(*num);
00142     *dst=(float64_t*) malloc(sz);
00143     ASSERT(dst);
00144 
00145     memcpy(*dst, hist, sz);
00146 }
00147 

SHOGUN Machine Learning Toolbox - Documentation