00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include "distributions/histogram/Histogram.h" 00013 #include "lib/common.h" 00014 #include "features/StringFeatures.h" 00015 #include "lib/io.h" 00016 #include "lib/Mathematics.h" 00017 00018 00019 CHistogram::CHistogram() 00020 : CDistribution() 00021 { 00022 hist=new DREAL[1<<16]; 00023 } 00024 00025 CHistogram::CHistogram(CStringFeatures<WORD> *f) 00026 : CDistribution() 00027 { 00028 hist=new DREAL[1<<16]; 00029 features=f; 00030 } 00031 00032 CHistogram::~CHistogram() 00033 { 00034 delete[] hist; 00035 } 00036 00037 bool CHistogram::train() 00038 { 00039 INT vec; 00040 INT feat; 00041 INT i; 00042 00043 ASSERT(features); 00044 ASSERT(features->get_feature_class()==C_STRING); 00045 ASSERT(features->get_feature_type()==F_WORD); 00046 00047 for (i=0; i< (INT) (1<<16); i++) 00048 hist[i]=0; 00049 00050 for (vec=0; vec<features->get_num_vectors(); vec++) 00051 { 00052 INT len; 00053 00054 WORD* vector=((CStringFeatures<WORD>*) features)->get_feature_vector(vec, len); 00055 00056 for (feat=0; feat<len ; feat++) 00057 hist[vector[feat]]++; 00058 } 00059 00060 for (i=0; i< (INT) (1<<16); i++) 00061 hist[i]=log(hist[i]); 00062 00063 return true; 00064 } 00065 00066 DREAL CHistogram::get_log_likelihood_example(INT num_example) 00067 { 00068 ASSERT(features); 00069 ASSERT(features->get_feature_class()==C_STRING); 00070 ASSERT(features->get_feature_type()==F_WORD); 00071 00072 INT len; 00073 DREAL loglik=0; 00074 00075 WORD* vector=((CStringFeatures<WORD>*) features)->get_feature_vector(num_example, len); 00076 00077 for (INT i=0; i<len; i++) 00078 loglik+=hist[vector[i]]; 00079 00080 return loglik; 00081 } 00082 00083 DREAL CHistogram::get_log_derivative(INT num_param, INT num_example) 00084 { 00085 if (hist[num_param] < CMath::ALMOST_NEG_INFTY) 00086 return -CMath::INFTY; 00087 else 00088 { 00089 ASSERT(features); 00090 ASSERT(features->get_feature_class()==C_STRING); 00091 ASSERT(features->get_feature_type()==F_WORD); 00092 00093 INT len; 00094 DREAL deriv=0; 00095 00096 WORD* vector=((CStringFeatures<WORD>*) features)->get_feature_vector(num_example, len); 00097 00098 INT num_occurences=0; 00099 00100 for (INT i=0; i<len; i++) 00101 { 00102 deriv+=hist[vector[i]]; 00103 00104 if (vector[i]==num_param) 00105 num_occurences++; 00106 } 00107 00108 if (num_occurences>0) 00109 deriv+=log(num_occurences)-hist[num_param]; 00110 else 00111 deriv=-CMath::INFTY; 00112 00113 return deriv; 00114 } 00115 } 00116 00117 DREAL CHistogram::get_log_model_parameter(INT num_param) 00118 { 00119 return hist[num_param]; 00120 } 00121 00122 bool CHistogram::set_histogram(DREAL* src, INT num) 00123 { 00124 ASSERT(num==get_num_model_parameters()); 00125 00126 delete[] hist; 00127 hist=new DREAL[num]; 00128 for (INT i=0; i<num; i++) { 00129 hist[i]=src[i]; 00130 } 00131 00132 return true; 00133 } 00134 00135 void CHistogram::get_histogram(DREAL** dst, INT* num) 00136 { 00137 *num=get_num_model_parameters(); 00138 size_t sz=sizeof(*hist)*(*num); 00139 *dst=(DREAL*) malloc(sz); 00140 ASSERT(dst); 00141 00142 memcpy(*dst, hist, sz); 00143 } 00144