PruneVarSubMean.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 1999-2008 Soeren Sonnenburg
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "preproc/PruneVarSubMean.h"
00013 #include "preproc/SimplePreProc.h"
00014 #include "features/Features.h"
00015 #include "features/RealFeatures.h"
00016 #include "lib/io.h"
00017 #include "lib/Mathematics.h"
00018 
00019 CPruneVarSubMean::CPruneVarSubMean(bool divide)
00020 : CSimplePreProc<DREAL>("PruneVarSubMean","PVSM"), idx(NULL), mean(NULL),
00021     std(NULL), num_idx(0), divide_by_std(divide), initialized(false)
00022 {
00023 }
00024 
00025 CPruneVarSubMean::~CPruneVarSubMean()
00026 {
00027     cleanup();
00028 }
00029 
00031 bool CPruneVarSubMean::init(CFeatures* p_f)
00032 {
00033     if (!initialized)
00034     {
00035         ASSERT(p_f->get_feature_class()==C_SIMPLE);
00036         ASSERT(p_f->get_feature_type()==F_DREAL);
00037 
00038         CRealFeatures *f=(CRealFeatures*) p_f;
00039         INT num_examples=f->get_num_vectors();
00040         INT num_features=((CRealFeatures*)f)->get_num_features();
00041 
00042         delete[] mean;
00043         delete[] idx;
00044         delete[] std;
00045         mean=NULL;
00046         idx=NULL;
00047         std=NULL;
00048 
00049         mean=new double[num_features];
00050         double* var=new double[num_features];
00051         INT i,j;
00052 
00053         for (i=0; i<num_features; i++)
00054         {
00055             mean[i]=0;
00056             var[i]=0 ;
00057         }
00058 
00059         // compute mean
00060         for (i=0; i<num_examples; i++)
00061         {
00062             INT len ; bool free ;
00063             DREAL* feature=f->get_feature_vector(i, len, free) ;
00064 
00065             for (j=0; j<len; j++)
00066                 mean[j]+=feature[j];
00067 
00068             f->free_feature_vector(feature, i, free) ;
00069         }
00070 
00071         for (j=0; j<num_features; j++)
00072             mean[j]/=num_examples ;
00073 
00074         // compute var
00075         for (i=0; i<num_examples; i++)
00076         {
00077             INT len ; bool free ;
00078             DREAL* feature=f->get_feature_vector(i, len, free) ;
00079 
00080             for (j=0; j<num_features; j++)
00081                 var[j]+=(mean[j]-feature[j])*(mean[j]-feature[j]) ;
00082 
00083             f->free_feature_vector(feature, i, free) ;
00084         }
00085 
00086         INT num_ok=0;
00087         INT* idx_ok=new int[num_features];
00088 
00089         for (j=0; j<num_features; j++)
00090         {
00091             var[j]/=num_examples ;
00092 
00093             if (var[j]>=1e-14) 
00094             {
00095                 idx_ok[num_ok]=j ;
00096                 num_ok++ ;
00097             }
00098         }
00099 
00100         SG_INFO( "Reducing number of features from %i to %i\n", num_features, num_ok) ;
00101 
00102         delete[] idx ;
00103         idx=new int[num_ok];
00104         DREAL* new_mean=new DREAL[num_ok];
00105         std=new DREAL[num_ok];
00106 
00107         for (j=0; j<num_ok; j++)
00108         {
00109             idx[j]=idx_ok[j] ;
00110             new_mean[j]=mean[idx_ok[j]];
00111             std[j]=sqrt(var[idx_ok[j]]);
00112         }
00113         num_idx=num_ok ;
00114         delete[] idx_ok ;
00115         delete[] mean;
00116         delete[] var;
00117         mean=new_mean;
00118 
00119         initialized=true;
00120         return true ;
00121     }
00122     else
00123         return false;
00124 }
00125 
00127 void CPruneVarSubMean::cleanup()
00128 {
00129     delete[] idx;
00130     idx=NULL;
00131     delete[] mean;
00132     mean=NULL;
00133     delete[] std;
00134     std=NULL;
00135 }
00136 
00140 DREAL* CPruneVarSubMean::apply_to_feature_matrix(CFeatures* f)
00141 {
00142     ASSERT(initialized);
00143 
00144     INT num_vectors=0;
00145     INT num_features=0;
00146     DREAL* m=((CRealFeatures*) f)->get_feature_matrix(num_features, num_vectors);
00147 
00148     SG_INFO( "get Feature matrix: %ix%i\n", num_vectors, num_features);
00149     SG_INFO( "Preprocessing feature matrix\n");
00150     for (INT vec=0; vec<num_vectors; vec++)
00151     {
00152         DREAL* v_src=&m[num_features*vec];
00153         DREAL* v_dst=&m[num_idx*vec];
00154 
00155         if (divide_by_std)
00156         {
00157             for (INT feat=0; feat<num_idx; feat++)
00158                 v_dst[feat]=(v_src[idx[feat]]-mean[feat])/std[feat];
00159         }
00160         else
00161         {
00162             for (INT feat=0; feat<num_idx; feat++)
00163                 v_dst[feat]=(v_src[idx[feat]]-mean[feat]);
00164         }
00165     }
00166 
00167     ((CRealFeatures*) f)->set_num_features(num_idx);
00168     ((CRealFeatures*) f)->get_feature_matrix(num_features, num_vectors);
00169     SG_INFO( "new Feature matrix: %ix%i\n", num_vectors, num_features);
00170 
00171     return m;
00172 }
00173 
00176 DREAL* CPruneVarSubMean::apply_to_feature_vector(DREAL* f, INT &len)
00177 {
00178     DREAL* ret=NULL;
00179 
00180     if (initialized)
00181     {
00182         ret=new DREAL[num_idx] ;
00183 
00184         if (divide_by_std)
00185         {
00186             for (INT i=0; i<num_idx; i++)
00187                 ret[i]=(f[idx[i]]-mean[i])/std[i];
00188         }
00189         else
00190         {
00191             for (INT i=0; i<num_idx; i++)
00192                 ret[i]=(f[idx[i]]-mean[i]);
00193         }
00194         len=num_idx ;
00195     }
00196     else
00197     {
00198         ret=new DREAL[len] ;
00199         for (INT i=0; i<len; i++)
00200             ret[i]=f[i];
00201     }
00202 
00203     return ret;
00204 }
00205 
00207 bool CPruneVarSubMean::load_init_data(FILE* src)
00208 {
00209     bool result=false;
00210     INT divide=0;
00211 
00212     ASSERT(fread(&divide, sizeof(int), 1, src)==1);
00213     ASSERT(fread(&num_idx, sizeof(int), 1, src)==1);
00214     SG_INFO( "divide:%d num_idx:%d\n", divide, num_idx);
00215     delete[] mean;
00216     delete[] idx;
00217     delete[] std;
00218     idx=new int[num_idx];
00219     mean=new DREAL[num_idx];
00220     std=new DREAL[num_idx];
00221     ASSERT (mean!=NULL && idx!=NULL && std!=NULL);
00222     ASSERT(fread(idx, sizeof(int), num_idx, src)==(UINT) num_idx);
00223     ASSERT(fread(mean, sizeof(DREAL), num_idx, src)==(UINT) num_idx);
00224     ASSERT(fread(std, sizeof(DREAL), num_idx, src)==(UINT) num_idx);
00225 
00226     result=true;
00227     divide_by_std=(divide==1);
00228     initialized=true;
00229     return result;
00230 }
00231 
00233 bool CPruneVarSubMean::save_init_data(FILE* dst)
00234 {
00235     return false;
00236 }

SHOGUN Machine Learning Toolbox - Documentation