PCACut.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 1999-2008 Soeren Sonnenburg
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "lib/config.h"
00013 #include "lib/Mathematics.h"
00014 
00015 #include <string.h>
00016 #include <stdlib.h>
00017 
00018 #ifdef HAVE_LAPACK
00019 #include "lib/lapack.h"
00020 
00021 #include "lib/common.h"
00022 #include "preproc/PCACut.h"
00023 #include "preproc/SimplePreProc.h"
00024 #include "features/Features.h"
00025 #include "features/RealFeatures.h"
00026 #include "lib/io.h"
00027 
00028 CPCACut::CPCACut(int32_t do_whitening_, float64_t thresh_)
00029 : CSimplePreProc<float64_t>("PCACut", "PCAC"), T(NULL), num_dim(0), mean(NULL),
00030     initialized(false), do_whitening(do_whitening_), thresh(thresh_)
00031 {
00032 }
00033 
00034 CPCACut::~CPCACut()
00035 {
00036     delete[] T;
00037     delete[] mean;
00038 }
00039 
00041 bool CPCACut::init(CFeatures* f)
00042 {
00043     if (!initialized)
00044     {
00045         ASSERT(f->get_feature_class()==C_SIMPLE);
00046         ASSERT(f->get_feature_type()==F_DREAL);
00047 
00048         SG_INFO("calling CPCACut::init\n") ;
00049         int32_t num_vectors=((CRealFeatures*)f)->get_num_vectors() ;
00050         int32_t num_features=((CRealFeatures*)f)->get_num_features() ;
00051         SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features);
00052         delete[] mean ;
00053         mean=new float64_t[num_features+1] ;
00054 
00055         int32_t i,j;
00056 
00058 
00059         // clear
00060         for (j=0; j<num_features; j++)
00061             mean[j]=0 ; 
00062 
00063         // sum 
00064         for (i=0; i<num_vectors; i++)
00065         {
00066             int32_t len;
00067             bool free;
00068             float64_t* vec=((CRealFeatures*) f)->get_feature_vector(i, len, free);
00069             for (j=0; j<num_features; j++)
00070                 mean[j]+= vec[j];
00071 
00072             ((CRealFeatures*) f)->free_feature_vector(vec, i, free);
00073         }
00074 
00075         //divide
00076         for (j=0; j<num_features; j++)
00077             mean[j]/=num_vectors;
00078 
00079         SG_DONE();
00080         SG_DEBUG("Computing covariance matrix... of size %.2f M\n", num_features*num_features/1024.0/1024.0);
00081         float64_t *cov=new float64_t[num_features*num_features];
00082 
00083         for (j=0; j<num_features*num_features; j++)
00084             cov[j]=0.0 ;
00085 
00086         for (i=0; i<num_vectors; i++)
00087         {
00088             if (!(i % (num_vectors/10+1)))
00089                 SG_PROGRESS(i, 0, num_vectors);
00090 
00091             int32_t len;
00092             bool free;
00093 
00094             float64_t* vec=((CRealFeatures*) f)->get_feature_vector(i, len, free) ;
00095 
00096             for (int32_t jj=0; jj<num_features; jj++)
00097                 vec[jj]-=mean[jj] ;
00098 
00100             int nf = (int) num_features; /* calling external lib */
00101             double* vec_double = (double*) vec; /* calling external lib */
00102             cblas_dger(CblasColMajor, nf, nf, 1.0, vec_double, 1, vec_double,
00103                 1, (double*) cov, nf);
00104 
00105             //for (int32_t k=0; k<num_features; k++)
00106             //  for (int32_t l=0; l<num_features; l++)
00107             //          cov[k*num_features+l]+=feature[l]*feature[k] ;
00108 
00109             ((CRealFeatures*) f)->free_feature_vector(vec, i, free) ;
00110         }
00111 
00112         SG_DONE();
00113 
00114         for (i=0; i<num_features; i++)
00115             for (j=0; j<num_features; j++)
00116                 cov[i*num_features+j]/=num_vectors ;
00117 
00118         SG_DONE();
00119 
00120         SG_INFO("Computing Eigenvalues ... ") ;
00121         char V='V';
00122         char U='U';
00123         int32_t info;
00124         int32_t ord= num_features;
00125         int32_t lda= num_features;
00126         float64_t* eigenvalues=new float64_t[num_features] ;
00127 
00128         for (i=0; i<num_features; i++)
00129             eigenvalues[i]=0;
00130 
00131         // lapack sym matrix eigenvalues+vectors
00132         wrap_dsyev(V, U, (int) ord, (double*) cov, (int) lda,
00133             (double*) eigenvalues, (int*) &info);
00134 
00135 
00136         num_dim=0;
00137         for (i=0; i<num_features; i++)
00138         {
00139             //    SG_DEBUG( "EV[%i]=%e\n", i, values[i]) ;
00140             if (eigenvalues[i]>thresh)
00141                 num_dim++ ;
00142         } ;
00143 
00144         SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim) ;
00145 
00146         delete[] T;
00147         T=new float64_t[num_dim*num_features];
00148         num_old_dim=num_features;
00149 
00150         if (do_whitening)
00151         {
00152             int32_t offs=0 ;
00153             for (i=0; i<num_features; i++)
00154             {
00155                 if (eigenvalues[i]>thresh)
00156                 {
00157                     for (int32_t jj=0; jj<num_features; jj++)
00158                         T[offs+jj*num_dim]=cov[num_features*i+jj]/sqrt(eigenvalues[i]) ;
00159                     offs++ ;
00160                 } ;
00161             }
00162         } ;
00163 
00164         delete[] eigenvalues;
00165         delete[] cov;
00166         initialized=true;
00167         SG_INFO("Done\n") ;
00168         return true ;
00169     }
00170     return 
00171         false;
00172 }
00173 
00175 void CPCACut::cleanup()
00176 {
00177     delete[] T ;
00178     T=NULL ;
00179 }
00180 
00184 float64_t* CPCACut::apply_to_feature_matrix(CFeatures* f)
00185 {
00186     int32_t num_vectors=0;
00187     int32_t num_features=0;
00188 
00189     float64_t* m=((CRealFeatures*) f)->get_feature_matrix(num_features, num_vectors);
00190     SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features) ;
00191 
00192     if (m)
00193     {
00194         SG_INFO("Preprocessing feature matrix\n");
00195         float64_t* res= new float64_t[num_dim];
00196         float64_t* sub_mean= new float64_t[num_features];
00197 
00198         for (int32_t vec=0; vec<num_vectors; vec++)
00199         {
00200             int32_t i;
00201 
00202             for (i=0; i<num_features; i++)
00203                 sub_mean[i]=m[num_features*vec+i]-mean[i] ;
00204 
00205             int nd = (int) num_dim; /* calling external lib */
00206             cblas_dgemv(CblasColMajor, CblasNoTrans, nd, (int) num_features,
00207                 1.0, T, nd, (double*) sub_mean, 1, 0, (double*) res, 1);
00208 
00209             float64_t* m_transformed=&m[num_dim*vec];
00210             for (i=0; i<num_dim; i++)
00211                 m_transformed[i]=m[i];
00212         }
00213         delete[] res;
00214         delete[] sub_mean;
00215 
00216         ((CRealFeatures*) f)->set_num_features(num_dim);
00217         ((CRealFeatures*) f)->get_feature_matrix(num_features, num_vectors);
00218         SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features);
00219     }
00220 
00221     return m;
00222 }
00223 
00226 float64_t* CPCACut::apply_to_feature_vector(float64_t* f, int32_t &len)
00227 {
00228     float64_t *ret=new float64_t[num_dim];
00229     float64_t *sub_mean=new float64_t[len];
00230     for (int32_t i=0; i<len; i++)
00231         sub_mean[i]=f[i]-mean[i];
00232 
00233     int nd = (int) num_dim;  /* calling external lib */
00234     cblas_dgemv(CblasColMajor, CblasNoTrans, nd, (int) len, 1.0, (double*) T,
00235         nd, (double*) sub_mean, 1, 0, (double*) ret, 1);
00236 
00237     delete[] sub_mean ;
00238     len=num_dim ;
00239     //    SG_DEBUG( "num_dim: %d\n", num_dim);
00240     return ret;
00241 }
00242 
00244 bool CPCACut::load_init_data(FILE* src)
00245 {
00246     ASSERT(fread(&num_dim, sizeof(int), 1, src)==1);
00247     ASSERT(fread(&num_old_dim, sizeof(int), 1, src)==1);
00248     delete[] mean;
00249     delete[] T;
00250     mean=new float64_t[num_dim];
00251     T=new float64_t[num_dim*num_old_dim];
00252     ASSERT (mean!=NULL && T!=NULL);
00253     ASSERT(fread(mean, sizeof(float64_t), num_old_dim, src)==(uint32_t) num_old_dim);
00254     ASSERT(fread(T, sizeof(float64_t), num_dim*num_old_dim, src)==(uint32_t) num_old_dim*num_dim);
00255     return true;
00256 }
00257 
00259 bool CPCACut::save_init_data(FILE* dst)
00260 {
00261     ASSERT(fwrite(&num_dim, sizeof(int), 1, dst)==1);
00262     ASSERT(fwrite(&num_old_dim, sizeof(int), 1, dst)==1);
00263     ASSERT(fwrite(mean, sizeof(float64_t), num_old_dim, dst)==(uint32_t) num_old_dim);
00264     ASSERT(fwrite(T, sizeof(float64_t), num_dim*num_old_dim, dst)==(uint32_t) num_old_dim*num_dim);
00265     return true;
00266 }
00267 #endif // HAVE_LAPACK

SHOGUN Machine Learning Toolbox - Documentation