PCACut.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 1999-2008 Soeren Sonnenburg
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "lib/config.h"
00013 #include "lib/Mathematics.h"
00014 
00015 #include <string.h>
00016 #include <stdlib.h>
00017 
00018 #ifdef HAVE_LAPACK
00019 #include "lib/lapack.h"
00020 
00021 #include "lib/common.h"
00022 #include "preproc/PCACut.h"
00023 #include "preproc/SimplePreProc.h"
00024 #include "features/Features.h"
00025 #include "features/RealFeatures.h"
00026 #include "lib/io.h"
00027 
00028 CPCACut::CPCACut(INT do_whitening_, double thresh_)
00029 : CSimplePreProc<DREAL>("PCACut", "PCAC"), T(NULL), num_dim(0), mean(NULL),
00030     initialized(false), do_whitening(do_whitening_), thresh(thresh_)
00031 {
00032 }
00033 
00034 CPCACut::~CPCACut()
00035 {
00036     delete[] T;
00037     delete[] mean;
00038 }
00039 
00041 bool CPCACut::init(CFeatures* f)
00042 {
00043     if (!initialized)
00044     {
00045         ASSERT(f->get_feature_class()==C_SIMPLE);
00046         ASSERT(f->get_feature_type()==F_DREAL);
00047 
00048         SG_INFO("calling CPCACut::init\n") ;
00049         INT num_vectors=((CRealFeatures*)f)->get_num_vectors() ;
00050         INT num_features=((CRealFeatures*)f)->get_num_features() ;
00051         SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features);
00052         delete[] mean ;
00053         mean=new double[num_features+1] ;
00054 
00055         INT i,j;
00056 
00058 
00059         // clear
00060         for (j=0; j<num_features; j++)
00061         {
00062             mean[j]=0 ; 
00063         }
00064 
00065         // sum 
00066         for (i=0; i<num_vectors; i++)
00067         {
00068             INT len;
00069             bool free;
00070             DREAL* vec=((CRealFeatures*) f)->get_feature_vector(i, len, free);
00071             for (j=0; j<num_features; j++)
00072             {
00073                 mean[j]+= vec[j];
00074             }
00075             ((CRealFeatures*) f)->free_feature_vector(vec, i, free);
00076         }
00077 
00078         //divide
00079         for (j=0; j<num_features; j++)
00080             mean[j]/=num_vectors;
00081 
00082         SG_DONE();
00083         SG_DEBUG("Computing covariance matrix... of size %.2f M\n", num_features*num_features/1024.0/1024.0);
00084         double *cov=new double[num_features*num_features];
00085 
00086         for (j=0; j<num_features*num_features; j++)
00087             cov[j]=0.0 ;
00088 
00089         for (i=0; i<num_vectors; i++)
00090         {
00091             if (!(i % (num_vectors/10+1)))
00092                 SG_PROGRESS(i, 0, num_vectors);
00093 
00094             INT len;
00095             bool free;
00096 
00097             DREAL* vec=((CRealFeatures*) f)->get_feature_vector(i, len, free) ;
00098 
00099             for (INT jj=0; jj<num_features; jj++)
00100                 vec[jj]-=mean[jj] ;
00101 
00103             cblas_dger(CblasColMajor, num_features,num_features, 1.0, vec, 1, 
00104                     vec, 1, cov, (int)num_features) ;
00105 
00106             //for (INT k=0; k<num_features; k++)
00107             //  for (INT l=0; l<num_features; l++)
00108             //          cov[k*num_features+l]+=feature[l]*feature[k] ;
00109 
00110             ((CRealFeatures*) f)->free_feature_vector(vec, i, free) ;
00111         }
00112 
00113         SG_DONE();
00114 
00115         for (i=0; i<num_features; i++)
00116             for (j=0; j<num_features; j++)
00117                 cov[i*num_features+j]/=num_vectors ;
00118 
00119         SG_DONE();
00120 
00121         SG_INFO("Computing Eigenvalues ... ") ;
00122         char V='V';
00123         char U='U';
00124         //#ifdef DARWIN
00125         //      __CLPK_integer ord= (int) num_features;
00126         //      __CLPK_integer lda= (int) num_features;
00127         //      __CLPK_integer info;
00128         //      __CLPK_integer lwork=3*num_features ;
00129         //      __CLPK_doublereal* work=new __CLPK_doublereal[lwork] ;
00130         //      __CLPK_doublereal* eigenvalues=new __CLPK_doublereal[num_features] ;
00131         //#else
00132         int info;
00133         int ord= (int) num_features;
00134         int lda= (int) num_features;
00135         double* eigenvalues=new double[num_features] ;
00136         //#endif
00137 
00138         for (i=0; i<num_features; i++)
00139             eigenvalues[i]=0;
00140 
00141         // lapack sym matrix eigenvalues+vectors
00142         wrap_dsyev(V, U, ord, cov, lda, eigenvalues, &info);
00143 
00144         num_dim=0;
00145         for (i=0; i<num_features; i++)
00146         {
00147             //    SG_DEBUG( "EV[%i]=%e\n", i, values[i]) ;
00148             if (eigenvalues[i]>thresh)
00149                 num_dim++ ;
00150         } ;
00151 
00152         SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim) ;
00153 
00154         delete[] T;
00155         T=new DREAL[num_dim*num_features];
00156         num_old_dim=num_features;
00157 
00158         if (do_whitening)
00159         {
00160             INT offs=0 ;
00161             for (i=0; i<num_features; i++)
00162             {
00163                 if (eigenvalues[i]>1e-6)
00164                 {
00165                     for (INT jj=0; jj<num_features; jj++)
00166                         T[offs+jj*num_dim]=cov[num_features*i+jj]/sqrt(eigenvalues[i]) ;
00167                     offs++ ;
00168                 } ;
00169             }
00170         } ;
00171 
00172         delete[] eigenvalues;
00173         delete[] cov;
00174         initialized=true;
00175         SG_INFO("Done\n") ;
00176         return true ;
00177     }
00178     return 
00179         false;
00180 }
00181 
00183 void CPCACut::cleanup()
00184 {
00185     delete[] T ;
00186     T=NULL ;
00187 }
00188 
00192 DREAL* CPCACut::apply_to_feature_matrix(CFeatures* f)
00193 {
00194     INT num_vectors=0;
00195     INT num_features=0;
00196 
00197     DREAL* m=((CRealFeatures*) f)->get_feature_matrix(num_features, num_vectors);
00198     SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features) ;
00199 
00200     if (m)
00201     {
00202         SG_INFO("Preprocessing feature matrix\n");
00203         DREAL* res= new DREAL[num_dim];
00204         double* sub_mean= new double[num_features];
00205 
00206         for (INT vec=0; vec<num_vectors; vec++)
00207         {
00208             INT i;
00209 
00210             for (i=0; i<num_features; i++)
00211                 sub_mean[i]=m[num_features*vec+i]-mean[i] ;
00212 
00213             cblas_dgemv(CblasColMajor, CblasNoTrans, num_dim, num_features, 1.0,
00214                     T, num_dim, sub_mean, 1, 0, res, 1); 
00215 
00216             DREAL* m_transformed=&m[num_dim*vec];
00217             for (i=0; i<num_dim; i++)
00218                 m_transformed[i]=m[i];
00219         }
00220         delete[] res;
00221         delete[] sub_mean;
00222 
00223         ((CRealFeatures*) f)->set_num_features(num_dim);
00224         ((CRealFeatures*) f)->get_feature_matrix(num_features, num_vectors);
00225         SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features);
00226     }
00227 
00228     return m;
00229 }
00230 
00233 DREAL* CPCACut::apply_to_feature_vector(DREAL* f, INT &len)
00234 {
00235     DREAL *ret=new DREAL[num_dim];
00236     DREAL *sub_mean=new DREAL[len];
00237     for (INT i=0; i<len; i++)
00238         sub_mean[i]=f[i]-mean[i];
00239 
00240     cblas_dgemv(CblasColMajor, CblasNoTrans, num_dim, len, 1.0 , T, num_dim, sub_mean, 1, 0, ret, 1) ;
00241     //void cblas_dgemv(const enum CBLAS_ORDER order,
00242     //                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
00243     //                 const double alpha, const double *A, const int lda,
00244     //                 const double *X, const int incX, const double beta,
00245     //                 double *Y, const int incY);
00246     //
00247 
00248     delete[] sub_mean ;
00249     len=num_dim ;
00250     //    SG_DEBUG( "num_dim: %d\n", num_dim);
00251     return ret;
00252 }
00253 
00255 bool CPCACut::load_init_data(FILE* src)
00256 {
00257     ASSERT(fread(&num_dim, sizeof(int), 1, src)==1);
00258     ASSERT(fread(&num_old_dim, sizeof(int), 1, src)==1);
00259     delete[] mean;
00260     delete[] T;
00261     mean=new double[num_dim];
00262     T=new double[num_dim*num_old_dim];
00263     ASSERT (mean!=NULL && T!=NULL);
00264     ASSERT(fread(mean, sizeof(double), num_old_dim, src)==(UINT) num_old_dim);
00265     ASSERT(fread(T, sizeof(double), num_dim*num_old_dim, src)==(UINT) num_old_dim*num_dim);
00266     return true;
00267 }
00268 
00270 bool CPCACut::save_init_data(FILE* dst)
00271 {
00272     ASSERT(fwrite(&num_dim, sizeof(int), 1, dst)==1);
00273     ASSERT(fwrite(&num_old_dim, sizeof(int), 1, dst)==1);
00274     ASSERT(fwrite(mean, sizeof(double), num_old_dim, dst)==(UINT) num_old_dim);
00275     ASSERT(fwrite(T, sizeof(double), num_dim*num_old_dim, dst)==(UINT) num_old_dim*num_dim);
00276     return true;
00277 }
00278 #endif // HAVE_LAPACK

SHOGUN Machine Learning Toolbox - Documentation