Kernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "lib/config.h"
00013 
00014 #include "lib/common.h"
00015 #include "lib/io.h"
00016 #include "lib/File.h"
00017 #include "lib/Time.h"
00018 #include "base/Parallel.h"
00019 
00020 #include "kernel/Kernel.h"
00021 #include "kernel/IdentityKernelNormalizer.h"
00022 #include "features/Features.h"
00023 
00024 #include "classifier/svm/SVM.h"
00025 
00026 #include <string.h>
00027 #include <unistd.h>
00028 #include <math.h>
00029 
00030 #ifndef WIN32
00031 #include <pthread.h>
00032 #endif
00033 
00034 CKernel::CKernel(int32_t size)
00035 : CSGObject(), kernel_matrix(NULL), lhs(NULL),
00036     rhs(NULL), combined_kernel_weight(1), optimization_initialized(false),
00037     opt_type(FASTBUTMEMHUNGRY), properties(KP_NONE), normalizer(NULL)
00038 {
00039     if (size<10)
00040         size=10;
00041 
00042     cache_size=size;
00043 
00044 
00045     if (get_is_initialized())
00046         SG_ERROR( "COptimizableKernel still initialized on destruction");
00047 
00048     set_normalizer(new CIdentityKernelNormalizer());
00049 }
00050 
00051 
00052 CKernel::CKernel(CFeatures* p_lhs, CFeatures* p_rhs, int32_t size) : CSGObject(),
00053     kernel_matrix(NULL), lhs(NULL), rhs(NULL), combined_kernel_weight(1),
00054     optimization_initialized(false), opt_type(FASTBUTMEMHUNGRY),
00055     properties(KP_NONE), normalizer(NULL)
00056 {
00057     if (size<10)
00058         size=10;
00059 
00060     cache_size=size;
00061 
00062     if (get_is_initialized())
00063         SG_ERROR("Kernel initialized on construction.\n");
00064 
00065     set_normalizer(new CIdentityKernelNormalizer());
00066     init(p_lhs, p_rhs);
00067 }
00068 
00069 CKernel::~CKernel()
00070 {
00071     if (get_is_initialized())
00072         SG_ERROR("Kernel still initialized on destruction.\n");
00073 
00074     remove_lhs_and_rhs();
00075 
00076     SG_INFO("Kernel deleted (%p).\n", this);
00077 }
00078 
00079 void CKernel::get_kernel_matrix(float64_t** dst, int32_t* m, int32_t* n)
00080 {
00081     ASSERT(dst && m && n);
00082 
00083     float64_t* result = NULL;
00084     CFeatures* f1 = lhs;
00085     CFeatures* f2 = rhs;
00086 
00087     if (f1 && f2)
00088     {
00089         int32_t num_vec1=f1->get_num_vectors();
00090         int32_t num_vec2=f2->get_num_vectors();
00091         *m=num_vec1;
00092         *n=num_vec2;
00093 
00094         int64_t total_num = num_vec1 * num_vec2;
00095         int32_t num_done = 0;
00096         SG_DEBUG( "returning kernel matrix of size %dx%d\n", num_vec1, num_vec2);
00097 
00098         result=(float64_t*) malloc(sizeof(float64_t)*total_num);
00099         ASSERT(result);
00100 
00101         if ( (f1 == f2) && (num_vec1 == num_vec2) )
00102         {
00103             for (int32_t i=0; i<num_vec1; i++)
00104             {
00105                 for (int32_t j=i; j<num_vec1; j++)
00106                 {
00107                     float64_t v=kernel(i,j);
00108 
00109                     result[i+j*num_vec1]=v;
00110                     result[j+i*num_vec1]=v;
00111 
00112                     if (num_done%100000)
00113                         SG_PROGRESS(num_done, 0, total_num-1);
00114 
00115                     if (i!=j)
00116                         num_done+=2;
00117                     else
00118                         num_done+=1;
00119                 }
00120             }
00121         }
00122         else
00123         {
00124             for (int32_t i=0; i<num_vec1; i++)
00125             {
00126                 for (int32_t j=0; j<num_vec2; j++)
00127                 {
00128                     result[i+j*num_vec1]=kernel(i,j) ;
00129 
00130                     if (num_done%100000)
00131                         SG_PROGRESS(num_done, 0, total_num-1);
00132 
00133                     num_done++;
00134                 }
00135             }
00136         }
00137 
00138         SG_DONE();
00139     }
00140     else
00141       SG_ERROR( "no features assigned to kernel\n");
00142 
00143     *dst=result;
00144 }
00145 
00146 float32_t* CKernel::get_kernel_matrix_shortreal(
00147     int32_t &num_vec1, int32_t &num_vec2, float32_t* target)
00148 {
00149     float32_t* result = NULL;
00150     CFeatures* f1 = lhs;
00151     CFeatures* f2 = rhs;
00152 
00153     if (f1 && f2)
00154     {
00155         if (target && (num_vec1!=f1->get_num_vectors() ||
00156                     num_vec2!=f2->get_num_vectors()) )
00157             SG_ERROR( "kernel matrix does not fit into target\n");
00158 
00159         num_vec1=f1->get_num_vectors();
00160         num_vec2=f2->get_num_vectors();
00161         int64_t total_num = num_vec1 * num_vec2;
00162         int32_t num_done = 0;
00163 
00164         SG_DEBUG( "returning kernel matrix of size %dx%d\n", num_vec1, num_vec2);
00165 
00166         if (target)
00167             result=target;
00168         else
00169             result=new float32_t[total_num];
00170 
00171         if (f1==f2 && num_vec1==num_vec2)
00172         {
00173             for (int32_t i=0; i<num_vec1; i++)
00174             {
00175                 for (int32_t j=i; j<num_vec1; j++)
00176                 {
00177                     float64_t v=kernel(i,j);
00178 
00179                     result[i+j*num_vec1]=v;
00180                     result[j+i*num_vec1]=v;
00181 
00182                     if (num_done%100000)
00183                         SG_PROGRESS(num_done, 0, total_num-1);
00184 
00185                     if (i!=j)
00186                         num_done+=2;
00187                     else
00188                         num_done+=1;
00189                 }
00190             }
00191         }
00192         else
00193         {
00194             for (int32_t i=0; i<num_vec1; i++)
00195             {
00196                 for (int32_t j=0; j<num_vec2; j++)
00197                 {
00198                     result[i+j*num_vec1]=kernel(i,j) ;
00199 
00200                     if (num_done%100000)
00201                         SG_PROGRESS(num_done, 0, total_num-1);
00202 
00203                     num_done++;
00204                 }
00205             }
00206         }
00207 
00208         SG_DONE();
00209     }
00210     else
00211       SG_ERROR( "no features assigned to kernel\n");
00212 
00213     return result;
00214 }
00215 
00216 float64_t* CKernel::get_kernel_matrix_real(
00217     int32_t &num_vec1, int32_t &num_vec2, float64_t* target)
00218 {
00219     float64_t* result = NULL;
00220     CFeatures* f1 = lhs;
00221     CFeatures* f2 = rhs;
00222 
00223     if (f1 && f2)
00224     {
00225         if (target && (num_vec1!=f1->get_num_vectors() ||
00226                     num_vec2!=f2->get_num_vectors()) )
00227             SG_ERROR( "kernel matrix does not fit into target\n");
00228 
00229         num_vec1=f1->get_num_vectors();
00230         num_vec2=f2->get_num_vectors();
00231         int64_t total_num = num_vec1 * num_vec2;
00232         int32_t num_done = 0;
00233 
00234         SG_DEBUG( "returning kernel matrix of size %dx%d\n", num_vec1, num_vec2);
00235 
00236         if (target)
00237             result=target;
00238         else
00239             result=new float64_t[total_num];
00240 
00241         if (f1==f2 && num_vec1==num_vec2)
00242         {
00243             for (int32_t i=0; i<num_vec1; i++)
00244             {
00245                 for (int32_t j=i; j<num_vec1; j++)
00246                 {
00247                     float64_t v=kernel(i,j);
00248 
00249                     result[i+j*num_vec1]=v;
00250                     result[j+i*num_vec1]=v;
00251 
00252                     if (num_done%100000)
00253                         SG_PROGRESS(num_done, 0, total_num-1);
00254 
00255                     if (i!=j)
00256                         num_done+=2;
00257                     else
00258                         num_done+=1;
00259                 }
00260             }
00261         }
00262         else
00263         {
00264             for (int32_t i=0; i<num_vec1; i++)
00265             {
00266                 for (int32_t j=0; j<num_vec2; j++)
00267                 {
00268                     result[i+j*num_vec1]=kernel(i,j) ;
00269 
00270                     if (num_done%100000)
00271                         SG_PROGRESS(num_done, 0, total_num-1);
00272 
00273                     num_done++;
00274                 }
00275             }
00276         }
00277 
00278         SG_DONE();
00279     }
00280     else
00281       SG_ERROR( "no features assigned to kernel\n");
00282 
00283     return result;
00284 }
00285 
00286 
00287 
00288 
00289 bool CKernel::init(CFeatures* l, CFeatures* r)
00290 {
00291     //make sure features were indeed supplied
00292     ASSERT(l);
00293     ASSERT(r);
00294 
00295     //make sure features are compatible
00296     ASSERT(l->get_feature_class()==r->get_feature_class());
00297     ASSERT(l->get_feature_type()==r->get_feature_type());
00298 
00299     //remove references to previous features
00300     remove_lhs_and_rhs();
00301 
00302     //increase reference counts
00303     SG_REF(l);
00304     if (l!=r)
00305         SG_REF(r);
00306 
00307     lhs=l;
00308     rhs=r;
00309 
00310     return true;
00311 }
00312 
00313 bool CKernel::set_normalizer(CKernelNormalizer* n)
00314 {
00315     SG_REF(n);
00316     SG_UNREF(normalizer);
00317 #ifndef HAVE_SWIG
00318     delete normalizer;
00319 #endif
00320     normalizer=n;
00321 
00322     return (normalizer!=NULL);
00323 }
00324 
00325 CKernelNormalizer* CKernel::get_normalizer()
00326 {
00327     SG_REF(normalizer)
00328     return normalizer;
00329 }
00330 
00331 bool CKernel::init_normalizer()
00332 {
00333     return normalizer->init(this);
00334 }
00335 
00336 void CKernel::cleanup()
00337 {
00338     remove_lhs_and_rhs();
00339 }
00340 
00341 
00342 
00343 bool CKernel::load(char* fname)
00344 {
00345     return false;
00346 }
00347 
00348 bool CKernel::save(char* fname)
00349 {
00350     int32_t i=0;
00351     int32_t num_left=lhs->get_num_vectors();
00352     int32_t num_right=rhs->get_num_vectors();
00353     KERNELCACHE_IDX num_total=num_left*num_right;
00354 
00355     CFile f(fname, 'w', F_DREAL);
00356 
00357     for (int32_t l=0; l< (int32_t) num_left && f.is_ok(); l++)
00358     {
00359         for (int32_t r=0; r< (int32_t) num_right && f.is_ok(); r++)
00360         {
00361             if (!(i % (num_total/10+1)))
00362                 SG_PRINT("%02d%%.", (int32_t) (100.0*i/num_total));
00363             else if (!(i % (num_total/200+1)))
00364                 SG_PRINT(".");
00365 
00366             float64_t k=kernel(l,r);
00367             f.save_real_data(&k, 1);
00368 
00369             i++;
00370         }
00371     }
00372 
00373     if (f.is_ok())
00374         SG_INFO( "kernel matrix of size %ld x %ld written (filesize: %ld)\n", num_left, num_right, num_total*sizeof(KERNELCACHE_ELEM));
00375 
00376     return (f.is_ok());
00377 }
00378 
00379 void CKernel::remove_lhs_and_rhs()
00380 {
00381     if (rhs!=lhs)
00382         SG_UNREF(rhs);
00383     rhs = NULL;
00384 
00385     SG_UNREF(lhs);
00386     lhs = NULL;
00387 
00388 
00389 }
00390 
00391 void CKernel::remove_lhs()
00392 { 
00393     SG_UNREF(lhs);
00394     lhs = NULL;
00395 
00396 
00397 }
00398 
00400 void CKernel::remove_rhs()
00401 {
00402     if (rhs!=lhs)
00403         SG_UNREF(rhs);
00404     rhs = NULL;
00405 
00406 
00407 }
00408 
00409 
00410 void CKernel::list_kernel()
00411 {
00412     SG_INFO( "0x%p - \"%s\" weight=%1.2f OPT:%s", this, get_name(),
00413             get_combined_kernel_weight(),
00414             get_optimization_type()==FASTBUTMEMHUNGRY ? "FASTBUTMEMHUNGRY" :
00415             "SLOWBUTMEMEFFICIENT");
00416 
00417     switch (get_kernel_type())
00418     {
00419         case K_UNKNOWN:
00420             SG_INFO( "K_UNKNOWN ");
00421             break;
00422         case K_LINEAR:
00423             SG_INFO( "K_LINEAR ");
00424             break;
00425         case K_SPARSELINEAR:
00426             SG_INFO( "K_SPARSELINEAR ");
00427             break;
00428         case K_POLY:
00429             SG_INFO( "K_POLY ");
00430             break;
00431         case K_GAUSSIAN:
00432             SG_INFO( "K_GAUSSIAN ");
00433             break;
00434         case K_SPARSEGAUSSIAN:
00435             SG_INFO( "K_SPARSEGAUSSIAN ");
00436             break;
00437         case K_GAUSSIANSHIFT:
00438             SG_INFO( "K_GAUSSIANSHIFT ");
00439             break;
00440         case K_HISTOGRAM:
00441             SG_INFO( "K_HISTOGRAM ");
00442             break;
00443         case K_SALZBERG:
00444             SG_INFO( "K_SALZBERG ");
00445             break;
00446         case K_LOCALITYIMPROVED:
00447             SG_INFO( "K_LOCALITYIMPROVED ");
00448             break;
00449         case K_SIMPLELOCALITYIMPROVED:
00450             SG_INFO( "K_SIMPLELOCALITYIMPROVED ");
00451             break;
00452         case K_FIXEDDEGREE:
00453             SG_INFO( "K_FIXEDDEGREE ");
00454             break;
00455         case K_WEIGHTEDDEGREE:
00456             SG_INFO( "K_WEIGHTEDDEGREE ");
00457             break;
00458         case K_WEIGHTEDDEGREEPOS:
00459             SG_INFO( "K_WEIGHTEDDEGREEPOS ");
00460             break;
00461         case K_WEIGHTEDCOMMWORDSTRING:
00462             SG_INFO( "K_WEIGHTEDCOMMWORDSTRING ");
00463             break;
00464         case K_POLYMATCH:
00465             SG_INFO( "K_POLYMATCH ");
00466             break;
00467         case K_ALIGNMENT:
00468             SG_INFO( "K_ALIGNMENT ");
00469             break;
00470         case K_COMMWORDSTRING:
00471             SG_INFO( "K_COMMWORDSTRING ");
00472             break;
00473         case K_COMMULONGSTRING:
00474             SG_INFO( "K_COMMULONGSTRING ");
00475             break;
00476         case K_COMBINED:
00477             SG_INFO( "K_COMBINED ");
00478             break;
00479         case K_AUC:
00480             SG_INFO( "K_AUC ");
00481             break;
00482         case K_CUSTOM:
00483             SG_INFO( "K_CUSTOM ");
00484             break;
00485         case K_SIGMOID:
00486             SG_INFO( "K_SIGMOID ");
00487             break;
00488         case K_CHI2:
00489             SG_INFO( "K_CHI2 ");
00490             break;
00491         case K_DIAG:
00492             SG_INFO( "K_DIAG ");
00493             break;
00494         case K_CONST:
00495             SG_INFO( "K_CONST ");
00496             break;
00497         case K_MINDYGRAM:
00498             SG_INFO( "K_MINDYGRAM ");
00499             break;
00500         case K_DISTANCE:
00501             SG_INFO( "K_DISTANCE ");
00502             break;
00503         case K_LOCALALIGNMENT:
00504             SG_INFO( "K_LOCALALIGNMENT ");
00505             break;
00506         default:
00507          SG_ERROR( "ERROR UNKNOWN KERNEL TYPE");
00508             break;
00509     }
00510 
00511     switch (get_feature_class())
00512     {
00513         case C_UNKNOWN:
00514             SG_INFO( "C_UNKNOWN ");
00515             break;
00516         case C_SIMPLE:
00517             SG_INFO( "C_SIMPLE ");
00518             break;
00519         case C_SPARSE:
00520             SG_INFO( "C_SPARSE ");
00521             break;
00522         case C_STRING:
00523             SG_INFO( "C_STRING ");
00524             break;
00525         case C_COMBINED:
00526             SG_INFO( "C_COMBINED ");
00527             break;
00528         case C_ANY:
00529             SG_INFO( "C_ANY ");
00530             break;
00531         default:
00532          SG_ERROR( "ERROR UNKNOWN FEATURE CLASS");
00533     }
00534 
00535     switch (get_feature_type())
00536     {
00537         case F_UNKNOWN:
00538             SG_INFO( "F_UNKNOWN ");
00539             break;
00540         case F_DREAL:
00541             SG_INFO( "F_REAL ");
00542             break;
00543         case F_SHORT:
00544             SG_INFO( "F_SHORT ");
00545             break;
00546         case F_CHAR:
00547             SG_INFO( "F_CHAR ");
00548             break;
00549         case F_INT:
00550             SG_INFO( "F_INT ");
00551             break;
00552         case F_BYTE:
00553             SG_INFO( "F_BYTE ");
00554             break;
00555         case F_WORD:
00556             SG_INFO( "F_WORD ");
00557             break;
00558         case F_ULONG:
00559             SG_INFO( "F_ULONG ");
00560             break;
00561         case F_ANY:
00562             SG_INFO( "F_ANY ");
00563             break;
00564         default:
00565          SG_ERROR( "ERROR UNKNOWN FEATURE TYPE");
00566             break;
00567     }
00568     SG_INFO( "\n");
00569 }
00570 
00571 bool CKernel::init_optimization(
00572     int32_t count, int32_t *IDX, float64_t * weights)
00573 {
00574    SG_ERROR( "kernel does not support linadd optimization\n");
00575     return false ;
00576 }
00577 
00578 bool CKernel::delete_optimization() 
00579 {
00580    SG_ERROR( "kernel does not support linadd optimization\n");
00581     return false;
00582 }
00583 
00584 float64_t CKernel::compute_optimized(int32_t vector_idx)
00585 {
00586    SG_ERROR( "kernel does not support linadd optimization\n");
00587     return 0;
00588 }
00589 
00590 void CKernel::compute_batch(
00591     int32_t num_vec, int32_t* vec_idx, float64_t* target, int32_t num_suppvec,
00592     int32_t* IDX, float64_t* weights, float64_t factor)
00593 {
00594    SG_ERROR( "kernel does not support batch computation\n");
00595 }
00596 
00597 void CKernel::add_to_normal(int32_t vector_idx, float64_t weight)
00598 {
00599    SG_ERROR( "kernel does not support linadd optimization, add_to_normal not implemented\n");
00600 }
00601 
00602 void CKernel::clear_normal()
00603 {
00604    SG_ERROR( "kernel does not support linadd optimization, clear_normal not implemented\n");
00605 }
00606 
00607 int32_t CKernel::get_num_subkernels()
00608 {
00609     return 1;
00610 }
00611 
00612 void CKernel::compute_by_subkernel(
00613     int32_t vector_idx, float64_t * subkernel_contrib)
00614 {
00615    SG_ERROR( "kernel compute_by_subkernel not implemented\n");
00616 }
00617 
00618 const float64_t* CKernel::get_subkernel_weights(int32_t &num_weights)
00619 {
00620     num_weights=1 ;
00621     return &combined_kernel_weight ;
00622 }
00623 
00624 void CKernel::set_subkernel_weights(float64_t* weights, int32_t num_weights)
00625 {
00626     combined_kernel_weight = weights[0] ;
00627     if (num_weights!=1)
00628       SG_ERROR( "number of subkernel weights should be one ...\n");
00629 }
00630 
00631 bool CKernel::init_optimization_svm(CSVM * svm)
00632 {
00633     int32_t num_suppvec=svm->get_num_support_vectors();
00634     int32_t* sv_idx=new int32_t[num_suppvec];
00635     float64_t* sv_weight=new float64_t[num_suppvec];
00636 
00637     for (int32_t i=0; i<num_suppvec; i++)
00638     {
00639         sv_idx[i]    = svm->get_support_vector(i);
00640         sv_weight[i] = svm->get_alpha(i);
00641     }
00642     bool ret = init_optimization(num_suppvec, sv_idx, sv_weight);
00643 
00644     delete[] sv_idx;
00645     delete[] sv_weight;
00646     return ret;
00647 }
00648 

SHOGUN Machine Learning Toolbox - Documentation