00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/config.h"
00013
00014 #include "lib/common.h"
00015 #include "lib/io.h"
00016 #include "lib/File.h"
00017 #include "lib/Time.h"
00018 #include "base/Parallel.h"
00019
00020 #include "kernel/Kernel.h"
00021 #include "kernel/IdentityKernelNormalizer.h"
00022 #include "features/Features.h"
00023
00024 #include "classifier/svm/SVM.h"
00025
00026 #include <string.h>
00027 #include <unistd.h>
00028 #include <math.h>
00029
00030 #ifndef WIN32
00031 #include <pthread.h>
00032 #endif
00033
00034 CKernel::CKernel(int32_t size)
00035 : CSGObject(), kernel_matrix(NULL), lhs(NULL),
00036 rhs(NULL), combined_kernel_weight(1), optimization_initialized(false),
00037 opt_type(FASTBUTMEMHUNGRY), properties(KP_NONE), normalizer(NULL)
00038 {
00039 if (size<10)
00040 size=10;
00041
00042 cache_size=size;
00043
00044
00045 if (get_is_initialized())
00046 SG_ERROR( "COptimizableKernel still initialized on destruction");
00047
00048 set_normalizer(new CIdentityKernelNormalizer());
00049 }
00050
00051
00052 CKernel::CKernel(CFeatures* p_lhs, CFeatures* p_rhs, int32_t size) : CSGObject(),
00053 kernel_matrix(NULL), lhs(NULL), rhs(NULL), combined_kernel_weight(1),
00054 optimization_initialized(false), opt_type(FASTBUTMEMHUNGRY),
00055 properties(KP_NONE), normalizer(NULL)
00056 {
00057 if (size<10)
00058 size=10;
00059
00060 cache_size=size;
00061
00062 if (get_is_initialized())
00063 SG_ERROR("Kernel initialized on construction.\n");
00064
00065 set_normalizer(new CIdentityKernelNormalizer());
00066 init(p_lhs, p_rhs);
00067 }
00068
00069 CKernel::~CKernel()
00070 {
00071 if (get_is_initialized())
00072 SG_ERROR("Kernel still initialized on destruction.\n");
00073
00074 remove_lhs_and_rhs();
00075
00076 SG_INFO("Kernel deleted (%p).\n", this);
00077 }
00078
00079 void CKernel::get_kernel_matrix(float64_t** dst, int32_t* m, int32_t* n)
00080 {
00081 ASSERT(dst && m && n);
00082
00083 float64_t* result = NULL;
00084 CFeatures* f1 = lhs;
00085 CFeatures* f2 = rhs;
00086
00087 if (f1 && f2)
00088 {
00089 int32_t num_vec1=f1->get_num_vectors();
00090 int32_t num_vec2=f2->get_num_vectors();
00091 *m=num_vec1;
00092 *n=num_vec2;
00093
00094 int64_t total_num = num_vec1 * num_vec2;
00095 int32_t num_done = 0;
00096 SG_DEBUG( "returning kernel matrix of size %dx%d\n", num_vec1, num_vec2);
00097
00098 result=(float64_t*) malloc(sizeof(float64_t)*total_num);
00099 ASSERT(result);
00100
00101 if ( (f1 == f2) && (num_vec1 == num_vec2) )
00102 {
00103 for (int32_t i=0; i<num_vec1; i++)
00104 {
00105 for (int32_t j=i; j<num_vec1; j++)
00106 {
00107 float64_t v=kernel(i,j);
00108
00109 result[i+j*num_vec1]=v;
00110 result[j+i*num_vec1]=v;
00111
00112 if (num_done%100000)
00113 SG_PROGRESS(num_done, 0, total_num-1);
00114
00115 if (i!=j)
00116 num_done+=2;
00117 else
00118 num_done+=1;
00119 }
00120 }
00121 }
00122 else
00123 {
00124 for (int32_t i=0; i<num_vec1; i++)
00125 {
00126 for (int32_t j=0; j<num_vec2; j++)
00127 {
00128 result[i+j*num_vec1]=kernel(i,j) ;
00129
00130 if (num_done%100000)
00131 SG_PROGRESS(num_done, 0, total_num-1);
00132
00133 num_done++;
00134 }
00135 }
00136 }
00137
00138 SG_DONE();
00139 }
00140 else
00141 SG_ERROR( "no features assigned to kernel\n");
00142
00143 *dst=result;
00144 }
00145
00146 float32_t* CKernel::get_kernel_matrix_shortreal(
00147 int32_t &num_vec1, int32_t &num_vec2, float32_t* target)
00148 {
00149 float32_t* result = NULL;
00150 CFeatures* f1 = lhs;
00151 CFeatures* f2 = rhs;
00152
00153 if (f1 && f2)
00154 {
00155 if (target && (num_vec1!=f1->get_num_vectors() ||
00156 num_vec2!=f2->get_num_vectors()) )
00157 SG_ERROR( "kernel matrix does not fit into target\n");
00158
00159 num_vec1=f1->get_num_vectors();
00160 num_vec2=f2->get_num_vectors();
00161 int64_t total_num = num_vec1 * num_vec2;
00162 int32_t num_done = 0;
00163
00164 SG_DEBUG( "returning kernel matrix of size %dx%d\n", num_vec1, num_vec2);
00165
00166 if (target)
00167 result=target;
00168 else
00169 result=new float32_t[total_num];
00170
00171 if (f1==f2 && num_vec1==num_vec2)
00172 {
00173 for (int32_t i=0; i<num_vec1; i++)
00174 {
00175 for (int32_t j=i; j<num_vec1; j++)
00176 {
00177 float64_t v=kernel(i,j);
00178
00179 result[i+j*num_vec1]=v;
00180 result[j+i*num_vec1]=v;
00181
00182 if (num_done%100000)
00183 SG_PROGRESS(num_done, 0, total_num-1);
00184
00185 if (i!=j)
00186 num_done+=2;
00187 else
00188 num_done+=1;
00189 }
00190 }
00191 }
00192 else
00193 {
00194 for (int32_t i=0; i<num_vec1; i++)
00195 {
00196 for (int32_t j=0; j<num_vec2; j++)
00197 {
00198 result[i+j*num_vec1]=kernel(i,j) ;
00199
00200 if (num_done%100000)
00201 SG_PROGRESS(num_done, 0, total_num-1);
00202
00203 num_done++;
00204 }
00205 }
00206 }
00207
00208 SG_DONE();
00209 }
00210 else
00211 SG_ERROR( "no features assigned to kernel\n");
00212
00213 return result;
00214 }
00215
00216 float64_t* CKernel::get_kernel_matrix_real(
00217 int32_t &num_vec1, int32_t &num_vec2, float64_t* target)
00218 {
00219 float64_t* result = NULL;
00220 CFeatures* f1 = lhs;
00221 CFeatures* f2 = rhs;
00222
00223 if (f1 && f2)
00224 {
00225 if (target && (num_vec1!=f1->get_num_vectors() ||
00226 num_vec2!=f2->get_num_vectors()) )
00227 SG_ERROR( "kernel matrix does not fit into target\n");
00228
00229 num_vec1=f1->get_num_vectors();
00230 num_vec2=f2->get_num_vectors();
00231 int64_t total_num = num_vec1 * num_vec2;
00232 int32_t num_done = 0;
00233
00234 SG_DEBUG( "returning kernel matrix of size %dx%d\n", num_vec1, num_vec2);
00235
00236 if (target)
00237 result=target;
00238 else
00239 result=new float64_t[total_num];
00240
00241 if (f1==f2 && num_vec1==num_vec2)
00242 {
00243 for (int32_t i=0; i<num_vec1; i++)
00244 {
00245 for (int32_t j=i; j<num_vec1; j++)
00246 {
00247 float64_t v=kernel(i,j);
00248
00249 result[i+j*num_vec1]=v;
00250 result[j+i*num_vec1]=v;
00251
00252 if (num_done%100000)
00253 SG_PROGRESS(num_done, 0, total_num-1);
00254
00255 if (i!=j)
00256 num_done+=2;
00257 else
00258 num_done+=1;
00259 }
00260 }
00261 }
00262 else
00263 {
00264 for (int32_t i=0; i<num_vec1; i++)
00265 {
00266 for (int32_t j=0; j<num_vec2; j++)
00267 {
00268 result[i+j*num_vec1]=kernel(i,j) ;
00269
00270 if (num_done%100000)
00271 SG_PROGRESS(num_done, 0, total_num-1);
00272
00273 num_done++;
00274 }
00275 }
00276 }
00277
00278 SG_DONE();
00279 }
00280 else
00281 SG_ERROR( "no features assigned to kernel\n");
00282
00283 return result;
00284 }
00285
00286
00287
00288
00289 bool CKernel::init(CFeatures* l, CFeatures* r)
00290 {
00291
00292 ASSERT(l);
00293 ASSERT(r);
00294
00295
00296 ASSERT(l->get_feature_class()==r->get_feature_class());
00297 ASSERT(l->get_feature_type()==r->get_feature_type());
00298
00299
00300 remove_lhs_and_rhs();
00301
00302
00303 SG_REF(l);
00304 if (l!=r)
00305 SG_REF(r);
00306
00307 lhs=l;
00308 rhs=r;
00309
00310 return true;
00311 }
00312
00313 bool CKernel::set_normalizer(CKernelNormalizer* n)
00314 {
00315 SG_REF(n);
00316 SG_UNREF(normalizer);
00317 #ifndef HAVE_SWIG
00318 delete normalizer;
00319 #endif
00320 normalizer=n;
00321
00322 return (normalizer!=NULL);
00323 }
00324
00325 CKernelNormalizer* CKernel::get_normalizer()
00326 {
00327 SG_REF(normalizer)
00328 return normalizer;
00329 }
00330
00331 bool CKernel::init_normalizer()
00332 {
00333 return normalizer->init(this);
00334 }
00335
00336 void CKernel::cleanup()
00337 {
00338 remove_lhs_and_rhs();
00339 }
00340
00341
00342
00343 bool CKernel::load(char* fname)
00344 {
00345 return false;
00346 }
00347
00348 bool CKernel::save(char* fname)
00349 {
00350 int32_t i=0;
00351 int32_t num_left=lhs->get_num_vectors();
00352 int32_t num_right=rhs->get_num_vectors();
00353 KERNELCACHE_IDX num_total=num_left*num_right;
00354
00355 CFile f(fname, 'w', F_DREAL);
00356
00357 for (int32_t l=0; l< (int32_t) num_left && f.is_ok(); l++)
00358 {
00359 for (int32_t r=0; r< (int32_t) num_right && f.is_ok(); r++)
00360 {
00361 if (!(i % (num_total/10+1)))
00362 SG_PRINT("%02d%%.", (int32_t) (100.0*i/num_total));
00363 else if (!(i % (num_total/200+1)))
00364 SG_PRINT(".");
00365
00366 float64_t k=kernel(l,r);
00367 f.save_real_data(&k, 1);
00368
00369 i++;
00370 }
00371 }
00372
00373 if (f.is_ok())
00374 SG_INFO( "kernel matrix of size %ld x %ld written (filesize: %ld)\n", num_left, num_right, num_total*sizeof(KERNELCACHE_ELEM));
00375
00376 return (f.is_ok());
00377 }
00378
00379 void CKernel::remove_lhs_and_rhs()
00380 {
00381 if (rhs!=lhs)
00382 SG_UNREF(rhs);
00383 rhs = NULL;
00384
00385 SG_UNREF(lhs);
00386 lhs = NULL;
00387
00388
00389 }
00390
00391 void CKernel::remove_lhs()
00392 {
00393 SG_UNREF(lhs);
00394 lhs = NULL;
00395
00396
00397 }
00398
00400 void CKernel::remove_rhs()
00401 {
00402 if (rhs!=lhs)
00403 SG_UNREF(rhs);
00404 rhs = NULL;
00405
00406
00407 }
00408
00409
00410 void CKernel::list_kernel()
00411 {
00412 SG_INFO( "0x%p - \"%s\" weight=%1.2f OPT:%s", this, get_name(),
00413 get_combined_kernel_weight(),
00414 get_optimization_type()==FASTBUTMEMHUNGRY ? "FASTBUTMEMHUNGRY" :
00415 "SLOWBUTMEMEFFICIENT");
00416
00417 switch (get_kernel_type())
00418 {
00419 case K_UNKNOWN:
00420 SG_INFO( "K_UNKNOWN ");
00421 break;
00422 case K_LINEAR:
00423 SG_INFO( "K_LINEAR ");
00424 break;
00425 case K_SPARSELINEAR:
00426 SG_INFO( "K_SPARSELINEAR ");
00427 break;
00428 case K_POLY:
00429 SG_INFO( "K_POLY ");
00430 break;
00431 case K_GAUSSIAN:
00432 SG_INFO( "K_GAUSSIAN ");
00433 break;
00434 case K_SPARSEGAUSSIAN:
00435 SG_INFO( "K_SPARSEGAUSSIAN ");
00436 break;
00437 case K_GAUSSIANSHIFT:
00438 SG_INFO( "K_GAUSSIANSHIFT ");
00439 break;
00440 case K_HISTOGRAM:
00441 SG_INFO( "K_HISTOGRAM ");
00442 break;
00443 case K_SALZBERG:
00444 SG_INFO( "K_SALZBERG ");
00445 break;
00446 case K_LOCALITYIMPROVED:
00447 SG_INFO( "K_LOCALITYIMPROVED ");
00448 break;
00449 case K_SIMPLELOCALITYIMPROVED:
00450 SG_INFO( "K_SIMPLELOCALITYIMPROVED ");
00451 break;
00452 case K_FIXEDDEGREE:
00453 SG_INFO( "K_FIXEDDEGREE ");
00454 break;
00455 case K_WEIGHTEDDEGREE:
00456 SG_INFO( "K_WEIGHTEDDEGREE ");
00457 break;
00458 case K_WEIGHTEDDEGREEPOS:
00459 SG_INFO( "K_WEIGHTEDDEGREEPOS ");
00460 break;
00461 case K_WEIGHTEDCOMMWORDSTRING:
00462 SG_INFO( "K_WEIGHTEDCOMMWORDSTRING ");
00463 break;
00464 case K_POLYMATCH:
00465 SG_INFO( "K_POLYMATCH ");
00466 break;
00467 case K_ALIGNMENT:
00468 SG_INFO( "K_ALIGNMENT ");
00469 break;
00470 case K_COMMWORDSTRING:
00471 SG_INFO( "K_COMMWORDSTRING ");
00472 break;
00473 case K_COMMULONGSTRING:
00474 SG_INFO( "K_COMMULONGSTRING ");
00475 break;
00476 case K_COMBINED:
00477 SG_INFO( "K_COMBINED ");
00478 break;
00479 case K_AUC:
00480 SG_INFO( "K_AUC ");
00481 break;
00482 case K_CUSTOM:
00483 SG_INFO( "K_CUSTOM ");
00484 break;
00485 case K_SIGMOID:
00486 SG_INFO( "K_SIGMOID ");
00487 break;
00488 case K_CHI2:
00489 SG_INFO( "K_CHI2 ");
00490 break;
00491 case K_DIAG:
00492 SG_INFO( "K_DIAG ");
00493 break;
00494 case K_CONST:
00495 SG_INFO( "K_CONST ");
00496 break;
00497 case K_MINDYGRAM:
00498 SG_INFO( "K_MINDYGRAM ");
00499 break;
00500 case K_DISTANCE:
00501 SG_INFO( "K_DISTANCE ");
00502 break;
00503 case K_LOCALALIGNMENT:
00504 SG_INFO( "K_LOCALALIGNMENT ");
00505 break;
00506 default:
00507 SG_ERROR( "ERROR UNKNOWN KERNEL TYPE");
00508 break;
00509 }
00510
00511 switch (get_feature_class())
00512 {
00513 case C_UNKNOWN:
00514 SG_INFO( "C_UNKNOWN ");
00515 break;
00516 case C_SIMPLE:
00517 SG_INFO( "C_SIMPLE ");
00518 break;
00519 case C_SPARSE:
00520 SG_INFO( "C_SPARSE ");
00521 break;
00522 case C_STRING:
00523 SG_INFO( "C_STRING ");
00524 break;
00525 case C_COMBINED:
00526 SG_INFO( "C_COMBINED ");
00527 break;
00528 case C_ANY:
00529 SG_INFO( "C_ANY ");
00530 break;
00531 default:
00532 SG_ERROR( "ERROR UNKNOWN FEATURE CLASS");
00533 }
00534
00535 switch (get_feature_type())
00536 {
00537 case F_UNKNOWN:
00538 SG_INFO( "F_UNKNOWN ");
00539 break;
00540 case F_DREAL:
00541 SG_INFO( "F_REAL ");
00542 break;
00543 case F_SHORT:
00544 SG_INFO( "F_SHORT ");
00545 break;
00546 case F_CHAR:
00547 SG_INFO( "F_CHAR ");
00548 break;
00549 case F_INT:
00550 SG_INFO( "F_INT ");
00551 break;
00552 case F_BYTE:
00553 SG_INFO( "F_BYTE ");
00554 break;
00555 case F_WORD:
00556 SG_INFO( "F_WORD ");
00557 break;
00558 case F_ULONG:
00559 SG_INFO( "F_ULONG ");
00560 break;
00561 case F_ANY:
00562 SG_INFO( "F_ANY ");
00563 break;
00564 default:
00565 SG_ERROR( "ERROR UNKNOWN FEATURE TYPE");
00566 break;
00567 }
00568 SG_INFO( "\n");
00569 }
00570
00571 bool CKernel::init_optimization(
00572 int32_t count, int32_t *IDX, float64_t * weights)
00573 {
00574 SG_ERROR( "kernel does not support linadd optimization\n");
00575 return false ;
00576 }
00577
00578 bool CKernel::delete_optimization()
00579 {
00580 SG_ERROR( "kernel does not support linadd optimization\n");
00581 return false;
00582 }
00583
00584 float64_t CKernel::compute_optimized(int32_t vector_idx)
00585 {
00586 SG_ERROR( "kernel does not support linadd optimization\n");
00587 return 0;
00588 }
00589
00590 void CKernel::compute_batch(
00591 int32_t num_vec, int32_t* vec_idx, float64_t* target, int32_t num_suppvec,
00592 int32_t* IDX, float64_t* weights, float64_t factor)
00593 {
00594 SG_ERROR( "kernel does not support batch computation\n");
00595 }
00596
00597 void CKernel::add_to_normal(int32_t vector_idx, float64_t weight)
00598 {
00599 SG_ERROR( "kernel does not support linadd optimization, add_to_normal not implemented\n");
00600 }
00601
00602 void CKernel::clear_normal()
00603 {
00604 SG_ERROR( "kernel does not support linadd optimization, clear_normal not implemented\n");
00605 }
00606
00607 int32_t CKernel::get_num_subkernels()
00608 {
00609 return 1;
00610 }
00611
00612 void CKernel::compute_by_subkernel(
00613 int32_t vector_idx, float64_t * subkernel_contrib)
00614 {
00615 SG_ERROR( "kernel compute_by_subkernel not implemented\n");
00616 }
00617
00618 const float64_t* CKernel::get_subkernel_weights(int32_t &num_weights)
00619 {
00620 num_weights=1 ;
00621 return &combined_kernel_weight ;
00622 }
00623
00624 void CKernel::set_subkernel_weights(float64_t* weights, int32_t num_weights)
00625 {
00626 combined_kernel_weight = weights[0] ;
00627 if (num_weights!=1)
00628 SG_ERROR( "number of subkernel weights should be one ...\n");
00629 }
00630
00631 bool CKernel::init_optimization_svm(CSVM * svm)
00632 {
00633 int32_t num_suppvec=svm->get_num_support_vectors();
00634 int32_t* sv_idx=new int32_t[num_suppvec];
00635 float64_t* sv_weight=new float64_t[num_suppvec];
00636
00637 for (int32_t i=0; i<num_suppvec; i++)
00638 {
00639 sv_idx[i] = svm->get_support_vector(i);
00640 sv_weight[i] = svm->get_alpha(i);
00641 }
00642 bool ret = init_optimization(num_suppvec, sv_idx, sv_weight);
00643
00644 delete[] sv_idx;
00645 delete[] sv_weight;
00646 return ret;
00647 }
00648