CommUlongStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include "lib/common.h"
00012 #include "kernel/CommUlongStringKernel.h"
00013 #include "kernel/SqrtDiagKernelNormalizer.h"
00014 #include "features/StringFeatures.h"
00015 #include "lib/io.h"
00016 
00017 CCommUlongStringKernel::CCommUlongStringKernel(int32_t size, bool us)
00018 : CStringKernel<uint64_t>(size), use_sign(us)
00019 {
00020     properties |= KP_LINADD;
00021     clear_normal();
00022 
00023     set_normalizer(new CSqrtDiagKernelNormalizer());
00024 }
00025 
00026 CCommUlongStringKernel::CCommUlongStringKernel(
00027     CStringFeatures<uint64_t>* l, CStringFeatures<uint64_t>* r, bool us,
00028     int32_t size)
00029 : CStringKernel<uint64_t>(size), use_sign(us)
00030 {
00031     properties |= KP_LINADD;
00032     clear_normal();
00033     set_normalizer(new CSqrtDiagKernelNormalizer());
00034     init(l,r);
00035 }
00036 
00037 CCommUlongStringKernel::~CCommUlongStringKernel()
00038 {
00039     cleanup();
00040 }
00041 
00042 void CCommUlongStringKernel::remove_lhs()
00043 {
00044     delete_optimization();
00045 
00046 #ifdef SVMLIGHT
00047     if (lhs)
00048         cache_reset();
00049 #endif
00050 
00051     lhs = NULL ; 
00052     rhs = NULL ; 
00053 }
00054 
00055 void CCommUlongStringKernel::remove_rhs()
00056 {
00057 #ifdef SVMLIGHT
00058     if (rhs)
00059         cache_reset();
00060 #endif
00061 
00062     rhs = lhs;
00063 }
00064 
00065 bool CCommUlongStringKernel::init(CFeatures* l, CFeatures* r)
00066 {
00067     CStringKernel<uint64_t>::init(l,r);
00068     return init_normalizer();
00069 }
00070 
00071 void CCommUlongStringKernel::cleanup()
00072 {
00073     delete_optimization();
00074     clear_normal();
00075     CKernel::cleanup();
00076 }
00077 
00078 bool CCommUlongStringKernel::load_init(FILE* src)
00079 {
00080     return false;
00081 }
00082 
00083 bool CCommUlongStringKernel::save_init(FILE* dest)
00084 {
00085     return false;
00086 }
00087 
00088 float64_t CCommUlongStringKernel::compute(int32_t idx_a, int32_t idx_b)
00089 {
00090     int32_t alen, blen;
00091     uint64_t* avec=((CStringFeatures<uint64_t>*) lhs)->get_feature_vector(idx_a, alen);
00092     uint64_t* bvec=((CStringFeatures<uint64_t>*) rhs)->get_feature_vector(idx_b, blen);
00093 
00094     float64_t result=0;
00095 
00096     int32_t left_idx=0;
00097     int32_t right_idx=0;
00098 
00099     if (use_sign)
00100     {
00101         while (left_idx < alen && right_idx < blen)
00102         {
00103             if (avec[left_idx]==bvec[right_idx])
00104             {
00105                 uint64_t sym=avec[left_idx];
00106 
00107                 while (left_idx< alen && avec[left_idx]==sym)
00108                     left_idx++;
00109 
00110                 while (right_idx< blen && bvec[right_idx]==sym)
00111                     right_idx++;
00112 
00113                 result++;
00114             }
00115             else if (avec[left_idx]<bvec[right_idx])
00116                 left_idx++;
00117             else
00118                 right_idx++;
00119         }
00120     }
00121     else
00122     {
00123         while (left_idx < alen && right_idx < blen)
00124         {
00125             if (avec[left_idx]==bvec[right_idx])
00126             {
00127                 int32_t old_left_idx=left_idx;
00128                 int32_t old_right_idx=right_idx;
00129 
00130                 uint64_t sym=avec[left_idx];
00131 
00132                 while (left_idx< alen && avec[left_idx]==sym)
00133                     left_idx++;
00134 
00135                 while (right_idx< blen && bvec[right_idx]==sym)
00136                     right_idx++;
00137 
00138                 result+=((float64_t) (left_idx-old_left_idx)) * ((float64_t) (right_idx-old_right_idx));
00139             }
00140             else if (avec[left_idx]<bvec[right_idx])
00141                 left_idx++;
00142             else
00143                 right_idx++;
00144         }
00145     }
00146 
00147     return result;
00148 }
00149 
00150 void CCommUlongStringKernel::add_to_normal(int32_t vec_idx, float64_t weight)
00151 {
00152     int32_t t=0;
00153     int32_t j=0;
00154     int32_t k=0;
00155     int32_t last_j=0;
00156     int32_t len=-1;
00157     uint64_t* vec=((CStringFeatures<uint64_t>*) lhs)->get_feature_vector(vec_idx, len);
00158 
00159     if (vec && len>0)
00160     {
00161         //use malloc not new [] as DynamicArray uses it
00162         uint64_t* dic= (uint64_t*) malloc(
00163             (len+dictionary.get_num_elements())*sizeof(uint64_t));
00164         float64_t* dic_weights= (float64_t*) malloc(
00165             (len+dictionary.get_num_elements())*sizeof(float64_t));
00166 
00167         if (use_sign)
00168         {
00169             for (j=1; j<len; j++)
00170             {
00171                 if (vec[j]==vec[j-1])
00172                     continue;
00173 
00174                 merge_dictionaries(t, j, k, vec, dic, dic_weights, weight, vec_idx);
00175             }
00176 
00177             merge_dictionaries(t, j, k, vec, dic, dic_weights, weight, vec_idx);
00178 
00179             while (k<dictionary.get_num_elements())
00180             {
00181                 dic[t]=dictionary[k];
00182                 dic_weights[t]=dictionary_weights[k];
00183                 t++;
00184                 k++;
00185             }
00186         }
00187         else
00188         {
00189             for (j=1; j<len; j++)
00190             {
00191                 if (vec[j]==vec[j-1])
00192                     continue;
00193 
00194                 merge_dictionaries(t, j, k, vec, dic, dic_weights, weight*(j-last_j), vec_idx);
00195                 last_j = j;
00196             }
00197 
00198             merge_dictionaries(t, j, k, vec, dic, dic_weights, weight*(j-last_j), vec_idx);
00199 
00200             while (k<dictionary.get_num_elements())
00201             {
00202                 dic[t]=dictionary[k];
00203                 dic_weights[t]=dictionary_weights[k];
00204                 t++;
00205                 k++;
00206             }
00207         }
00208 
00209         dictionary.set_array(dic, t, len+dictionary.get_num_elements());
00210         dictionary_weights.set_array(dic_weights, t, len+dictionary.get_num_elements());
00211     }
00212 
00213     set_is_initialized(true);
00214 }
00215 
00216 void CCommUlongStringKernel::clear_normal()
00217 {
00218     dictionary.resize_array(0);
00219     dictionary_weights.resize_array(0);
00220     set_is_initialized(false);
00221 }
00222 
00223 bool CCommUlongStringKernel::init_optimization(
00224     int32_t count, int32_t *IDX, float64_t * weights)
00225 {
00226     clear_normal();
00227 
00228     if (count<=0)
00229     {
00230         set_is_initialized(true);
00231         SG_DEBUG( "empty set of SVs\n");
00232         return true;
00233     }
00234 
00235     SG_DEBUG( "initializing CCommUlongStringKernel optimization\n");
00236 
00237     for (int32_t i=0; i<count; i++)
00238     {
00239         if ( (i % (count/10+1)) == 0)
00240             SG_PROGRESS(i, 0, count);
00241 
00242         add_to_normal(IDX[i], weights[i]);
00243     }
00244 
00245     SG_PRINT( "Done.         \n");
00246     
00247     set_is_initialized(true);
00248     return true;
00249 }
00250 
00251 bool CCommUlongStringKernel::delete_optimization() 
00252 {
00253     SG_DEBUG( "deleting CCommUlongStringKernel optimization\n");
00254     clear_normal();
00255     return true;
00256 }
00257 
00258 // binary search for each feature. trick: as features are sorted save last found idx in old_idx and
00259 // only search in the remainder of the dictionary
00260 float64_t CCommUlongStringKernel::compute_optimized(int32_t i)
00261 { 
00262     float64_t result = 0;
00263     int32_t j, last_j=0;
00264     int32_t old_idx = 0;
00265 
00266     if (!get_is_initialized())
00267     {
00268       SG_ERROR( "CCommUlongStringKernel optimization not initialized\n");
00269         return 0 ; 
00270     }
00271 
00272 
00273 
00274     int32_t alen = -1;
00275     uint64_t* avec=((CStringFeatures<uint64_t>*) rhs)->
00276         get_feature_vector(i, alen);
00277 
00278     if (avec && alen>0)
00279     {
00280         if (use_sign)
00281         {
00282             for (j=1; j<alen; j++)
00283             {
00284                 if (avec[j]==avec[j-1])
00285                     continue;
00286 
00287                 int32_t idx = CMath::binary_search_max_lower_equal(&(dictionary.get_array()[old_idx]), dictionary.get_num_elements()-old_idx, avec[j-1]);
00288 
00289                 if (idx!=-1)
00290                 {
00291                     if (dictionary[idx+old_idx] == avec[j-1])
00292                         result += dictionary_weights[idx+old_idx];
00293 
00294                     old_idx+=idx;
00295                 }
00296             }
00297 
00298             int32_t idx = CMath::binary_search(&(dictionary.get_array()[old_idx]), dictionary.get_num_elements()-old_idx, avec[alen-1]);
00299             if (idx!=-1)
00300                 result += dictionary_weights[idx+old_idx];
00301         }
00302         else
00303         {
00304             for (j=1; j<alen; j++)
00305             {
00306                 if (avec[j]==avec[j-1])
00307                     continue;
00308 
00309                 int32_t idx = CMath::binary_search_max_lower_equal(&(dictionary.get_array()[old_idx]), dictionary.get_num_elements()-old_idx, avec[j-1]);
00310 
00311                 if (idx!=-1)
00312                 {
00313                     if (dictionary[idx+old_idx] == avec[j-1])
00314                         result += dictionary_weights[idx+old_idx]*(j-last_j);
00315 
00316                     old_idx+=idx;
00317                 }
00318 
00319                 last_j = j;
00320             }
00321 
00322             int32_t idx = CMath::binary_search(&(dictionary.get_array()[old_idx]), dictionary.get_num_elements()-old_idx, avec[alen-1]);
00323             if (idx!=-1)
00324                 result += dictionary_weights[idx+old_idx]*(alen-last_j);
00325         }
00326     }
00327 
00328     return normalizer->normalize_rhs(result, i);
00329 }

SHOGUN Machine Learning Toolbox - Documentation