HammingWordDistance.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2007 Christian Gehl
00008  * Written (W) 1999-2008 Soeren Sonnenburg
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "lib/common.h"
00013 #include "distance/HammingWordDistance.h"
00014 #include "features/Features.h"
00015 #include "features/StringFeatures.h"
00016 #include "lib/io.h"
00017 
00018 CHammingWordDistance::CHammingWordDistance(bool sign)
00019 : CStringDistance<uint16_t>(), use_sign(sign)
00020 {
00021     SG_DEBUG( "CHammingWordDistance with sign: %d created\n", (sign) ? 1 : 0);
00022     dictionary_size= 1<<(sizeof(uint16_t)*8);
00023     dictionary_weights = new float64_t[dictionary_size];
00024     SG_DEBUG( "using dictionary of %d bytes\n", dictionary_size);
00025 }
00026 
00027 CHammingWordDistance::CHammingWordDistance(
00028     CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r, bool sign)
00029 : CStringDistance<uint16_t>(), use_sign(sign)
00030 {
00031     SG_DEBUG( "CHammingWordDistance with sign: %d created\n", (sign) ? 1 : 0);
00032     dictionary_size= 1<<(sizeof(uint16_t)*8);
00033     dictionary_weights = new float64_t[dictionary_size];
00034     SG_DEBUG( "using dictionary of %d bytes\n", dictionary_size);
00035 
00036     init(l, r);
00037 }
00038 
00039 CHammingWordDistance::~CHammingWordDistance()
00040 {
00041     cleanup();
00042 
00043     delete[] dictionary_weights;
00044 }
00045   
00046 bool CHammingWordDistance::init(CFeatures* l, CFeatures* r)
00047 {
00048     bool result=CStringDistance<uint16_t>::init(l,r);
00049     return result;
00050 }
00051 
00052 void CHammingWordDistance::cleanup()
00053 {
00054 }
00055 
00056 bool CHammingWordDistance::load_init(FILE* src)
00057 {
00058     return false;
00059 }
00060 
00061 bool CHammingWordDistance::save_init(FILE* dest)
00062 {
00063     return false;
00064 }
00065 
00066 float64_t CHammingWordDistance::compute(int32_t idx_a, int32_t idx_b)
00067 {
00068     int32_t alen, blen;
00069 
00070     uint16_t* avec=((CStringFeatures<uint16_t>*) lhs)->
00071         get_feature_vector(idx_a, alen);
00072     uint16_t* bvec=((CStringFeatures<uint16_t>*) rhs)->
00073         get_feature_vector(idx_b, blen);
00074 
00075     int32_t result=0;
00076 
00077     int32_t left_idx=0;
00078     int32_t right_idx=0;
00079 
00080     if (use_sign)
00081     {
00082         // hamming of: if words appear in both vectors 
00083         while (left_idx < alen && right_idx < blen)
00084         {
00085             uint16_t sym=avec[left_idx];
00086             if (avec[left_idx]==bvec[right_idx])
00087             {
00088                 while (left_idx< alen && avec[left_idx]==sym)
00089                     left_idx++;
00090 
00091                 while (right_idx< blen && bvec[right_idx]==sym)
00092                     right_idx++;
00093             }
00094             else if (avec[left_idx]<bvec[right_idx])
00095             {
00096                 result++;
00097 
00098                 while (left_idx< alen && avec[left_idx]==sym)
00099                     left_idx++;
00100             }
00101             else
00102             {
00103                 sym=bvec[right_idx];
00104                 result++;
00105 
00106                 while (right_idx< blen && bvec[right_idx]==sym)
00107                     right_idx++;
00108             }
00109         }
00110     }
00111     else
00112     {
00113         //hamming of: if words appear in both vectors _the same number_ of times
00114         while (left_idx < alen && right_idx < blen)
00115         {
00116             uint16_t sym=avec[left_idx];
00117             if (avec[left_idx]==bvec[right_idx])
00118             {
00119                 int32_t old_left_idx=left_idx;
00120                 int32_t old_right_idx=right_idx;
00121 
00122                 while (left_idx< alen && avec[left_idx]==sym)
00123                     left_idx++;
00124 
00125                 while (right_idx< blen && bvec[right_idx]==sym)
00126                     right_idx++;
00127 
00128                 if ((left_idx-old_left_idx)!=(right_idx-old_right_idx))
00129                     result++;
00130             }
00131             else if (avec[left_idx]<bvec[right_idx])
00132             {
00133                 result++;
00134 
00135                 while (left_idx< alen && avec[left_idx]==sym)
00136                     left_idx++;
00137             }
00138             else
00139             {
00140                 sym=bvec[right_idx];
00141                 result++;
00142 
00143                 while (right_idx< blen && bvec[right_idx]==sym)
00144                     right_idx++;
00145             }
00146         }
00147     }
00148 
00149     while (left_idx < alen)
00150     {
00151         uint16_t sym=avec[left_idx];
00152         result++;
00153 
00154         while (left_idx< alen && avec[left_idx]==sym)
00155             left_idx++;
00156     }
00157 
00158     while (right_idx < blen)
00159     {
00160         uint16_t sym=bvec[right_idx];
00161         result++;
00162 
00163         while (right_idx< blen && bvec[right_idx]==sym)
00164             right_idx++;
00165     }
00166 
00167     return result;
00168 }

SHOGUN Machine Learning Toolbox - Documentation