WDSVMOcas.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2007-2008 Vojtech Franc
00008  * Written (W) 2007-2008 Soeren Sonnenburg
00009  * Copyright (C) 2007-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "features/Labels.h"
00013 #include "lib/Mathematics.h"
00014 #include "lib/DynamicArray.h"
00015 #include "lib/Time.h"
00016 #include "base/Parallel.h"
00017 #include "classifier/Classifier.h"
00018 #include "classifier/svm/libocas.h"
00019 #include "classifier/svm/WDSVMOcas.h"
00020 #include "features/StringFeatures.h"
00021 #include "features/Alphabet.h"
00022 #include "features/Labels.h"
00023 
00024 
00025 struct wdocas_thread_params_output
00026 {
00027     float32_t* out;
00028     int32_t* val;
00029     float64_t* output;
00030     CWDSVMOcas* wdocas;
00031     int32_t start;
00032     int32_t end;
00033 };
00034 
00035 struct wdocas_thread_params_add
00036 {
00037     CWDSVMOcas* wdocas;
00038     float32_t* new_a;
00039     uint32_t* new_cut;
00040     int32_t start;
00041     int32_t end;
00042     uint32_t cut_length;
00043 };
00044 
00045 CWDSVMOcas::CWDSVMOcas(E_SVM_TYPE type)
00046 : CClassifier(), use_bias(false), bufsize(3000), C1(1), C2(1),
00047     epsilon(1e-3), method(type)
00048 {
00049     w=NULL;
00050     old_w=NULL;
00051     degree=6;
00052     from_degree=40;
00053     wd_weights=NULL;
00054     w_offsets=NULL;
00055     normalization_const=1.0;
00056 }
00057 
00058 CWDSVMOcas::CWDSVMOcas(
00059     float64_t C, int32_t d, int32_t from_d, CStringFeatures<uint8_t>* traindat,
00060     CLabels* trainlab)
00061 : CClassifier(), use_bias(false), bufsize(3000), C1(C), C2(C), epsilon(1e-3),
00062     degree(d), from_degree(from_d)
00063 {
00064     w=NULL;
00065     old_w=NULL;
00066     method=SVM_OCAS;
00067     features=traindat;
00068     set_labels(trainlab);
00069     wd_weights=NULL;
00070     w_offsets=NULL;
00071     normalization_const=1.0;
00072 }
00073 
00074 
00075 CWDSVMOcas::~CWDSVMOcas()
00076 {
00077 }
00078 
00079 CLabels* CWDSVMOcas::classify(CLabels* output)
00080 {
00081     set_wd_weights();
00082     set_normalization_const();
00083 
00084     if (features)
00085     {
00086         int32_t num=features->get_num_vectors();
00087         ASSERT(num>0);
00088 
00089         if (!output)
00090             output=new CLabels(num);
00091 
00092         for (int32_t i=0; i<num; i++)
00093             output->set_label(i, classify_example(i));
00094 
00095         return output;
00096     }
00097 
00098     return NULL;
00099 }
00100 
00101 int32_t CWDSVMOcas::set_wd_weights()
00102 {
00103     ASSERT(degree>0 && degree<=8);
00104     delete[] wd_weights;
00105     wd_weights=new float32_t[degree];
00106     delete[] w_offsets;
00107     w_offsets=new int32_t[degree];
00108     int32_t w_dim_single_c=0;
00109 
00110     for (int32_t i=0; i<degree; i++)
00111     {
00112         w_offsets[i]=CMath::pow(alphabet_size, i+1);
00113         wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00114         w_dim_single_c+=w_offsets[i];
00115     }
00116     return w_dim_single_c;
00117 }
00118 
00119 bool CWDSVMOcas::train()
00120 {
00121     SG_INFO("C=%f, epsilon=%f, bufsize=%d\n", get_C1(), get_epsilon(), bufsize);
00122 
00123     ASSERT(labels);
00124     ASSERT(get_features());
00125     ASSERT(labels->is_two_class_labeling());
00126     CAlphabet* alphabet=get_features()->get_alphabet();
00127     ASSERT(alphabet && alphabet->get_alphabet()==RAWDNA);
00128 
00129     alphabet_size=alphabet->get_num_symbols();
00130     string_length=features->get_num_vectors();
00131     int32_t num_train_labels=0;
00132     lab=labels->get_labels(num_train_labels);
00133 
00134     w_dim_single_char=set_wd_weights();
00135     //CMath::display_vector(wd_weights, degree, "wd_weights");
00136     SG_DEBUG("w_dim_single_char=%d\n", w_dim_single_char);
00137     w_dim=string_length*w_dim_single_char;
00138     SG_DEBUG("cutting plane has %d dims\n", w_dim);
00139     num_vec=get_features()->get_max_vector_length();
00140 
00141     set_normalization_const();
00142     SG_INFO("num_vec: %d num_lab: %d\n", num_vec, num_train_labels);
00143     ASSERT(num_vec==num_train_labels);
00144     ASSERT(num_vec>0);
00145 
00146     delete[] w;
00147     w=new float32_t[w_dim];
00148     memset(w, 0, w_dim*sizeof(float32_t));
00149 
00150     delete[] old_w;
00151     old_w=new float32_t[w_dim];
00152     memset(old_w, 0, w_dim*sizeof(float32_t));
00153     bias=0;
00154 
00155     cuts=new float32_t*[bufsize];
00156     memset(cuts, 0, sizeof(*cuts)*bufsize);
00157 
00159     /*float64_t* tmp = new float64_t[num_vec];
00160     float64_t start=CTime::get_curtime();
00161     CMath::random_vector(w, w_dim, (float32_t) 0, (float32_t) 1000);
00162     compute_output(tmp, this);
00163     start=CTime::get_curtime()-start;
00164     SG_PRINT("timing:%f\n", start);
00165     delete[] tmp;
00166     exit(1);*/
00168     float64_t TolAbs=0;
00169     float64_t QPBound=0;
00170     uint8_t Method=0;
00171     if (method == SVM_OCAS)
00172         Method = 1;
00173     ocas_return_value_T result = svm_ocas_solver( get_C1(), num_vec, get_epsilon(),
00174             TolAbs, QPBound, bufsize, Method,
00175             &CWDSVMOcas::compute_W,
00176             &CWDSVMOcas::update_W,
00177             &CWDSVMOcas::add_new_cut,
00178             &CWDSVMOcas::compute_output,
00179             &CWDSVMOcas::sort,
00180             this);
00181 
00182     SG_INFO("Ocas Converged after %d iterations\n"
00183             "==================================\n"
00184             "timing statistics:\n"
00185             "output_time: %f s\n"
00186             "sort_time: %f s\n"
00187             "add_time: %f s\n"
00188             "w_time: %f s\n"
00189             "solver_time %f s\n"
00190             "ocas_time %f s\n\n", result.nIter, result.output_time, result.sort_time,
00191             result.add_time, result.w_time, result.solver_time, result.ocas_time);
00192 
00193     for (int32_t i=bufsize-1; i>=0; i--)
00194         delete[] cuts[i];
00195     delete[] cuts;
00196 
00197     delete[] lab;
00198     lab=NULL;
00199 
00200     SG_UNREF(alphabet);
00201 
00202     return true;
00203 }
00204 
00205 /*----------------------------------------------------------------------------------
00206   sq_norm_W = sparse_update_W( t ) does the following:
00207 
00208   W = oldW*(1-t) + t*W;
00209   sq_norm_W = W'*W;
00210 
00211   ---------------------------------------------------------------------------------*/
00212 float64_t CWDSVMOcas::update_W( float64_t t, void* ptr )
00213 {
00214   float64_t sq_norm_W = 0;         
00215   CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00216   uint32_t nDim = (uint32_t) o->w_dim;
00217   float32_t* W=o->w;
00218   float32_t* oldW=o->old_w;
00219 
00220   for(uint32_t j=0; j <nDim; j++)
00221   {
00222       W[j] = oldW[j]*(1-t) + t*W[j];
00223       sq_norm_W += W[j]*W[j];
00224   }          
00225 
00226   return( sq_norm_W );
00227 }
00228 
00229 /*----------------------------------------------------------------------------------
00230   sparse_add_new_cut( new_col_H, new_cut, cut_length, nSel ) does the following:
00231 
00232     new_a = sum(data_X(:,find(new_cut ~=0 )),2);
00233     new_col_H = [sparse_A(:,1:nSel)'*new_a ; new_a'*new_a];
00234     sparse_A(:,nSel+1) = new_a;
00235 
00236   ---------------------------------------------------------------------------------*/
00237 void* CWDSVMOcas::add_new_cut_helper( void* ptr)
00238 {
00239     wdocas_thread_params_add* p = (wdocas_thread_params_add*) ptr;
00240     CWDSVMOcas* o = p->wdocas;
00241     int32_t start = p->start;
00242     int32_t end = p->end;
00243     int32_t string_length = o->string_length;
00244     //uint32_t nDim=(uint32_t) o->w_dim;
00245     uint32_t cut_length=p->cut_length;
00246     uint32_t* new_cut=p->new_cut;
00247     int32_t* w_offsets = o->w_offsets;
00248     float64_t* y = o->lab;
00249     int32_t alphabet_size = o->alphabet_size;
00250     float32_t* wd_weights = o->wd_weights;
00251     int32_t degree = o->degree;
00252     CStringFeatures<uint8_t>* f = o->features;
00253     float64_t normalization_const = o->normalization_const;
00254 
00255     // temporary vector
00256     float32_t* new_a = p->new_a;
00257     //float32_t* new_a = new float32_t[nDim];
00258     //memset(new_a, 0, sizeof(float32_t)*nDim);
00259 
00260     int32_t* val=new int32_t[cut_length];
00261     for (int32_t j=start; j<end; j++)
00262     {
00263         int32_t offs=o->w_dim_single_char*j;
00264         memset(val,0,sizeof(int32_t)*cut_length);
00265         int32_t lim=CMath::min(degree, string_length-j);
00266         int32_t len;
00267 
00268         for (int32_t k=0; k<lim; k++)
00269         {
00270             uint8_t* vec = f->get_feature_vector(j+k, len);
00271             float32_t wd = wd_weights[k]/normalization_const;
00272 
00273             for(uint32_t i=0; i < cut_length; i++) 
00274             {
00275                 val[i]=val[i]*alphabet_size + vec[new_cut[i]];
00276                 new_a[offs+val[i]]+=wd * y[new_cut[i]];
00277             }
00278             offs+=w_offsets[k];
00279         }
00280     }
00281 
00282     //p->new_a=new_a;
00283     delete[] val;
00284     return NULL;
00285 }
00286 
00287 void CWDSVMOcas::add_new_cut(
00288     float64_t *new_col_H, uint32_t *new_cut, uint32_t cut_length,
00289     uint32_t nSel, void* ptr)
00290 {
00291     CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00292     int32_t string_length = o->string_length;
00293     uint32_t nDim=(uint32_t) o->w_dim;
00294     float32_t** cuts=o->cuts;
00295 
00296     uint32_t i;
00297     wdocas_thread_params_add* params_add=new wdocas_thread_params_add[o->parallel.get_num_threads()];
00298     pthread_t* threads=new pthread_t[o->parallel.get_num_threads()];
00299     float32_t* new_a=new float32_t[nDim];
00300     memset(new_a, 0, sizeof(float32_t)*nDim);
00301 
00302     int32_t t;
00303     int32_t nthreads=o->parallel.get_num_threads()-1;
00304     int32_t end=0;
00305     int32_t step= string_length/o->parallel.get_num_threads();
00306 
00307     if (step<1)
00308     {
00309         nthreads=string_length-1;
00310         step=1;
00311     }
00312 
00313     for (t=0; t<nthreads; t++)
00314     {
00315         params_add[t].wdocas=o;
00316         //params_add[t].new_a=NULL;
00317         params_add[t].new_a=new_a;
00318         params_add[t].new_cut=new_cut;
00319         params_add[t].start = step*t;
00320         params_add[t].end = step*(t+1);
00321         params_add[t].cut_length = cut_length;
00322 
00323         if (pthread_create(&threads[t], NULL, &CWDSVMOcas::add_new_cut_helper, (void*)&params_add[t]) != 0)
00324         {
00325             nthreads=t;
00326             SG_SWARNING("thread creation failed\n");
00327             break;
00328         }
00329         end=params_add[t].end;
00330     }
00331 
00332     params_add[t].wdocas=o;
00333     //params_add[t].new_a=NULL;
00334     params_add[t].new_a=new_a;
00335     params_add[t].new_cut=new_cut;
00336     params_add[t].start = step*t;
00337     params_add[t].end = string_length;
00338     params_add[t].cut_length = cut_length;
00339     add_new_cut_helper(&params_add[t]);
00340     //float32_t* new_a=params_add[t].new_a;
00341 
00342     for (t=0; t<nthreads; t++)
00343     {
00344         if (pthread_join(threads[t], NULL) != 0)
00345             SG_SWARNING( "pthread_join failed\n");
00346 
00347         //float32_t* a=params_add[t].new_a;
00348         //for (i=0; i<nDim; i++)
00349         //  new_a[i]+=a[i];
00350         //delete[] a;
00351     }
00352 
00353     // insert new_a into the last column of sparse_A
00354     for(i=0; i < nSel; i++)
00355         new_col_H[i] = CMath::dot(new_a, cuts[i], nDim);
00356     new_col_H[nSel] = CMath::dot(new_a, new_a, nDim);
00357 
00358     cuts[nSel]=new_a;
00359     //CMath::display_vector(new_col_H, nSel+1, "new_col_H");
00360     //CMath::display_vector(cuts[nSel], nDim, "cut[nSel]");
00361     //
00362     delete[] threads;
00363     delete[] params_add;
00364 }
00365 
00366 void CWDSVMOcas::sort( float64_t* vals, uint32_t* idx, uint32_t size)
00367 {
00368     CMath::qsort_index(vals, idx, size);
00369 }
00370 
00371 /*----------------------------------------------------------------------
00372   sparse_compute_output( output ) does the follwing:
00373 
00374   output = data_X'*W;
00375   ----------------------------------------------------------------------*/
00376 void* CWDSVMOcas::compute_output_helper(void* ptr)
00377 {
00378     wdocas_thread_params_output* p = (wdocas_thread_params_output*) ptr;
00379     CWDSVMOcas* o = p->wdocas;
00380     int32_t start = p->start;
00381     int32_t end = p->end;
00382     float32_t* out = p->out;
00383     float64_t* output = p->output;
00384     int32_t* val = p->val;
00385 
00386     CStringFeatures<uint8_t>* f=o->get_features();
00387 
00388     int32_t degree = o->degree;
00389     int32_t string_length = o->string_length;
00390     int32_t alphabet_size = o->alphabet_size;
00391     int32_t* w_offsets = o->w_offsets;
00392     float32_t* wd_weights = o->wd_weights;
00393     float32_t* w= o->w;
00394 
00395     float64_t* y = o->lab;
00396     float64_t normalization_const = o->normalization_const;
00397 
00398 
00399     for (int32_t j=0; j<string_length; j++)
00400     {
00401         int32_t offs=o->w_dim_single_char*j;
00402         for (int32_t i=start ; i<end; i++)
00403             val[i]=0;
00404 
00405         int32_t lim=CMath::min(degree, string_length-j);
00406         int32_t len;
00407 
00408         for (int32_t k=0; k<lim; k++)
00409         {
00410             uint8_t* vec=f->get_feature_vector(j+k, len);
00411             float32_t wd = wd_weights[k];
00412 
00413             for (int32_t i=start; i<end; i++) // quite fast 1.9s
00414             {
00415                 val[i]=val[i]*alphabet_size + vec[i];
00416                 out[i]+=wd*w[offs+val[i]];
00417             }
00418 
00419             /*for (int32_t i=0; i<nData/4; i++) // slowest 2s
00420             {
00421                 uint32_t x=((uint32_t*) vec)[i];
00422                 int32_t ii=4*i;
00423                 val[ii]=val[ii]*alphabet_size + (x&255);
00424                 val[ii+1]=val[ii+1]*alphabet_size + ((x>>8)&255);
00425                 val[ii+2]=val[ii+2]*alphabet_size + ((x>>16)&255);
00426                 val[ii+3]=val[ii+3]*alphabet_size + (x>>24);
00427                 out[ii]+=wd*w[offs+val[ii]];
00428                 out[ii+1]+=wd*w[offs+val[ii+1]];
00429                 out[ii+2]+=wd*w[offs+val[ii+2]];
00430                 out[ii+3]+=wd*w[offs+val[ii+3]];
00431             }*/
00432 
00433             /*for (int32_t i=0; i<nData>>3; i++) // fastest on 64bit: 1.5s
00434             {
00435                 uint64_t x=((uint64_t*) vec)[i];
00436                 int32_t ii=i<<3;
00437                 val[ii]=val[ii]*alphabet_size + (x&255);
00438                 val[ii+1]=val[ii+1]*alphabet_size + ((x>>8)&255);
00439                 val[ii+2]=val[ii+2]*alphabet_size + ((x>>16)&255);
00440                 val[ii+3]=val[ii+3]*alphabet_size + ((x>>24)&255);
00441                 val[ii+4]=val[ii+4]*alphabet_size + ((x>>32)&255);
00442                 val[ii+5]=val[ii+5]*alphabet_size + ((x>>40)&255);
00443                 val[ii+6]=val[ii+6]*alphabet_size + ((x>>48)&255);
00444                 val[ii+7]=val[ii+7]*alphabet_size + (x>>56);
00445                 out[ii]+=wd*w[offs+val[ii]];
00446                 out[ii+1]+=wd*w[offs+val[ii+1]];
00447                 out[ii+2]+=wd*w[offs+val[ii+2]];
00448                 out[ii+3]+=wd*w[offs+val[ii+3]];
00449                 out[ii+4]+=wd*w[offs+val[ii+4]];
00450                 out[ii+5]+=wd*w[offs+val[ii+5]];
00451                 out[ii+6]+=wd*w[offs+val[ii+6]];
00452                 out[ii+7]+=wd*w[offs+val[ii+7]];
00453             }*/
00454             offs+=w_offsets[k];
00455         }
00456     }
00457 
00458     for (int32_t i=start; i<end; i++)
00459         output[i]=out[i]*y[i]/normalization_const;
00460 
00461     //CMath::display_vector(o->w, o->w_dim, "w");
00462     //CMath::display_vector(output, nData, "out");
00463     return NULL;
00464 }
00465 
00466 void CWDSVMOcas::compute_output( float64_t *output, void* ptr )
00467 {
00468     CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00469     int32_t nData=o->num_vec;
00470     wdocas_thread_params_output* params_output=new wdocas_thread_params_output[o->parallel.get_num_threads()];
00471     pthread_t* threads = new pthread_t[o->parallel.get_num_threads()];
00472 
00473     float32_t* out=new float32_t[nData];
00474     int32_t* val=new int32_t[nData];
00475     memset(out, 0, sizeof(float32_t)*nData);
00476 
00477     int32_t t;
00478     int32_t nthreads=o->parallel.get_num_threads()-1;
00479     int32_t end=0;
00480     int32_t step= nData/o->parallel.get_num_threads();
00481 
00482     if (step<1)
00483     {
00484         nthreads=nData-1;
00485         step=1;
00486     }
00487 
00488     for (t=0; t<nthreads; t++)
00489     {
00490         params_output[t].wdocas=o;
00491         params_output[t].output=output;
00492         params_output[t].out=out;
00493         params_output[t].val=val;
00494         params_output[t].start = step*t;
00495         params_output[t].end = step*(t+1);
00496 
00497         //SG_SPRINT("t=%d start=%d end=%d output=%p\n", t, params_output[t].start, params_output[t].end, params_output[t].output);
00498         if (pthread_create(&threads[t], NULL, &CWDSVMOcas::compute_output_helper, (void*)&params_output[t]) != 0)
00499         {
00500             nthreads=t;
00501             SG_SWARNING("thread creation failed\n");
00502             break;
00503         }
00504         end=params_output[t].end;
00505     }
00506 
00507     params_output[t].wdocas=o;
00508     params_output[t].output=output;
00509     params_output[t].out=out;
00510     params_output[t].val=val;
00511     params_output[t].start = step*t;
00512     params_output[t].end = nData;
00513     compute_output_helper(&params_output[t]);
00514     //SG_SPRINT("t=%d start=%d end=%d output=%p\n", t, params_output[t].start, params_output[t].end, params_output[t].output);
00515 
00516     for (t=0; t<nthreads; t++)
00517     {
00518         if (pthread_join(threads[t], NULL) != 0)
00519             SG_SWARNING( "pthread_join failed\n");
00520     }
00521     delete[] threads;
00522     delete[] params_output;
00523     delete[] val;
00524     delete[] out;
00525 }
00526 /*----------------------------------------------------------------------
00527   sq_norm_W = compute_W( alpha, nSel ) does the following:
00528 
00529   oldW = W;
00530   W = sparse_A(:,1:nSel)'*alpha;
00531   sq_norm_W = W'*W;
00532   dp_WoldW = W'*oldW';
00533 
00534   ----------------------------------------------------------------------*/
00535 void CWDSVMOcas::compute_W(
00536     float64_t *sq_norm_W, float64_t *dp_WoldW, float64_t *alpha, uint32_t nSel,
00537     void* ptr)
00538 {
00539     CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00540     uint32_t nDim= (uint32_t) o->w_dim;
00541     CMath::swap(o->w, o->old_w);
00542     float32_t* W=o->w;
00543     float32_t* oldW=o->old_w;
00544     float32_t** cuts=o->cuts;
00545     memset(W, 0, sizeof(float32_t)*nDim);
00546 
00547     for (uint32_t i=0; i<nSel; i++)
00548     {
00549         if (alpha[i] > 0)
00550             CMath::vec1_plus_scalar_times_vec2(W, (float32_t) alpha[i], cuts[i], nDim);
00551     }
00552 
00553     *sq_norm_W = CMath::dot(W,W, nDim);
00554     *dp_WoldW = CMath::dot(W,oldW, nDim);;
00555     //SG_PRINT("nSel=%d sq_norm_W=%f dp_WoldW=%f\n", nSel, *sq_norm_W, *dp_WoldW);
00556 }

SHOGUN Machine Learning Toolbox - Documentation