00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "features/Labels.h"
00013 #include "lib/Mathematics.h"
00014 #include "lib/DynamicArray.h"
00015 #include "lib/Time.h"
00016 #include "base/Parallel.h"
00017 #include "classifier/Classifier.h"
00018 #include "classifier/svm/libocas.h"
00019 #include "classifier/svm/WDSVMOcas.h"
00020 #include "features/StringFeatures.h"
00021 #include "features/Alphabet.h"
00022 #include "features/Labels.h"
00023
00024
00025 struct wdocas_thread_params_output
00026 {
00027 float32_t* out;
00028 int32_t* val;
00029 float64_t* output;
00030 CWDSVMOcas* wdocas;
00031 int32_t start;
00032 int32_t end;
00033 };
00034
00035 struct wdocas_thread_params_add
00036 {
00037 CWDSVMOcas* wdocas;
00038 float32_t* new_a;
00039 uint32_t* new_cut;
00040 int32_t start;
00041 int32_t end;
00042 uint32_t cut_length;
00043 };
00044
00045 CWDSVMOcas::CWDSVMOcas(E_SVM_TYPE type)
00046 : CClassifier(), use_bias(false), bufsize(3000), C1(1), C2(1),
00047 epsilon(1e-3), method(type)
00048 {
00049 w=NULL;
00050 old_w=NULL;
00051 degree=6;
00052 from_degree=40;
00053 wd_weights=NULL;
00054 w_offsets=NULL;
00055 normalization_const=1.0;
00056 }
00057
00058 CWDSVMOcas::CWDSVMOcas(
00059 float64_t C, int32_t d, int32_t from_d, CStringFeatures<uint8_t>* traindat,
00060 CLabels* trainlab)
00061 : CClassifier(), use_bias(false), bufsize(3000), C1(C), C2(C), epsilon(1e-3),
00062 degree(d), from_degree(from_d)
00063 {
00064 w=NULL;
00065 old_w=NULL;
00066 method=SVM_OCAS;
00067 features=traindat;
00068 set_labels(trainlab);
00069 wd_weights=NULL;
00070 w_offsets=NULL;
00071 normalization_const=1.0;
00072 }
00073
00074
00075 CWDSVMOcas::~CWDSVMOcas()
00076 {
00077 }
00078
00079 CLabels* CWDSVMOcas::classify(CLabels* output)
00080 {
00081 set_wd_weights();
00082 set_normalization_const();
00083
00084 if (features)
00085 {
00086 int32_t num=features->get_num_vectors();
00087 ASSERT(num>0);
00088
00089 if (!output)
00090 output=new CLabels(num);
00091
00092 for (int32_t i=0; i<num; i++)
00093 output->set_label(i, classify_example(i));
00094
00095 return output;
00096 }
00097
00098 return NULL;
00099 }
00100
00101 int32_t CWDSVMOcas::set_wd_weights()
00102 {
00103 ASSERT(degree>0 && degree<=8);
00104 delete[] wd_weights;
00105 wd_weights=new float32_t[degree];
00106 delete[] w_offsets;
00107 w_offsets=new int32_t[degree];
00108 int32_t w_dim_single_c=0;
00109
00110 for (int32_t i=0; i<degree; i++)
00111 {
00112 w_offsets[i]=CMath::pow(alphabet_size, i+1);
00113 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00114 w_dim_single_c+=w_offsets[i];
00115 }
00116 return w_dim_single_c;
00117 }
00118
00119 bool CWDSVMOcas::train()
00120 {
00121 SG_INFO("C=%f, epsilon=%f, bufsize=%d\n", get_C1(), get_epsilon(), bufsize);
00122
00123 ASSERT(labels);
00124 ASSERT(get_features());
00125 ASSERT(labels->is_two_class_labeling());
00126 CAlphabet* alphabet=get_features()->get_alphabet();
00127 ASSERT(alphabet && alphabet->get_alphabet()==RAWDNA);
00128
00129 alphabet_size=alphabet->get_num_symbols();
00130 string_length=features->get_num_vectors();
00131 int32_t num_train_labels=0;
00132 lab=labels->get_labels(num_train_labels);
00133
00134 w_dim_single_char=set_wd_weights();
00135
00136 SG_DEBUG("w_dim_single_char=%d\n", w_dim_single_char);
00137 w_dim=string_length*w_dim_single_char;
00138 SG_DEBUG("cutting plane has %d dims\n", w_dim);
00139 num_vec=get_features()->get_max_vector_length();
00140
00141 set_normalization_const();
00142 SG_INFO("num_vec: %d num_lab: %d\n", num_vec, num_train_labels);
00143 ASSERT(num_vec==num_train_labels);
00144 ASSERT(num_vec>0);
00145
00146 delete[] w;
00147 w=new float32_t[w_dim];
00148 memset(w, 0, w_dim*sizeof(float32_t));
00149
00150 delete[] old_w;
00151 old_w=new float32_t[w_dim];
00152 memset(old_w, 0, w_dim*sizeof(float32_t));
00153 bias=0;
00154
00155 cuts=new float32_t*[bufsize];
00156 memset(cuts, 0, sizeof(*cuts)*bufsize);
00157
00159
00160
00161
00162
00163
00164
00165
00166
00168 float64_t TolAbs=0;
00169 float64_t QPBound=0;
00170 uint8_t Method=0;
00171 if (method == SVM_OCAS)
00172 Method = 1;
00173 ocas_return_value_T result = svm_ocas_solver( get_C1(), num_vec, get_epsilon(),
00174 TolAbs, QPBound, bufsize, Method,
00175 &CWDSVMOcas::compute_W,
00176 &CWDSVMOcas::update_W,
00177 &CWDSVMOcas::add_new_cut,
00178 &CWDSVMOcas::compute_output,
00179 &CWDSVMOcas::sort,
00180 this);
00181
00182 SG_INFO("Ocas Converged after %d iterations\n"
00183 "==================================\n"
00184 "timing statistics:\n"
00185 "output_time: %f s\n"
00186 "sort_time: %f s\n"
00187 "add_time: %f s\n"
00188 "w_time: %f s\n"
00189 "solver_time %f s\n"
00190 "ocas_time %f s\n\n", result.nIter, result.output_time, result.sort_time,
00191 result.add_time, result.w_time, result.solver_time, result.ocas_time);
00192
00193 for (int32_t i=bufsize-1; i>=0; i--)
00194 delete[] cuts[i];
00195 delete[] cuts;
00196
00197 delete[] lab;
00198 lab=NULL;
00199
00200 SG_UNREF(alphabet);
00201
00202 return true;
00203 }
00204
00205
00206
00207
00208
00209
00210
00211
00212 float64_t CWDSVMOcas::update_W( float64_t t, void* ptr )
00213 {
00214 float64_t sq_norm_W = 0;
00215 CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00216 uint32_t nDim = (uint32_t) o->w_dim;
00217 float32_t* W=o->w;
00218 float32_t* oldW=o->old_w;
00219
00220 for(uint32_t j=0; j <nDim; j++)
00221 {
00222 W[j] = oldW[j]*(1-t) + t*W[j];
00223 sq_norm_W += W[j]*W[j];
00224 }
00225
00226 return( sq_norm_W );
00227 }
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237 void* CWDSVMOcas::add_new_cut_helper( void* ptr)
00238 {
00239 wdocas_thread_params_add* p = (wdocas_thread_params_add*) ptr;
00240 CWDSVMOcas* o = p->wdocas;
00241 int32_t start = p->start;
00242 int32_t end = p->end;
00243 int32_t string_length = o->string_length;
00244
00245 uint32_t cut_length=p->cut_length;
00246 uint32_t* new_cut=p->new_cut;
00247 int32_t* w_offsets = o->w_offsets;
00248 float64_t* y = o->lab;
00249 int32_t alphabet_size = o->alphabet_size;
00250 float32_t* wd_weights = o->wd_weights;
00251 int32_t degree = o->degree;
00252 CStringFeatures<uint8_t>* f = o->features;
00253 float64_t normalization_const = o->normalization_const;
00254
00255
00256 float32_t* new_a = p->new_a;
00257
00258
00259
00260 int32_t* val=new int32_t[cut_length];
00261 for (int32_t j=start; j<end; j++)
00262 {
00263 int32_t offs=o->w_dim_single_char*j;
00264 memset(val,0,sizeof(int32_t)*cut_length);
00265 int32_t lim=CMath::min(degree, string_length-j);
00266 int32_t len;
00267
00268 for (int32_t k=0; k<lim; k++)
00269 {
00270 uint8_t* vec = f->get_feature_vector(j+k, len);
00271 float32_t wd = wd_weights[k]/normalization_const;
00272
00273 for(uint32_t i=0; i < cut_length; i++)
00274 {
00275 val[i]=val[i]*alphabet_size + vec[new_cut[i]];
00276 new_a[offs+val[i]]+=wd * y[new_cut[i]];
00277 }
00278 offs+=w_offsets[k];
00279 }
00280 }
00281
00282
00283 delete[] val;
00284 return NULL;
00285 }
00286
00287 void CWDSVMOcas::add_new_cut(
00288 float64_t *new_col_H, uint32_t *new_cut, uint32_t cut_length,
00289 uint32_t nSel, void* ptr)
00290 {
00291 CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00292 int32_t string_length = o->string_length;
00293 uint32_t nDim=(uint32_t) o->w_dim;
00294 float32_t** cuts=o->cuts;
00295
00296 uint32_t i;
00297 wdocas_thread_params_add* params_add=new wdocas_thread_params_add[o->parallel.get_num_threads()];
00298 pthread_t* threads=new pthread_t[o->parallel.get_num_threads()];
00299 float32_t* new_a=new float32_t[nDim];
00300 memset(new_a, 0, sizeof(float32_t)*nDim);
00301
00302 int32_t t;
00303 int32_t nthreads=o->parallel.get_num_threads()-1;
00304 int32_t end=0;
00305 int32_t step= string_length/o->parallel.get_num_threads();
00306
00307 if (step<1)
00308 {
00309 nthreads=string_length-1;
00310 step=1;
00311 }
00312
00313 for (t=0; t<nthreads; t++)
00314 {
00315 params_add[t].wdocas=o;
00316
00317 params_add[t].new_a=new_a;
00318 params_add[t].new_cut=new_cut;
00319 params_add[t].start = step*t;
00320 params_add[t].end = step*(t+1);
00321 params_add[t].cut_length = cut_length;
00322
00323 if (pthread_create(&threads[t], NULL, &CWDSVMOcas::add_new_cut_helper, (void*)¶ms_add[t]) != 0)
00324 {
00325 nthreads=t;
00326 SG_SWARNING("thread creation failed\n");
00327 break;
00328 }
00329 end=params_add[t].end;
00330 }
00331
00332 params_add[t].wdocas=o;
00333
00334 params_add[t].new_a=new_a;
00335 params_add[t].new_cut=new_cut;
00336 params_add[t].start = step*t;
00337 params_add[t].end = string_length;
00338 params_add[t].cut_length = cut_length;
00339 add_new_cut_helper(¶ms_add[t]);
00340
00341
00342 for (t=0; t<nthreads; t++)
00343 {
00344 if (pthread_join(threads[t], NULL) != 0)
00345 SG_SWARNING( "pthread_join failed\n");
00346
00347
00348
00349
00350
00351 }
00352
00353
00354 for(i=0; i < nSel; i++)
00355 new_col_H[i] = CMath::dot(new_a, cuts[i], nDim);
00356 new_col_H[nSel] = CMath::dot(new_a, new_a, nDim);
00357
00358 cuts[nSel]=new_a;
00359
00360
00361
00362 delete[] threads;
00363 delete[] params_add;
00364 }
00365
00366 void CWDSVMOcas::sort( float64_t* vals, uint32_t* idx, uint32_t size)
00367 {
00368 CMath::qsort_index(vals, idx, size);
00369 }
00370
00371
00372
00373
00374
00375
00376 void* CWDSVMOcas::compute_output_helper(void* ptr)
00377 {
00378 wdocas_thread_params_output* p = (wdocas_thread_params_output*) ptr;
00379 CWDSVMOcas* o = p->wdocas;
00380 int32_t start = p->start;
00381 int32_t end = p->end;
00382 float32_t* out = p->out;
00383 float64_t* output = p->output;
00384 int32_t* val = p->val;
00385
00386 CStringFeatures<uint8_t>* f=o->get_features();
00387
00388 int32_t degree = o->degree;
00389 int32_t string_length = o->string_length;
00390 int32_t alphabet_size = o->alphabet_size;
00391 int32_t* w_offsets = o->w_offsets;
00392 float32_t* wd_weights = o->wd_weights;
00393 float32_t* w= o->w;
00394
00395 float64_t* y = o->lab;
00396 float64_t normalization_const = o->normalization_const;
00397
00398
00399 for (int32_t j=0; j<string_length; j++)
00400 {
00401 int32_t offs=o->w_dim_single_char*j;
00402 for (int32_t i=start ; i<end; i++)
00403 val[i]=0;
00404
00405 int32_t lim=CMath::min(degree, string_length-j);
00406 int32_t len;
00407
00408 for (int32_t k=0; k<lim; k++)
00409 {
00410 uint8_t* vec=f->get_feature_vector(j+k, len);
00411 float32_t wd = wd_weights[k];
00412
00413 for (int32_t i=start; i<end; i++)
00414 {
00415 val[i]=val[i]*alphabet_size + vec[i];
00416 out[i]+=wd*w[offs+val[i]];
00417 }
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454 offs+=w_offsets[k];
00455 }
00456 }
00457
00458 for (int32_t i=start; i<end; i++)
00459 output[i]=out[i]*y[i]/normalization_const;
00460
00461
00462
00463 return NULL;
00464 }
00465
00466 void CWDSVMOcas::compute_output( float64_t *output, void* ptr )
00467 {
00468 CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00469 int32_t nData=o->num_vec;
00470 wdocas_thread_params_output* params_output=new wdocas_thread_params_output[o->parallel.get_num_threads()];
00471 pthread_t* threads = new pthread_t[o->parallel.get_num_threads()];
00472
00473 float32_t* out=new float32_t[nData];
00474 int32_t* val=new int32_t[nData];
00475 memset(out, 0, sizeof(float32_t)*nData);
00476
00477 int32_t t;
00478 int32_t nthreads=o->parallel.get_num_threads()-1;
00479 int32_t end=0;
00480 int32_t step= nData/o->parallel.get_num_threads();
00481
00482 if (step<1)
00483 {
00484 nthreads=nData-1;
00485 step=1;
00486 }
00487
00488 for (t=0; t<nthreads; t++)
00489 {
00490 params_output[t].wdocas=o;
00491 params_output[t].output=output;
00492 params_output[t].out=out;
00493 params_output[t].val=val;
00494 params_output[t].start = step*t;
00495 params_output[t].end = step*(t+1);
00496
00497
00498 if (pthread_create(&threads[t], NULL, &CWDSVMOcas::compute_output_helper, (void*)¶ms_output[t]) != 0)
00499 {
00500 nthreads=t;
00501 SG_SWARNING("thread creation failed\n");
00502 break;
00503 }
00504 end=params_output[t].end;
00505 }
00506
00507 params_output[t].wdocas=o;
00508 params_output[t].output=output;
00509 params_output[t].out=out;
00510 params_output[t].val=val;
00511 params_output[t].start = step*t;
00512 params_output[t].end = nData;
00513 compute_output_helper(¶ms_output[t]);
00514
00515
00516 for (t=0; t<nthreads; t++)
00517 {
00518 if (pthread_join(threads[t], NULL) != 0)
00519 SG_SWARNING( "pthread_join failed\n");
00520 }
00521 delete[] threads;
00522 delete[] params_output;
00523 delete[] val;
00524 delete[] out;
00525 }
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535 void CWDSVMOcas::compute_W(
00536 float64_t *sq_norm_W, float64_t *dp_WoldW, float64_t *alpha, uint32_t nSel,
00537 void* ptr)
00538 {
00539 CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00540 uint32_t nDim= (uint32_t) o->w_dim;
00541 CMath::swap(o->w, o->old_w);
00542 float32_t* W=o->w;
00543 float32_t* oldW=o->old_w;
00544 float32_t** cuts=o->cuts;
00545 memset(W, 0, sizeof(float32_t)*nDim);
00546
00547 for (uint32_t i=0; i<nSel; i++)
00548 {
00549 if (alpha[i] > 0)
00550 CMath::vec1_plus_scalar_times_vec2(W, (float32_t) alpha[i], cuts[i], nDim);
00551 }
00552
00553 *sq_norm_W = CMath::dot(W,W, nDim);
00554 *dp_WoldW = CMath::dot(W,oldW, nDim);;
00555
00556 }