00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "features/Labels.h"
00013 #include "lib/Mathematics.h"
00014 #include "lib/DynamicArray.h"
00015 #include "lib/Time.h"
00016 #include "base/Parallel.h"
00017 #include "classifier/Classifier.h"
00018 #include "classifier/svm/libocas.h"
00019 #include "classifier/svm/WDSVMOcas.h"
00020 #include "features/StringFeatures.h"
00021 #include "features/Alphabet.h"
00022 #include "features/Labels.h"
00023
00024
00025 struct wdocas_thread_params_output
00026 {
00027 SHORTREAL* out;
00028 INT* val;
00029 double* output;
00030 CWDSVMOcas* wdocas;
00031 INT start;
00032 INT end;
00033 };
00034
00035 struct wdocas_thread_params_add
00036 {
00037 CWDSVMOcas* wdocas;
00038 SHORTREAL* new_a;
00039 uint32_t* new_cut;
00040 INT start;
00041 INT end;
00042 uint32_t cut_length;
00043 };
00044
00045 CWDSVMOcas::CWDSVMOcas(E_SVM_TYPE type)
00046 : CClassifier(), use_bias(false), bufsize(3000), C1(1), C2(1),
00047 epsilon(1e-3), method(type)
00048 {
00049 w=NULL;
00050 old_w=NULL;
00051 degree=6;
00052 from_degree=40;
00053 wd_weights=NULL;
00054 w_offsets=NULL;
00055 normalization_const=1.0;
00056 }
00057
00058 CWDSVMOcas::CWDSVMOcas(DREAL C, INT d, INT from_d, CStringFeatures<BYTE>* traindat, CLabels* trainlab)
00059 : CClassifier(), use_bias(false), bufsize(3000), C1(C), C2(C), epsilon(1e-3),
00060 degree(d), from_degree(from_d)
00061 {
00062 w=NULL;
00063 old_w=NULL;
00064 method=SVM_OCAS;
00065 features=traindat;
00066 CClassifier::labels=trainlab;
00067 wd_weights=NULL;
00068 w_offsets=NULL;
00069 normalization_const=1.0;
00070 }
00071
00072
00073 CWDSVMOcas::~CWDSVMOcas()
00074 {
00075 }
00076
00077 CLabels* CWDSVMOcas::classify(CLabels* output)
00078 {
00079 set_wd_weights();
00080 set_normalization_const();
00081
00082 if (features)
00083 {
00084 INT num=features->get_num_vectors();
00085 ASSERT(num>0);
00086
00087 if (!output)
00088 output=new CLabels(num);
00089
00090 for (INT i=0; i<num; i++)
00091 output->set_label(i, classify_example(i));
00092
00093 return output;
00094 }
00095
00096 return NULL;
00097 }
00098
00099 INT CWDSVMOcas::set_wd_weights()
00100 {
00101 ASSERT(degree>0 && degree<=8);
00102 delete[] wd_weights;
00103 wd_weights=new SHORTREAL[degree];
00104 delete[] w_offsets;
00105 w_offsets=new INT[degree];
00106 INT w_dim_single_c=0;
00107
00108 for (INT i=0; i<degree; i++)
00109 {
00110 w_offsets[i]=CMath::pow(alphabet_size, i+1);
00111 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00112 w_dim_single_c+=w_offsets[i];
00113 }
00114 return w_dim_single_c;
00115 }
00116
00117 bool CWDSVMOcas::train()
00118 {
00119 SG_INFO("C=%f, epsilon=%f, bufsize=%d\n", get_C1(), get_epsilon(), bufsize);
00120
00121 ASSERT(labels);
00122 ASSERT(get_features());
00123 ASSERT(labels->is_two_class_labeling());
00124 CAlphabet* alphabet=get_features()->get_alphabet();
00125 ASSERT(alphabet && alphabet->get_alphabet()==RAWDNA);
00126
00127 alphabet_size=alphabet->get_num_symbols();
00128 string_length=features->get_num_vectors();
00129 INT num_train_labels=0;
00130 lab=labels->get_labels(num_train_labels);
00131
00132 w_dim_single_char=set_wd_weights();
00133 CMath::display_vector(wd_weights, degree, "wd_weights");
00134 SG_DEBUG("w_dim_single_char=%d\n", w_dim_single_char);
00135 w_dim=string_length*w_dim_single_char;
00136 SG_DEBUG("cutting plane has %d dims\n", w_dim);
00137 num_vec=get_features()->get_max_vector_length();
00138
00139 set_normalization_const();
00140 SG_INFO("num_vec: %d num_lab: %d\n", num_vec, num_train_labels);
00141 ASSERT(num_vec==num_train_labels);
00142 ASSERT(num_vec>0);
00143
00144 delete[] w;
00145 w=new SHORTREAL[w_dim];
00146 memset(w, 0, w_dim*sizeof(SHORTREAL));
00147
00148 delete[] old_w;
00149 old_w=new SHORTREAL[w_dim];
00150 memset(old_w, 0, w_dim*sizeof(SHORTREAL));
00151 bias=0;
00152
00153 cuts=new SHORTREAL*[bufsize];
00154 memset(cuts, 0, sizeof(*cuts)*bufsize);
00155
00157
00158
00159
00160
00161
00162
00163
00164
00166 double TolAbs=0;
00167 double QPBound=0;
00168 int Method=0;
00169 if (method == SVM_OCAS)
00170 Method = 1;
00171 ocas_return_value_T result = svm_ocas_solver( get_C1(), num_vec, get_epsilon(),
00172 TolAbs, QPBound, bufsize, Method,
00173 &CWDSVMOcas::compute_W,
00174 &CWDSVMOcas::update_W,
00175 &CWDSVMOcas::add_new_cut,
00176 &CWDSVMOcas::compute_output,
00177 &CWDSVMOcas::sort,
00178 this);
00179
00180 SG_INFO("Ocas Converged after %d iterations\n"
00181 "==================================\n"
00182 "timing statistics:\n"
00183 "output_time: %f s\n"
00184 "sort_time: %f s\n"
00185 "add_time: %f s\n"
00186 "w_time: %f s\n"
00187 "solver_time %f s\n"
00188 "ocas_time %f s\n\n", result.nIter, result.output_time, result.sort_time,
00189 result.add_time, result.w_time, result.solver_time, result.ocas_time);
00190
00191 for (INT i=bufsize-1; i>=0; i--)
00192 delete[] cuts[i];
00193 delete[] cuts;
00194
00195 delete[] lab;
00196 lab=NULL;
00197
00198 SG_UNREF(alphabet);
00199
00200 return true;
00201 }
00202
00203
00204
00205
00206
00207
00208
00209
00210 double CWDSVMOcas::update_W( double t, void* ptr )
00211 {
00212 double sq_norm_W = 0;
00213 CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00214 uint32_t nDim = (uint32_t) o->w_dim;
00215 SHORTREAL* W=o->w;
00216 SHORTREAL* oldW=o->old_w;
00217
00218 for(uint32_t j=0; j <nDim; j++)
00219 {
00220 W[j] = oldW[j]*(1-t) + t*W[j];
00221 sq_norm_W += W[j]*W[j];
00222 }
00223
00224 return( sq_norm_W );
00225 }
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235 void* CWDSVMOcas::add_new_cut_helper( void* ptr)
00236 {
00237 wdocas_thread_params_add* p = (wdocas_thread_params_add*) ptr;
00238 CWDSVMOcas* o = p->wdocas;
00239 INT start = p->start;
00240 INT end = p->end;
00241 INT string_length = o->string_length;
00242
00243 uint32_t cut_length=p->cut_length;
00244 uint32_t* new_cut=p->new_cut;
00245 INT* w_offsets = o->w_offsets;
00246 DREAL* y = o->lab;
00247 INT alphabet_size = o->alphabet_size;
00248 SHORTREAL* wd_weights = o->wd_weights;
00249 INT degree = o->degree;
00250 CStringFeatures<BYTE>* f = o->features;
00251 DREAL normalization_const = o->normalization_const;
00252
00253
00254 SHORTREAL* new_a = p->new_a;
00255
00256
00257
00258 INT* val=new INT[cut_length];
00259 for (INT j=start; j<end; j++)
00260 {
00261 INT offs=o->w_dim_single_char*j;
00262 memset(val,0,sizeof(INT)*cut_length);
00263 INT lim=CMath::min(degree, string_length-j);
00264 INT len;
00265
00266 for (INT k=0; k<lim; k++)
00267 {
00268 BYTE* vec = f->get_feature_vector(j+k, len);
00269 SHORTREAL wd = wd_weights[k]/normalization_const;
00270
00271 for(uint32_t i=0; i < cut_length; i++)
00272 {
00273 val[i]=val[i]*alphabet_size + vec[new_cut[i]];
00274 new_a[offs+val[i]]+=wd * y[new_cut[i]];
00275 }
00276 offs+=w_offsets[k];
00277 }
00278 }
00279
00280
00281 delete[] val;
00282 return NULL;
00283 }
00284
00285 void CWDSVMOcas::add_new_cut( double *new_col_H,
00286 uint32_t *new_cut,
00287 uint32_t cut_length,
00288 uint32_t nSel,
00289 void* ptr)
00290 {
00291 CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00292 INT string_length = o->string_length;
00293 uint32_t nDim=(uint32_t) o->w_dim;
00294 SHORTREAL** cuts=o->cuts;
00295
00296 uint32_t i;
00297 wdocas_thread_params_add* params_add=new wdocas_thread_params_add[o->parallel.get_num_threads()];
00298 pthread_t* threads=new pthread_t[o->parallel.get_num_threads()];
00299 SHORTREAL* new_a=new SHORTREAL[nDim];
00300 memset(new_a, 0, sizeof(SHORTREAL)*nDim);
00301
00302 INT t;
00303 INT nthreads=o->parallel.get_num_threads()-1;
00304 INT end=0;
00305 INT step= string_length/o->parallel.get_num_threads();
00306
00307 if (step<1)
00308 {
00309 nthreads=string_length-1;
00310 step=1;
00311 }
00312
00313 for (t=0; t<nthreads; t++)
00314 {
00315 params_add[t].wdocas=o;
00316
00317 params_add[t].new_a=new_a;
00318 params_add[t].new_cut=new_cut;
00319 params_add[t].start = step*t;
00320 params_add[t].end = step*(t+1);
00321 params_add[t].cut_length = cut_length;
00322
00323 if (pthread_create(&threads[t], NULL, &CWDSVMOcas::add_new_cut_helper, (void*)¶ms_add[t]) != 0)
00324 {
00325 nthreads=t;
00326 SG_SWARNING("thread creation failed\n");
00327 break;
00328 }
00329 end=params_add[t].end;
00330 }
00331
00332 params_add[t].wdocas=o;
00333
00334 params_add[t].new_a=new_a;
00335 params_add[t].new_cut=new_cut;
00336 params_add[t].start = step*t;
00337 params_add[t].end = string_length;
00338 params_add[t].cut_length = cut_length;
00339 add_new_cut_helper(¶ms_add[t]);
00340
00341
00342 for (t=0; t<nthreads; t++)
00343 {
00344 if (pthread_join(threads[t], NULL) != 0)
00345 SG_SWARNING( "pthread_join failed\n");
00346
00347
00348
00349
00350
00351 }
00352
00353
00354 for(i=0; i < nSel; i++)
00355 new_col_H[i] = CMath::dot(new_a, cuts[i], nDim);
00356 new_col_H[nSel] = CMath::dot(new_a, new_a, nDim);
00357
00358 cuts[nSel]=new_a;
00359
00360
00361
00362 delete[] threads;
00363 delete[] params_add;
00364 }
00365
00366 void CWDSVMOcas::sort( double* vals, uint32_t* idx, uint32_t size)
00367 {
00368 CMath::qsort_index(vals, idx, size);
00369 }
00370
00371
00372
00373
00374
00375
00376 void* CWDSVMOcas::compute_output_helper(void* ptr)
00377 {
00378 wdocas_thread_params_output* p = (wdocas_thread_params_output*) ptr;
00379 CWDSVMOcas* o = p->wdocas;
00380 INT start = p->start;
00381 INT end = p->end;
00382 SHORTREAL* out = p->out;
00383 double* output = p->output;
00384 INT* val = p->val;
00385
00386 CStringFeatures<BYTE>* f=o->get_features();
00387
00388 INT degree = o->degree;
00389 INT string_length = o->string_length;
00390 INT alphabet_size = o->alphabet_size;
00391 INT* w_offsets = o->w_offsets;
00392 SHORTREAL* wd_weights = o->wd_weights;
00393 SHORTREAL* w= o->w;
00394
00395 DREAL* y = o->lab;
00396 DREAL normalization_const = o->normalization_const;
00397
00398
00399 for (INT j=0; j<string_length; j++)
00400 {
00401 INT offs=o->w_dim_single_char*j;
00402 for (INT i=start ; i<end; i++)
00403 val[i]=0;
00404
00405 INT lim=CMath::min(degree, string_length-j);
00406 INT len;
00407
00408 for (INT k=0; k<lim; k++)
00409 {
00410 BYTE* vec=f->get_feature_vector(j+k, len);
00411 SHORTREAL wd = wd_weights[k];
00412
00413 for (INT i=start; i<end; i++)
00414 {
00415 val[i]=val[i]*alphabet_size + vec[i];
00416 out[i]+=wd*w[offs+val[i]];
00417 }
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454 offs+=w_offsets[k];
00455 }
00456 }
00457
00458 for (INT i=start; i<end; i++)
00459 output[i]=out[i]*y[i]/normalization_const;
00460
00461
00462
00463 return NULL;
00464 }
00465
00466 void CWDSVMOcas::compute_output( double *output, void* ptr )
00467 {
00468 CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00469 INT nData=o->num_vec;
00470 wdocas_thread_params_output* params_output=new wdocas_thread_params_output[o->parallel.get_num_threads()];
00471 pthread_t* threads = new pthread_t[o->parallel.get_num_threads()];
00472
00473 SHORTREAL* out=new SHORTREAL[nData];
00474 INT* val=new INT[nData];
00475 memset(out, 0, sizeof(SHORTREAL)*nData);
00476
00477 INT t;
00478 INT nthreads=o->parallel.get_num_threads()-1;
00479 INT end=0;
00480 INT step= nData/o->parallel.get_num_threads();
00481
00482 if (step<1)
00483 {
00484 nthreads=nData-1;
00485 step=1;
00486 }
00487
00488 for (t=0; t<nthreads; t++)
00489 {
00490 params_output[t].wdocas=o;
00491 params_output[t].output=output;
00492 params_output[t].out=out;
00493 params_output[t].val=val;
00494 params_output[t].start = step*t;
00495 params_output[t].end = step*(t+1);
00496
00497
00498 if (pthread_create(&threads[t], NULL, &CWDSVMOcas::compute_output_helper, (void*)¶ms_output[t]) != 0)
00499 {
00500 nthreads=t;
00501 SG_SWARNING("thread creation failed\n");
00502 break;
00503 }
00504 end=params_output[t].end;
00505 }
00506
00507 params_output[t].wdocas=o;
00508 params_output[t].output=output;
00509 params_output[t].out=out;
00510 params_output[t].val=val;
00511 params_output[t].start = step*t;
00512 params_output[t].end = nData;
00513 compute_output_helper(¶ms_output[t]);
00514
00515
00516 for (t=0; t<nthreads; t++)
00517 {
00518 if (pthread_join(threads[t], NULL) != 0)
00519 SG_SWARNING( "pthread_join failed\n");
00520 }
00521 delete[] threads;
00522 delete[] params_output;
00523 delete[] val;
00524 delete[] out;
00525 }
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535 void CWDSVMOcas::compute_W( double *sq_norm_W, double *dp_WoldW, double *alpha, uint32_t nSel, void* ptr )
00536 {
00537 CWDSVMOcas* o = (CWDSVMOcas*) ptr;
00538 uint32_t nDim= (uint32_t) o->w_dim;
00539 CMath::swap(o->w, o->old_w);
00540 SHORTREAL* W=o->w;
00541 SHORTREAL* oldW=o->old_w;
00542 SHORTREAL** cuts=o->cuts;
00543 memset(W, 0, sizeof(SHORTREAL)*nDim);
00544
00545 for (uint32_t i=0; i<nSel; i++)
00546 {
00547 if (alpha[i] > 0)
00548 CMath::vec1_plus_scalar_times_vec2(W, (SHORTREAL) alpha[i], cuts[i], nDim);
00549 }
00550
00551 *sq_norm_W = CMath::dot(W,W, nDim);
00552 *dp_WoldW = CMath::dot(W,oldW, nDim);;
00553
00554 }