ssl.cpp

Go to the documentation of this file.
00001 /*    Copyright 2006 Vikas Sindhwani (vikass@cs.uchicago.edu)
00002       SVM-lin: Fast SVM Solvers for Supervised and Semi-supervised Learning
00003 
00004       This file is part of SVM-lin.      
00005 
00006       SVM-lin is free software; you can redistribute it and/or modify
00007       it under the terms of the GNU General Public License as published by
00008       the Free Software Foundation; either version 2 of the License, or
00009       (at your option) any later version.
00010 
00011       SVM-lin is distributed in the hope that it will be useful,
00012       but WITHOUT ANY WARRANTY; without even the implied warranty of
00013       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014       GNU General Public License for more details.
00015 
00016       You should have received a copy of the GNU General Public License
00017       along with SVM-lin (see gpl.txt); if not, write to the Free Software
00018       Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
00019       */
00020 
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <ctype.h>
00024 #include <algorithm>
00025 
00026 #include "lib/io.h"
00027 #include "lib/Mathematics.h"
00028 #include "features/SparseFeatures.h"
00029 #include "classifier/svm/ssl.h"
00030 
00031 #define VERBOSE 1
00032 
00033 void ssl_train(struct data *Data, 
00034         struct options *Options, 
00035         struct vector_double *Weights,
00036         struct vector_double *Outputs)
00037 {
00038     // initialize 
00039     initialize(Weights,Data->n,0.0);
00040     initialize(Outputs,Data->m,0.0);
00041     vector_int    *Subset  = new vector_int[1];
00042     initialize(Subset,Data->m);
00043     // call the right algorithm
00044     int32_t optimality = 0;
00045     switch(Options->algo)
00046     {
00047         case -1:
00048             SG_SINFO("Regularized Least Squares Regression (CGLS)\n");
00049             optimality=CGLS(Data,Options,Subset,Weights,Outputs);
00050             break;
00051         case RLS:
00052             SG_SINFO("Regularized Least Squares Classification (CGLS)\n");
00053             optimality=CGLS(Data,Options,Subset,Weights,Outputs);
00054             break;
00055         case SVM:
00056             SG_SINFO("Modified Finite Newton L2-SVM (L2-SVM-MFN)\n");
00057             optimality=L2_SVM_MFN(Data,Options,Weights,Outputs,0);
00058             break;
00059         case TSVM:
00060             SG_SINFO("Transductive L2-SVM (TSVM)\n");
00061             optimality=TSVM_MFN(Data,Options,Weights,Outputs);
00062             break;
00063         case DA_SVM:
00064             SG_SINFO("Deterministic Annealing Semi-supervised L2-SVM (DAS3VM)\n");
00065             optimality=DA_S3VM(Data,Options,Weights,Outputs);
00066             break;
00067         default:
00068             SG_SERROR("Algorithm unspecified");
00069     }
00070 
00071     delete[] Subset->vec;
00072     delete[] Subset;
00073     return;
00074 }
00075 
00076 int32_t CGLS(
00077     const struct data *Data, const struct options *Options,
00078     const struct vector_int *Subset, struct vector_double *Weights,
00079     struct vector_double *Outputs)
00080 {
00081     SG_SDEBUG("CGLS starting...");
00082 
00083     /* Disassemble the structures */
00084     int32_t active = Subset->d;
00085     int32_t *J = Subset->vec;
00086     CSparseFeatures<float64_t>* features=Data->features;
00087     float64_t *Y = Data->Y;
00088     float64_t *C = Data->C;
00089     int32_t n  = Data->n;
00090     float64_t lambda = Options->lambda;
00091     int32_t cgitermax = Options->cgitermax;
00092     float64_t epsilon = Options->epsilon;
00093     float64_t *beta = Weights->vec;
00094     float64_t *o  = Outputs->vec; 
00095     // initialize z 
00096     float64_t *z = new float64_t[active];
00097     float64_t *q = new float64_t[active];
00098     int32_t ii=0;
00099     for (int32_t i = active ; i-- ;){
00100         ii=J[i];      
00101         z[i]  = C[ii]*(Y[ii] - o[ii]);
00102     }
00103     float64_t *r = new float64_t[n];
00104     for (int32_t i = n ; i-- ;)
00105         r[i] = 0.0;
00106     for (register int32_t j=0; j < active; j++)
00107     {
00108         ii=J[j];
00109         int32_t num_entries=0;
00110         bool free_vec=false;
00111 
00112         TSparseEntry<float64_t>* vec=features->get_sparse_feature_vector(ii, num_entries, free_vec);
00113         for (int32_t i=0; i<num_entries; i++)
00114             r[vec[i].feat_index]+= vec[i].entry*z[j];
00115         features->free_sparse_feature_vector(vec, num_entries, free_vec);
00116         r[n-1]+=Options->bias*z[j]; //bias (modelled as last dim)
00117     }
00118     float64_t *p = new float64_t[n];   
00119     float64_t omega1 = 0.0;
00120     for (int32_t i = n ; i-- ;)
00121     {
00122         r[i] -= lambda*beta[i];
00123         p[i] = r[i];
00124         omega1 += r[i]*r[i];
00125     }   
00126     float64_t omega_p = omega1;
00127     float64_t omega_q = 0.0;
00128     float64_t inv_omega2 = 1/omega1;
00129     float64_t scale = 0.0;
00130     float64_t omega_z=0.0;
00131     float64_t gamma = 0.0;
00132     int32_t cgiter = 0;
00133     int32_t optimality = 0;
00134     float64_t epsilon2 = epsilon*epsilon;   
00135     // iterate
00136     while(cgiter < cgitermax)
00137     {
00138         cgiter++;
00139         omega_q=0.0;
00140         float64_t t=0.0;
00141         register int32_t i,j;
00142         // #pragma omp parallel for private(i,j)
00143         for (i=0; i < active; i++)
00144         {
00145             ii=J[i];
00146             t=0.0;
00147             int32_t num_entries=0;
00148             bool free_vec=false;
00149             TSparseEntry<float64_t>* vec=features->get_sparse_feature_vector(ii,
00150                     num_entries, free_vec);
00151             for (j=0; j<num_entries; j++)
00152                 t+=vec[j].entry*p[vec[j].feat_index];
00153             features->free_sparse_feature_vector(vec, num_entries, free_vec);
00154             t+=Options->bias*p[n-1]; //bias (modelled as last dim)
00155             q[i]=t;
00156             omega_q += C[ii]*t*t;
00157         }       
00158         gamma = omega1/(lambda*omega_p + omega_q);    
00159         inv_omega2 = 1/omega1;     
00160         for (i = n ; i-- ;)
00161         {
00162             r[i] = 0.0;
00163             beta[i] += gamma*p[i];
00164         } 
00165         omega_z=0.0;
00166         for (i = active ; i-- ;)
00167         {
00168             ii=J[i];
00169             o[ii] += gamma*q[i];
00170             z[i] -= gamma*C[ii]*q[i];
00171             omega_z+=z[i]*z[i];
00172         } 
00173         for (j=0; j < active; j++)
00174         {
00175             ii=J[j];
00176             t=z[j];
00177             int32_t num_entries=0;
00178             bool free_vec=false;
00179 
00180             TSparseEntry<float64_t>* vec=features->get_sparse_feature_vector(ii, num_entries, free_vec);
00181             for (i=0; i<num_entries; i++)
00182                 r[vec[i].feat_index]+= vec[i].entry*t;
00183             features->free_sparse_feature_vector(vec, num_entries, free_vec);
00184             r[n-1]+=Options->bias*t; //bias (modelled as last dim)
00185         }
00186         omega1 = 0.0;
00187         for (i = n ; i-- ;)
00188         {
00189             r[i] -= lambda*beta[i];
00190             omega1 += r[i]*r[i];
00191         }
00192         if(omega1 < epsilon2*omega_z)
00193         {
00194             optimality=1;
00195             break;
00196         }
00197         omega_p=0.0;
00198         scale=omega1*inv_omega2;
00199         for(i = n ; i-- ;)
00200         {
00201             p[i] = r[i] + p[i]*scale;
00202             omega_p += p[i]*p[i]; 
00203         } 
00204     }            
00205     SG_SDEBUG("...Done.");
00206     SG_SINFO("CGLS converged in %d iteration(s)", cgiter);
00207 
00208     delete[] z;
00209     delete[] q;
00210     delete[] r;
00211     delete[] p;
00212     return optimality;
00213 }
00214 
00215 int32_t L2_SVM_MFN(
00216     const struct data *Data, struct options *Options,
00217     struct vector_double *Weights, struct vector_double *Outputs,
00218     int32_t ini)
00219 {
00220     /* Disassemble the structures */  
00221     CSparseFeatures<float64_t>* features=Data->features;
00222     float64_t *Y = Data->Y;
00223     float64_t *C = Data->C;
00224     int32_t n  = Data->n;
00225     int32_t m  = Data->m;
00226     float64_t lambda = Options->lambda;
00227     float64_t epsilon;
00228     float64_t *w = Weights->vec;
00229     float64_t *o = Outputs->vec; 
00230     float64_t F_old = 0.0;
00231     float64_t F = 0.0;
00232     float64_t diff=0.0;
00233     vector_int *ActiveSubset = new vector_int[1];
00234     ActiveSubset->vec = new int32_t[m];
00235     ActiveSubset->d = m;
00236     // initialize
00237     if(ini==0) {
00238         epsilon=BIG_EPSILON; 
00239         Options->cgitermax=SMALL_CGITERMAX; 
00240         Options->epsilon=BIG_EPSILON;
00241     }
00242     else {epsilon = Options->epsilon;}  
00243     for (int32_t i=0;i<n;i++) F+=w[i]*w[i];
00244     F=0.5*lambda*F;        
00245     int32_t active=0;
00246     int32_t inactive=m-1; // l-1      
00247     for (int32_t i=0; i<m ; i++)
00248     { 
00249         diff=1-Y[i]*o[i];
00250         if(diff>0)
00251         {
00252             ActiveSubset->vec[active]=i;
00253             active++;
00254             F+=0.5*C[i]*diff*diff;
00255         }
00256         else
00257         {
00258             ActiveSubset->vec[inactive]=i;
00259             inactive--;
00260         }   
00261     }
00262     ActiveSubset->d=active;        
00263     int32_t iter=0;
00264     int32_t opt=0;
00265     int32_t opt2=0;
00266     vector_double *Weights_bar = new vector_double[1];
00267     vector_double *Outputs_bar = new vector_double[1];
00268     float64_t *w_bar = new float64_t[n];
00269     float64_t *o_bar = new float64_t[m];
00270     Weights_bar->vec=w_bar;
00271     Outputs_bar->vec=o_bar;
00272     Weights_bar->d=n;
00273     Outputs_bar->d=m;
00274     float64_t delta=0.0;
00275     float64_t t=0.0;
00276     int32_t ii = 0;
00277     while(iter<MFNITERMAX)
00278     {
00279         iter++;
00280         SG_SDEBUG("L2_SVM_MFN Iteration# %d (%d active examples, objective_value = %f)\n", iter, active, F);
00281         for (int32_t i=n; i-- ;)
00282             w_bar[i]=w[i];
00283         for (int32_t i=m; i-- ;)
00284             o_bar[i]=o[i];
00285         
00286         opt=CGLS(Data,Options,ActiveSubset,Weights_bar,Outputs_bar);
00287         for(register int32_t i=active; i < m; i++) 
00288         {
00289             ii=ActiveSubset->vec[i];   
00290             t=0.0;
00291 
00292             int32_t num_entries=0;
00293             bool free_vec=false;
00294 
00295             TSparseEntry<float64_t>* vec=features->get_sparse_feature_vector(ii, num_entries, free_vec);
00296             for (int32_t j=0; j<num_entries; j++)
00297                 t+=vec[j].entry*w_bar[vec[j].feat_index];
00298             features->free_sparse_feature_vector(vec, num_entries, free_vec);
00299             t+=Options->bias*w_bar[n-1]; //bias (modelled as last dim)
00300 
00301             o_bar[ii]=t;
00302         }
00303         if(ini==0) {Options->cgitermax=CGITERMAX; ini=1;};
00304         opt2=1;
00305         for (int32_t i=0;i<m;i++)
00306         { 
00307             ii=ActiveSubset->vec[i];
00308             if(i<active)
00309                 opt2=(opt2 && (Y[ii]*o_bar[ii]<=1+epsilon));
00310             else
00311                 opt2=(opt2 && (Y[ii]*o_bar[ii]>=1-epsilon));  
00312             if(opt2==0) break;
00313         }      
00314         if(opt && opt2) // l
00315         {
00316             if(epsilon==BIG_EPSILON) 
00317             {
00318                 epsilon=EPSILON;
00319                 Options->epsilon=EPSILON;
00320                 SG_SDEBUG("epsilon = %f case converged (speedup heuristic 2). Continuing with epsilon=%f",  BIG_EPSILON , EPSILON);
00321                 continue;
00322             }
00323             else
00324             {
00325                 for (int32_t i=n; i-- ;)
00326                     w[i]=w_bar[i];      
00327                 for (int32_t i=m; i-- ;)
00328                     o[i]=o_bar[i]; 
00329                 delete[] ActiveSubset->vec;
00330                 delete[] ActiveSubset;
00331                 delete[] o_bar;
00332                 delete[] w_bar;
00333                 delete[] Weights_bar;
00334                 delete[] Outputs_bar;
00335                 SG_SINFO("L2_SVM_MFN converged (optimality) in %d", iter);
00336                 return 1;      
00337             }
00338         }
00339         delta=line_search(w,w_bar,lambda,o,o_bar,Y,C,n,m); 
00340         SG_SDEBUG("LINE_SEARCH delta = %f\n", delta);
00341         F_old=F;
00342         F=0.0;
00343         for (int32_t i=n; i-- ;) {
00344             w[i]+=delta*(w_bar[i]-w[i]);
00345             F+=w[i]*w[i];
00346         }
00347         F=0.5*lambda*F;      
00348         active=0;
00349         inactive=m-1;  
00350         for (int32_t i=0; i<m ; i++)
00351         {
00352             o[i]+=delta*(o_bar[i]-o[i]);
00353             diff=1-Y[i]*o[i];
00354             if(diff>0)
00355             {
00356                 ActiveSubset->vec[active]=i;
00357                 active++;
00358                 F+=0.5*C[i]*diff*diff;
00359             }
00360             else
00361             {
00362                 ActiveSubset->vec[inactive]=i;
00363                 inactive--;
00364             }   
00365         }
00366         ActiveSubset->d=active;      
00367         if(CMath::abs(F-F_old)<RELATIVE_STOP_EPS*CMath::abs(F_old))
00368         {
00369             SG_SINFO("L2_SVM_MFN converged (rel. criterion) in %d iterations", iter);
00370             return 2;
00371         }
00372     }
00373     delete[] ActiveSubset->vec;
00374     delete[] ActiveSubset;
00375     delete[] o_bar;
00376     delete[] w_bar;
00377     delete[] Weights_bar;
00378     delete[] Outputs_bar;
00379     SG_SINFO("L2_SVM_MFN converged (max iter exceeded) in %d iterations", iter);
00380     return 0;
00381 }
00382 
00383 float64_t line_search(float64_t *w, 
00384         float64_t *w_bar,
00385         float64_t lambda,
00386         float64_t *o, 
00387         float64_t *o_bar, 
00388         float64_t *Y, 
00389         float64_t *C,
00390         int32_t d, /* data dimensionality -- 'n' */
00391         int32_t l) /* number of examples */
00392 {                       
00393     float64_t omegaL = 0.0;
00394     float64_t omegaR = 0.0;
00395     float64_t diff=0.0;   
00396     for(int32_t i=d; i--; )
00397     {
00398         diff=w_bar[i]-w[i];  
00399         omegaL+=w[i]*diff;
00400         omegaR+=w_bar[i]*diff;
00401     }
00402     omegaL=lambda*omegaL;
00403     omegaR=lambda*omegaR;
00404     float64_t L=0.0;
00405     float64_t R=0.0;
00406     int32_t ii=0;
00407     for (int32_t i=0;i<l;i++)
00408     {
00409         if(Y[i]*o[i]<1)
00410         {
00411             diff=C[i]*(o_bar[i]-o[i]);  
00412             L+=(o[i]-Y[i])*diff;
00413             R+=(o_bar[i]-Y[i])*diff;
00414         }
00415     }
00416     L+=omegaL;
00417     R+=omegaR;
00418     Delta* deltas=new Delta[l];    
00419     int32_t p=0;
00420     for(int32_t i=0;i<l;i++)
00421     { 
00422         diff=Y[i]*(o_bar[i]-o[i]);
00423 
00424         if(Y[i]*o[i]<1)
00425         {
00426             if(diff>0)
00427             {
00428                 deltas[p].delta=(1-Y[i]*o[i])/diff;
00429                 deltas[p].index=i;
00430                 deltas[p].s=-1;
00431                 p++;
00432             }
00433         }
00434         else
00435         {
00436             if(diff<0)
00437             {
00438                 deltas[p].delta=(1-Y[i]*o[i])/diff;
00439                 deltas[p].index=i;
00440                 deltas[p].s=1;      
00441                 p++;
00442             }
00443         }
00444     }
00445     std::sort(deltas,deltas+p);            
00446     float64_t delta_prime=0.0;  
00447     for (int32_t i=0;i<p;i++)
00448     {
00449         delta_prime = L + deltas[i].delta*(R-L);       
00450         if(delta_prime>=0)
00451             break;
00452         ii=deltas[i].index;   
00453         diff=(deltas[i].s)*C[ii]*(o_bar[ii]-o[ii]);
00454         L+=diff*(o[ii]-Y[ii]);
00455         R+=diff*(o_bar[ii]-Y[ii]);
00456     }   
00457     delete [] deltas;
00458     return (-L/(R-L));
00459 }
00460 
00461 int32_t TSVM_MFN(
00462     const struct data *Data, struct options *Options,
00463     struct vector_double *Weights, struct vector_double *Outputs)
00464 {
00465     /* Setup labeled-only examples and train L2_SVM_MFN */
00466     struct data *Data_Labeled = new data[1];
00467     struct vector_double *Outputs_Labeled = new vector_double[1];
00468     initialize(Outputs_Labeled,Data->l,0.0);
00469     SG_SDEBUG("Initializing weights, unknown labels");
00470     GetLabeledData(Data_Labeled,Data); /* gets labeled data and sets C=1/l */
00471     L2_SVM_MFN(Data_Labeled, Options, Weights,Outputs_Labeled,0);
00473     /* Use this weight vector to classify R*u unlabeled examples as
00474        positive*/
00475     int32_t p=0,q=0;
00476     float64_t t=0.0;
00477     int32_t *JU = new int32_t[Data->u];
00478     float64_t *ou = new float64_t[Data->u];
00479     float64_t lambda_0 = TSVM_LAMBDA_SMALL;
00480     for (int32_t i=0;i<Data->m;i++)
00481     {
00482         if(Data->Y[i]==0.0)
00483         {
00484             t=0.0;
00485             int32_t num_entries=0;
00486             bool free_vec=false;
00487 
00488             TSparseEntry<float64_t>* vec=Data->features->get_sparse_feature_vector(i, num_entries, free_vec);
00489             for (int32_t j=0; j<num_entries; j++)
00490                 t+=vec[j].entry*Weights->vec[vec[j].feat_index];
00491             Data->features->free_sparse_feature_vector(vec, num_entries, free_vec);
00492             t+=Options->bias*Weights->vec[Data->n-1]; //bias (modelled as last dim)
00493 
00494             Outputs->vec[i]=t;
00495             Data->C[i]=lambda_0*1.0/Data->u;
00496             JU[q]=i;
00497             ou[q]=t;
00498             q++;
00499         }
00500         else
00501         {                
00502             Outputs->vec[i]=Outputs_Labeled->vec[p];
00503             Data->C[i]=1.0/Data->l;
00504             p++;   
00505         }
00506     }
00507     std::nth_element(ou,ou+int32_t((1-Options->R)*Data->u-1),ou+Data->u);
00508     float64_t thresh=*(ou+int32_t((1-Options->R)*Data->u)-1);
00509     delete [] ou;
00510     for (int32_t i=0;i<Data->u;i++)
00511     {  
00512         if(Outputs->vec[JU[i]]>thresh)
00513             Data->Y[JU[i]]=1.0;
00514         else
00515             Data->Y[JU[i]]=-1.0;
00516     }
00517     for (int32_t i=0;i<Data->n;i++)
00518         Weights->vec[i]=0.0;
00519     for (int32_t i=0;i<Data->m;i++)
00520         Outputs->vec[i]=0.0;
00521     L2_SVM_MFN(Data,Options,Weights,Outputs,0); 
00522     int32_t num_switches=0;
00523     int32_t s=0;
00524     int32_t last_round=0;
00525     while(lambda_0 <= Options->lambda_u)
00526     {
00527         int32_t iter2=0;
00528         while(1){
00529             s=switch_labels(Data->Y,Outputs->vec,JU,Data->u,Options->S);
00530             if(s==0) break;
00531             iter2++;
00532             SG_SDEBUG("****** lambda_0 = %f iteration = %d ************************************\n", lambda_0, iter2);
00533             SG_SDEBUG("Optimizing unknown labels. switched %d labels.\n");
00534             num_switches+=s;
00535             SG_SDEBUG("Optimizing weights\n");
00536             L2_SVM_MFN(Data,Options,Weights,Outputs,1); 
00537         }
00538         if(last_round==1) break;
00539         lambda_0=TSVM_ANNEALING_RATE*lambda_0;
00540         if(lambda_0 >= Options->lambda_u) {lambda_0 = Options->lambda_u; last_round=1;} 
00541         for (int32_t i=0;i<Data->u;i++)
00542             Data->C[JU[i]]=lambda_0*1.0/Data->u;       
00543         SG_SDEBUG("****** lambda0 increased to %f%% of lambda_u = %f ************************\n", lambda_0*100/Options->lambda_u, Options->lambda_u);
00544         SG_SDEBUG("Optimizing weights\n");
00545         L2_SVM_MFN(Data,Options,Weights,Outputs,1); 
00546     }
00547     SG_SDEBUG("Total Number of Switches = %d\n", num_switches);
00548     /* reset labels */
00549     for (int32_t i=0;i<Data->u;i++) Data->Y[JU[i]] = 0.0;
00550     float64_t F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
00551     SG_SDEBUG("Objective Value = %f\n",F);
00552     delete [] JU;
00553     return num_switches;
00554 }
00555 
00556 int32_t switch_labels(float64_t* Y, float64_t* o, int32_t* JU, int32_t u, int32_t S)
00557 {
00558     int32_t npos=0;
00559     int32_t nneg=0;
00560     for (int32_t i=0;i<u;i++)
00561     {
00562         if((Y[JU[i]]>0) && (o[JU[i]]<1.0)) npos++;   
00563         if((Y[JU[i]]<0) && (-o[JU[i]]<1.0)) nneg++;        
00564     }     
00565     Delta* positive=new Delta[npos];
00566     Delta* negative=new Delta[nneg];  
00567     int32_t p=0;
00568     int32_t n=0;
00569     int32_t ii=0;
00570     for (int32_t i=0;i<u;i++)
00571     {
00572         ii=JU[i];
00573         if((Y[ii]>0.0) && (o[ii]<1.0)) {
00574             positive[p].delta=o[ii]; 
00575             positive[p].index=ii;
00576             positive[p].s=0;
00577             p++;};   
00578             if((Y[ii]<0.0) && (-o[ii]<1.0)) 
00579             {
00580                 negative[n].delta=-o[ii]; 
00581                 negative[n].index=ii;
00582                 negative[n].s=0;
00583                 n++;};   
00584     }
00585     std::sort(positive,positive+npos);
00586     std::sort(negative,negative+nneg);  
00587     int32_t s=-1;
00588     while(1)
00589     {
00590         s++;    
00591         if((s>=S) || (positive[s].delta>=-negative[s].delta) || (s>=npos) || (s>=nneg))
00592             break;
00593         Y[positive[s].index]=-1.0;
00594         Y[negative[s].index]= 1.0;
00595     }
00596     delete [] positive;
00597     delete [] negative;
00598     return s;
00599 }
00600 
00601 int32_t DA_S3VM(
00602     struct data *Data, struct options *Options, struct vector_double *Weights,
00603     struct vector_double *Outputs)
00604 {
00605     float64_t T = DA_INIT_TEMP*Options->lambda_u;
00606     int32_t iter1 = 0, iter2 =0;
00607     float64_t *p = new float64_t[Data->u];
00608     float64_t *q = new float64_t[Data->u];
00609     float64_t *g = new float64_t[Data->u];
00610     float64_t F,F_min;
00611     float64_t *w_min = new float64_t[Data->n];
00612     float64_t *o_min = new float64_t[Data->m];
00613     float64_t *w = Weights->vec;
00614     float64_t *o = Outputs->vec;
00615     float64_t kl_divergence = 1.0;
00616     /*initialize */
00617     SG_SDEBUG("Initializing weights, p");
00618     for (int32_t i=0;i<Data->u; i++)
00619         p[i] = Options->R;
00620     /* record which examples are unlabeled */
00621     int32_t *JU = new int32_t[Data->u];
00622     int32_t j=0;
00623     for(int32_t i=0;i<Data->m;i++)
00624     {
00625         if(Data->Y[i]==0.0)
00626         {JU[j]=i;j++;}
00627     }  
00628     float64_t H = entropy(p,Data->u);
00629     optimize_w(Data,p,Options,Weights,Outputs,0);  
00630     F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
00631     F_min = F;
00632     for (int32_t i=0;i<Weights->d;i++)
00633         w_min[i]=w[i];
00634     for (int32_t i=0;i<Outputs->d;i++)
00635         o_min[i]=o[i];
00636     while((iter1 < DA_OUTER_ITERMAX) && (H > Options->epsilon))
00637     {
00638         iter1++;
00639         iter2=0;
00640         kl_divergence=1.0;
00641         while((iter2 < DA_INNER_ITERMAX) && (kl_divergence > Options->epsilon)) 
00642         {
00643             iter2++;
00644             for (int32_t i=0;i<Data->u;i++)
00645             {
00646                 q[i]=p[i];
00647                 g[i] = Options->lambda_u*((o[JU[i]] > 1 ? 0 : (1 - o[JU[i]])*(1 - o[JU[i]])) - (o[JU[i]]< -1 ? 0 : (1 + o[JU[i]])*(1 + o[JU[i]]))); 
00648             }
00649             SG_SDEBUG("Optimizing p.\n");
00650             optimize_p(g,Data->u,T,Options->R,p);
00651             kl_divergence=KL(p,q,Data->u);
00652             SG_SDEBUG("Optimizing weights\n");
00653             optimize_w(Data,p,Options,Weights,Outputs,1);
00654             F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
00655             if(F < F_min)
00656             {
00657                 F_min = F;
00658                 for (int32_t i=0;i<Weights->d;i++)
00659                     w_min[i]=w[i];
00660                 for (int32_t i=0;i<Outputs->d;i++)
00661                     o_min[i]=o[i];
00662             }
00663             SG_SDEBUG("***** outer_iter = %d  T = %g  inner_iter = %d  kl = %g  cost = %g *****\n",iter1,T,iter2,kl_divergence,F); 
00664         }
00665         H = entropy(p,Data->u); 
00666         SG_SDEBUG("***** Finished outer_iter = %d T = %g  Entropy = %g ***\n", iter1,T,H);
00667         T = T/DA_ANNEALING_RATE;
00668     }
00669     for (int32_t i=0;i<Weights->d;i++)
00670         w[i]=w_min[i];
00671     for (int32_t i=0;i<Outputs->d;i++)
00672         o[i]=o_min[i];
00673     /* may want to reset the original Y */ 
00674     delete [] p; 
00675     delete [] q;
00676     delete [] g;
00677     delete [] JU;
00678     delete [] w_min;
00679     delete [] o_min;
00680     SG_SINFO("(min) Objective Value = %f", F_min);
00681     return 1;
00682 }
00683 
00684 int32_t optimize_w(
00685     const struct data *Data, const float64_t *p, struct options *Options,
00686     struct vector_double *Weights, struct vector_double *Outputs, int32_t ini)
00687 {
00688     int32_t i,j;
00689     CSparseFeatures<float64_t>* features=Data->features;
00690     int32_t n  = Data->n;
00691     int32_t m  = Data->m;
00692     int32_t u  = Data->u;
00693     float64_t lambda = Options->lambda;
00694     float64_t epsilon;
00695     float64_t *w = Weights->vec;
00696     float64_t *o = new float64_t[m+u];
00697     float64_t *Y = new float64_t[m+u];
00698     float64_t *C = new float64_t[m+u];
00699     int32_t *labeled_indices = new int32_t[m];
00700     float64_t F_old = 0.0;
00701     float64_t F = 0.0;
00702     float64_t diff=0.0;
00703     float64_t lambda_u_by_u = Options->lambda_u/u;
00704     vector_int *ActiveSubset = new vector_int[1];
00705     ActiveSubset->vec = new int32_t[m];
00706     ActiveSubset->d = m;
00707     // initialize
00708     if(ini==0) 
00709     {
00710         epsilon=BIG_EPSILON; 
00711         Options->cgitermax=SMALL_CGITERMAX;
00712         Options->epsilon=BIG_EPSILON;}
00713     else {epsilon = Options->epsilon;}  
00714 
00715     for(i=0;i<n;i++) F+=w[i]*w[i];
00716     F=lambda*F;        
00717     int32_t active=0;
00718     int32_t inactive=m-1; // l-1      
00719     float64_t temp1;
00720     float64_t temp2;
00721 
00722     j = 0;
00723     for(i=0; i<m ; i++)
00724     { 
00725         o[i]=Outputs->vec[i];
00726         if(Data->Y[i]==0.0)
00727         {
00728             labeled_indices[i]=0;
00729             o[m+j]=o[i];
00730             Y[i]=1;
00731             Y[m+j]=-1;
00732             C[i]=lambda_u_by_u*p[j];
00733             C[m+j]=lambda_u_by_u*(1-p[j]);
00734             ActiveSubset->vec[active]=i;
00735             active++; 
00736             diff = 1 - CMath::abs(o[i]);
00737             if(diff>0)
00738             {
00739                 Data->Y[i] = 2*p[j]-1;
00740                 Data->C[i] = lambda_u_by_u;
00741                 temp1 = (1 - o[i]);
00742                 temp2 = (1 + o[i]);
00743                 F+=lambda_u_by_u*(p[j]*temp1*temp1 + (1-p[j])*temp2*temp2);
00744             }
00745             else
00746             {
00747                 if(o[i]>0)
00748                 {
00749                     Data->Y[i] = -1.0;
00750                     Data->C[i] = C[m+j];
00751                 }
00752                 else
00753                 {
00754                     Data->Y[i] = 1.0;
00755                     Data->C[i] = C[i];       
00756                 }
00757                 temp1 = (1-Data->Y[i]*o[i]);
00758                 F+= Data->C[i]*temp1*temp1;
00759             }
00760             j++;
00761         } 
00762         else
00763         {
00764             labeled_indices[i]=1;
00765             Y[i]=Data->Y[i];
00766             C[i]=1.0/Data->l;
00767             Data->C[i]=1.0/Data->l;
00768             diff=1-Data->Y[i]*o[i];
00769             if(diff>0)
00770             {
00771                 ActiveSubset->vec[active]=i;
00772                 active++;
00773                 F+=Data->C[i]*diff*diff;
00774             }
00775             else
00776             {
00777                 ActiveSubset->vec[inactive]=i;
00778                 inactive--;
00779             }
00780         }
00781     }
00782     F=0.5*F;
00783     ActiveSubset->d=active;
00784     int32_t iter=0;
00785     int32_t opt=0;
00786     int32_t opt2=0;
00787     vector_double *Weights_bar = new vector_double[1];
00788     vector_double *Outputs_bar = new vector_double[1];
00789     float64_t *w_bar = new float64_t[n];
00790     float64_t *o_bar = new float64_t[m+u];
00791     Weights_bar->vec=w_bar;
00792     Outputs_bar->vec=o_bar;
00793     Weights_bar->d=n;
00794     Outputs_bar->d=m; /* read only the top m ; bottom u will be copies */
00795     float64_t delta=0.0;
00796     float64_t t=0.0;
00797     int32_t ii = 0;
00798     while(iter<MFNITERMAX)
00799     {
00800         iter++;
00801         SG_SDEBUG("L2_SVM_MFN Iteration# %d (%d active examples,  objective_value = %f)", iter, active, F);
00802         for(i=n; i-- ;) 
00803             w_bar[i]=w[i];
00804 
00805         for(i=m+u; i-- ;)  
00806             o_bar[i]=o[i];     
00807         opt=CGLS(Data,Options,ActiveSubset,Weights_bar,Outputs_bar);
00808         for(i=active; i < m; i++) 
00809         {
00810             ii=ActiveSubset->vec[i];   
00811             t=0.0;
00812             int32_t num_entries=0;
00813             bool free_vec=false;
00814 
00815             TSparseEntry<float64_t>* vec=features->get_sparse_feature_vector(ii, num_entries, free_vec);
00816             for (j=0; j<num_entries; j++)
00817                 t+=vec[j].entry*w_bar[vec[j].feat_index];
00818             features->free_sparse_feature_vector(vec, num_entries, free_vec);
00819             t+=Options->bias*w_bar[n-1]; //bias (modelled as last dim)
00820 
00821             o_bar[ii]=t;
00822         }
00823         // make o_bar consistent in the bottom half      
00824         j=0;
00825         for(i=0; i<m;i++)
00826         {
00827             if(labeled_indices[i]==0)
00828             {o_bar[m+j]=o_bar[i]; j++;};
00829         }
00830         if(ini==0) {Options->cgitermax=CGITERMAX; ini=1;};
00831         opt2=1;
00832         for(i=0; i < m ;i++)
00833         { 
00834             ii=ActiveSubset->vec[i];
00835             if(i<active)
00836             {
00837                 if(labeled_indices[ii]==1)
00838                     opt2=(opt2 && (Data->Y[ii]*o_bar[ii]<=1+epsilon));
00839                 else
00840                 {
00841                     if(CMath::abs(o[ii])<1)
00842                         opt2=(opt2 && (CMath::abs(o_bar[ii])<=1+epsilon));
00843                     else
00844                         opt2=(opt2 && (CMath::abs(o_bar[ii])>=1-epsilon));
00845                 }
00846             }
00847             else
00848                 opt2=(opt2 && (Data->Y[ii]*o_bar[ii]>=1-epsilon));  
00849             if(opt2==0) break;
00850         }
00851         if(opt && opt2) // l
00852         {
00853             if(epsilon==BIG_EPSILON) 
00854             {
00855                 epsilon=EPSILON;
00856                 Options->epsilon=EPSILON;
00857                 SG_SDEBUG( "epsilon = %f case converged (speedup heuristic 2). Continuing with epsilon=%f\n", BIG_EPSILON, EPSILON);
00858                 continue;
00859             }
00860             else
00861             {
00862                 for(i=n; i-- ;) 
00863                     w[i]=w_bar[i];      
00864                 for(i=m; i-- ;)
00865                     Outputs->vec[i]=o_bar[i]; 
00866                 for(i=m; i-- ;)
00867                 {
00868                     if(labeled_indices[i]==0)
00869                         Data->Y[i]=0.0;
00870                 }
00871                 delete[] ActiveSubset->vec;
00872                 delete[] ActiveSubset;
00873                 delete[] o_bar;
00874                 delete[] w_bar;
00875                 delete[] o;
00876                 delete[] Weights_bar;
00877                 delete[] Outputs_bar;
00878                 delete[] Y;
00879                 delete[] C;
00880                 delete[] labeled_indices;
00881                 SG_SINFO("L2_SVM_MFN converged in %d iteration(s)", iter);
00882                 return 1;      
00883             }
00884         }  
00885 
00886         delta=line_search(w,w_bar,lambda,o,o_bar,Y,C,n,m+u); 
00887         SG_SDEBUG("LINE_SEARCH delta = %f", delta);
00888         F_old=F;
00889         F=0.0;
00890         for(i=0;i<n;i++) {w[i]+=delta*(w_bar[i]-w[i]);  F+=w[i]*w[i];}
00891         F=lambda*F;
00892         j=0;
00893         active=0;
00894         inactive=m-1;
00895         for(i=0; i<m ; i++)
00896         { 
00897             o[i]+=delta*(o_bar[i]-o[i]);
00898             if(labeled_indices[i]==0)
00899             {
00900                 o[m+j]=o[i];
00901                 ActiveSubset->vec[active]=i;
00902                 active++; 
00903                 diff = 1 - CMath::abs(o[i]);
00904                 if(diff>0)
00905                 {
00906                     Data->Y[i] = 2*p[j]-1;
00907                     Data->C[i] = lambda_u_by_u;
00908                     temp1 = (1 - o[i]);
00909                     temp2 = (1 + o[i]);
00910                     F+=lambda_u_by_u*(p[j]*temp1*temp1 + (1-p[j])*temp2*temp2);
00911                 }
00912                 else
00913                 {
00914                     if(o[i]>0)
00915                     {
00916                         Data->Y[i] = -1;
00917                         Data->C[i] = C[m+j];
00918                     }
00919                     else
00920                     {
00921                         Data->Y[i] = +1;
00922                         Data->C[i] = C[i];
00923                     }
00924                     temp1=(1-Data->Y[i]*o[i]);
00925                     F+= Data->C[i]*temp1*temp1;
00926                 }
00927                 j++;
00928             }
00929             else
00930             {
00931                 diff=1-Data->Y[i]*o[i];
00932                 if(diff>0)
00933                 {
00934                     ActiveSubset->vec[active]=i;
00935                     active++;
00936                     F+=Data->C[i]*diff*diff;
00937                 }
00938                 else
00939                 {
00940                     ActiveSubset->vec[inactive]=i;
00941                     inactive--;
00942                 }
00943             }
00944         }
00945         F=0.5*F;
00946         ActiveSubset->d=active;
00947         if(CMath::abs(F-F_old)<EPSILON)
00948             break;
00949     }
00950     for(i=m; i-- ;)
00951     {
00952         Outputs->vec[i]=o[i];
00953         if(labeled_indices[i]==0)
00954             Data->Y[i]=0.0;
00955     }
00956     delete[] ActiveSubset->vec;
00957     delete[] labeled_indices;
00958     delete[] ActiveSubset;
00959     delete[] o_bar;
00960     delete[] w_bar;
00961     delete[] o;
00962     delete[] Weights_bar;
00963     delete[] Outputs_bar;
00964     delete[] Y;
00965     delete[] C;
00966     SG_SINFO("L2_SVM_MFN converged in %d iterations", iter);
00967     return 0;
00968 }
00969 
00970 void optimize_p(
00971     const float64_t* g, int32_t u, float64_t T, float64_t r, float64_t* p)
00972 {
00973     int32_t iter=0;
00974     float64_t epsilon=1e-10;
00975     int32_t maxiter=500; 
00976     float64_t nu_minus=g[0];
00977     float64_t nu_plus=g[0];
00978     for (int32_t i=0;i<u;i++)
00979     {
00980         if(g[i]<nu_minus) nu_minus=g[i]; 
00981         if(g[i]>nu_plus) nu_plus=g[i];
00982     };
00983 
00984     float64_t b=T*log((1-r)/r);
00985     nu_minus-=b;
00986     nu_plus-=b;
00987     float64_t nu=(nu_plus+nu_minus)/2;
00988     float64_t Bnu=0.0;
00989     float64_t BnuPrime=0.0;
00990     float64_t s=0.0;
00991     float64_t tmp=0.0;
00992     for (int32_t i=0;i<u;i++)
00993     {
00994         s=exp((g[i]-nu)/T);
00995         if(!(isinf(s)))
00996         {
00997             tmp=1.0/(1.0+s);
00998             Bnu+=tmp;
00999             BnuPrime+=s*tmp*tmp;    
01000         }
01001     }
01002     Bnu=Bnu/u;
01003     Bnu-=r;
01004     BnuPrime=BnuPrime/(T*u);
01005     float64_t nuHat=0.0;
01006     while((CMath::abs(Bnu)>epsilon) && (iter < maxiter))
01007     {
01008         iter++;
01009         if(CMath::abs(BnuPrime)>0.0)
01010             nuHat=nu-Bnu/BnuPrime;
01011         if((CMath::abs(BnuPrime) > 0.0) | (nuHat>nu_plus)  | (nuHat < nu_minus))
01012             nu=(nu_minus+nu_plus)/2.0;
01013         else
01014             nu=nuHat; 
01015         Bnu=0.0;
01016         BnuPrime=0.0;
01017         for(int32_t i=0;i<u;i++)
01018         {
01019             s=exp((g[i]-nu)/T);
01020             if(!(isinf(s)))
01021             {
01022                 tmp=1.0/(1.0+s);
01023                 Bnu+=tmp;
01024                 BnuPrime+=s*tmp*tmp;    
01025             }
01026         }
01027         Bnu=Bnu/u;
01028         Bnu-=r;
01029         BnuPrime=BnuPrime/(T*u);
01030         if(Bnu<0)
01031             nu_minus=nu;
01032         else
01033             nu_plus=nu;
01034         if(CMath::abs(nu_minus-nu_plus)<epsilon)
01035             break;   
01036     }
01037     if(CMath::abs(Bnu)>epsilon)
01038         SG_SWARNING("Warning (Root): root not found to required precision\n");
01039 
01040     for (int32_t i=0;i<u;i++)
01041     {
01042         s=exp((g[i]-nu)/T);
01043         if(isinf(s)) p[i]=0.0;
01044         else p[i]=1.0/(1.0+s);  
01045     }
01046     SG_SINFO(" root (nu) = %f B(nu) = %f", nu, Bnu);
01047 }
01048 
01049 float64_t transductive_cost(
01050     float64_t normWeights, float64_t *Y, float64_t *Outputs, int32_t m,
01051     float64_t lambda, float64_t lambda_u)
01052 {
01053     float64_t F1=0.0,F2=0.0, o=0.0, y=0.0; 
01054     int32_t u=0,l=0;
01055     for (int32_t i=0;i<m;i++)
01056     {
01057         o=Outputs[i];
01058         y=Y[i];
01059         if(y==0.0)
01060         {F1 += CMath::abs(o) > 1 ? 0 : (1 - CMath::abs(o))*(1 - CMath::abs(o)); u++;}
01061         else
01062         {F2 += y*o > 1 ? 0 : (1-y*o)*(1-y*o); l++;}   
01063     }
01064     float64_t F;
01065     F = 0.5*(lambda*normWeights + lambda_u*F1/u + F2/l);
01066     return F;
01067 }
01068 
01069 float64_t entropy(const float64_t *p, int32_t u)
01070 {
01071     float64_t h=0.0;
01072     float64_t q=0.0;
01073     for (int32_t i=0;i<u;i++)
01074     {
01075         q=p[i];
01076         if(q>0 && q<1)
01077             h+= -(q*CMath::log2(q) + (1-q)*CMath::log2(1-q));
01078     }
01079     return h/u;
01080 }
01081 
01082 float64_t KL(const float64_t *p, const float64_t *q, int32_t u)
01083 {
01084     float64_t h=0.0;
01085     float64_t p1=0.0;
01086     float64_t q1=0.0;
01087     float64_t g=0.0;
01088     for (int32_t i=0;i<u;i++)
01089     {
01090         p1=p[i];
01091         q1=q[i];
01092         if(p1>1-1e-8) p1-=1e-8;
01093         if(p1<1-1e-8) p1+=1e-8;
01094         if(q1>1-1e-8) q1-=1e-8;
01095         if(q1<1-1e-8) q1+=1e-8;
01096         g= (p1*CMath::log2(p1/q1) + (1-p1)*CMath::log2((1-p1)/(1-q1)));
01097         if(CMath::abs(g)<1e-12 || isnan(g)) g=0.0;
01098         h+=g;
01099     }
01100     return h/u;   
01101 }
01102 
01103 /********************** UTILITIES ********************/
01104 float64_t norm_square(const vector_double *A)
01105 {
01106     float64_t x=0.0, t=0.0;
01107     for(int32_t i=0;i<A->d;i++)
01108     {
01109         t=A->vec[i];
01110         x+=t*t;
01111     }
01112     return x;
01113 }
01114 
01115 void initialize(struct vector_double *A, int32_t k, float64_t a)
01116 {
01117     float64_t *vec = new float64_t[k];
01118     for (int32_t i=0;i<k;i++)
01119         vec[i]=a;
01120     A->vec = vec;
01121     A->d   = k;
01122     return;
01123 }
01124 
01125 void initialize(struct vector_int *A, int32_t k)
01126 {
01127     int32_t *vec = new int32_t[k];
01128     for(int32_t i=0;i<k;i++)
01129         vec[i]=i; 
01130     A->vec = vec;
01131     A->d   = k;
01132     return;
01133 }
01134 
01135 void GetLabeledData(struct data *D, const struct data *Data)
01136 {
01137     /*FIXME
01138     int32_t *J = new int[Data->l];
01139     D->C   = new float64_t[Data->l];
01140     D->Y   = new float64_t[Data->l];
01141     int32_t nz=0;
01142     int32_t k=0;
01143     int32_t rowptrs_=Data->l;
01144     for(int32_t i=0;i<Data->m;i++)
01145     {
01146         if(Data->Y[i]!=0.0)
01147         {
01148             J[k]=i;
01149             D->Y[k]=Data->Y[i];
01150             D->C[k]=1.0/Data->l;
01151             nz+=(Data->rowptr[i+1] - Data->rowptr[i]);
01152             k++;
01153         }
01154     }  
01155     D->val    = new float64_t[nz];
01156     D->colind = new int32_t[nz]; 
01157     D->rowptr = new int32_trowptrs_+1];
01158     nz=0;
01159     for(int32_t i=0;i<Data->l;i++)
01160     {
01161         D->rowptr[i]=nz;
01162         for(int32_t j=Data->rowptr[J[i]];j<Data->rowptr[J[i]+1];j++)
01163         {
01164             D->val[nz] = Data->val[j];
01165             D->colind[nz] = Data->colind[j];
01166             nz++;   
01167         }
01168     }
01169     D->rowptr[rowptrs_]=nz;
01170     D->nz=nz;
01171     D->l=Data->l;
01172     D->m=Data->l;
01173     D->n=Data->n;
01174     D->u=0;
01175     delete [] J;*/
01176 }

SHOGUN Machine Learning Toolbox - Documentation