ssl.cpp

Go to the documentation of this file.
00001 /*    Copyright 2006 Vikas Sindhwani (vikass@cs.uchicago.edu)
00002       SVM-lin: Fast SVM Solvers for Supervised and Semi-supervised Learning
00003 
00004       This file is part of SVM-lin.      
00005 
00006       SVM-lin is free software; you can redistribute it and/or modify
00007       it under the terms of the GNU General Public License as published by
00008       the Free Software Foundation; either version 2 of the License, or
00009       (at your option) any later version.
00010 
00011       SVM-lin is distributed in the hope that it will be useful,
00012       but WITHOUT ANY WARRANTY; without even the implied warranty of
00013       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014       GNU General Public License for more details.
00015 
00016       You should have received a copy of the GNU General Public License
00017       along with SVM-lin (see gpl.txt); if not, write to the Free Software
00018       Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
00019       */
00020 
00021 #include <stdio.h>
00022 #include <stdlib.h>
00023 #include <ctype.h>
00024 #include <algorithm>
00025 
00026 #include "lib/io.h"
00027 #include "lib/Mathematics.h"
00028 #include "features/SparseFeatures.h"
00029 #include "classifier/svm/ssl.h"
00030 
00031 #define VERBOSE 1
00032 
00033 void ssl_train(struct data *Data, 
00034         struct options *Options, 
00035         struct vector_double *Weights,
00036         struct vector_double *Outputs)
00037 {
00038     // initialize 
00039     initialize(Weights,Data->n,0.0);
00040     initialize(Outputs,Data->m,0.0);
00041     vector_int    *Subset  = new vector_int[1];
00042     initialize(Subset,Data->m);
00043     // call the right algorithm
00044     int optimality = 0;
00045     switch(Options->algo)
00046     {
00047         case -1:
00048             SG_SINFO("Regularized Least Squares Regression (CGLS)\n");
00049             optimality=CGLS(Data,Options,Subset,Weights,Outputs);
00050             break;
00051         case RLS:
00052             SG_SINFO("Regularized Least Squares Classification (CGLS)\n");
00053             optimality=CGLS(Data,Options,Subset,Weights,Outputs);
00054             break;
00055         case SVM:
00056             SG_SINFO("Modified Finite Newton L2-SVM (L2-SVM-MFN)\n");
00057             optimality=L2_SVM_MFN(Data,Options,Weights,Outputs,0);
00058             break;
00059         case TSVM:
00060             SG_SINFO("Transductive L2-SVM (TSVM)\n");
00061             optimality=TSVM_MFN(Data,Options,Weights,Outputs);
00062             break;
00063         case DA_SVM:
00064             SG_SINFO("Deterministic Annealing Semi-supervised L2-SVM (DAS3VM)\n");
00065             optimality=DA_S3VM(Data,Options,Weights,Outputs);
00066             break;
00067         default:
00068             SG_SERROR("Algorithm unspecified");
00069     }
00070 
00071     delete[] Subset->vec;
00072     delete[] Subset;
00073     return;
00074 } 
00075 int CGLS(const struct data *Data, 
00076         const struct options *Options, 
00077         const struct vector_int *Subset, 
00078         struct vector_double *Weights,
00079         struct vector_double *Outputs)
00080 {
00081     SG_SDEBUG("CGLS starting...");
00082 
00083     /* Disassemble the structures */
00084     int active = Subset->d;
00085     int *J = Subset->vec;
00086     CSparseFeatures<DREAL>* features=Data->features;
00087     double *Y = Data->Y;
00088     double *C = Data->C;
00089     int n  = Data->n;
00090     double lambda = Options->lambda;
00091     int cgitermax = Options->cgitermax;
00092     double epsilon = Options->epsilon;
00093     double *beta = Weights->vec;
00094     double *o  = Outputs->vec; 
00095     // initialize z 
00096     double *z = new double[active];
00097     double *q = new double[active];
00098     int ii=0;
00099     for(int i = active ; i-- ;){
00100         ii=J[i];      
00101         z[i]  = C[ii]*(Y[ii] - o[ii]);
00102     }
00103     double *r = new double[n];
00104     for(int i = n ; i-- ;)
00105         r[i] = 0.0;
00106     for(register int j=0; j < active; j++)
00107     {
00108         ii=J[j];
00109         INT num_entries=0;
00110         bool free_vec=false;
00111 
00112         TSparseEntry<DREAL>* vec=features->get_sparse_feature_vector(ii, num_entries, free_vec);
00113         for (int i=0; i<num_entries; i++)
00114             r[vec[i].feat_index]+= vec[i].entry*z[j];
00115         features->free_sparse_feature_vector(vec, num_entries, free_vec);
00116         r[n-1]+=Options->bias*z[j]; //bias (modelled as last dim)
00117     }
00118     double *p = new double[n];   
00119     double omega1 = 0.0;
00120     for(int i = n ; i-- ;)
00121     {
00122         r[i] -= lambda*beta[i];
00123         p[i] = r[i];
00124         omega1 += r[i]*r[i];
00125     }   
00126     double omega_p = omega1;
00127     double omega_q = 0.0;
00128     double inv_omega2 = 1/omega1;
00129     double scale = 0.0;
00130     double omega_z=0.0;
00131     double gamma = 0.0;
00132     int cgiter = 0;
00133     int optimality = 0;
00134     double epsilon2 = epsilon*epsilon;   
00135     // iterate
00136     while(cgiter < cgitermax)
00137     {
00138         cgiter++;
00139         omega_q=0.0;
00140         double t=0.0;
00141         register int i,j; 
00142         // #pragma omp parallel for private(i,j)
00143         for(i=0; i < active; i++)
00144         {
00145             ii=J[i];
00146             t=0.0;
00147             INT num_entries=0;
00148             bool free_vec=false;
00149             TSparseEntry<DREAL>* vec=features->get_sparse_feature_vector(ii,
00150                     num_entries, free_vec);
00151             for (j=0; j<num_entries; j++)
00152                 t+=vec[j].entry*p[vec[j].feat_index];
00153             features->free_sparse_feature_vector(vec, num_entries, free_vec);
00154             t+=Options->bias*p[n-1]; //bias (modelled as last dim)
00155             q[i]=t;
00156             omega_q += C[ii]*t*t;
00157         }       
00158         gamma = omega1/(lambda*omega_p + omega_q);    
00159         inv_omega2 = 1/omega1;     
00160         for(i = n ; i-- ;)
00161         {
00162             r[i] = 0.0;
00163             beta[i] += gamma*p[i];
00164         } 
00165         omega_z=0.0;
00166         for(i = active ; i-- ;)
00167         {
00168             ii=J[i];
00169             o[ii] += gamma*q[i];
00170             z[i] -= gamma*C[ii]*q[i];
00171             omega_z+=z[i]*z[i];
00172         } 
00173         for(j=0; j < active; j++)
00174         {
00175             ii=J[j];
00176             t=z[j];
00177             INT num_entries=0;
00178             bool free_vec=false;
00179 
00180             TSparseEntry<DREAL>* vec=features->get_sparse_feature_vector(ii, num_entries, free_vec);
00181             for (i=0; i<num_entries; i++)
00182                 r[vec[i].feat_index]+= vec[i].entry*t;
00183             features->free_sparse_feature_vector(vec, num_entries, free_vec);
00184             r[n-1]+=Options->bias*t; //bias (modelled as last dim)
00185         }
00186         omega1 = 0.0;
00187         for(i = n ; i-- ;)
00188         {
00189             r[i] -= lambda*beta[i];
00190             omega1 += r[i]*r[i];
00191         }
00192         if(omega1 < epsilon2*omega_z)
00193         {
00194             optimality=1;
00195             break;
00196         }
00197         omega_p=0.0;
00198         scale=omega1*inv_omega2;
00199         for(i = n ; i-- ;)
00200         {
00201             p[i] = r[i] + p[i]*scale;
00202             omega_p += p[i]*p[i]; 
00203         } 
00204     }            
00205     SG_SDEBUG("...Done.");
00206     SG_SINFO("CGLS converged in %d iteration(s)", cgiter);
00207 
00208     delete[] z;
00209     delete[] q;
00210     delete[] r;
00211     delete[] p;
00212     return optimality;
00213 }
00214 int L2_SVM_MFN(const struct data *Data, 
00215         struct options *Options, 
00216         struct vector_double *Weights,
00217         struct vector_double *Outputs,
00218         int ini)
00219 { 
00220     /* Disassemble the structures */  
00221     CSparseFeatures<DREAL>* features=Data->features;
00222     double *Y = Data->Y;
00223     double *C = Data->C;
00224     int n  = Data->n;
00225     int m  = Data->m;
00226     double lambda = Options->lambda;
00227     double epsilon;
00228     double *w = Weights->vec;
00229     double *o = Outputs->vec; 
00230     double F_old = 0.0;
00231     double F = 0.0;
00232     double diff=0.0;
00233     vector_int *ActiveSubset = new vector_int[1];
00234     ActiveSubset->vec = new int[m];
00235     ActiveSubset->d = m;
00236     // initialize
00237     if(ini==0) {
00238         epsilon=BIG_EPSILON; 
00239         Options->cgitermax=SMALL_CGITERMAX; 
00240         Options->epsilon=BIG_EPSILON;
00241     }
00242     else {epsilon = Options->epsilon;}  
00243     for(int i=0;i<n;i++) F+=w[i]*w[i];
00244     F=0.5*lambda*F;        
00245     int active=0;
00246     int inactive=m-1; // l-1      
00247     for(int i=0; i<m ; i++)
00248     { 
00249         diff=1-Y[i]*o[i];
00250         if(diff>0)
00251         {
00252             ActiveSubset->vec[active]=i;
00253             active++;
00254             F+=0.5*C[i]*diff*diff;
00255         }
00256         else
00257         {
00258             ActiveSubset->vec[inactive]=i;
00259             inactive--;
00260         }   
00261     }
00262     ActiveSubset->d=active;        
00263     int iter=0;
00264     int opt=0;
00265     int opt2=0;
00266     vector_double *Weights_bar = new vector_double[1];
00267     vector_double *Outputs_bar = new vector_double[1];
00268     double *w_bar = new double[n];
00269     double *o_bar = new double[m];
00270     Weights_bar->vec=w_bar;
00271     Outputs_bar->vec=o_bar;
00272     Weights_bar->d=n;
00273     Outputs_bar->d=m;
00274     double delta=0.0;
00275     double t=0.0;
00276     int ii = 0;
00277     while(iter<MFNITERMAX)
00278     {
00279         iter++;
00280         SG_SDEBUG("L2_SVM_MFN Iteration# %d (%d active examples, objective_value = %f)\n", iter, active, F);
00281         for(int i=n; i-- ;) 
00282             w_bar[i]=w[i];
00283         for(int i=m; i-- ;)  
00284             o_bar[i]=o[i];
00285         
00286         opt=CGLS(Data,Options,ActiveSubset,Weights_bar,Outputs_bar);
00287         for(register int i=active; i < m; i++) 
00288         {
00289             ii=ActiveSubset->vec[i];   
00290             t=0.0;
00291 
00292             INT num_entries=0;
00293             bool free_vec=false;
00294 
00295             TSparseEntry<DREAL>* vec=features->get_sparse_feature_vector(ii, num_entries, free_vec);
00296             for (int j=0; j<num_entries; j++)
00297                 t+=vec[j].entry*w_bar[vec[j].feat_index];
00298             features->free_sparse_feature_vector(vec, num_entries, free_vec);
00299             t+=Options->bias*w_bar[n-1]; //bias (modelled as last dim)
00300 
00301             o_bar[ii]=t;
00302         }
00303         if(ini==0) {Options->cgitermax=CGITERMAX; ini=1;};
00304         opt2=1;
00305         for(int i=0;i<m;i++)
00306         { 
00307             ii=ActiveSubset->vec[i];
00308             if(i<active)
00309                 opt2=(opt2 && (Y[ii]*o_bar[ii]<=1+epsilon));
00310             else
00311                 opt2=(opt2 && (Y[ii]*o_bar[ii]>=1-epsilon));  
00312             if(opt2==0) break;
00313         }      
00314         if(opt && opt2) // l
00315         {
00316             if(epsilon==BIG_EPSILON) 
00317             {
00318                 epsilon=EPSILON;
00319                 Options->epsilon=EPSILON;
00320                 SG_SDEBUG("epsilon = %f case converged (speedup heuristic 2). Continuing with epsilon=%f",  BIG_EPSILON , EPSILON);
00321                 continue;
00322             }
00323             else
00324             {
00325                 for(int i=n; i-- ;) 
00326                     w[i]=w_bar[i];      
00327                 for(int i=m; i-- ;)
00328                     o[i]=o_bar[i]; 
00329                 delete[] ActiveSubset->vec;
00330                 delete[] ActiveSubset;
00331                 delete[] o_bar;
00332                 delete[] w_bar;
00333                 delete[] Weights_bar;
00334                 delete[] Outputs_bar;
00335                 SG_SINFO("L2_SVM_MFN converged (optimality) in %d", iter);
00336                 return 1;      
00337             }
00338         }
00339         delta=line_search(w,w_bar,lambda,o,o_bar,Y,C,n,m); 
00340         SG_SDEBUG("LINE_SEARCH delta = %f\n", delta);
00341         F_old=F;
00342         F=0.0;
00343         for(int i=n; i-- ;){ 
00344             w[i]+=delta*(w_bar[i]-w[i]);
00345             F+=w[i]*w[i];
00346         }
00347         F=0.5*lambda*F;      
00348         active=0;
00349         inactive=m-1;  
00350         for(int i=0; i<m ; i++)
00351         {
00352             o[i]+=delta*(o_bar[i]-o[i]);
00353             diff=1-Y[i]*o[i];
00354             if(diff>0)
00355             {
00356                 ActiveSubset->vec[active]=i;
00357                 active++;
00358                 F+=0.5*C[i]*diff*diff;
00359             }
00360             else
00361             {
00362                 ActiveSubset->vec[inactive]=i;
00363                 inactive--;
00364             }   
00365         }
00366         ActiveSubset->d=active;      
00367         if(CMath::abs(F-F_old)<RELATIVE_STOP_EPS*CMath::abs(F_old))
00368         {
00369             SG_SINFO("L2_SVM_MFN converged (rel. criterion) in %d iterations", iter);
00370             return 2;
00371         }
00372     }
00373     delete[] ActiveSubset->vec;
00374     delete[] ActiveSubset;
00375     delete[] o_bar;
00376     delete[] w_bar;
00377     delete[] Weights_bar;
00378     delete[] Outputs_bar;
00379     SG_SINFO("L2_SVM_MFN converged (max iter exceeded) in %d iterations", iter);
00380     return 0;
00381 }
00382 
00383 double line_search(double *w, 
00384         double *w_bar,
00385         double lambda,
00386         double *o, 
00387         double *o_bar, 
00388         double *Y, 
00389         double *C,
00390         int d, /* data dimensionality -- 'n' */
00391         int l) /* number of examples */                  
00392 {                       
00393     double omegaL = 0.0;
00394     double omegaR = 0.0;
00395     double diff=0.0;   
00396     for(int i=d; i--; )
00397     {
00398         diff=w_bar[i]-w[i];  
00399         omegaL+=w[i]*diff;
00400         omegaR+=w_bar[i]*diff;
00401     }
00402     omegaL=lambda*omegaL;
00403     omegaR=lambda*omegaR;
00404     double L=0.0;
00405     double R=0.0;
00406     int ii=0;
00407     for(int i=0;i<l;i++)
00408     {
00409         if(Y[i]*o[i]<1)
00410         {
00411             diff=C[i]*(o_bar[i]-o[i]);  
00412             L+=(o[i]-Y[i])*diff;
00413             R+=(o_bar[i]-Y[i])*diff;
00414         }
00415     }
00416     L+=omegaL;
00417     R+=omegaR;
00418     Delta* deltas=new Delta[l];    
00419     int p=0;
00420     for(int i=0;i<l;i++)
00421     { 
00422         diff=Y[i]*(o_bar[i]-o[i]);
00423 
00424         if(Y[i]*o[i]<1)
00425         {
00426             if(diff>0)
00427             {
00428                 deltas[p].delta=(1-Y[i]*o[i])/diff;
00429                 deltas[p].index=i;
00430                 deltas[p].s=-1;
00431                 p++;
00432             }
00433         }
00434         else
00435         {
00436             if(diff<0)
00437             {
00438                 deltas[p].delta=(1-Y[i]*o[i])/diff;
00439                 deltas[p].index=i;
00440                 deltas[p].s=1;      
00441                 p++;
00442             }
00443         }
00444     }
00445     std::sort(deltas,deltas+p);            
00446     double delta_prime=0.0;  
00447     for(int i=0;i<p;i++)
00448     {
00449         delta_prime = L + deltas[i].delta*(R-L);       
00450         if(delta_prime>=0)
00451             break;
00452         ii=deltas[i].index;   
00453         diff=(deltas[i].s)*C[ii]*(o_bar[ii]-o[ii]);
00454         L+=diff*(o[ii]-Y[ii]);
00455         R+=diff*(o_bar[ii]-Y[ii]);
00456     }   
00457     delete [] deltas;
00458     return (-L/(R-L));
00459 } 
00460 int TSVM_MFN(const struct data *Data, 
00461         struct options *Options, 
00462         struct vector_double *Weights,
00463         struct vector_double *Outputs)
00464 {
00465     /* Setup labeled-only examples and train L2_SVM_MFN */
00466     struct data *Data_Labeled = new data[1];
00467     struct vector_double *Outputs_Labeled = new vector_double[1];
00468     initialize(Outputs_Labeled,Data->l,0.0);
00469     SG_SDEBUG("Initializing weights, unknown labels");
00470     GetLabeledData(Data_Labeled,Data); /* gets labeled data and sets C=1/l */
00471     L2_SVM_MFN(Data_Labeled, Options, Weights,Outputs_Labeled,0);
00473     /* Use this weight vector to classify R*u unlabeled examples as
00474        positive*/   
00475     int p=0,q=0; 
00476     double t=0.0;
00477     int *JU = new int[Data->u];
00478     double *ou = new double[Data->u];
00479     double lambda_0 = TSVM_LAMBDA_SMALL;
00480     for(int i=0;i<Data->m;i++)
00481     {
00482         if(Data->Y[i]==0.0)
00483         {
00484             t=0.0;
00485             INT num_entries=0;
00486             bool free_vec=false;
00487 
00488             TSparseEntry<DREAL>* vec=Data->features->get_sparse_feature_vector(i, num_entries, free_vec);
00489             for (int j=0; j<num_entries; j++)
00490                 t+=vec[j].entry*Weights->vec[vec[j].feat_index];
00491             Data->features->free_sparse_feature_vector(vec, num_entries, free_vec);
00492             t+=Options->bias*Weights->vec[Data->n-1]; //bias (modelled as last dim)
00493 
00494             Outputs->vec[i]=t;
00495             Data->C[i]=lambda_0*1.0/Data->u;
00496             JU[q]=i;
00497             ou[q]=t;
00498             q++;
00499         }
00500         else
00501         {                
00502             Outputs->vec[i]=Outputs_Labeled->vec[p];
00503             Data->C[i]=1.0/Data->l;
00504             p++;   
00505         }
00506     }
00507     std::nth_element(ou,ou+int((1-Options->R)*Data->u-1),ou+Data->u);
00508     double thresh=*(ou+int((1-Options->R)*Data->u)-1);
00509     delete [] ou;
00510     for(int i=0;i<Data->u;i++)
00511     {  
00512         if(Outputs->vec[JU[i]]>thresh)
00513             Data->Y[JU[i]]=1.0;
00514         else
00515             Data->Y[JU[i]]=-1.0;
00516     }
00517     for(int i=0;i<Data->n;i++)
00518         Weights->vec[i]=0.0;
00519     for(int i=0;i<Data->m;i++)
00520         Outputs->vec[i]=0.0;
00521     L2_SVM_MFN(Data,Options,Weights,Outputs,0); 
00522     int num_switches=0;
00523     int s=0;
00524     int last_round=0;
00525     while(lambda_0 <= Options->lambda_u)
00526     {   
00527         int iter2=0;
00528         while(1){
00529             s=switch_labels(Data->Y,Outputs->vec,JU,Data->u,Options->S);
00530             if(s==0) break;
00531             iter2++;
00532             SG_SDEBUG("****** lambda_0 = %f iteration = %d ************************************\n", lambda_0, iter2);
00533             SG_SDEBUG("Optimizing unknown labels. switched %d labels.\n");
00534             num_switches+=s;
00535             SG_SDEBUG("Optimizing weights\n");
00536             L2_SVM_MFN(Data,Options,Weights,Outputs,1); 
00537         }
00538         if(last_round==1) break;
00539         lambda_0=TSVM_ANNEALING_RATE*lambda_0;
00540         if(lambda_0 >= Options->lambda_u) {lambda_0 = Options->lambda_u; last_round=1;} 
00541         for(int i=0;i<Data->u;i++)
00542             Data->C[JU[i]]=lambda_0*1.0/Data->u;       
00543         SG_SDEBUG("****** lambda0 increased to %f%% of lambda_u = %f ************************\n", lambda_0*100/Options->lambda_u, Options->lambda_u);
00544         SG_SDEBUG("Optimizing weights\n");
00545         L2_SVM_MFN(Data,Options,Weights,Outputs,1); 
00546     }
00547     SG_SDEBUG("Total Number of Switches = %d\n", num_switches);
00548     /* reset labels */
00549     for(int i=0;i<Data->u;i++) Data->Y[JU[i]] = 0.0;
00550     double F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
00551     SG_SDEBUG("Objective Value = %f\n",F);
00552     delete [] JU;
00553     return num_switches;
00554 }
00555 int switch_labels(double* Y, double* o, int* JU, int u, int S)
00556 {     
00557     int npos=0;
00558     int nneg=0;
00559     for(int i=0;i<u;i++)
00560     {
00561         if((Y[JU[i]]>0) && (o[JU[i]]<1.0)) npos++;   
00562         if((Y[JU[i]]<0) && (-o[JU[i]]<1.0)) nneg++;        
00563     }     
00564     Delta* positive=new Delta[npos];
00565     Delta* negative=new Delta[nneg];  
00566     int p=0;
00567     int n=0;
00568     int ii=0;
00569     for(int i=0;i<u;i++)
00570     {
00571         ii=JU[i];
00572         if((Y[ii]>0.0) && (o[ii]<1.0)) {
00573             positive[p].delta=o[ii]; 
00574             positive[p].index=ii;
00575             positive[p].s=0;
00576             p++;};   
00577             if((Y[ii]<0.0) && (-o[ii]<1.0)) 
00578             {
00579                 negative[n].delta=-o[ii]; 
00580                 negative[n].index=ii;
00581                 negative[n].s=0;
00582                 n++;};   
00583     }
00584     std::sort(positive,positive+npos);
00585     std::sort(negative,negative+nneg);  
00586     int s=-1;
00587     while(1)
00588     {
00589         s++;    
00590         if((s>=S) || (positive[s].delta>=-negative[s].delta) || (s>=npos) || (s>=nneg))
00591             break;
00592         Y[positive[s].index]=-1.0;
00593         Y[negative[s].index]= 1.0;
00594     }
00595     delete [] positive;
00596     delete [] negative;
00597     return s;
00598 }
00599 
00600 int DA_S3VM(struct data *Data, 
00601         struct options *Options, 
00602         struct vector_double *Weights,
00603         struct vector_double *Outputs)
00604 {
00605     double T = DA_INIT_TEMP*Options->lambda_u;
00606     int iter1 = 0, iter2 =0;
00607     double *p = new double[Data->u];
00608     double *q = new double[Data->u];
00609     double *g = new double[Data->u];
00610     double F,F_min;
00611     double *w_min = new double[Data->n];
00612     double *o_min = new double[Data->m];
00613     double *w = Weights->vec;
00614     double *o = Outputs->vec;
00615     double kl_divergence = 1.0;
00616     /*initialize */
00617     SG_SDEBUG("Initializing weights, p");
00618     for(int i=0;i<Data->u; i++)
00619         p[i] = Options->R;
00620     /* record which examples are unlabeled */
00621     int *JU = new int[Data->u];
00622     int j=0;
00623     for(int i=0;i<Data->m;i++)
00624     {
00625         if(Data->Y[i]==0.0)
00626         {JU[j]=i;j++;}
00627     }  
00628     double H = entropy(p,Data->u);
00629     optimize_w(Data,p,Options,Weights,Outputs,0);  
00630     F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
00631     F_min = F;
00632     for(int i=0;i<Weights->d;i++)
00633         w_min[i]=w[i];
00634     for(int i=0;i<Outputs->d;i++)
00635         o_min[i]=o[i];
00636     while((iter1 < DA_OUTER_ITERMAX) && (H > Options->epsilon))
00637     {
00638         iter1++;
00639         iter2=0;
00640         kl_divergence=1.0;
00641         while((iter2 < DA_INNER_ITERMAX) && (kl_divergence > Options->epsilon)) 
00642         {
00643             iter2++;
00644             for(int i=0;i<Data->u;i++)
00645             {
00646                 q[i]=p[i];
00647                 g[i] = Options->lambda_u*((o[JU[i]] > 1 ? 0 : (1 - o[JU[i]])*(1 - o[JU[i]])) - (o[JU[i]]< -1 ? 0 : (1 + o[JU[i]])*(1 + o[JU[i]]))); 
00648             }
00649             SG_SDEBUG("Optimizing p.\n");
00650             optimize_p(g,Data->u,T,Options->R,p);
00651             kl_divergence=KL(p,q,Data->u);
00652             SG_SDEBUG("Optimizing weights\n");
00653             optimize_w(Data,p,Options,Weights,Outputs,1);
00654             F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u);
00655             if(F < F_min)
00656             {
00657                 F_min = F;
00658                 for(int i=0;i<Weights->d;i++)
00659                     w_min[i]=w[i];
00660                 for(int i=0;i<Outputs->d;i++)
00661                     o_min[i]=o[i];
00662             }
00663             SG_SDEBUG("***** outer_iter = %d  T = %g  inner_iter = %d  kl = %g  cost = %g *****\n",iter1,T,iter2,kl_divergence,F); 
00664         }
00665         H = entropy(p,Data->u); 
00666         SG_SDEBUG("***** Finished outer_iter = %d T = %g  Entropy = %g ***\n", iter1,T,H);
00667         T = T/DA_ANNEALING_RATE;
00668     }
00669     for(int i=0;i<Weights->d;i++)
00670         w[i]=w_min[i];
00671     for(int i=0;i<Outputs->d;i++)
00672         o[i]=o_min[i];
00673     /* may want to reset the original Y */ 
00674     delete [] p; 
00675     delete [] q;
00676     delete [] g;
00677     delete [] JU;
00678     delete [] w_min;
00679     delete [] o_min;
00680     SG_SINFO("(min) Objective Value = %f", F_min);
00681     return 1;
00682 }
00683 int optimize_w(const struct data *Data, 
00684         const double *p,
00685         struct options *Options, 
00686         struct vector_double *Weights,
00687         struct vector_double *Outputs,
00688         int ini)
00689 { 
00690     int i,j;
00691     CSparseFeatures<DREAL>* features=Data->features;
00692     int n  = Data->n;
00693     int m  = Data->m;
00694     int u  = Data->u;
00695     double lambda = Options->lambda;
00696     double epsilon;
00697     double *w = Weights->vec;
00698     double *o = new double[m+u];
00699     double *Y = new double[m+u];
00700     double *C = new double[m+u];
00701     int *labeled_indices = new int[m];
00702     double F_old = 0.0;
00703     double F = 0.0;
00704     double diff=0.0;
00705     double lambda_u_by_u = Options->lambda_u/u;
00706     vector_int *ActiveSubset = new vector_int[1];
00707     ActiveSubset->vec = new int[m];
00708     ActiveSubset->d = m;
00709     // initialize
00710     if(ini==0) 
00711     {
00712         epsilon=BIG_EPSILON; 
00713         Options->cgitermax=SMALL_CGITERMAX;
00714         Options->epsilon=BIG_EPSILON;}
00715     else {epsilon = Options->epsilon;}  
00716 
00717     for(i=0;i<n;i++) F+=w[i]*w[i];
00718     F=lambda*F;        
00719     int active=0;
00720     int inactive=m-1; // l-1      
00721     double temp1;
00722     double temp2;
00723 
00724     j = 0;
00725     for(i=0; i<m ; i++)
00726     { 
00727         o[i]=Outputs->vec[i];
00728         if(Data->Y[i]==0.0)
00729         {
00730             labeled_indices[i]=0;
00731             o[m+j]=o[i];
00732             Y[i]=1;
00733             Y[m+j]=-1;
00734             C[i]=lambda_u_by_u*p[j];
00735             C[m+j]=lambda_u_by_u*(1-p[j]);
00736             ActiveSubset->vec[active]=i;
00737             active++; 
00738             diff = 1 - CMath::abs(o[i]);
00739             if(diff>0)
00740             {
00741                 Data->Y[i] = 2*p[j]-1;
00742                 Data->C[i] = lambda_u_by_u;
00743                 temp1 = (1 - o[i]);
00744                 temp2 = (1 + o[i]);
00745                 F+=lambda_u_by_u*(p[j]*temp1*temp1 + (1-p[j])*temp2*temp2);
00746             }
00747             else
00748             {
00749                 if(o[i]>0)
00750                 {
00751                     Data->Y[i] = -1.0;
00752                     Data->C[i] = C[m+j];
00753                 }
00754                 else
00755                 {
00756                     Data->Y[i] = 1.0;
00757                     Data->C[i] = C[i];       
00758                 }
00759                 temp1 = (1-Data->Y[i]*o[i]);
00760                 F+= Data->C[i]*temp1*temp1;
00761             }
00762             j++;
00763         } 
00764         else
00765         {
00766             labeled_indices[i]=1;
00767             Y[i]=Data->Y[i];
00768             C[i]=1.0/Data->l;
00769             Data->C[i]=1.0/Data->l;
00770             diff=1-Data->Y[i]*o[i];
00771             if(diff>0)
00772             {
00773                 ActiveSubset->vec[active]=i;
00774                 active++;
00775                 F+=Data->C[i]*diff*diff;
00776             }
00777             else
00778             {
00779                 ActiveSubset->vec[inactive]=i;
00780                 inactive--;
00781             }
00782         }
00783     }
00784     F=0.5*F;
00785     ActiveSubset->d=active;
00786     int iter=0;
00787     int opt=0;
00788     int opt2=0;
00789     vector_double *Weights_bar = new vector_double[1];
00790     vector_double *Outputs_bar = new vector_double[1];
00791     double *w_bar = new double[n];
00792     double *o_bar = new double[m+u];
00793     Weights_bar->vec=w_bar;
00794     Outputs_bar->vec=o_bar;
00795     Weights_bar->d=n;
00796     Outputs_bar->d=m; /* read only the top m ; bottom u will be copies */
00797     double delta=0.0;
00798     double t=0.0;
00799     int ii = 0;
00800     while(iter<MFNITERMAX)
00801     {
00802         iter++;
00803         SG_SDEBUG("L2_SVM_MFN Iteration# %d (%d active examples,  objective_value = %f)", iter, active, F);
00804         for(i=n; i-- ;) 
00805             w_bar[i]=w[i];
00806 
00807         for(i=m+u; i-- ;)  
00808             o_bar[i]=o[i];     
00809         opt=CGLS(Data,Options,ActiveSubset,Weights_bar,Outputs_bar);
00810         for(i=active; i < m; i++) 
00811         {
00812             ii=ActiveSubset->vec[i];   
00813             t=0.0;
00814             INT num_entries=0;
00815             bool free_vec=false;
00816 
00817             TSparseEntry<DREAL>* vec=features->get_sparse_feature_vector(ii, num_entries, free_vec);
00818             for (j=0; j<num_entries; j++)
00819                 t+=vec[j].entry*w_bar[vec[j].feat_index];
00820             features->free_sparse_feature_vector(vec, num_entries, free_vec);
00821             t+=Options->bias*w_bar[n-1]; //bias (modelled as last dim)
00822 
00823             o_bar[ii]=t;
00824         }
00825         // make o_bar consistent in the bottom half      
00826         j=0;
00827         for(i=0; i<m;i++)
00828         {
00829             if(labeled_indices[i]==0)
00830             {o_bar[m+j]=o_bar[i]; j++;};
00831         }
00832         if(ini==0) {Options->cgitermax=CGITERMAX; ini=1;};
00833         opt2=1;
00834         for(i=0; i < m ;i++)
00835         { 
00836             ii=ActiveSubset->vec[i];
00837             if(i<active)
00838             {
00839                 if(labeled_indices[ii]==1)
00840                     opt2=(opt2 && (Data->Y[ii]*o_bar[ii]<=1+epsilon));
00841                 else
00842                 {
00843                     if(CMath::abs(o[ii])<1)
00844                         opt2=(opt2 && (CMath::abs(o_bar[ii])<=1+epsilon));
00845                     else
00846                         opt2=(opt2 && (CMath::abs(o_bar[ii])>=1-epsilon));
00847                 }
00848             }
00849             else
00850                 opt2=(opt2 && (Data->Y[ii]*o_bar[ii]>=1-epsilon));  
00851             if(opt2==0) break;
00852         }
00853         if(opt && opt2) // l
00854         {
00855             if(epsilon==BIG_EPSILON) 
00856             {
00857                 epsilon=EPSILON;
00858                 Options->epsilon=EPSILON;
00859                 SG_SDEBUG( "epsilon = %f case converged (speedup heuristic 2). Continuing with epsilon=%f\n", BIG_EPSILON, EPSILON);
00860                 continue;
00861             }
00862             else
00863             {
00864                 for(i=n; i-- ;) 
00865                     w[i]=w_bar[i];      
00866                 for(i=m; i-- ;)
00867                     Outputs->vec[i]=o_bar[i]; 
00868                 for(i=m; i-- ;)
00869                 {
00870                     if(labeled_indices[i]==0)
00871                         Data->Y[i]=0.0;
00872                 }
00873                 delete[] ActiveSubset->vec;
00874                 delete[] ActiveSubset;
00875                 delete[] o_bar;
00876                 delete[] w_bar;
00877                 delete[] o;
00878                 delete[] Weights_bar;
00879                 delete[] Outputs_bar;
00880                 delete[] Y;
00881                 delete[] C;
00882                 delete[] labeled_indices;
00883                 SG_SINFO("L2_SVM_MFN converged in %d iteration(s)", iter);
00884                 return 1;      
00885             }
00886         }  
00887 
00888         delta=line_search(w,w_bar,lambda,o,o_bar,Y,C,n,m+u); 
00889         SG_SDEBUG("LINE_SEARCH delta = %f", delta);
00890         F_old=F;
00891         F=0.0;
00892         for(i=0;i<n;i++) {w[i]+=delta*(w_bar[i]-w[i]);  F+=w[i]*w[i];}
00893         F=lambda*F;
00894         j=0;
00895         active=0;
00896         inactive=m-1;
00897         for(i=0; i<m ; i++)
00898         { 
00899             o[i]+=delta*(o_bar[i]-o[i]);
00900             if(labeled_indices[i]==0)
00901             {
00902                 o[m+j]=o[i];
00903                 ActiveSubset->vec[active]=i;
00904                 active++; 
00905                 diff = 1 - CMath::abs(o[i]);
00906                 if(diff>0)
00907                 {
00908                     Data->Y[i] = 2*p[j]-1;
00909                     Data->C[i] = lambda_u_by_u;
00910                     temp1 = (1 - o[i]);
00911                     temp2 = (1 + o[i]);
00912                     F+=lambda_u_by_u*(p[j]*temp1*temp1 + (1-p[j])*temp2*temp2);
00913                 }
00914                 else
00915                 {
00916                     if(o[i]>0)
00917                     {
00918                         Data->Y[i] = -1;
00919                         Data->C[i] = C[m+j];
00920                     }
00921                     else
00922                     {
00923                         Data->Y[i] = +1;
00924                         Data->C[i] = C[i];
00925                     }
00926                     temp1=(1-Data->Y[i]*o[i]);
00927                     F+= Data->C[i]*temp1*temp1;
00928                 }
00929                 j++;
00930             }
00931             else
00932             {
00933                 diff=1-Data->Y[i]*o[i];
00934                 if(diff>0)
00935                 {
00936                     ActiveSubset->vec[active]=i;
00937                     active++;
00938                     F+=Data->C[i]*diff*diff;
00939                 }
00940                 else
00941                 {
00942                     ActiveSubset->vec[inactive]=i;
00943                     inactive--;
00944                 }
00945             }
00946         }
00947         F=0.5*F;
00948         ActiveSubset->d=active;
00949         if(CMath::abs(F-F_old)<EPSILON)
00950             break;
00951     }
00952     for(i=m; i-- ;)
00953     {
00954         Outputs->vec[i]=o[i];
00955         if(labeled_indices[i]==0)
00956             Data->Y[i]=0.0;
00957     }
00958     delete[] ActiveSubset->vec;
00959     delete[] labeled_indices;
00960     delete[] ActiveSubset;
00961     delete[] o_bar;
00962     delete[] w_bar;
00963     delete[] o;
00964     delete[] Weights_bar;
00965     delete[] Outputs_bar;
00966     delete[] Y;
00967     delete[] C;
00968     SG_SINFO("L2_SVM_MFN converged in %d iterations", iter);
00969     return 0;
00970 }
00971 void optimize_p(const double* g, int u, double T, double r, double* p)
00972 {
00973     int iter=0;
00974     double epsilon=1e-10;
00975     int maxiter=500; 
00976     double nu_minus=g[0];
00977     double nu_plus=g[0];
00978     for(int i=0;i<u;i++)
00979     {
00980         if(g[i]<nu_minus) nu_minus=g[i]; 
00981         if(g[i]>nu_plus) nu_plus=g[i];
00982     };
00983 
00984     double b=T*log((1-r)/r);
00985     nu_minus-=b;
00986     nu_plus-=b;
00987     double nu=(nu_plus+nu_minus)/2;
00988     double Bnu=0.0;
00989     double BnuPrime=0.0;
00990     double s=0.0;
00991     double tmp=0.0;
00992     for(int i=0;i<u;i++)
00993     {
00994         s=exp((g[i]-nu)/T);
00995         if(!(isinf(s)))
00996         {
00997             tmp=1.0/(1.0+s);
00998             Bnu+=tmp;
00999             BnuPrime+=s*tmp*tmp;    
01000         }
01001     }
01002     Bnu=Bnu/u;
01003     Bnu-=r;
01004     BnuPrime=BnuPrime/(T*u);
01005     double nuHat=0.0;
01006     while((CMath::abs(Bnu)>epsilon) && (iter < maxiter))
01007     {
01008         iter++;
01009         if(CMath::abs(BnuPrime)>0.0)
01010             nuHat=nu-Bnu/BnuPrime;
01011         if((CMath::abs(BnuPrime) > 0.0) | (nuHat>nu_plus)  | (nuHat < nu_minus))
01012             nu=(nu_minus+nu_plus)/2.0;
01013         else
01014             nu=nuHat; 
01015         Bnu=0.0;
01016         BnuPrime=0.0;
01017         for(int i=0;i<u;i++)
01018         {
01019             s=exp((g[i]-nu)/T);
01020             if(!(isinf(s)))
01021             {
01022                 tmp=1.0/(1.0+s);
01023                 Bnu+=tmp;
01024                 BnuPrime+=s*tmp*tmp;    
01025             }
01026         }
01027         Bnu=Bnu/u;
01028         Bnu-=r;
01029         BnuPrime=BnuPrime/(T*u);
01030         if(Bnu<0)
01031             nu_minus=nu;
01032         else
01033             nu_plus=nu;
01034         if(CMath::abs(nu_minus-nu_plus)<epsilon)
01035             break;   
01036     }
01037     if(CMath::abs(Bnu)>epsilon)
01038         SG_SWARNING("Warning (Root): root not found to required precision\n");
01039 
01040     for(int i=0;i<u;i++)
01041     {
01042         s=exp((g[i]-nu)/T);
01043         if(isinf(s)) p[i]=0.0;
01044         else p[i]=1.0/(1.0+s);  
01045     }
01046     SG_SINFO(" root (nu) = %f B(nu) = %f", nu, Bnu);
01047 }
01048 double transductive_cost(double normWeights,double *Y, double *Outputs, int m, double lambda,double lambda_u)
01049 {
01050     double F1=0.0,F2=0.0, o=0.0, y=0.0; 
01051     int u=0,l=0;
01052     for(int i=0;i<m;i++)
01053     {
01054         o=Outputs[i];
01055         y=Y[i];
01056         if(y==0.0)
01057         {F1 += CMath::abs(o) > 1 ? 0 : (1 - CMath::abs(o))*(1 - CMath::abs(o)); u++;}
01058         else
01059         {F2 += y*o > 1 ? 0 : (1-y*o)*(1-y*o); l++;}   
01060     }
01061     double F;
01062     F = 0.5*(lambda*normWeights + lambda_u*F1/u + F2/l);
01063     return F;
01064 }
01065 
01066 double entropy(const double *p, int u)
01067 {
01068     double h=0.0;
01069     double q=0.0; 
01070     for(int i=0;i<u;i++)
01071     {
01072         q=p[i];
01073         if(q>0 && q<1)
01074             h+= -(q*CMath::log2(q) + (1-q)*CMath::log2(1-q));
01075     }
01076     return h/u;
01077 }
01078 double KL(const double *p, const double *q, int u)
01079 {
01080     double h=0.0;
01081     double p1=0.0;
01082     double q1=0.0;
01083     double g=0.0;
01084     for(int i=0;i<u;i++)
01085     {
01086         p1=p[i];
01087         q1=q[i];
01088         if(p1>1-1e-8) p1-=1e-8;
01089         if(p1<1-1e-8) p1+=1e-8;
01090         if(q1>1-1e-8) q1-=1e-8;
01091         if(q1<1-1e-8) q1+=1e-8;
01092         g= (p1*CMath::log2(p1/q1) + (1-p1)*CMath::log2((1-p1)/(1-q1)));
01093         if(CMath::abs(g)<1e-12 || isnan(g)) g=0.0;
01094         h+=g;
01095     }
01096     return h/u;   
01097 }
01098 /********************** UTILITIES ********************/
01099 double norm_square(const vector_double *A)
01100 {
01101     double x=0.0, t=0.0;
01102     for(int i=0;i<A->d;i++)
01103     {
01104         t=A->vec[i];
01105         x+=t*t;
01106     }
01107     return x;
01108 } 
01109 void initialize(struct vector_double *A, int k, double a)
01110 {
01111     double *vec = new double[k];
01112     for(int i=0;i<k;i++)
01113         vec[i]=a;
01114     A->vec = vec;
01115     A->d   = k;
01116     return;
01117 }
01118 void initialize(struct vector_int *A, int k)
01119 {  
01120     int *vec = new int[k];
01121     for(int i=0;i<k;i++)
01122         vec[i]=i; 
01123     A->vec = vec;
01124     A->d   = k;
01125     return;
01126 }
01127 void GetLabeledData(struct data *D, const struct data *Data)
01128 {
01129     /*FIXME
01130     int *J = new int[Data->l];
01131     D->C   = new double[Data->l];
01132     D->Y   = new double[Data->l];
01133     int nz=0;
01134     int k=0;
01135     int rowptrs_=Data->l;
01136     for(int i=0;i<Data->m;i++)
01137     {
01138         if(Data->Y[i]!=0.0)
01139         {
01140             J[k]=i;
01141             D->Y[k]=Data->Y[i];
01142             D->C[k]=1.0/Data->l;
01143             nz+=(Data->rowptr[i+1] - Data->rowptr[i]);
01144             k++;
01145         }
01146     }  
01147     D->val    = new double[nz];
01148     D->colind = new int[nz]; 
01149     D->rowptr = new int[rowptrs_+1];
01150     nz=0;
01151     for(int i=0;i<Data->l;i++)
01152     {
01153         D->rowptr[i]=nz;
01154         for(int j=Data->rowptr[J[i]];j<Data->rowptr[J[i]+1];j++)
01155         {
01156             D->val[nz] = Data->val[j];
01157             D->colind[nz] = Data->colind[j];
01158             nz++;   
01159         }
01160     }
01161     D->rowptr[rowptrs_]=nz;
01162     D->nz=nz;
01163     D->l=Data->l;
01164     D->m=Data->l;
01165     D->n=Data->n;
01166     D->u=0;
01167     delete [] J;*/
01168 }

SHOGUN Machine Learning Toolbox - Documentation