KMeans.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Gunnar Raetsch
00008  * Written (W) 2007-2008 Soeren Sonnenburg
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "clustering/KMeans.h"
00013 #include "distance/Distance.h"
00014 #include "features/Labels.h"
00015 #include "features/RealFeatures.h"
00016 #include "lib/Mathematics.h"
00017 #include "base/Parallel.h"
00018 
00019 #ifndef WIN32
00020 #include <pthread.h>
00021 #endif
00022 
00023 #define MUSRECALC
00024 
00025 #define PAR_THRESH  10
00026 
00027 CKMeans::CKMeans()
00028 : CDistanceMachine(), max_iter(10000), k(3), dimensions(0), R(NULL),
00029     mus(NULL), Weights(NULL)
00030 {
00031 }
00032 
00033 CKMeans::CKMeans(INT k_, CDistance* d)
00034 : CDistanceMachine(), max_iter(10000), k(k_), dimensions(0), R(NULL),
00035     mus(NULL), Weights(NULL)
00036 {
00037     set_distance(d);
00038 }
00039 
00040 CKMeans::~CKMeans()
00041 {
00042     delete[] R;
00043     delete[] mus;
00044 }
00045 
00046 bool CKMeans::train()
00047 {
00048     ASSERT(distance);
00049     ASSERT(distance->get_feature_type()==F_DREAL);
00050     ASSERT(distance->get_distance_type()==D_EUCLIDIAN);
00051     CRealFeatures* lhs=(CRealFeatures*) distance->get_lhs();
00052     ASSERT(lhs);
00053     INT num=lhs->get_num_vectors();
00054 
00055     Weights=new DREAL[num];
00056     for (INT i=0; i<num; i++)
00057         Weights[i]=1.0;
00058 
00059     clustknb(false, NULL);
00060     delete[] Weights;
00061     SG_UNREF(lhs);
00062 
00063     return true;
00064 }
00065 
00066 bool CKMeans::load(FILE* srcfile)
00067 {
00068     return false;
00069 }
00070 
00071 bool CKMeans::save(FILE* dstfile)
00072 {
00073     return false;
00074 }
00075 
00076 struct thread_data
00077 {
00078     double* x;
00079     CRealFeatures* y;
00080     double* z;
00081     int n1, n2, m ; 
00082     int js, je ; /* defines the matrix stripe */
00083     int offs;
00084 };
00085 
00086 void *sqdist_thread_func(void * P) 
00087 {
00088     struct thread_data *TD=(struct thread_data*) P;
00089     double* x=TD->x;
00090     CRealFeatures* y=TD->y;
00091     double* z=TD->z;
00092     int n1=TD->n1, 
00093         m=TD->m,
00094         js=TD->js,
00095         je=TD->je,
00096         offs=TD->offs,
00097         j,i,k;
00098 
00099     for (j=js; j<je; j++)
00100     {
00101         INT vlen=0;
00102         bool vfree=false;
00103         double* vec=y->get_feature_vector(j+offs, vlen, vfree);
00104 
00105         for (i=0; i<n1; i++)
00106         {
00107             double sum=0;
00108             for (k=0; k<m; k++) 
00109                 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00110             z[j*n1 + i] = sum;
00111         }
00112 
00113         y->free_feature_vector(vec, j, vfree);
00114     }
00115     return NULL;
00116 } 
00117 
00118 void CKMeans::sqdist(double * x, CRealFeatures* y, double *z,
00119         int n1, int offs, int n2, int m)
00120 {
00121     const int num_threads=parallel.get_num_threads();
00122     int nc, n2_nc = n2/num_threads;
00123     struct thread_data TD[num_threads];
00124     pthread_t tid[num_threads];
00125     void *status;
00126 
00127     /* prepare the structure */
00128     TD[0].x=x ; TD[0].y=y ; TD[0].z=z ; 
00129     TD[0].n1=n1 ; TD[0].n2=n2 ; TD[0].m=m;
00130     TD[0].offs=offs;
00131 
00132     if (n2>PAR_THRESH)
00133     {
00134         ASSERT(PAR_THRESH>1);
00135 
00136         /* create the threads */
00137         for (nc=0; nc<num_threads; nc++)
00138         {
00139             TD[nc]=TD[0];
00140             TD[nc].js=nc*n2_nc;
00141             if (nc+1==num_threads)
00142                 TD[nc].je=n2;
00143             else
00144                 TD[nc].je=(nc+1)*n2_nc;
00145 
00146             pthread_create(&tid[nc], NULL, sqdist_thread_func, (void*)&TD[nc]);
00147         }
00148 
00149         /* wait for finishing all threads */
00150         for (nc=0; nc<num_threads; nc++)
00151             pthread_join(tid[nc], &status);
00152 
00153     }
00154     else
00155     {
00156         /* simply call the ,,thread''-function */
00157         TD[0].js=0 ; TD[0].je=n2;
00158         sqdist_thread_func((void *)&TD[0]);
00159     }
00160 }
00161 
00162 void CKMeans::clustknb(bool use_old_mus, double *mus_start)
00163 {
00164     ASSERT(distance && distance->get_feature_type()==F_DREAL);
00165     CRealFeatures* lhs = (CRealFeatures*) distance->get_lhs();
00166     ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00167     
00168     int XSize=lhs->get_num_vectors();
00169     dimensions=lhs->get_num_features();
00170     int i, changed=1;
00171     const int XDimk=dimensions*k;
00172     int iter=0;
00173 
00174     delete[] R;
00175     R=new DREAL[k];
00176 
00177     delete[] mus;
00178     mus=new DREAL[XDimk];
00179 
00180     int *ClList = (int*) calloc(XSize, sizeof(int));
00181     double *weights_set = (double*) calloc(k, sizeof(double));
00182     double *oldmus = (double*) calloc(XDimk, sizeof(double));
00183     double *dists = (double*) calloc(k*XSize, sizeof(double));
00184 
00185     INT vlen=0;
00186     bool vfree=false;
00187     double* vec=NULL;
00188 
00189     /* ClList=zeros(XSize,1) ; */
00190     for (i=0; i<XSize; i++) ClList[i]=0;
00191     /* weights_set=zeros(k,1) ; */
00192     for (i=0; i<k; i++) weights_set[i]=0;
00193 
00194     /* mus=zeros(dimensions, k) ; */
00195     for (i=0; i<XDimk; i++) mus[i]=0;
00196 
00197     if (!use_old_mus)
00198     {
00199         /* random clustering (select random subsets) */
00200         /*  ks=ceil(rand(1,XSize)*k);
00201          *  for i=1:k,
00202          *  actks= (ks==i);
00203          *  c=sum(actks);
00204          *  weights_set(i)=c;
00205          *
00206          *  ClList(actks)=i*ones(1, c);
00207          *
00208          *  if ~mus_recalc,
00209          *      if c>1
00210          *          mus(:,i) = sum(XData(:,actks)')'/c;
00211          *      elseif c>0
00212          *          mus(:,i) = XData(:,actks);
00213          *      end;
00214          *  end;
00215          *   end ; */
00216 
00217         for (i=0; i<XSize; i++) 
00218         {
00219             const int Cl= (int) ( CMath::random(0, k-1) );
00220             int j;
00221             double weight=Weights[i];
00222 
00223             weights_set[Cl]+=weight;
00224             ClList[i]=Cl;
00225 
00226             vec=lhs->get_feature_vector(i, vlen, vfree);
00227 
00228             for (j=0; j<dimensions; j++)
00229                 mus[Cl*dimensions+j] += weight*vec[j];
00230 
00231             lhs->free_feature_vector(vec, i, vfree);
00232         }
00233         for (i=0; i<k; i++)
00234         {
00235             int j;
00236 
00237             if (weights_set[i]!=0.0)
00238                 for (j=0; j<dimensions; j++)
00239                     mus[i*dimensions+j] /= weights_set[i];
00240         }
00241     }
00242     else 
00243     {
00244         ASSERT(mus_start);
00245 
00246         sqdist(mus_start, lhs, dists, k, 0, XSize, dimensions);
00247 
00248         for (i=0; i<XSize; i++)
00249         {
00250             double mini=dists[i*k];
00251             int Cl = 0, j;
00252 
00253             for (j=1; j<k; j++)
00254             {
00255                 if (dists[i*k+j]<mini)
00256                 {
00257                     Cl=j;
00258                     mini=dists[i*k+j];
00259                 }
00260             }
00261             ClList[i]=Cl;
00262         }
00263 
00264         /* Compute the sum of all points belonging to a cluster 
00265          * and count the points */
00266         for (i=0; i<XSize; i++) 
00267         {
00268             const int Cl = ClList[i];
00269             double weight=Weights[i];
00270             weights_set[Cl]+=weight;
00271 #ifndef MUSRECALC
00272             vec=lhs->get_feature_vector(i, vlen, vfree);
00273 
00274             for (j=0; j<dimensions; j++)
00275                 mus[Cl*dimensions+j] += weight*vec[j];
00276 
00277             lhs->free_feature_vector(vec, i, vfree);
00278 #endif
00279         }
00280 #ifndef MUSRECALC
00281         /* normalization to get the mean */ 
00282         for (i=0; i<k; i++)
00283         {
00284             if (weights_set[i]!=0.0)
00285             {
00286                 int j;
00287                 for (j=0; j<dimensions; j++)
00288                     mus[i*dimensions+j] /= weights_set[i];
00289             }
00290         }
00291 #endif
00292     }
00293 
00294     for (i=0; i<XDimk; i++) oldmus[i]=-1;
00295 
00296     while (changed && (iter<max_iter))
00297     {
00298         iter++;
00299         if (iter==max_iter-1)
00300             SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00301 
00302         if (iter%1000 == 0)
00303             SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00304         changed=0;
00305 
00306 #ifdef MUSRECALC
00307         /* mus=zeros(dimensions, k) ; */
00308         for (i=0; i<XDimk; i++) mus[i]=0;
00309 
00310         for (i=0; i<XSize; i++) 
00311         {
00312             int j;
00313             int Cl=ClList[i];
00314             double weight=Weights[i];
00315 
00316             vec=lhs->get_feature_vector(i, vlen, vfree);
00317 
00318             for (j=0; j<dimensions; j++)
00319                 mus[Cl*dimensions+j] += weight*vec[j];
00320 
00321             lhs->free_feature_vector(vec, i, vfree);
00322         }
00323         for (i=0; i<k; i++)
00324         {
00325             int j;
00326 
00327             if (weights_set[i]!=0.0)
00328                 for (j=0; j<dimensions; j++)
00329                     mus[i*dimensions+j] /= weights_set[i];
00330         }
00331 #endif
00332 
00333         for (i=0; i<XSize; i++)
00334         {
00335             /* ks=ceil(rand(1,XSize)*XSize) ; */
00336             const int Pat= CMath::random(0, XSize-1);
00337             const int ClList_Pat=ClList[Pat];
00338             int imini, j;
00339             double mini, weight;
00340 
00341             weight=Weights[Pat];
00342 
00343             /* compute the distance of this point to all centers */
00344             /* dists=sqdist(mus',XData) ; */
00345             sqdist(mus, lhs, dists, k, Pat, 1, dimensions);
00346 
00347             /* [mini,imini]=min(dists(:,i)) ; */
00348             imini=0 ; mini=dists[0];
00349             for (j=1; j<k; j++)
00350                 if (dists[j]<mini)
00351                 {
00352                     mini=dists[j];
00353                     imini=j;
00354                 }
00355 
00356             if (imini!=ClList_Pat)
00357             {
00358                 changed= changed + 1;
00359 
00360                 /* weights_set(imini) = weights_set(imini) + weight ; */
00361                 weights_set[imini]+= weight;
00362                 /* weights_set(j)     = weights_set(j)     - weight ; */
00363                 weights_set[ClList_Pat]-= weight;
00364 
00365                 /* mu_new=mu_old + (x - mu_old)/(n+1) */
00366                 /* mus(:,imini)=mus(:,imini) + (XData(:,i) - mus(:,imini)) * (weight / weights_set(imini)) ; */
00367                 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00368 
00369                 for (j=0; j<dimensions; j++)
00370                     mus[imini*dimensions+j]-=(vec[j]-mus[imini*dimensions+j])*(weight/weights_set[imini]);
00371 
00372                 lhs->free_feature_vector(vec, Pat, vfree);
00373 
00374                 /* mu_new = mu_old - (x - mu_old)/(n-1) */
00375                 /* if weights_set(j)~=0 */
00376                 if (weights_set[ClList_Pat]!=0.0)
00377                 {
00378                     /* mus(:,j)=mus(:,j) - (XData(:,i) - mus(:,j)) * (weight/weights_set(j)) ; */
00379                     vec=lhs->get_feature_vector(Pat, vlen, vfree);
00380 
00381                     for (j=0; j<dimensions; j++)
00382                         mus[ClList_Pat*dimensions+j]-=(vec[j]-mus[ClList_Pat*dimensions+j])*(weight/weights_set[ClList_Pat]);
00383                     lhs->free_feature_vector(vec, Pat, vfree);
00384                 }
00385                 else
00386                     /*  mus(:,j)=zeros(dimensions,1) ; */
00387                     for (j=0; j<dimensions; j++)
00388                         mus[ClList_Pat*dimensions+j]=0;
00389 
00390                 /* ClList(i)= imini ; */
00391                 ClList[Pat] = imini;
00392             }
00393         }
00394     }
00395 
00396     /* compute the ,,variances'' of the clusters */
00397     for (i=0; i<k; i++)
00398     {
00399         double rmin1=0;
00400         double rmin2=0;
00401 
00402         bool first_round=true;
00403 
00404         for (INT j=0; j<k; j++) 
00405         {
00406             if (j!=i)
00407             {
00408                 int l;
00409                 double dist = 0;
00410 
00411                 for (l=0; l<dimensions; l++)
00412                     dist+=CMath::sq(mus[i*dimensions+l]-mus[j*dimensions+l]);
00413 
00414                 if (first_round)
00415                 {
00416                     rmin1=dist;
00417                     rmin2=dist;
00418                     first_round=false;
00419                 }
00420                 else
00421                 {
00422                     if ((dist<rmin2) && (dist>=rmin1))
00423                         rmin2=dist;
00424 
00425                     if (dist<rmin1) 
00426                     {
00427                         rmin2=rmin1;
00428                         rmin1=dist;
00429                     }
00430                 }
00431             }
00432         }
00433 
00434         R[i]=(0.7*sqrt(rmin1)+0.3*sqrt(rmin2));
00435     }
00436 
00437     free(ClList);
00438     free(weights_set);
00439     free(oldmus);
00440     free(dists);
00441 } 

SHOGUN Machine Learning Toolbox - Documentation