00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "clustering/KMeans.h"
00013 #include "distance/Distance.h"
00014 #include "features/Labels.h"
00015 #include "features/RealFeatures.h"
00016 #include "lib/Mathematics.h"
00017 #include "base/Parallel.h"
00018
00019 #ifndef WIN32
00020 #include <pthread.h>
00021 #endif
00022
00023 #define MUSRECALC
00024
00025 #define PAR_THRESH 10
00026
00027 CKMeans::CKMeans()
00028 : CDistanceMachine(), max_iter(10000), k(3), dimensions(0), R(NULL),
00029 mus(NULL), Weights(NULL)
00030 {
00031 }
00032
00033 CKMeans::CKMeans(INT k_, CDistance* d)
00034 : CDistanceMachine(), max_iter(10000), k(k_), dimensions(0), R(NULL),
00035 mus(NULL), Weights(NULL)
00036 {
00037 set_distance(d);
00038 }
00039
00040 CKMeans::~CKMeans()
00041 {
00042 delete[] R;
00043 delete[] mus;
00044 }
00045
00046 bool CKMeans::train()
00047 {
00048 ASSERT(distance);
00049 ASSERT(distance->get_feature_type()==F_DREAL);
00050 ASSERT(distance->get_distance_type()==D_EUCLIDIAN);
00051 CRealFeatures* lhs=(CRealFeatures*) distance->get_lhs();
00052 ASSERT(lhs);
00053 INT num=lhs->get_num_vectors();
00054
00055 Weights=new DREAL[num];
00056 for (INT i=0; i<num; i++)
00057 Weights[i]=1.0;
00058
00059 clustknb(false, NULL);
00060 delete[] Weights;
00061 SG_UNREF(lhs);
00062
00063 return true;
00064 }
00065
00066 bool CKMeans::load(FILE* srcfile)
00067 {
00068 return false;
00069 }
00070
00071 bool CKMeans::save(FILE* dstfile)
00072 {
00073 return false;
00074 }
00075
00076 struct thread_data
00077 {
00078 double* x;
00079 CRealFeatures* y;
00080 double* z;
00081 int n1, n2, m ;
00082 int js, je ;
00083 int offs;
00084 };
00085
00086 void *sqdist_thread_func(void * P)
00087 {
00088 struct thread_data *TD=(struct thread_data*) P;
00089 double* x=TD->x;
00090 CRealFeatures* y=TD->y;
00091 double* z=TD->z;
00092 int n1=TD->n1,
00093 m=TD->m,
00094 js=TD->js,
00095 je=TD->je,
00096 offs=TD->offs,
00097 j,i,k;
00098
00099 for (j=js; j<je; j++)
00100 {
00101 INT vlen=0;
00102 bool vfree=false;
00103 double* vec=y->get_feature_vector(j+offs, vlen, vfree);
00104
00105 for (i=0; i<n1; i++)
00106 {
00107 double sum=0;
00108 for (k=0; k<m; k++)
00109 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00110 z[j*n1 + i] = sum;
00111 }
00112
00113 y->free_feature_vector(vec, j, vfree);
00114 }
00115 return NULL;
00116 }
00117
00118 void CKMeans::sqdist(double * x, CRealFeatures* y, double *z,
00119 int n1, int offs, int n2, int m)
00120 {
00121 const int num_threads=parallel.get_num_threads();
00122 int nc, n2_nc = n2/num_threads;
00123 struct thread_data TD[num_threads];
00124 pthread_t tid[num_threads];
00125 void *status;
00126
00127
00128 TD[0].x=x ; TD[0].y=y ; TD[0].z=z ;
00129 TD[0].n1=n1 ; TD[0].n2=n2 ; TD[0].m=m;
00130 TD[0].offs=offs;
00131
00132 if (n2>PAR_THRESH)
00133 {
00134 ASSERT(PAR_THRESH>1);
00135
00136
00137 for (nc=0; nc<num_threads; nc++)
00138 {
00139 TD[nc]=TD[0];
00140 TD[nc].js=nc*n2_nc;
00141 if (nc+1==num_threads)
00142 TD[nc].je=n2;
00143 else
00144 TD[nc].je=(nc+1)*n2_nc;
00145
00146 pthread_create(&tid[nc], NULL, sqdist_thread_func, (void*)&TD[nc]);
00147 }
00148
00149
00150 for (nc=0; nc<num_threads; nc++)
00151 pthread_join(tid[nc], &status);
00152
00153 }
00154 else
00155 {
00156
00157 TD[0].js=0 ; TD[0].je=n2;
00158 sqdist_thread_func((void *)&TD[0]);
00159 }
00160 }
00161
00162 void CKMeans::clustknb(bool use_old_mus, double *mus_start)
00163 {
00164 ASSERT(distance && distance->get_feature_type()==F_DREAL);
00165 CRealFeatures* lhs = (CRealFeatures*) distance->get_lhs();
00166 ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00167
00168 int XSize=lhs->get_num_vectors();
00169 dimensions=lhs->get_num_features();
00170 int i, changed=1;
00171 const int XDimk=dimensions*k;
00172 int iter=0;
00173
00174 delete[] R;
00175 R=new DREAL[k];
00176
00177 delete[] mus;
00178 mus=new DREAL[XDimk];
00179
00180 int *ClList = (int*) calloc(XSize, sizeof(int));
00181 double *weights_set = (double*) calloc(k, sizeof(double));
00182 double *oldmus = (double*) calloc(XDimk, sizeof(double));
00183 double *dists = (double*) calloc(k*XSize, sizeof(double));
00184
00185 INT vlen=0;
00186 bool vfree=false;
00187 double* vec=NULL;
00188
00189
00190 for (i=0; i<XSize; i++) ClList[i]=0;
00191
00192 for (i=0; i<k; i++) weights_set[i]=0;
00193
00194
00195 for (i=0; i<XDimk; i++) mus[i]=0;
00196
00197 if (!use_old_mus)
00198 {
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217 for (i=0; i<XSize; i++)
00218 {
00219 const int Cl= (int) ( CMath::random(0, k-1) );
00220 int j;
00221 double weight=Weights[i];
00222
00223 weights_set[Cl]+=weight;
00224 ClList[i]=Cl;
00225
00226 vec=lhs->get_feature_vector(i, vlen, vfree);
00227
00228 for (j=0; j<dimensions; j++)
00229 mus[Cl*dimensions+j] += weight*vec[j];
00230
00231 lhs->free_feature_vector(vec, i, vfree);
00232 }
00233 for (i=0; i<k; i++)
00234 {
00235 int j;
00236
00237 if (weights_set[i]!=0.0)
00238 for (j=0; j<dimensions; j++)
00239 mus[i*dimensions+j] /= weights_set[i];
00240 }
00241 }
00242 else
00243 {
00244 ASSERT(mus_start);
00245
00246 sqdist(mus_start, lhs, dists, k, 0, XSize, dimensions);
00247
00248 for (i=0; i<XSize; i++)
00249 {
00250 double mini=dists[i*k];
00251 int Cl = 0, j;
00252
00253 for (j=1; j<k; j++)
00254 {
00255 if (dists[i*k+j]<mini)
00256 {
00257 Cl=j;
00258 mini=dists[i*k+j];
00259 }
00260 }
00261 ClList[i]=Cl;
00262 }
00263
00264
00265
00266 for (i=0; i<XSize; i++)
00267 {
00268 const int Cl = ClList[i];
00269 double weight=Weights[i];
00270 weights_set[Cl]+=weight;
00271 #ifndef MUSRECALC
00272 vec=lhs->get_feature_vector(i, vlen, vfree);
00273
00274 for (j=0; j<dimensions; j++)
00275 mus[Cl*dimensions+j] += weight*vec[j];
00276
00277 lhs->free_feature_vector(vec, i, vfree);
00278 #endif
00279 }
00280 #ifndef MUSRECALC
00281
00282 for (i=0; i<k; i++)
00283 {
00284 if (weights_set[i]!=0.0)
00285 {
00286 int j;
00287 for (j=0; j<dimensions; j++)
00288 mus[i*dimensions+j] /= weights_set[i];
00289 }
00290 }
00291 #endif
00292 }
00293
00294 for (i=0; i<XDimk; i++) oldmus[i]=-1;
00295
00296 while (changed && (iter<max_iter))
00297 {
00298 iter++;
00299 if (iter==max_iter-1)
00300 SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00301
00302 if (iter%1000 == 0)
00303 SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00304 changed=0;
00305
00306 #ifdef MUSRECALC
00307
00308 for (i=0; i<XDimk; i++) mus[i]=0;
00309
00310 for (i=0; i<XSize; i++)
00311 {
00312 int j;
00313 int Cl=ClList[i];
00314 double weight=Weights[i];
00315
00316 vec=lhs->get_feature_vector(i, vlen, vfree);
00317
00318 for (j=0; j<dimensions; j++)
00319 mus[Cl*dimensions+j] += weight*vec[j];
00320
00321 lhs->free_feature_vector(vec, i, vfree);
00322 }
00323 for (i=0; i<k; i++)
00324 {
00325 int j;
00326
00327 if (weights_set[i]!=0.0)
00328 for (j=0; j<dimensions; j++)
00329 mus[i*dimensions+j] /= weights_set[i];
00330 }
00331 #endif
00332
00333 for (i=0; i<XSize; i++)
00334 {
00335
00336 const int Pat= CMath::random(0, XSize-1);
00337 const int ClList_Pat=ClList[Pat];
00338 int imini, j;
00339 double mini, weight;
00340
00341 weight=Weights[Pat];
00342
00343
00344
00345 sqdist(mus, lhs, dists, k, Pat, 1, dimensions);
00346
00347
00348 imini=0 ; mini=dists[0];
00349 for (j=1; j<k; j++)
00350 if (dists[j]<mini)
00351 {
00352 mini=dists[j];
00353 imini=j;
00354 }
00355
00356 if (imini!=ClList_Pat)
00357 {
00358 changed= changed + 1;
00359
00360
00361 weights_set[imini]+= weight;
00362
00363 weights_set[ClList_Pat]-= weight;
00364
00365
00366
00367 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00368
00369 for (j=0; j<dimensions; j++)
00370 mus[imini*dimensions+j]-=(vec[j]-mus[imini*dimensions+j])*(weight/weights_set[imini]);
00371
00372 lhs->free_feature_vector(vec, Pat, vfree);
00373
00374
00375
00376 if (weights_set[ClList_Pat]!=0.0)
00377 {
00378
00379 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00380
00381 for (j=0; j<dimensions; j++)
00382 mus[ClList_Pat*dimensions+j]-=(vec[j]-mus[ClList_Pat*dimensions+j])*(weight/weights_set[ClList_Pat]);
00383 lhs->free_feature_vector(vec, Pat, vfree);
00384 }
00385 else
00386
00387 for (j=0; j<dimensions; j++)
00388 mus[ClList_Pat*dimensions+j]=0;
00389
00390
00391 ClList[Pat] = imini;
00392 }
00393 }
00394 }
00395
00396
00397 for (i=0; i<k; i++)
00398 {
00399 double rmin1=0;
00400 double rmin2=0;
00401
00402 bool first_round=true;
00403
00404 for (INT j=0; j<k; j++)
00405 {
00406 if (j!=i)
00407 {
00408 int l;
00409 double dist = 0;
00410
00411 for (l=0; l<dimensions; l++)
00412 dist+=CMath::sq(mus[i*dimensions+l]-mus[j*dimensions+l]);
00413
00414 if (first_round)
00415 {
00416 rmin1=dist;
00417 rmin2=dist;
00418 first_round=false;
00419 }
00420 else
00421 {
00422 if ((dist<rmin2) && (dist>=rmin1))
00423 rmin2=dist;
00424
00425 if (dist<rmin1)
00426 {
00427 rmin2=rmin1;
00428 rmin1=dist;
00429 }
00430 }
00431 }
00432 }
00433
00434 R[i]=(0.7*sqrt(rmin1)+0.3*sqrt(rmin2));
00435 }
00436
00437 free(ClList);
00438 free(weights_set);
00439 free(oldmus);
00440 free(dists);
00441 }