00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "clustering/KMeans.h"
00013 #include "distance/Distance.h"
00014 #include "features/Labels.h"
00015 #include "features/RealFeatures.h"
00016 #include "lib/Mathematics.h"
00017 #include "base/Parallel.h"
00018
00019 #ifndef WIN32
00020 #include <pthread.h>
00021 #endif
00022
00023 #define MUSRECALC
00024
00025 #define PAR_THRESH 10
00026
00027 CKMeans::CKMeans()
00028 : CDistanceMachine(), max_iter(10000), k(3), dimensions(0), R(NULL),
00029 mus(NULL), Weights(NULL)
00030 {
00031 }
00032
00033 CKMeans::CKMeans(int32_t k_, CDistance* d)
00034 : CDistanceMachine(), max_iter(10000), k(k_), dimensions(0), R(NULL),
00035 mus(NULL), Weights(NULL)
00036 {
00037 set_distance(d);
00038 }
00039
00040 CKMeans::~CKMeans()
00041 {
00042 delete[] R;
00043 delete[] mus;
00044 }
00045
00046 bool CKMeans::train()
00047 {
00048 ASSERT(distance);
00049 ASSERT(distance->get_feature_type()==F_DREAL);
00050 ASSERT(distance->get_distance_type()==D_EUCLIDIAN);
00051 CRealFeatures* lhs=(CRealFeatures*) distance->get_lhs();
00052 ASSERT(lhs);
00053 int32_t num=lhs->get_num_vectors();
00054
00055 Weights=new float64_t[num];
00056 for (int32_t i=0; i<num; i++)
00057 Weights[i]=1.0;
00058
00059 clustknb(false, NULL);
00060 delete[] Weights;
00061 SG_UNREF(lhs);
00062
00063 return true;
00064 }
00065
00066 bool CKMeans::load(FILE* srcfile)
00067 {
00068 return false;
00069 }
00070
00071 bool CKMeans::save(FILE* dstfile)
00072 {
00073 return false;
00074 }
00075
00076 struct thread_data
00077 {
00078 float64_t* x;
00079 CRealFeatures* y;
00080 float64_t* z;
00081 int32_t n1, n2, m;
00082 int32_t js, je;
00083 int32_t offs;
00084 };
00085
00086 void *sqdist_thread_func(void * P)
00087 {
00088 struct thread_data *TD=(struct thread_data*) P;
00089 float64_t* x=TD->x;
00090 CRealFeatures* y=TD->y;
00091 float64_t* z=TD->z;
00092 int32_t n1=TD->n1,
00093 m=TD->m,
00094 js=TD->js,
00095 je=TD->je,
00096 offs=TD->offs,
00097 j,i,k;
00098
00099 for (j=js; j<je; j++)
00100 {
00101 int32_t vlen=0;
00102 bool vfree=false;
00103 float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
00104
00105 for (i=0; i<n1; i++)
00106 {
00107 float64_t sum=0;
00108 for (k=0; k<m; k++)
00109 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00110 z[j*n1 + i] = sum;
00111 }
00112
00113 y->free_feature_vector(vec, j, vfree);
00114 }
00115 return NULL;
00116 }
00117
00118 void CKMeans::sqdist(
00119 float64_t* x, CRealFeatures* y, float64_t* z, int32_t n1, int32_t offs,
00120 int32_t n2, int32_t m)
00121 {
00122 const int32_t num_threads=parallel.get_num_threads();
00123 int32_t nc, n2_nc = n2/num_threads;
00124 struct thread_data TD[num_threads];
00125 pthread_t tid[num_threads];
00126 void *status;
00127
00128
00129 TD[0].x=x ; TD[0].y=y ; TD[0].z=z ;
00130 TD[0].n1=n1 ; TD[0].n2=n2 ; TD[0].m=m;
00131 TD[0].offs=offs;
00132
00133 if (n2>PAR_THRESH)
00134 {
00135 ASSERT(PAR_THRESH>1);
00136
00137
00138 for (nc=0; nc<num_threads; nc++)
00139 {
00140 TD[nc]=TD[0];
00141 TD[nc].js=nc*n2_nc;
00142 if (nc+1==num_threads)
00143 TD[nc].je=n2;
00144 else
00145 TD[nc].je=(nc+1)*n2_nc;
00146
00147 pthread_create(&tid[nc], NULL, sqdist_thread_func, (void*)&TD[nc]);
00148 }
00149
00150
00151 for (nc=0; nc<num_threads; nc++)
00152 pthread_join(tid[nc], &status);
00153
00154 }
00155 else
00156 {
00157
00158 TD[0].js=0 ; TD[0].je=n2;
00159 sqdist_thread_func((void *)&TD[0]);
00160 }
00161 }
00162
00163 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
00164 {
00165 ASSERT(distance && distance->get_feature_type()==F_DREAL);
00166 CRealFeatures* lhs = (CRealFeatures*) distance->get_lhs();
00167 ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00168
00169 int32_t XSize=lhs->get_num_vectors();
00170 dimensions=lhs->get_num_features();
00171 int32_t i, changed=1;
00172 const int32_t XDimk=dimensions*k;
00173 int32_t iter=0;
00174
00175 delete[] R;
00176 R=new float64_t[k];
00177
00178 delete[] mus;
00179 mus=new float64_t[XDimk];
00180
00181 int32_t *ClList = (int32_t*) calloc(XSize, sizeof(int32_t));
00182 float64_t *weights_set = (float64_t*) calloc(k, sizeof(float64_t));
00183 float64_t *oldmus = (float64_t*) calloc(XDimk, sizeof(float64_t));
00184 float64_t *dists = (float64_t*) calloc(k*XSize, sizeof(float64_t));
00185
00186 int32_t vlen=0;
00187 bool vfree=false;
00188 float64_t* vec=NULL;
00189
00190
00191 for (i=0; i<XSize; i++) ClList[i]=0;
00192
00193 for (i=0; i<k; i++) weights_set[i]=0;
00194
00195
00196 for (i=0; i<XDimk; i++) mus[i]=0;
00197
00198 if (!use_old_mus)
00199 {
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218 for (i=0; i<XSize; i++)
00219 {
00220 const int32_t Cl=CMath::random(0, k-1);
00221 int32_t j;
00222 float64_t weight=Weights[i];
00223
00224 weights_set[Cl]+=weight;
00225 ClList[i]=Cl;
00226
00227 vec=lhs->get_feature_vector(i, vlen, vfree);
00228
00229 for (j=0; j<dimensions; j++)
00230 mus[Cl*dimensions+j] += weight*vec[j];
00231
00232 lhs->free_feature_vector(vec, i, vfree);
00233 }
00234 for (i=0; i<k; i++)
00235 {
00236 int32_t j;
00237
00238 if (weights_set[i]!=0.0)
00239 for (j=0; j<dimensions; j++)
00240 mus[i*dimensions+j] /= weights_set[i];
00241 }
00242 }
00243 else
00244 {
00245 ASSERT(mus_start);
00246
00247 sqdist(mus_start, lhs, dists, k, 0, XSize, dimensions);
00248
00249 for (i=0; i<XSize; i++)
00250 {
00251 float64_t mini=dists[i*k];
00252 int32_t Cl = 0, j;
00253
00254 for (j=1; j<k; j++)
00255 {
00256 if (dists[i*k+j]<mini)
00257 {
00258 Cl=j;
00259 mini=dists[i*k+j];
00260 }
00261 }
00262 ClList[i]=Cl;
00263 }
00264
00265
00266
00267 for (i=0; i<XSize; i++)
00268 {
00269 const int32_t Cl = ClList[i];
00270 float64_t weight=Weights[i];
00271 weights_set[Cl]+=weight;
00272 #ifndef MUSRECALC
00273 vec=lhs->get_feature_vector(i, vlen, vfree);
00274
00275 for (j=0; j<dimensions; j++)
00276 mus[Cl*dimensions+j] += weight*vec[j];
00277
00278 lhs->free_feature_vector(vec, i, vfree);
00279 #endif
00280 }
00281 #ifndef MUSRECALC
00282
00283 for (i=0; i<k; i++)
00284 {
00285 if (weights_set[i]!=0.0)
00286 {
00287 int32_t j;
00288 for (j=0; j<dimensions; j++)
00289 mus[i*dimensions+j] /= weights_set[i];
00290 }
00291 }
00292 #endif
00293 }
00294
00295 for (i=0; i<XDimk; i++) oldmus[i]=-1;
00296
00297 while (changed && (iter<max_iter))
00298 {
00299 iter++;
00300 if (iter==max_iter-1)
00301 SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00302
00303 if (iter%1000 == 0)
00304 SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00305 changed=0;
00306
00307 #ifdef MUSRECALC
00308
00309 for (i=0; i<XDimk; i++) mus[i]=0;
00310
00311 for (i=0; i<XSize; i++)
00312 {
00313 int32_t j;
00314 int32_t Cl=ClList[i];
00315 float64_t weight=Weights[i];
00316
00317 vec=lhs->get_feature_vector(i, vlen, vfree);
00318
00319 for (j=0; j<dimensions; j++)
00320 mus[Cl*dimensions+j] += weight*vec[j];
00321
00322 lhs->free_feature_vector(vec, i, vfree);
00323 }
00324 for (i=0; i<k; i++)
00325 {
00326 int32_t j;
00327
00328 if (weights_set[i]!=0.0)
00329 for (j=0; j<dimensions; j++)
00330 mus[i*dimensions+j] /= weights_set[i];
00331 }
00332 #endif
00333
00334 for (i=0; i<XSize; i++)
00335 {
00336
00337 const int32_t Pat= CMath::random(0, XSize-1);
00338 const int32_t ClList_Pat=ClList[Pat];
00339 int32_t imini, j;
00340 float64_t mini, weight;
00341
00342 weight=Weights[Pat];
00343
00344
00345
00346 sqdist(mus, lhs, dists, k, Pat, 1, dimensions);
00347
00348
00349 imini=0 ; mini=dists[0];
00350 for (j=1; j<k; j++)
00351 if (dists[j]<mini)
00352 {
00353 mini=dists[j];
00354 imini=j;
00355 }
00356
00357 if (imini!=ClList_Pat)
00358 {
00359 changed= changed + 1;
00360
00361
00362 weights_set[imini]+= weight;
00363
00364 weights_set[ClList_Pat]-= weight;
00365
00366
00367
00368 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00369
00370 for (j=0; j<dimensions; j++)
00371 mus[imini*dimensions+j]-=(vec[j]-mus[imini*dimensions+j])*(weight/weights_set[imini]);
00372
00373 lhs->free_feature_vector(vec, Pat, vfree);
00374
00375
00376
00377 if (weights_set[ClList_Pat]!=0.0)
00378 {
00379
00380 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00381
00382 for (j=0; j<dimensions; j++)
00383 mus[ClList_Pat*dimensions+j]-=(vec[j]-mus[ClList_Pat*dimensions+j])*(weight/weights_set[ClList_Pat]);
00384 lhs->free_feature_vector(vec, Pat, vfree);
00385 }
00386 else
00387
00388 for (j=0; j<dimensions; j++)
00389 mus[ClList_Pat*dimensions+j]=0;
00390
00391
00392 ClList[Pat] = imini;
00393 }
00394 }
00395 }
00396
00397
00398 for (i=0; i<k; i++)
00399 {
00400 float64_t rmin1=0;
00401 float64_t rmin2=0;
00402
00403 bool first_round=true;
00404
00405 for (int32_t j=0; j<k; j++)
00406 {
00407 if (j!=i)
00408 {
00409 int32_t l;
00410 float64_t dist = 0;
00411
00412 for (l=0; l<dimensions; l++)
00413 dist+=CMath::sq(mus[i*dimensions+l]-mus[j*dimensions+l]);
00414
00415 if (first_round)
00416 {
00417 rmin1=dist;
00418 rmin2=dist;
00419 first_round=false;
00420 }
00421 else
00422 {
00423 if ((dist<rmin2) && (dist>=rmin1))
00424 rmin2=dist;
00425
00426 if (dist<rmin1)
00427 {
00428 rmin2=rmin1;
00429 rmin1=dist;
00430 }
00431 }
00432 }
00433 }
00434
00435 R[i]=(0.7*sqrt(rmin1)+0.3*sqrt(rmin2));
00436 }
00437
00438 free(ClList);
00439 free(weights_set);
00440 free(oldmus);
00441 free(dists);
00442 }