SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
KMeans.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 2007-2009 Soeren Sonnenburg
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
14 #include <shogun/features/Labels.h>
17 #include <shogun/base/Parallel.h>
18 
19 #ifdef HAVE_PTHREAD
20 #include <pthread.h>
21 #endif
22 
23 #define MUSRECALC
24 
25 #define PAR_THRESH 10
26 
27 using namespace shogun;
28 
31 {
32  init();
33 }
34 
35 CKMeans::CKMeans(int32_t k_, CDistance* d)
37 {
38  init();
39  k=k_;
40  set_distance(d);
41 }
42 
44 {
45  R.destroy_vector();
46 }
47 
49 {
51 
52  if (data)
53  distance->init(data, data);
54 
56 
59  ASSERT(lhs);
60  int32_t num=lhs->get_num_vectors();
61  SG_UNREF(lhs);
62 
63  Weights=SGVector<float64_t>(num);
64  for (int32_t i=0; i<num; i++)
65  Weights.vector[i]=1.0;
66 
67  clustknb(false, NULL);
68  Weights.destroy_vector();
69 
70  return true;
71 }
72 
73 bool CKMeans::load(FILE* srcfile)
74 {
77  return false;
78 }
79 
80 bool CKMeans::save(FILE* dstfile)
81 {
84  return false;
85 }
86 
87 
88 void CKMeans::set_k(int32_t p_k)
89 {
90  ASSERT(p_k>0);
91  this->k=p_k;
92 }
93 
94 int32_t CKMeans::get_k()
95 {
96  return k;
97 }
98 
99 void CKMeans::set_max_iter(int32_t iter)
100 {
101  ASSERT(iter>0);
102  max_iter=iter;
103 }
104 
106 {
107  return max_iter;
108 }
109 
111 {
112  return R;
113 }
114 
116 {
117  if (!R.vector)
118  return SGMatrix<float64_t>();
119 
122  SGMatrix<float64_t> centers=lhs->get_feature_matrix();
123  SG_UNREF(lhs);
124  return centers;
125 }
126 
128 {
129  return dimensions;
130 }
131 
132 #ifndef DOXYGEN_SHOULD_SKIP_THIS
133 struct thread_data
134 {
135  float64_t* x;
137  float64_t* z;
138  int32_t n1, n2, m;
139  int32_t js, je; /* defines the matrix stripe */
140  int32_t offs;
141 };
142 #endif // DOXYGEN_SHOULD_SKIP_THIS
143 
144 namespace shogun
145 {
146 void *sqdist_thread_func(void * P)
147 {
148  struct thread_data *TD=(struct thread_data*) P;
149  float64_t* x=TD->x;
151  float64_t* z=TD->z;
152  int32_t n1=TD->n1,
153  m=TD->m,
154  js=TD->js,
155  je=TD->je,
156  offs=TD->offs,
157  j,i,k;
158 
159  for (j=js; j<je; j++)
160  {
161  int32_t vlen=0;
162  bool vfree=false;
163  float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
164 
165  for (i=0; i<n1; i++)
166  {
167  float64_t sum=0;
168  for (k=0; k<m; k++)
169  sum = sum + CMath::sq(x[i*m + k] - vec[k]);
170  z[j*n1 + i] = sum;
171  }
172 
173  y->free_feature_vector(vec, j, vfree);
174  }
175  return NULL;
176 }
177 }
178 
179 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
180 {
183  ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
184 
185  int32_t XSize=lhs->get_num_vectors();
186  dimensions=lhs->get_num_features();
187  int32_t i, changed=1;
188  const int32_t XDimk=dimensions*k;
189  int32_t iter=0;
190 
191  R.destroy_vector();
193 
195 
196  int32_t *ClList=SG_CALLOC(int32_t, XSize);
197  float64_t *weights_set=SG_CALLOC(float64_t, k);
198  float64_t *dists=SG_CALLOC(float64_t, k*XSize);
199 
202  CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
203 
204  int32_t vlen=0;
205  bool vfree=false;
206  float64_t* vec=NULL;
207 
208  /* ClList=zeros(XSize,1) ; */
209  memset(ClList, 0, sizeof(int32_t)*XSize);
210  /* weights_set=zeros(k,1) ; */
211  memset(weights_set, 0, sizeof(float64_t)*k);
212 
213  /* cluster_centers=zeros(dimensions, k) ; */
214  memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
215 
216  if (!use_old_mus)
217  {
218  for (i=0; i<XSize; i++)
219  {
220  const int32_t Cl=CMath::random(0, k-1);
221  int32_t j;
222  float64_t weight=Weights.vector[i];
223 
224  weights_set[Cl]+=weight;
225  ClList[i]=Cl;
226 
227  vec=lhs->get_feature_vector(i, vlen, vfree);
228 
229  for (j=0; j<dimensions; j++)
230  mus.matrix[Cl*dimensions+j] += weight*vec[j];
231 
232  lhs->free_feature_vector(vec, i, vfree);
233  }
234  for (i=0; i<k; i++)
235  {
236  int32_t j;
237 
238  if (weights_set[i]!=0.0)
239  for (j=0; j<dimensions; j++)
240  mus.matrix[i*dimensions+j] /= weights_set[i];
241  }
242  }
243  else
244  {
245  ASSERT(mus_start);
246 
248  rhs_mus->copy_feature_matrix(SGMatrix<float64_t>(mus_start,dimensions,k));
249  float64_t* p_dists=dists;
250 
251  for(int32_t idx=0;idx<XSize;idx++,p_dists+=k)
252  distances_rhs(p_dists,0,k,idx);
253  p_dists=NULL;
254 
255  for (i=0; i<XSize; i++)
256  {
257  float64_t mini=dists[i*k];
258  int32_t Cl = 0, j;
259 
260  for (j=1; j<k; j++)
261  {
262  if (dists[i*k+j]<mini)
263  {
264  Cl=j;
265  mini=dists[i*k+j];
266  }
267  }
268  ClList[i]=Cl;
269  }
270 
271  /* Compute the sum of all points belonging to a cluster
272  * and count the points */
273  for (i=0; i<XSize; i++)
274  {
275  const int32_t Cl = ClList[i];
276  float64_t weight=Weights.vector[i];
277  weights_set[Cl]+=weight;
278 #ifndef MUSRECALC
279  vec=lhs->get_feature_vector(i, vlen, vfree);
280 
281  for (j=0; j<dimensions; j++)
282  mus.matrix[Cl*dimensions+j] += weight*vec[j];
283 
284  lhs->free_feature_vector(vec, i, vfree);
285 #endif
286  }
287 #ifndef MUSRECALC
288  /* normalization to get the mean */
289  for (i=0; i<k; i++)
290  {
291  if (weights_set[i]!=0.0)
292  {
293  int32_t j;
294  for (j=0; j<dimensions; j++)
295  mus.matrix[i*dimensions+j] /= weights_set[i];
296  }
297  }
298 #endif
299  }
300 
301 
302 
303  while (changed && (iter<max_iter))
304  {
305  iter++;
306  if (iter==max_iter-1)
307  SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
308 
309  if (iter%1000 == 0)
310  SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
311  changed=0;
312 
313 #ifdef MUSRECALC
314  /* mus=zeros(dimensions, k) ; */
315  memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
316 
317  for (i=0; i<XSize; i++)
318  {
319  int32_t j;
320  int32_t Cl=ClList[i];
321  float64_t weight=Weights.vector[i];
322 
323  vec=lhs->get_feature_vector(i, vlen, vfree);
324 
325  for (j=0; j<dimensions; j++)
326  mus.matrix[Cl*dimensions+j] += weight*vec[j];
327 
328  lhs->free_feature_vector(vec, i, vfree);
329  }
330  for (i=0; i<k; i++)
331  {
332  int32_t j;
333 
334  if (weights_set[i]!=0.0)
335  for (j=0; j<dimensions; j++)
336  mus.matrix[i*dimensions+j] /= weights_set[i];
337  }
338 #endif
339 
340  rhs_mus->copy_feature_matrix(mus);
341 
342  for (i=0; i<XSize; i++)
343  {
344  /* ks=ceil(rand(1,XSize)*XSize) ; */
345  const int32_t Pat= CMath::random(0, XSize-1);
346  const int32_t ClList_Pat=ClList[Pat];
347  int32_t imini, j;
348  float64_t mini, weight;
349 
350  weight=Weights.vector[Pat];
351 
352  /* compute the distance of this point to all centers */
353  for(int32_t idx_k=0;idx_k<k;idx_k++)
354  dists[idx_k]=distance->distance(Pat,idx_k);
355 
356  /* [mini,imini]=min(dists(:,i)) ; */
357  imini=0 ; mini=dists[0];
358  for (j=1; j<k; j++)
359  if (dists[j]<mini)
360  {
361  mini=dists[j];
362  imini=j;
363  }
364 
365  if (imini!=ClList_Pat)
366  {
367  changed= changed + 1;
368 
369  /* weights_set(imini) = weights_set(imini) + weight ; */
370  weights_set[imini]+= weight;
371  /* weights_set(j) = weights_set(j) - weight ; */
372  weights_set[ClList_Pat]-= weight;
373 
374  vec=lhs->get_feature_vector(Pat, vlen, vfree);
375 
376  for (j=0; j<dimensions; j++)
377  {
378  mus.matrix[imini*dimensions+j]-=(vec[j]
379  -mus.matrix[imini*dimensions+j])
380  *(weight/weights_set[imini]);
381  }
382 
383  lhs->free_feature_vector(vec, Pat, vfree);
384 
385  /* mu_new = mu_old - (x - mu_old)/(n-1) */
386  /* if weights_set(j)~=0 */
387  if (weights_set[ClList_Pat]!=0.0)
388  {
389  vec=lhs->get_feature_vector(Pat, vlen, vfree);
390 
391  for (j=0; j<dimensions; j++)
392  {
393  mus.matrix[ClList_Pat*dimensions+j]-=
394  (vec[j]
395  -mus.matrix[ClList_Pat
396  *dimensions+j])
397  *(weight/weights_set[ClList_Pat]);
398  }
399  lhs->free_feature_vector(vec, Pat, vfree);
400  }
401  else
402  /* mus(:,j)=zeros(dimensions,1) ; */
403  for (j=0; j<dimensions; j++)
404  mus.matrix[ClList_Pat*dimensions+j]=0;
405 
406  /* ClList(i)= imini ; */
407  ClList[Pat] = imini;
408  }
409  }
410  }
411 
412  /* compute the ,,variances'' of the clusters */
413  for (i=0; i<k; i++)
414  {
415  float64_t rmin1=0;
416  float64_t rmin2=0;
417 
418  bool first_round=true;
419 
420  for (int32_t j=0; j<k; j++)
421  {
422  if (j!=i)
423  {
424  int32_t l;
425  float64_t dist = 0;
426 
427  for (l=0; l<dimensions; l++)
428  {
429  dist+=CMath::sq(
430  mus.matrix[i*dimensions+l]
431  -mus.matrix[j*dimensions+l]);
432  }
433 
434  if (first_round)
435  {
436  rmin1=dist;
437  rmin2=dist;
438  first_round=false;
439  }
440  else
441  {
442  if ((dist<rmin2) && (dist>=rmin1))
443  rmin2=dist;
444 
445  if (dist<rmin1)
446  {
447  rmin2=rmin1;
448  rmin1=dist;
449  }
450  }
451  }
452  }
453 
454  R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
455  }
456  distance->replace_rhs(rhs_cache);
457  delete rhs_mus;
458  SG_FREE(ClList);
459  SG_FREE(weights_set);
460  SG_FREE(dists);
461  SG_UNREF(lhs);
462 }
463 
465 {
466  /* set lhs of underlying distance to cluster centers */
468  mus);
469 
470  /* reset mus variable to avoid interference with above features */
471  mus.do_free=false;
472  mus.free_matrix();
473 
474  /* store cluster centers in lhs of distance variable */
475  CFeatures* rhs=distance->get_rhs();
476  distance->init(cluster_centers, rhs);
477  SG_UNREF(rhs);
478 }
479 
480 void CKMeans::init()
481 {
482  max_iter=10000;
483  k=3;
484  dimensions=0;
485 
486  m_parameters->add(&max_iter, "max_iter", "Maximum number of iterations");
487  m_parameters->add(&k, "k", "Parameter k");
488  m_parameters->add(&dimensions, "dimensions", "Dimensions of data");
489  m_parameters->add(&R, "R", "Cluster radiuses");
490 }
491 

SHOGUN Machine Learning Toolbox - Documentation