SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
KNN.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2006 Christian Gehl
8  * Written (W) 2006-2009 Soeren Sonnenburg
9  * Written (W) 2011 Sergey Lisitsyn
10  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
11  */
12 
13 #include <shogun/classifier/KNN.h>
14 #include <shogun/features/Labels.h>
16 #include <shogun/lib/Signal.h>
17 #include <shogun/base/Parameter.h>
18 
19 using namespace shogun;
20 
23 {
24  init();
25 }
26 
27 CKNN::CKNN(int32_t k, CDistance* d, CLabels* trainlab)
29 {
30  init();
31 
32  m_k=k;
33 
34  ASSERT(d);
35  ASSERT(trainlab);
36 
37  set_distance(d);
38  set_labels(trainlab);
39  train_labels.vlen=trainlab->get_num_labels();
40 }
41 
42 void CKNN::init()
43 {
44  /* do not store model features by default (CDistanceMachine::apply(...) is
45  * overwritten */
47 
48  m_k=3;
49  m_q=1.0;
50  num_classes=0;
51 
52  m_parameters->add(&m_k, "k", "Parameter k");
53  m_parameters->add(&m_q, "q", "Parameter q");
54  m_parameters->add(&num_classes, "num_classes", "Number of classes");
55 }
56 
58 {
60 }
61 
63 {
64  ASSERT(labels);
66 
67  if (data)
68  {
69  if (labels->get_num_labels() != data->get_num_vectors())
70  SG_ERROR("Number of training vectors does not match number of labels\n");
71  distance->init(data, data);
72  }
73 
77  lab.free_vector();
79 
80  int32_t max_class=train_labels.vector[0];
81  int32_t min_class=train_labels.vector[0];
82 
83  int32_t i;
84  for (i=1; i<train_labels.vlen; i++)
85  {
86  max_class=CMath::max(max_class, train_labels.vector[i]);
87  min_class=CMath::min(min_class, train_labels.vector[i]);
88  }
89 
90  for (i=0; i<train_labels.vlen; i++)
91  train_labels.vector[i]-=min_class;
92 
93  min_label=min_class;
94  num_classes=max_class-min_class+1;
95 
96  SG_INFO( "num_classes: %d (%+d to %+d) num_train: %d\n", num_classes,
97  min_class, max_class, train_labels.vlen);
98  return true;
99 }
100 
102 {
103  ASSERT(num_classes>0);
104  ASSERT(distance);
106 
107  int32_t num_lab=distance->get_num_vec_rhs();
108  ASSERT(m_k<=num_lab);
109 
110  CLabels* output=new CLabels(num_lab);
111 
112  //distances to train data and working buffer of train_labels
114  int32_t* train_lab=SG_MALLOC(int32_t, train_labels.vlen);
115 
116  SG_INFO( "%d test examples\n", num_lab);
118 
121 
122  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
123  {
124  SG_PROGRESS(i, 0, num_lab);
125 
126  // lhs idx 1..n and rhs idx i
127  distances_lhs(dists,0,train_labels.vlen-1,i);
128  int32_t j;
129 
130  for (j=0; j<train_labels.vlen; j++)
131  train_lab[j]=train_labels.vector[j];
132 
133  //sort the distance vector for test example j to all train examples
134  //classes[1..k] then holds the classes for minimum distance
135  CMath::qsort_index(dists, train_lab, train_labels.vlen);
136 
137  //compute histogram of class outputs of the first k nearest neighbours
138  for (j=0; j<num_classes; j++)
139  classes[j]=0.0;
140 
141  float64_t multiplier = m_q;
142  for (j=0; j<m_k; j++)
143  {
144  classes[train_lab[j]]+= multiplier;
145  multiplier*= multiplier;
146  }
147 
148  //choose the class that got 'outputted' most often
149  int32_t out_idx=0;
150  float64_t out_max=0;
151 
152  for (j=0; j<num_classes; j++)
153  {
154  if (out_max< classes[j])
155  {
156  out_idx= j;
157  out_max= classes[j];
158  }
159  }
160  output->set_label(i, out_idx+min_label);
161  }
162 
163  SG_FREE(classes);
164  SG_FREE(dists);
165  SG_FREE(train_lab);
166 
167  return output;
168 }
169 
171 {
172  init_distance(data);
173 
174  // redirecting to fast (without sorting) classify if k==1
175  if (m_k == 1)
176  return classify_NN();
177 
178  return apply();
179 }
180 
182 {
183  ASSERT(distance);
184  ASSERT(num_classes>0);
185 
186  int32_t num_lab = distance->get_num_vec_rhs();
187  ASSERT(num_lab);
188 
189  CLabels* output = new CLabels(num_lab);
191 
192  SG_INFO("%d test examples\n", num_lab);
194 
195  // for each test example
196  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
197  {
198  SG_PROGRESS(i,0,num_lab);
199 
200  // get distances from i-th test example to 0..num_train_labels-1 train examples
201  distances_lhs(distances,0,train_labels.vlen-1,i);
202  int32_t j;
203 
204  // assuming 0th train examples as nearest to i-th test example
205  int32_t out_idx = 0;
206  float64_t min_dist = distances[0];
207 
208  // searching for nearest neighbor by comparing distances
209  for (j=0; j<train_labels.vlen; j++)
210  {
211  if (distances[j]<min_dist)
212  {
213  min_dist = distances[j];
214  out_idx = j;
215  }
216  }
217 
218  // label i-th test example with label of nearest neighbor with out_idx index
219  output->set_label(i,train_labels.vector[out_idx]+min_label);
220  }
221 
222  delete [] distances;
223  return output;
224 }
225 
227 {
228  ASSERT(num_classes>0);
229  ASSERT(distance);
231 
232  int32_t num_lab=distance->get_num_vec_rhs();
233  ASSERT(m_k<=num_lab);
234 
235  int32_t* output=SG_MALLOC(int32_t, m_k*num_lab);
236 
237  //distances to train data and working buffer of train_labels
239  int32_t* train_lab=SG_MALLOC(int32_t, train_labels.vlen);
240 
242  int32_t* classes=SG_MALLOC(int32_t, num_classes);
243 
244  SG_INFO( "%d test examples\n", num_lab);
246 
247  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
248  {
249  SG_PROGRESS(i, 0, num_lab);
250 
251  // lhs idx 1..n and rhs idx i
252  distances_lhs(dists,0,train_labels.vlen-1,i);
253  for (int32_t j=0; j<train_labels.vlen; j++)
254  train_lab[j]=train_labels.vector[j];
255 
256  //sort the distance vector for test example j to all train examples
257  //classes[1..k] then holds the classes for minimum distance
258  CMath::qsort_index(dists, train_lab, train_labels.vlen);
259 
260  //compute histogram of class outputs of the first k nearest neighbours
261  for (int32_t j=0; j<num_classes; j++)
262  classes[j]=0;
263 
264  for (int32_t j=0; j<m_k; j++)
265  {
266  classes[train_lab[j]]++;
267 
268  //choose the class that got 'outputted' most often
269  int32_t out_idx=0;
270  int32_t out_max=0;
271 
272  for (int32_t c=0; c<num_classes; c++)
273  {
274  if (out_max< classes[c])
275  {
276  out_idx= c;
277  out_max= classes[c];
278  }
279  }
280  output[j*num_lab+i]=out_idx+min_label;
281  }
282  }
283 
284  SG_FREE(dists);
285  SG_FREE(train_lab);
286  SG_FREE(classes);
287 
288  return SGMatrix<int32_t>(output,num_lab,m_k);
289 }
290 
292 {
293  if (!distance)
294  SG_ERROR("No distance assigned!\n");
295  CFeatures* lhs=distance->get_lhs();
296  if (!lhs || !lhs->get_num_vectors())
297  {
298  SG_UNREF(lhs);
299  SG_ERROR("No vectors on left hand side\n");
300  }
301  distance->init(lhs, data);
302  SG_UNREF(lhs);
303 }
304 
305 bool CKNN::load(FILE* srcfile)
306 {
309  return false;
310 }
311 
312 bool CKNN::save(FILE* dstfile)
313 {
316  return false;
317 }
318 
320 {
321  CFeatures* d_lhs=distance->get_lhs();
322  CFeatures* d_rhs=distance->get_rhs();
323 
324  /* copy lhs of underlying distance */
325  distance->init(d_lhs->duplicate(), d_rhs);
326 
327  SG_UNREF(d_lhs);
328  SG_UNREF(d_rhs);
329 }

SHOGUN Machine Learning Toolbox - Documentation