SHOGUN
v1.1.0
Main Page
Related Pages
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Pages
src
shogun
classifier
KNN.cpp
Go to the documentation of this file.
1
/*
2
* This program is free software; you can redistribute it and/or modify
3
* it under the terms of the GNU General Public License as published by
4
* the Free Software Foundation; either version 3 of the License, or
5
* (at your option) any later version.
6
*
7
* Written (W) 2006 Christian Gehl
8
* Written (W) 2006-2009 Soeren Sonnenburg
9
* Written (W) 2011 Sergey Lisitsyn
10
* Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
11
*/
12
13
#include <
shogun/classifier/KNN.h
>
14
#include <
shogun/features/Labels.h
>
15
#include <
shogun/mathematics/Math.h
>
16
#include <
shogun/lib/Signal.h
>
17
#include <
shogun/base/Parameter.h
>
18
19
using namespace
shogun;
20
21
CKNN::CKNN
()
22
:
CDistanceMachine
()
23
{
24
init();
25
}
26
27
CKNN::CKNN
(int32_t k,
CDistance
* d,
CLabels
* trainlab)
28
:
CDistanceMachine
()
29
{
30
init();
31
32
m_k
=k;
33
34
ASSERT
(d);
35
ASSERT
(trainlab);
36
37
set_distance
(d);
38
set_labels
(trainlab);
39
train_labels
.
vlen
=trainlab->
get_num_labels
();
40
}
41
42
void
CKNN::init()
43
{
44
/* do not store model features by default (CDistanceMachine::apply(...) is
45
* overwritten */
46
set_store_model_features
(
false
);
47
48
m_k
=3;
49
m_q
=1.0;
50
num_classes
=0;
51
52
m_parameters
->
add
(&
m_k
,
"k"
,
"Parameter k"
);
53
m_parameters
->
add
(&
m_q
,
"q"
,
"Parameter q"
);
54
m_parameters
->
add
(&
num_classes
,
"num_classes"
,
"Number of classes"
);
55
}
56
57
CKNN::~CKNN
()
58
{
59
SG_FREE
(
train_labels
.
vector
);
60
}
61
62
bool
CKNN::train_machine
(
CFeatures
* data)
63
{
64
ASSERT
(
labels
);
65
ASSERT
(
distance
);
66
67
if
(data)
68
{
69
if
(
labels
->
get_num_labels
() != data->
get_num_vectors
())
70
SG_ERROR
(
"Number of training vectors does not match number of labels\n"
);
71
distance
->
init
(data, data);
72
}
73
74
SGVector<int32_t>
lab=
labels
->
get_int_labels
();
75
train_labels
.
vlen
=lab.
vlen
;
76
train_labels
.
vector
=
CMath::clone_vector
(lab.
vector
, lab.
vlen
);
77
lab.
free_vector
();
78
ASSERT
(
train_labels
.
vlen
>0);
79
80
int32_t max_class=
train_labels
.
vector
[0];
81
int32_t min_class=
train_labels
.
vector
[0];
82
83
int32_t i;
84
for
(i=1; i<
train_labels
.
vlen
; i++)
85
{
86
max_class=
CMath::max
(max_class,
train_labels
.
vector
[i]);
87
min_class=
CMath::min
(min_class,
train_labels
.
vector
[i]);
88
}
89
90
for
(i=0; i<
train_labels
.
vlen
; i++)
91
train_labels
.
vector
[i]-=min_class;
92
93
min_label
=min_class;
94
num_classes
=max_class-min_class+1;
95
96
SG_INFO
(
"num_classes: %d (%+d to %+d) num_train: %d\n"
,
num_classes
,
97
min_class, max_class,
train_labels
.
vlen
);
98
return
true
;
99
}
100
101
CLabels
*
CKNN::apply
()
102
{
103
ASSERT
(
num_classes
>0);
104
ASSERT
(
distance
);
105
ASSERT
(
distance
->
get_num_vec_rhs
());
106
107
int32_t num_lab=
distance
->
get_num_vec_rhs
();
108
ASSERT
(
m_k
<=num_lab);
109
110
CLabels
* output=
new
CLabels
(num_lab);
111
112
//distances to train data and working buffer of train_labels
113
float64_t
* dists=
SG_MALLOC
(
float64_t
,
train_labels
.
vlen
);
114
int32_t* train_lab=
SG_MALLOC
(int32_t,
train_labels
.
vlen
);
115
116
SG_INFO
(
"%d test examples\n"
, num_lab);
117
CSignal::clear_cancel
();
118
120
float64_t
* classes=
SG_MALLOC
(
float64_t
,
num_classes
);
121
122
for
(int32_t i=0; i<num_lab && (!
CSignal::cancel_computations
()); i++)
123
{
124
SG_PROGRESS
(i, 0, num_lab);
125
126
// lhs idx 1..n and rhs idx i
127
distances_lhs
(dists,0,
train_labels
.
vlen
-1,i);
128
int32_t j;
129
130
for
(j=0; j<
train_labels
.
vlen
; j++)
131
train_lab[j]=
train_labels
.
vector
[j];
132
133
//sort the distance vector for test example j to all train examples
134
//classes[1..k] then holds the classes for minimum distance
135
CMath::qsort_index
(dists, train_lab,
train_labels
.
vlen
);
136
137
//compute histogram of class outputs of the first k nearest neighbours
138
for
(j=0; j<
num_classes
; j++)
139
classes[j]=0.0;
140
141
float64_t
multiplier =
m_q
;
142
for
(j=0; j<
m_k
; j++)
143
{
144
classes[train_lab[j]]+= multiplier;
145
multiplier*= multiplier;
146
}
147
148
//choose the class that got 'outputted' most often
149
int32_t out_idx=0;
150
float64_t
out_max=0;
151
152
for
(j=0; j<
num_classes
; j++)
153
{
154
if
(out_max< classes[j])
155
{
156
out_idx= j;
157
out_max= classes[j];
158
}
159
}
160
output->
set_label
(i, out_idx+
min_label
);
161
}
162
163
SG_FREE
(classes);
164
SG_FREE
(dists);
165
SG_FREE
(train_lab);
166
167
return
output;
168
}
169
170
CLabels
*
CKNN::apply
(
CFeatures
* data)
171
{
172
init_distance
(data);
173
174
// redirecting to fast (without sorting) classify if k==1
175
if
(
m_k
== 1)
176
return
classify_NN
();
177
178
return
apply
();
179
}
180
181
CLabels
*
CKNN::classify_NN
()
182
{
183
ASSERT
(
distance
);
184
ASSERT
(
num_classes
>0);
185
186
int32_t num_lab =
distance
->
get_num_vec_rhs
();
187
ASSERT
(num_lab);
188
189
CLabels
* output =
new
CLabels
(num_lab);
190
float64_t
* distances =
SG_MALLOC
(
float64_t
,
train_labels
.
vlen
);
191
192
SG_INFO
(
"%d test examples\n"
, num_lab);
193
CSignal::clear_cancel
();
194
195
// for each test example
196
for
(int32_t i=0; i<num_lab && (!
CSignal::cancel_computations
()); i++)
197
{
198
SG_PROGRESS
(i,0,num_lab);
199
200
// get distances from i-th test example to 0..num_train_labels-1 train examples
201
distances_lhs
(distances,0,
train_labels
.
vlen
-1,i);
202
int32_t j;
203
204
// assuming 0th train examples as nearest to i-th test example
205
int32_t out_idx = 0;
206
float64_t
min_dist = distances[0];
207
208
// searching for nearest neighbor by comparing distances
209
for
(j=0; j<
train_labels
.
vlen
; j++)
210
{
211
if
(distances[j]<min_dist)
212
{
213
min_dist = distances[j];
214
out_idx = j;
215
}
216
}
217
218
// label i-th test example with label of nearest neighbor with out_idx index
219
output->
set_label
(i,
train_labels
.
vector
[out_idx]+
min_label
);
220
}
221
222
delete
[] distances;
223
return
output;
224
}
225
226
SGMatrix<int32_t>
CKNN::classify_for_multiple_k
()
227
{
228
ASSERT
(
num_classes
>0);
229
ASSERT
(
distance
);
230
ASSERT
(
distance
->
get_num_vec_rhs
());
231
232
int32_t num_lab=
distance
->
get_num_vec_rhs
();
233
ASSERT
(
m_k
<=num_lab);
234
235
int32_t* output=
SG_MALLOC
(int32_t,
m_k
*num_lab);
236
237
//distances to train data and working buffer of train_labels
238
float64_t
* dists=
SG_MALLOC
(
float64_t
,
train_labels
.
vlen
);
239
int32_t* train_lab=
SG_MALLOC
(int32_t,
train_labels
.
vlen
);
240
242
int32_t* classes=
SG_MALLOC
(int32_t,
num_classes
);
243
244
SG_INFO
(
"%d test examples\n"
, num_lab);
245
CSignal::clear_cancel
();
246
247
for
(int32_t i=0; i<num_lab && (!
CSignal::cancel_computations
()); i++)
248
{
249
SG_PROGRESS
(i, 0, num_lab);
250
251
// lhs idx 1..n and rhs idx i
252
distances_lhs
(dists,0,
train_labels
.
vlen
-1,i);
253
for
(int32_t j=0; j<
train_labels
.
vlen
; j++)
254
train_lab[j]=
train_labels
.
vector
[j];
255
256
//sort the distance vector for test example j to all train examples
257
//classes[1..k] then holds the classes for minimum distance
258
CMath::qsort_index
(dists, train_lab,
train_labels
.
vlen
);
259
260
//compute histogram of class outputs of the first k nearest neighbours
261
for
(int32_t j=0; j<
num_classes
; j++)
262
classes[j]=0;
263
264
for
(int32_t j=0; j<
m_k
; j++)
265
{
266
classes[train_lab[j]]++;
267
268
//choose the class that got 'outputted' most often
269
int32_t out_idx=0;
270
int32_t out_max=0;
271
272
for
(int32_t c=0; c<
num_classes
; c++)
273
{
274
if
(out_max< classes[c])
275
{
276
out_idx= c;
277
out_max= classes[c];
278
}
279
}
280
output[j*num_lab+i]=out_idx+
min_label
;
281
}
282
}
283
284
SG_FREE
(dists);
285
SG_FREE
(train_lab);
286
SG_FREE
(classes);
287
288
return
SGMatrix<int32_t>
(output,num_lab,
m_k
);
289
}
290
291
void
CKNN::init_distance
(
CFeatures
* data)
292
{
293
if
(!
distance
)
294
SG_ERROR
(
"No distance assigned!\n"
);
295
CFeatures
* lhs=
distance
->
get_lhs
();
296
if
(!lhs || !lhs->
get_num_vectors
())
297
{
298
SG_UNREF
(lhs);
299
SG_ERROR
(
"No vectors on left hand side\n"
);
300
}
301
distance
->
init
(lhs, data);
302
SG_UNREF
(lhs);
303
}
304
305
bool
CKNN::load
(FILE* srcfile)
306
{
307
SG_SET_LOCALE_C
;
308
SG_RESET_LOCALE
;
309
return
false
;
310
}
311
312
bool
CKNN::save
(FILE* dstfile)
313
{
314
SG_SET_LOCALE_C
;
315
SG_RESET_LOCALE
;
316
return
false
;
317
}
318
319
void
CKNN::store_model_features
()
320
{
321
CFeatures
* d_lhs=
distance
->
get_lhs
();
322
CFeatures
* d_rhs=
distance
->
get_rhs
();
323
324
/* copy lhs of underlying distance */
325
distance
->
init
(d_lhs->
duplicate
(), d_rhs);
326
327
SG_UNREF
(d_lhs);
328
SG_UNREF
(d_rhs);
329
}
SHOGUN
Machine Learning Toolbox - Documentation