SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
GaussianNaiveBayes.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Sergey Lisitsyn
8  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
9  */
10 
12 #include <shogun/machine/Machine.h>
14 #include <shogun/features/Labels.h>
16 #include <shogun/lib/Signal.h>
17 
18 using namespace shogun;
19 
21 CMachine(), m_features(NULL), m_min_label(0),
22 m_num_classes(0), m_dim(0), m_means(),
23 m_variances(), m_label_prob(), m_rates()
24 {
25 
26 };
27 
29 CMachine(), m_features(NULL), m_min_label(0),
30 m_num_classes(0), m_dim(0), m_means(),
31 m_variances(), m_label_prob(), m_rates()
32 {
33  ASSERT(train_examples->get_num_vectors() == train_labels->get_num_labels());
34  set_labels(train_labels);
35  if (!train_examples->has_property(FP_DOT))
36  SG_ERROR("Specified features are not of type CDotFeatures\n");
37  set_features((CDotFeatures*)train_examples);
38 };
39 
41 {
43 
48 };
49 
51 {
52  // init features with data if necessary and assure type is correct
53  if (data)
54  {
55  if (!data->has_property(FP_DOT))
56  SG_ERROR("Specified features are not of type CDotFeatures\n");
57  set_features((CDotFeatures*) data);
58  }
59  // get int labels to train_labels and check length equality
60  ASSERT(labels);
61  SGVector<int32_t> train_labels = labels->get_int_labels();
62  ASSERT(m_features->get_num_vectors()==train_labels.vlen);
63 
64  // init min_label, max_label and loop variables
65  int32_t min_label = train_labels.vector[0];
66  int32_t max_label = train_labels.vector[0];
67  int i,j;
68 
69  // find minimal and maximal label
70  for (i=1; i<train_labels.vlen; i++)
71  {
72  min_label = CMath::min(min_label, train_labels.vector[i]);
73  max_label = CMath::max(max_label, train_labels.vector[i]);
74  }
75 
76  // subtract minimal label from all labels
77  for (i=0; i<train_labels.vlen; i++)
78  train_labels.vector[i]-= min_label;
79 
80  // get number of classes, minimal label and dimensionality
81  m_num_classes = max_label-min_label+1;
82  m_min_label = min_label;
84 
85  // allocate memory for distributions' parameters and a priori probability
88 
91 
94 
95  // allocate memory for label rates
98 
99  // assure that memory is allocated
104 
105  // make arrays filled by zeros before using
106  for (i=0;i<m_num_classes*m_dim;i++)
107  {
108  m_means.vector[i] = 0.0;
109  m_variances.vector[i] = 0.0;
110  }
111  for (i=0;i<m_num_classes;i++)
112  {
113  m_label_prob.vector[i] = 0.0;
114  m_rates.vector[i] = 0.0;
115  }
116 
118 
119  // get sum of features among labels
120  for (i=0; i<train_labels.vlen; i++)
121  {
122  for (j=0; j<m_dim; j++)
123  m_means.vector[m_dim*train_labels.vector[i]+j]+=feature_matrix.matrix[i*m_dim+j];
124 
125  m_label_prob.vector[train_labels.vector[i]]+=1.0;
126  }
127 
128  // get means of features of labels
129  for (i=0; i<m_num_classes; i++)
130  {
131  for (j=0; j<m_dim; j++)
132  m_means.vector[m_dim*i+j] /= m_label_prob.vector[i];
133  }
134 
135  // compute squared residuals with means available
136  for (i=0; i<train_labels.vlen; i++)
137  {
138  for (j=0; j<m_dim; j++)
139  m_variances.vector[m_dim*train_labels.vector[i]+j]+=
140  CMath::sq(feature_matrix.matrix[i*m_dim+j]-m_means.vector[m_dim*train_labels.vector[i]+j]);
141  }
142 
143  // get variance of features of labels
144  for (i=0; i<m_num_classes; i++)
145  {
146  for (j=0; j<m_dim; j++)
147  m_variances.vector[m_dim*i+j] /= m_label_prob.vector[i] > 1 ? m_label_prob.vector[i]-1 : 1;
148  }
149 
150  // get a priori probabilities of labels
151  for (i=0; i<m_num_classes; i++)
152  {
154  }
155 
156  train_labels.free_vector();
157 
158  return true;
159 }
160 
162 {
163  // init number of vectors
164  int32_t n = m_features->get_num_vectors();
165 
166  // init result labels
167  CLabels* result = new CLabels(n);
168 
169  // classify each example of data
170  for (int i=0; i<n; i++)
171  result->set_label(i,apply(i));
172 
173  return result;
174 };
175 
177 {
178  // check data correctness
179  if (!data)
180  SG_ERROR("No features specified\n");
181  if (!data->has_property(FP_DOT))
182  SG_ERROR("Specified features are not of type CDotFeatures\n");
183 
184  // set features to classify
185  set_features((CDotFeatures*)data);
186 
187  // classify using features
188  return apply();
189 };
190 
192 {
193  // get [idx] feature vector
195 
196  // init loop variables
197  int i,k;
198 
199  // rate all labels
200  for (i=0; i<m_num_classes; i++)
201  {
202  // set rate to 0.0 if a priori probability is 0.0 and continue
203  if (m_label_prob.vector[i]==0.0)
204  {
205  m_rates.vector[i] = 0.0;
206  continue;
207  }
208  else
210 
211  // product all conditional gaussian probabilities
212  for (k=0; k<m_dim; k++)
213  m_rates.vector[i]*= normal_exp(feature_vector.vector[k],i,k)/CMath::sqrt(m_variances.vector[i*m_dim+k]);
214  }
215 
216  // find label with maximum rate
217  int32_t max_label_idx = 0;
218 
219  for (i=0; i<m_num_classes; i++)
220  {
221  if (m_rates.vector[i]>m_rates.vector[max_label_idx])
222  max_label_idx = i;
223  }
224 
225  return max_label_idx+m_min_label;
226 };

SHOGUN Machine Learning Toolbox - Documentation