SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
PCA.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 1999-2008,2011 Soeren Sonnenburg
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  * Copyright (C) 2011 Berlin Institute of Technology
11  */
13 #ifdef HAVE_LAPACK
15 #include <shogun/lib/config.h>
17 #include <string.h>
18 #include <stdlib.h>
19 #include <shogun/lib/common.h>
23 #include <shogun/io/SGIO.h>
24 
25 using namespace shogun;
26 
27 CPCA::CPCA(bool do_whitening_, EPCAMode mode_, float64_t thresh_)
28 : CDimensionReductionPreprocessor(), num_dim(0), m_initialized(false),
29  m_whitening(do_whitening_), m_mode(mode_), thresh(thresh_)
30 {
31  init();
32 }
33 
34 void CPCA::init()
35 {
37  m_mean_vector = SGVector<float64_t>(NULL,0,true);
39 
40 
42  "transformation matrix", "Transformation matrix (Eigenvectors of covariance matrix).");
43  m_parameters->add(&m_mean_vector,
44  "mean vector", "Mean Vector.");
45  m_parameters->add(&m_eigenvalues_vector,
46  "eigenvalues vector", "Vector with Eigenvalues.");
48  "initalized", "True when initialized.");
50  "whitening", "Whether data shall be whitened.");
51  m_parameters->add((machine_int_t*) &m_mode, "mode",
52  "PCA Mode.");
54  "thresh", "Cutoff threshold.");
55 }
56 
58 {
62 }
63 
64 bool CPCA::init(CFeatures* features)
65 {
66  if (!m_initialized)
67  {
68  // loop varibles
69  int32_t i,j,k;
70 
71  ASSERT(features->get_feature_class()==C_SIMPLE);
72  ASSERT(features->get_feature_type()==F_DREAL);
73 
74  int32_t num_vectors=((CSimpleFeatures<float64_t>*)features)->get_num_vectors();
75  int32_t num_features=((CSimpleFeatures<float64_t>*)features)->get_num_features();
76  SG_INFO("num_examples: %ld num_features: %ld \n", num_vectors, num_features);
77 
78  m_mean_vector.vlen = num_features;
79  m_mean_vector.vector = SG_CALLOC(float64_t, num_features);
80 
81  // sum
82  SGMatrix<float64_t> feature_matrix = ((CSimpleFeatures<float64_t>*)features)->get_feature_matrix();
83  for (i=0; i<num_vectors; i++)
84  {
85  for (j=0; j<num_features; j++)
86  m_mean_vector.vector[j] += feature_matrix.matrix[i*num_features+j];
87  }
88 
89  //divide
90  for (i=0; i<num_features; i++)
91  m_mean_vector.vector[i] /= num_vectors;
92 
93  float64_t* cov = SG_CALLOC(float64_t, num_features*num_features);
94 
95  float64_t* sub_mean = SG_MALLOC(float64_t, num_features);
96 
97  for (i=0; i<num_vectors; i++)
98  {
99  for (k=0; k<num_features; k++)
100  sub_mean[k]=feature_matrix.matrix[i*num_features+k]-m_mean_vector.vector[k];
101 
102  cblas_dger(CblasColMajor,
103  num_features,num_features,
104  1.0,sub_mean,1,
105  sub_mean,1,
106  cov, num_features);
107  }
108 
109  SG_FREE(sub_mean);
110 
111  for (i=0; i<num_features; i++)
112  {
113  for (j=0; j<num_features; j++)
114  cov[i*num_features+j]/=(num_vectors-1);
115  }
116 
117  SG_INFO("Computing Eigenvalues ... ") ;
118 
119  m_eigenvalues_vector.vector = CMath::compute_eigenvectors(cov,num_features,num_features);
120  m_eigenvalues_vector.vlen = num_features;
121  num_dim=0;
122 
123  if (m_mode == FIXED_NUMBER)
124  {
125  ASSERT(m_target_dim <= num_features);
127  }
128  if (m_mode == VARIANCE_EXPLAINED)
129  {
130  float64_t eig_sum = 0;
131  for (i=0; i<num_features; i++)
132  eig_sum += m_eigenvalues_vector.vector[i];
133 
134  float64_t com_sum = 0;
135  for (i=num_features-1; i>-1; i--)
136  {
137  num_dim++;
138  com_sum += m_eigenvalues_vector.vector[i];
139  if (com_sum/eig_sum>=thresh)
140  break;
141  }
142  }
143  if (m_mode == THRESHOLD)
144  {
145  for (i=num_features-1; i>-1; i--)
146  {
148  num_dim++;
149  else
150  break;
151  }
152  }
153 
154  SG_INFO("Done\nReducing from %i to %i features..", num_features, num_dim) ;
155 
157  num_old_dim = num_features;
158 
159  int32_t offs=0;
160  for (i=num_features-num_dim; i<num_features; i++)
161  {
162  for (k=0; k<num_features; k++)
163  if (m_whitening)
165  cov[num_features*i+k]/sqrt(m_eigenvalues_vector.vector[i]);
166  else
168  cov[num_features*i+k];
169  offs++;
170  }
171 
172  SG_FREE(cov);
173  m_initialized = true;
174  return true;
175  }
176 
177  return false;
178 }
179 
181 {
185 }
186 
188 {
190  SGMatrix<float64_t> m = ((CSimpleFeatures<float64_t>*) features)->get_feature_matrix();
191  int32_t num_vectors = m.num_cols;
192  int32_t num_features = m.num_rows;
193  SG_INFO("get Feature matrix: %ix%i\n", num_vectors, num_features);
194 
195  if (m.matrix)
196  {
197  SG_INFO("Preprocessing feature matrix\n");
199  float64_t* sub_mean = SG_MALLOC(float64_t, num_features);
200 
201  for (int32_t vec=0; vec<num_vectors; vec++)
202  {
203  int32_t i;
204 
205  for (i=0; i<num_features; i++)
206  sub_mean[i] = m.matrix[num_features*vec+i] - m_mean_vector.vector[i];
207 
208  cblas_dgemv(CblasColMajor,CblasNoTrans,
209  num_dim,num_features,
211  sub_mean,1,
212  0.0,res,1);
213 
214  float64_t* m_transformed = &m.matrix[num_dim*vec];
215 
216  for (i=0; i<num_dim; i++)
217  m_transformed[i] = res[i];
218  }
219  SG_FREE(res);
220  SG_FREE(sub_mean);
221 
222  ((CSimpleFeatures<float64_t>*) features)->set_num_features(num_dim);
223  ((CSimpleFeatures<float64_t>*) features)->get_feature_matrix(num_features, num_vectors);
224  SG_INFO("new Feature matrix: %ix%i\n", num_vectors, num_features);
225  }
226 
227  return m;
228 }
229 
231 {
232  float64_t* result = SG_MALLOC(float64_t, num_dim);
233  float64_t* sub_mean = SG_MALLOC(float64_t, vector.vlen);
234 
235  for (int32_t i=0; i<vector.vlen; i++)
236  sub_mean[i]=vector.vector[i]-m_mean_vector.vector[i];
237 
238  cblas_dgemv(CblasColMajor,CblasNoTrans,
239  num_dim,vector.vlen,
241  sub_mean,1,
242  0.0,result,1);
243 
244  SG_FREE(sub_mean);
245  return SGVector<float64_t>(result,num_dim);
246 }
247 
249 {
251 }
252 
254 {
255  return m_eigenvalues_vector;
256 }
257 
259 {
260  return m_mean_vector;
261 }
262 
263 #endif /* HAVE_LAPACK */

SHOGUN Machine Learning Toolbox - Documentation