SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
RegulatoryModulesStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2009 Sebastian J. Schultheiss and Soeren Sonnenburg
8  * Copyright (C) 2009 Max-Planck-Society
9  */
10 
11 #include <shogun/lib/common.h>
15 #include <shogun/io/SGIO.h>
16 
17 using namespace shogun;
18 
20 : CStringKernel<char>(0), width(0.0), degree(0), shift(0), window(0),
21  motif_positions_lhs(NULL), motif_positions_rhs(NULL),
22  position_weights(NULL), weights(NULL)
23 {
25 }
26 
28  int32_t size, float64_t w, int32_t d, int32_t s, int32_t wl)
29 : CStringKernel<char>(size), width(w), degree(d), shift(s), window(wl),
30  motif_positions_lhs(NULL), motif_positions_rhs(NULL), position_weights(NULL), weights(NULL)
31 {
33 }
34 
37  float64_t w, int32_t d, int32_t s, int32_t wl, int32_t size)
38 : CStringKernel<char>(size), width(w), degree(d), shift(s), window(wl),
39  motif_positions_lhs(NULL), motif_positions_rhs(NULL), position_weights(NULL), weights(NULL)
40 {
41  set_motif_positions(lpos, rpos);
42  init(lstr,rstr);
44 }
45 
47 {
50 }
51 
52 bool CRegulatoryModulesStringKernel::init(CFeatures* l, CFeatures* r)
53 {
56 
58  SG_ERROR("Number of vectors does not agree (LHS: %d, Motif LHS: %d).\n",
61  SG_ERROR("Number of vectors does not agree (RHS: %d, Motif RHS: %d).\n",
63 
66  return init_normalizer();
67 }
68 
70  CSimpleFeatures<uint16_t>* positions_lhs, CSimpleFeatures<uint16_t>* positions_rhs)
71 {
72  ASSERT(positions_lhs);
73  ASSERT(positions_rhs);
76  if (positions_lhs->get_num_features() != positions_rhs->get_num_features())
77  SG_ERROR("Number of dimensions does not agree.\n");
78 
79  motif_positions_lhs=positions_lhs;
80  motif_positions_rhs=positions_rhs;
81  SG_REF(positions_lhs);
82  SG_REF(positions_rhs);
83 }
84 
86 {
89 
90  bool free_avec, free_bvec;
91  char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
92  char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
93 
94  int32_t alen_pos, blen_pos;
95  bool afree_pos, bfree_pos;
96  uint16_t* positions_a = motif_positions_lhs->get_feature_vector(idx_a, alen_pos, afree_pos);
97  uint16_t* positions_b = motif_positions_rhs->get_feature_vector(idx_b, blen_pos, bfree_pos);
98  ASSERT(alen_pos==blen_pos);
99  int32_t num_pos=alen_pos;
100 
101 
102  float64_t result_rbf=0;
103  float64_t result_wds=0;
104 
105  for (int32_t p=0; p<num_pos; p++)
106  {
107  result_rbf+=CMath::sq(positions_a[p]-positions_b[p]);
108 
109  for (int32_t p2=0; p2<num_pos; p2++) //p+1 and below * 2
110  result_rbf+=CMath::sq( (positions_a[p]-positions_a[p2]) - (positions_b[p]-positions_b[p2]) );
111 
112  int32_t limit = window;
113  if (window + positions_a[p] > alen)
114  limit = alen - positions_a[p];
115 
116  if (window + positions_b[p] > blen)
117  limit = CMath::min(limit, blen - positions_b[p]);
118 
119  result_wds+=compute_wds(&avec[positions_a[p]], &bvec[positions_b[p]],
120  limit);
121  }
122 
123  float64_t result=exp(-result_rbf/width)+result_wds;
124 
125  ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
126  ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
127  ((CSimpleFeatures<uint16_t>*) lhs)->free_feature_vector(positions_a, idx_a, afree_pos);
128  ((CSimpleFeatures<uint16_t>*) rhs)->free_feature_vector(positions_b, idx_b, bfree_pos);
129 
130  return result;
131 }
132 
134  char* avec, char* bvec, int32_t len)
135 {
136  float64_t* max_shift_vec = SG_MALLOC(float64_t, shift);
137  float64_t sum0=0 ;
138  for (int32_t i=0; i<shift; i++)
139  max_shift_vec[i]=0 ;
140 
141  // no shift
142  for (int32_t i=0; i<len; i++)
143  {
144  if ((position_weights!=NULL) && (position_weights[i]==0.0))
145  continue ;
146 
147  float64_t sumi = 0.0 ;
148  for (int32_t j=0; (j<degree) && (i+j<len); j++)
149  {
150  if (avec[i+j]!=bvec[i+j])
151  break ;
152  sumi += weights[j];
153  }
154  if (position_weights!=NULL)
155  sum0 += position_weights[i]*sumi ;
156  else
157  sum0 += sumi ;
158  } ;
159 
160  for (int32_t i=0; i<len; i++)
161  {
162  for (int32_t k=1; (k<=shift) && (i+k<len); k++)
163  {
164  if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
165  continue ;
166 
167  float64_t sumi1 = 0.0 ;
168  // shift in sequence a
169  for (int32_t j=0; (j<degree) && (i+j+k<len); j++)
170  {
171  if (avec[i+j+k]!=bvec[i+j])
172  break ;
173  sumi1 += weights[j];
174  }
175  float64_t sumi2 = 0.0 ;
176  // shift in sequence b
177  for (int32_t j=0; (j<degree) && (i+j+k<len); j++)
178  {
179  if (avec[i+j]!=bvec[i+j+k])
180  break ;
181  sumi2 += weights[j];
182  }
183  if (position_weights!=NULL)
184  max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
185  else
186  max_shift_vec[k-1] += sumi1 + sumi2 ;
187  } ;
188  }
189 
190  float64_t result = sum0 ;
191  for (int32_t i=0; i<shift; i++)
192  result += max_shift_vec[i]/(2*(i+1)) ;
193 
194  SG_FREE(max_shift_vec);
195  return result ;
196 }
197 
199 {
200  ASSERT(degree>0);
201 
202  SG_FREE(weights);
204 
205  int32_t i;
206  float64_t sum=0;
207  for (i=0; i<degree; i++)
208  {
209  weights[i]=degree-i;
210  sum+=weights[i];
211  }
212 
213  for (i=0; i<degree; i++)
214  weights[i]/=sum;
215 }
216 
218 {
219  m_parameters->add(&width, "width", "the width of Gaussian kernel part");
220  m_parameters->add(&degree, "degree", "the degree of weighted degree kernel part");
221  m_parameters->add(&shift, "shift", "the shift of weighted degree with shifts kernel part");
222  m_parameters->add(&window, "window", "the size of window around motifs");
223  m_parameters->add_vector((CSGObject***)&motif_positions_lhs, &alen, "motif_positions_lhs", "the matrix of motif positions from sequences left-hand side");
224  m_parameters->add_vector((CSGObject***)&motif_positions_rhs, &blen, "motif_positions_rhs", "the matrix of motif positions from sequences right-hand side");
225  m_parameters->add_vector(&position_weights, &degree, "position_weights", "scaling weights in window");
226  m_parameters->add_vector(&weights, &degree, "weights", "weights of WD kernel");
227 }

SHOGUN Machine Learning Toolbox - Documentation