SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
StringFeatures.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2008 Gunnar Raetsch
9  * Subset support written (W) 2011 Heiko Strathmann
10  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
11  */
12 
13 #ifndef _CSTRINGFEATURES__H__
14 #define _CSTRINGFEATURES__H__
15 
16 #include <shogun/lib/common.h>
17 #include <shogun/lib/Cache.h>
19 #include <shogun/lib/Compressor.h>
20 #include <shogun/io/File.h>
21 
24 
25 namespace shogun
26 {
27 class CAlphabet;
28 template <class T> class CDynamicArray;
29 class CFile;
30 template <class T> class SGString;
31 
32 #ifndef DOXYGEN_SHOULD_SKIP_THIS
33 struct SSKDoubleFeature
34 {
35  int feature1;
36  int feature2;
37  int group;
38 };
39 
40 struct SSKTripleFeature
41 {
42  int feature1;
43  int feature2;
44  int feature3;
45  int group;
46 };
47 #endif
48 
71 template <class ST> class CStringFeatures : public CFeatures
72 {
73  public:
78 
84 
89  CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha);
90 
95  CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha);
96 
101  CStringFeatures(CAlphabet* alpha);
102 
104  CStringFeatures(const CStringFeatures & orig);
105 
111  CStringFeatures(CFile* loader, EAlphabet alpha=DNA);
112 
113  virtual ~CStringFeatures();
114 
120  virtual void cleanup();
121 
128  virtual void cleanup_feature_vector(int32_t num);
129 
137  virtual void cleanup_feature_vectors(int32_t start, int32_t stop);
138 
144 
149  virtual EFeatureType get_feature_type();
150 
156 
161  virtual CFeatures* duplicate() const;
162 
169  SGVector<ST> get_feature_vector(int32_t num);
170 
178  void set_feature_vector(SGVector<ST> vector, int32_t num);
179 
183 
188 
199  ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree);
200 
208 
222  SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec);
223 
232  void free_feature_vector(ST* feat_vec, int32_t num, bool dofree);
233 
241  void free_feature_vector(SGVector<ST> feat_vec, int32_t num);
242 
251  virtual ST get_feature(int32_t vec_num, int32_t feat_num);
252 
260  virtual int32_t get_vector_length(int32_t vec_num);
261 
268  virtual int32_t get_max_vector_length();
269 
271  virtual int32_t get_num_vectors() const;
272 
280 
289 
290  // these functions are necessary to find out about a former conversion process
291 
297 
302  int32_t get_order();
303 
311  ST get_masked_symbols(ST symbol, uint8_t mask);
312 
319  ST shift_offset(ST offset, int32_t amount);
320 
327  ST shift_symbol(ST symbol, int32_t amount);
328 
333  virtual void load(CFile* loader);
334 
345  void load_ascii_file(char* fname, bool remap_to_bin=true,
346  EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA);
347 
356  bool load_fasta_file(const char* fname, bool ignore_invalid=false);
357 
367  bool load_fastq_file(const char* fname,
368  bool ignore_invalid=false, bool bitremap_in_single_string=false);
369 
377  bool load_from_directory(char* dirname);
378 
384  void set_features(SGStringList<ST> feats);
385 
395  bool set_features(SGString<ST>* p_features, int32_t p_num_vectors,
396  int32_t p_max_string_length);
397 
407 
420  bool append_features(SGString<ST>* p_features, int32_t p_num_vectors,
421  int32_t p_max_string_length);
422 
427 
436  virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len);
437 
446  virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len);
447 
455  virtual void get_features(SGString<ST>** dst, int32_t* num_str);
456 
463  virtual void save(CFile* writer);
464 
473  virtual bool load_compressed(char* src, bool decompress);
474 
484  virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level);
485 
490  virtual int32_t get_size();
491 
497  virtual bool apply_preprocessor(bool force_preprocessing=false);
498 
511  int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0);
512 
523  int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
524  int32_t skip=0);
525 
539  bool obtain_from_char(CStringFeatures<char>* sf, int32_t start,
540  int32_t p_order, int32_t gap, bool rev);
541 
553  template <class CT>
554  bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
555  int32_t p_order, int32_t gap, bool rev);
556 
566  bool have_same_length(int32_t len=-1);
567 
573  void embed_features(int32_t p_order);
574 
581  void compute_symbol_mask_table(int64_t max_val);
582 
589  void unembed_word(ST word, uint8_t* seq, int32_t len);
590 
596  ST embed_word(ST* seq, int32_t len);
597 
603 
612 
621  virtual void set_feature_vector(int32_t num, ST* string, int32_t len);
622 
627  virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols,
628  bool normalize=true);
629 
634  virtual void create_random(float64_t* hist, int32_t rows, int32_t cols,
635  int32_t num_vec);
636 
645  virtual CFeatures* copy_subset(SGVector<index_t> indices);
646 
648  inline virtual const char* get_name() const { return "StringFeatures"; }
649 
651  virtual void subset_changed_post();
652 
653  protected:
664  virtual ST* compute_feature_vector(int32_t num, int32_t& len);
665 
666  private:
667  void init();
668 
669  protected:
670 
673 
675  int32_t num_vectors;
676 
679 
682 
685 
688 
691 
694 
696  int32_t order;
697 
700 
703 
706 };
707 }
708 #endif // _CSTRINGFEATURES__H__

SHOGUN Machine Learning Toolbox - Documentation