SHOGUN
v1.1.0
|
File based string features.
StringFeatures that are file based. Underneath memory mapped files are used. Derived from CStringFeatures thus transparently enabling all of the StringFeature functionality.
Supported file format contains one string per line, lines of variable length are supported and must be separated by '
'.
Definition at line 34 of file StringFileFeatures.h.
Public Member Functions | |
CStringFileFeatures () | |
CStringFileFeatures (const char *fname, EAlphabet alpha) | |
virtual | ~CStringFileFeatures () |
![]() | |
CStringFeatures () | |
CStringFeatures (EAlphabet alpha) | |
CStringFeatures (SGStringList< ST > string_list, EAlphabet alpha) | |
CStringFeatures (SGStringList< ST > string_list, CAlphabet *alpha) | |
CStringFeatures (CAlphabet *alpha) | |
CStringFeatures (const CStringFeatures &orig) | |
CStringFeatures (CFile *loader, EAlphabet alpha=DNA) | |
virtual | ~CStringFeatures () |
virtual void | cleanup_feature_vectors (int32_t start, int32_t stop) |
virtual EFeatureClass | get_feature_class () |
virtual EFeatureType | get_feature_type () |
CAlphabet * | get_alphabet () |
virtual CFeatures * | duplicate () const |
SGVector< ST > | get_feature_vector (int32_t num) |
void | set_feature_vector (SGVector< ST > vector, int32_t num) |
void | enable_on_the_fly_preprocessing () |
void | disable_on_the_fly_preprocessing () |
ST * | get_feature_vector (int32_t num, int32_t &len, bool &dofree) |
CStringFeatures< ST > * | get_transposed () |
SGString< ST > * | get_transposed (int32_t &num_feat, int32_t &num_vec) |
void | free_feature_vector (ST *feat_vec, int32_t num, bool dofree) |
void | free_feature_vector (SGVector< ST > feat_vec, int32_t num) |
virtual ST | get_feature (int32_t vec_num, int32_t feat_num) |
virtual int32_t | get_vector_length (int32_t vec_num) |
virtual int32_t | get_max_vector_length () |
virtual int32_t | get_num_vectors () const |
floatmax_t | get_num_symbols () |
floatmax_t | get_max_num_symbols () |
floatmax_t | get_original_num_symbols () |
int32_t | get_order () |
ST | get_masked_symbols (ST symbol, uint8_t mask) |
ST | shift_offset (ST offset, int32_t amount) |
ST | shift_symbol (ST symbol, int32_t amount) |
virtual void | load (CFile *loader) |
void | load_ascii_file (char *fname, bool remap_to_bin=true, EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA) |
bool | load_fasta_file (const char *fname, bool ignore_invalid=false) |
bool | load_fastq_file (const char *fname, bool ignore_invalid=false, bool bitremap_in_single_string=false) |
bool | load_from_directory (char *dirname) |
void | set_features (SGStringList< ST > feats) |
bool | set_features (SGString< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length) |
bool | append_features (CStringFeatures< ST > *sf) |
bool | append_features (SGString< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length) |
SGStringList< ST > | get_features () |
virtual SGString< ST > * | get_features (int32_t &num_str, int32_t &max_str_len) |
virtual SGString< ST > * | copy_features (int32_t &num_str, int32_t &max_str_len) |
virtual void | get_features (SGString< ST > **dst, int32_t *num_str) |
virtual void | save (CFile *writer) |
virtual bool | load_compressed (char *src, bool decompress) |
virtual bool | save_compressed (char *dest, E_COMPRESSION_TYPE compression, int level) |
virtual int32_t | get_size () |
virtual bool | apply_preprocessor (bool force_preprocessing=false) |
int32_t | obtain_by_sliding_window (int32_t window_size, int32_t step_size, int32_t skip=0) |
int32_t | obtain_by_position_list (int32_t window_size, CDynamicArray< int32_t > *positions, int32_t skip=0) |
bool | obtain_from_char (CStringFeatures< char > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
template<class CT > | |
bool | obtain_from_char_features (CStringFeatures< CT > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
bool | have_same_length (int32_t len=-1) |
void | embed_features (int32_t p_order) |
void | compute_symbol_mask_table (int64_t max_val) |
void | unembed_word (ST word, uint8_t *seq, int32_t len) |
ST | embed_word (ST *seq, int32_t len) |
void | determine_maximum_string_length () |
virtual void | set_feature_vector (int32_t num, ST *string, int32_t len) |
virtual void | get_histogram (float64_t **hist, int32_t *rows, int32_t *cols, bool normalize=true) |
virtual void | create_random (float64_t *hist, int32_t rows, int32_t cols, int32_t num_vec) |
virtual CFeatures * | copy_subset (SGVector< index_t > indices) |
virtual const char * | get_name () const |
virtual void | subset_changed_post () |
template<> | |
EFeatureType | get_feature_type () |
template<> | |
EFeatureType | get_feature_type () |
template<> | |
EFeatureType | get_feature_type () |
template<> | |
EFeatureType | get_feature_type () |
template<> | |
EFeatureType | get_feature_type () |
template<> | |
EFeatureType | get_feature_type () |
template<> | |
EFeatureType | get_feature_type () |
template<> | |
EFeatureType | get_feature_type () |
template<> | |
EFeatureType | get_feature_type () |
template<> | |
EFeatureType | get_feature_type () |
template<> | |
EFeatureType | get_feature_type () |
template<> | |
EFeatureType | get_feature_type () |
template<> | |
bool | get_masked_symbols (bool symbol, uint8_t mask) |
template<> | |
float32_t | get_masked_symbols (float32_t symbol, uint8_t mask) |
template<> | |
float64_t | get_masked_symbols (float64_t symbol, uint8_t mask) |
template<> | |
floatmax_t | get_masked_symbols (floatmax_t symbol, uint8_t mask) |
template<> | |
bool | shift_offset (bool symbol, int32_t amount) |
template<> | |
float32_t | shift_offset (float32_t symbol, int32_t amount) |
template<> | |
float64_t | shift_offset (float64_t symbol, int32_t amount) |
template<> | |
floatmax_t | shift_offset (floatmax_t symbol, int32_t amount) |
template<> | |
bool | shift_symbol (bool symbol, int32_t amount) |
template<> | |
float32_t | shift_symbol (float32_t symbol, int32_t amount) |
template<> | |
float64_t | shift_symbol (float64_t symbol, int32_t amount) |
template<> | |
floatmax_t | shift_symbol (floatmax_t symbol, int32_t amount) |
template<> | |
bool | obtain_from_char_features (CStringFeatures< CT > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
template<> | |
bool | obtain_from_char_features (CStringFeatures< CT > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
template<> | |
bool | obtain_from_char_features (CStringFeatures< CT > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
template<> | |
void | embed_features (int32_t p_order) |
template<> | |
void | embed_features (int32_t p_order) |
template<> | |
void | embed_features (int32_t p_order) |
template<> | |
void | compute_symbol_mask_table (int64_t max_val) |
template<> | |
void | compute_symbol_mask_table (int64_t max_val) |
template<> | |
void | compute_symbol_mask_table (int64_t max_val) |
template<> | |
float32_t | embed_word (float32_t *seq, int32_t len) |
template<> | |
float64_t | embed_word (float64_t *seq, int32_t len) |
template<> | |
floatmax_t | embed_word (floatmax_t *seq, int32_t len) |
template<> | |
void | unembed_word (float32_t word, uint8_t *seq, int32_t len) |
template<> | |
void | unembed_word (float64_t word, uint8_t *seq, int32_t len) |
template<> | |
void | unembed_word (floatmax_t word, uint8_t *seq, int32_t len) |
![]() | |
CFeatures (int32_t size=0) | |
CFeatures (const CFeatures &orig) | |
CFeatures (CFile *loader) | |
virtual | ~CFeatures () |
virtual int32_t | add_preprocessor (CPreprocessor *p) |
set preprocessor | |
virtual CPreprocessor * | del_preprocessor (int32_t num) |
del current preprocessor | |
CPreprocessor * | get_preprocessor (int32_t num) |
get current preprocessor | |
void | set_preprocessed (int32_t num) |
bool | is_preprocessed (int32_t num) |
int32_t | get_num_preprocessed () |
get whether specified preprocessor (or all if num=1) was/were already applied | |
int32_t | get_num_preprocessors () const |
void | clean_preprocessors () |
int32_t | get_cache_size () |
virtual bool | reshape (int32_t num_features, int32_t num_vectors) |
void | list_feature_obj () |
bool | check_feature_compatibility (CFeatures *f) |
bool | has_property (EFeatureProperty p) |
void | set_property (EFeatureProperty p) |
void | unset_property (EFeatureProperty p) |
virtual void | set_subset (CSubset *subset) |
virtual void | remove_subset () |
index_t | subset_idx_conversion (index_t idx) const |
bool | has_subset () const |
![]() | |
CSGObject () | |
CSGObject (const CSGObject &orig) | |
virtual | ~CSGObject () |
virtual bool | is_generic (EPrimitiveType *generic) const |
template<class T > | |
void | set_generic () |
void | unset_generic () |
virtual void | print_serializable (const char *prefix="") |
virtual bool | save_serializable (CSerializableFile *file, const char *prefix="") |
virtual bool | load_serializable (CSerializableFile *file, const char *prefix="") |
void | set_global_io (SGIO *io) |
SGIO * | get_global_io () |
void | set_global_parallel (Parallel *parallel) |
Parallel * | get_global_parallel () |
void | set_global_version (Version *version) |
Version * | get_global_version () |
SGVector< char * > | get_modelsel_names () |
char * | get_modsel_param_descr (const char *param_name) |
index_t | get_modsel_param_index (const char *param_name) |
Protected Member Functions | |
ST * | get_line (uint64_t &len, uint64_t &offs, int32_t &line_nr, uint64_t file_length) |
virtual void | cleanup () |
virtual void | cleanup_feature_vector (int32_t num) |
void | fetch_meta_info_from_file (int32_t granularity=1048576) |
![]() | |
virtual ST * | compute_feature_vector (int32_t num, int32_t &len) |
Protected Attributes | |
CMemoryMappedFile< ST > * | file |
![]() | |
CAlphabet * | alphabet |
int32_t | num_vectors |
SGString< ST > * | features |
ST * | single_string |
int32_t | length_of_single_string |
length of prior single string | |
int32_t | max_string_length |
floatmax_t | num_symbols |
number of used symbols | |
floatmax_t | original_num_symbols |
original number of used symbols (before higher order mapping) | |
int32_t | order |
order used in higher order mapping | |
ST * | symbol_mask_table |
order used in higher order mapping | |
bool | preprocess_on_get |
preprocess on-the-fly? | |
CCache< ST > * | feature_cache |
![]() | |
CSubset * | m_subset |
Additional Inherited Members | |
![]() | |
static ST * | get_zero_terminated_string_copy (SGString< ST > str) |
![]() | |
SGIO * | io |
Parallel * | parallel |
Version * | version |
Parameter * | m_parameters |
Parameter * | m_model_selection_parameters |
default constructor
Definition at line 6 of file StringFileFeatures.cpp.
CStringFileFeatures | ( | const char * | fname, |
EAlphabet | alpha | ||
) |
constructor
fname | filename of the file containing line based features |
alpha | alphabet (type) to use for string features |
Definition at line 10 of file StringFileFeatures.cpp.
|
virtual |
default destructor
Definition at line 17 of file StringFileFeatures.cpp.
|
protectedvirtual |
cleanup string features
Reimplemented from CStringFeatures< ST >.
Definition at line 53 of file StringFileFeatures.cpp.
|
protectedvirtual |
cleanup a single feature vector
Reimplemented from CStringFeatures< ST >.
Definition at line 71 of file StringFileFeatures.cpp.
|
protected |
obtain meta information from file
i.e., determine number of strings and their lengths
Definition at line 77 of file StringFileFeatures.cpp.
|
protected |
get next line from file
The returned line may be modfied in case the file was opened read/write. It is otherwise read-only.
len | length of line (returned via reference) |
offs | offset to be passed for reading next line, should be 0 initially (returned via reference) |
line_nr | used to indicate errors (returned as reference should be 0 initially) |
file_length | total length of the file (for error checking) |
Definition at line 23 of file StringFileFeatures.cpp.
|
protected |
memory mapped file
Definition at line 86 of file StringFileFeatures.h.