15 sparse_feature_matrix(NULL), feature_cache(NULL)
21 int32_t num_feat, int32_t num_vec,
bool copy)
23 sparse_feature_matrix(NULL), feature_cache(NULL)
33 for (int32_t i=0; i< num_vec; i++)
44 sparse_feature_matrix(NULL), feature_cache(NULL)
53 sparse_feature_matrix(NULL), feature_cache(NULL)
62 num_features(orig.num_features),
63 sparse_feature_matrix(orig.sparse_feature_matrix),
64 feature_cache(orig.feature_cache)
85 sparse_feature_matrix(NULL), feature_cache(NULL)
94 free_sparse_features();
98 clean_tsparse(sparse_feature_matrix, num_vectors);
99 sparse_feature_matrix = NULL;
106 free_sparse_feature_matrix();
107 delete feature_cache;
108 feature_cache = NULL;
117 ASSERT(index>=0 && index<num_features) ;
118 ASSERT(num>=0 && num<get_num_vectors()) ;
131 free_sparse_feature_vector(sv, num);
148 for (i=0; i<num_features; i++)
155 free_sparse_feature_vector(sv, num);
162 if (num>=num_vectors)
164 SG_ERROR(
"Index out of bounds (number of vectors %d, you "
165 "requested %d)\n", num_vectors, num);
175 dense.
vlen=num_features;
177 memset(dense.
vector, 0,
sizeof(ST)*num_features);
185 free_sparse_feature_vector(sv, num);
194 free_sparse_feature_vector(sv, num);
200 ASSERT(num<get_num_vectors());
202 index_t real_num=subset_idx_conversion(num);
206 if (sparse_feature_matrix)
208 result=sparse_feature_matrix[real_num];
218 result.
features=feature_cache->lock_entry(num);
224 result.
features=feature_cache->set_entry(num);
231 result.
features=compute_sparse_feature_vector(num,
235 if (get_num_preprocessors())
241 for (int32_t i=0; i<get_num_preprocessors(); i++)
247 tmp_feat_before=tmp_feat_after;
250 memcpy(result.
features, tmp_feat_after,
272 for (int32_t i=0; i<alen; i++)
276 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
279 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
289 for (int32_t i=0; i<blen; i++)
293 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
296 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
313 ASSERT(dim==num_features);
327 free_sparse_feature_vector(sv, num);
334 if (dim!=num_features)
336 SG_ERROR(
"dimension of vec (=%d) does not match number of features (=%d)\n",
362 free_sparse_feature_vector(sv, num);
368 feature_cache->unlock_entry(subset_idx_conversion(num));
376 SG_ERROR(
"get_sparse_feature_matrix() not allowed with subset\n");
378 num_feat=num_features;
381 return sparse_feature_matrix;
387 SG_ERROR(
"get_sparse_feature_matrix() not allowed with subset\n");
398 for (int32_t i=0; i<num_vec; i++)
415 num_feat=get_num_vectors();
416 num_vec=num_features;
418 int32_t* hist=
SG_MALLOC(int32_t, num_features);
419 memset(hist, 0,
sizeof(int32_t)*num_features);
422 for (int32_t v=0; v<num_feat; v++)
429 free_sparse_feature_vector(sv, v);
434 for (int32_t v=0; v<num_vec; v++)
442 memset(hist,0,
sizeof(int32_t)*num_features);
443 for (int32_t v=0; v<num_feat; v++)
456 free_sparse_feature_vector(sv, v);
466 SG_ERROR(
"set_sparse_feature_matrix() not allowed with subset\n");
469 free_sparse_feature_matrix();
481 SG_INFO(
"converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
487 memset(full.
matrix, 0,
size_t(num_features)*
size_t(get_num_vectors())*
sizeof(ST));
489 for (int32_t v=0; v<full.
num_cols; v++)
492 sparse_feature_matrix[subset_idx_conversion(v)];
496 int64_t offs=(current.
vec_index*num_features)
514 free_sparse_feature_matrix();
516 num_features=num_feat;
519 SG_INFO(
"converting dense feature matrix to sparse one\n");
520 int32_t* num_feat_entries=
SG_MALLOC(
int, num_vectors);
522 if (num_feat_entries)
524 int64_t num_total_entries=0;
527 for (int32_t i=0; i< num_vec; i++)
529 num_feat_entries[i]=0;
530 for (int32_t j=0; j< num_feat; j++)
532 if (src[i*((int64_t) num_feat) + j] != 0)
533 num_feat_entries[i]++;
541 if (sparse_feature_matrix)
543 for (int32_t i=0; i< num_vec; i++)
545 sparse_feature_matrix[i].vec_index=i;
546 sparse_feature_matrix[i].num_feat_entries=0;
547 sparse_feature_matrix[i].features= NULL;
549 if (num_feat_entries[i]>0)
553 if (!sparse_feature_matrix[i].features)
555 SG_INFO(
"allocation of features failed\n");
559 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
560 int32_t sparse_feat_idx=0;
562 for (int32_t j=0; j< num_feat; j++)
564 int64_t pos= i*num_feat + j;
568 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
569 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
579 SG_ERROR(
"allocation of sparse feature matrix failed\n");
583 SG_INFO(
"sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
584 num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
588 SG_ERROR(
"huh ? zero size matrix given ?\n");
598 SG_INFO(
"force: %d\n", force_preprocessing);
600 if ( sparse_feature_matrix && get_num_preprocessors() )
602 for (int32_t i=0; i<get_num_preprocessors(); i++)
604 if ( (!is_preprocessed(i) || force_preprocessing) )
607 SG_INFO(
"preprocessing using preproc %s\n", get_preprocessor(i)->get_name());
617 SG_WARNING(
"no sparse feature matrix available or features already preprocessed - skipping.\n");
632 return set_full_feature_matrix(fm);
637 return m_subset ? m_subset->get_size() : num_vectors;
647 int32_t n=num_features;
661 feature_cache->unlock_entry(subset_idx_conversion(num));
669 index_t num_vec=get_num_vectors();
670 for (int32_t i=0; i<num_vec; i++)
671 num+=sparse_feature_matrix[subset_idx_conversion(i)].num_feat_entries;
680 index_t num_vec=get_num_vectors();
681 for (int32_t i=0; i<num_vec; i++)
689 free_feature_vector(vec, i);
708 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
756 bool do_sort_features)
762 size_t blocksize=1024*1024;
763 size_t required_blocksize=blocksize;
764 uint8_t* dummy=
SG_MALLOC(uint8_t, blocksize);
765 FILE* f=fopen(fname,
"ro");
769 free_sparse_feature_matrix();
773 SG_INFO(
"counting line numbers in file %s\n", fname);
776 size_t old_block_offs=0;
777 fseek(f, 0, SEEK_END);
778 size_t fsize=ftell(f);
781 while (sz == blocksize)
783 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
784 for (
size_t i=0; i<sz; i++)
787 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
790 required_blocksize=
CMath::max(required_blocksize, block_offs-old_block_offs+1);
791 old_block_offs=block_offs;
794 SG_PROGRESS(block_offs, 0, fsize, 1,
"COUNTING:\t");
797 SG_INFO(
"found %d feature vectors\n", num_vectors);
799 blocksize=required_blocksize;
808 while (sz == blocksize)
810 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
813 for (
size_t i=0; i<sz; i++)
815 if (i==sz-1 && dummy[i]!=
'\n' && sz==blocksize)
817 size_t len=i-old_sz+1;
818 uint8_t* data=&dummy[old_sz];
820 for (
size_t j=0; j<len; j++)
823 sz=fread(dummy+len,
sizeof(uint8_t), blocksize-len, f);
829 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
833 uint8_t* data=&dummy[old_sz];
836 for (
size_t j=0; j<len; j++)
844 SG_ERROR(
"Error in line %d - number of"
845 " dimensions is %d line is %d characters"
846 " long\n line_content:'%.*s'\n", lines,
847 dims, len, len, (
const char*) data);
858 lab->
set_label(lines, atof((
const char*) data));
865 uint8_t* start=&data[j];
872 feat[d].
feat_index=(int32_t) atoi((
const char*) start)-1;
873 num_features=
CMath::max(num_features, feat[d].feat_index+1);
879 if (data[j]==
' ' || data[j]==
'\n')
882 feat[d].
entry=(ST) atof((
const char*) start);
891 feat[dims-1].
entry=(ST) atof((
const char*) start);
899 sparse_feature_matrix[lines].vec_index=lines;
900 sparse_feature_matrix[lines].num_feat_entries=dims;
901 sparse_feature_matrix[lines].features=feat;
905 SG_PROGRESS(lines, 0, num_vectors, 1,
"LOADING:\t");
909 SG_INFO(
"file successfully read\n");
915 if (do_sort_features)
924 SG_ERROR(
"sort_features() not allowed with subset\n");
926 ASSERT(get_num_preprocessors()==0);
928 if (!sparse_feature_matrix)
929 SG_ERROR(
"Requires sparse feature matrix to be available in-memory\n");
931 for (int32_t i=0; i<num_vectors; i++)
933 int32_t len=sparse_feature_matrix[i].num_feat_entries;
939 int32_t* feat_idx=
SG_MALLOC(int32_t, len);
940 int32_t* orig_idx=
SG_MALLOC(int32_t, len);
942 for (
int j=0; j<len; j++)
951 for (
int j=0; j<len; j++)
952 sf_new[j]=sf_orig[orig_idx[j]];
954 sparse_feature_matrix[i].features=sf_new;
957 for (
int j=0; j<len-1; j++)
958 ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
970 SG_ERROR(
"write_svmlight_file() not allowed with subset\n");
977 FILE* f=fopen(fname,
"wb");
981 for (int32_t i=0; i<num; i++)
986 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
988 for (int32_t j=0; j<num_feat; j++)
991 fprintf(f,
"%d:%f ", (int32_t) vec[j].feat_index+1, (
double) vec[j].entry);
993 fprintf(f,
"%d:%f\n", (int32_t) vec[j].feat_index+1, (
double) vec[j].entry);
1005 return num_features;
1022 free_sparse_feature_vector(avec, vec_idx1);
1023 sf->free_sparse_feature_vector(bvec, vec_idx2);
1030 if (vec2_len!=num_features)
1032 SG_ERROR(
"dimension of vec2 (=%d) does not match number of features (=%d)\n",
1033 vec2_len, num_features);
1045 free_sparse_feature_vector(sv, vec_idx1);
1052 if (vector_index>=get_num_vectors())
1054 SG_ERROR(
"Index out of bounds (number of vectors %d, you "
1055 "requested %d)\n", get_num_vectors(), vector_index);
1058 if (!sparse_feature_matrix)
1059 SG_ERROR(
"Requires a in-memory feature matrix\n");
1061 sparse_feature_iterator* it=
SG_MALLOC(sparse_feature_iterator, 1);
1062 it->sv=get_sparse_feature_vector(vector_index);
1070 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
1071 if (!it || it->index>=it->sv.num_feat_entries)
1074 int32_t i=it->index++;
1076 index=it->sv.features[i].feat_index;
1077 value=(
float64_t) it->sv.features[i].entry;
1087 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
1088 free_sparse_feature_vector(it->sv, it->sv.vec_index);
1095 get_dim_feature_space());
1111 free_sparse_feature_vector(current, index);
1130 m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
1131 "sparse_feature_matrix",
1132 "Array of sparse vectors.");
1133 m_parameters->add(&num_features,
"num_features",
1134 "Total number of features.");
1137 #define GET_FEATURE_TYPE(sg_type, f_type) \
1138 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type() \
1155 #undef GET_FEATURE_TYPE
1157 #define LOAD(fname, sg_type) \
1158 template<> void CSparseFeatures<sg_type>::load(CFile* loader) \
1163 SGSparseVector<sg_type>* matrix=NULL; \
1164 int32_t num_feat=0; \
1165 int32_t num_vec=0; \
1166 loader->fname(matrix, num_feat, num_vec); \
1167 set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec)); \
1170 LOAD(get_sparse_matrix,
bool)
1171 LOAD(get_sparse_matrix,
char)
1172 LOAD(get_sparse_matrix, uint8_t)
1173 LOAD(get_int8_sparsematrix, int8_t)
1174 LOAD(get_sparse_matrix, int16_t)
1175 LOAD(get_sparse_matrix, uint16_t)
1176 LOAD(get_sparse_matrix, int32_t)
1177 LOAD(get_uint_sparsematrix, uint32_t)
1178 LOAD(get_long_sparsematrix, int64_t)
1179 LOAD(get_ulong_sparsematrix, uint64_t)
1180 LOAD(get_sparse_matrix, float32_t)
1181 LOAD(get_sparse_matrix, float64_t)
1182 LOAD(get_longreal_sparsematrix, floatmax_t)
1185 #define WRITE(fname, sg_type) \
1186 template<> void CSparseFeatures<sg_type>::save(CFile* writer) \
1189 SG_ERROR("save() not allowed with subset\n"); \
1192 writer->fname(sparse_feature_matrix, num_features, num_vectors); \
1195 WRITE(set_sparse_matrix,
bool)
1196 WRITE(set_sparse_matrix,
char)
1197 WRITE(set_sparse_matrix, uint8_t)
1198 WRITE(set_int8_sparsematrix, int8_t)
1199 WRITE(set_sparse_matrix, int16_t)
1200 WRITE(set_sparse_matrix, uint16_t)
1201 WRITE(set_sparse_matrix, int32_t)
1202 WRITE(set_uint_sparsematrix, uint32_t)
1203 WRITE(set_long_sparsematrix, int64_t)
1204 WRITE(set_ulong_sparsematrix, uint64_t)
1205 WRITE(set_sparse_matrix, float32_t)
1206 WRITE(set_sparse_matrix, float64_t)
1207 WRITE(set_longreal_sparsematrix, floatmax_t)
1210 template class CSparseFeatures<bool>;
1211 template class CSparseFeatures<char>;
1212 template class CSparseFeatures<int8_t>;
1213 template class CSparseFeatures<uint8_t>;
1214 template class CSparseFeatures<int16_t>;
1215 template class CSparseFeatures<uint16_t>;
1216 template class CSparseFeatures<int32_t>;
1217 template class CSparseFeatures<uint32_t>;
1218 template class CSparseFeatures<int64_t>;
1219 template class CSparseFeatures<uint64_t>;
1220 template class CSparseFeatures<float32_t>;
1221 template class CSparseFeatures<float64_t>;
1222 template class CSparseFeatures<floatmax_t>;