00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _SPARSEFEATURES__H__
00013 #define _SPARSEFEATURES__H__
00014
00015 #include <string.h>
00016 #include <stdlib.h>
00017
00018 #include "lib/common.h"
00019 #include "lib/Mathematics.h"
00020 #include "lib/Cache.h"
00021 #include "lib/io.h"
00022 #include "lib/Cache.h"
00023
00024 #include "features/Labels.h"
00025 #include "features/Features.h"
00026 #include "features/SimpleFeatures.h"
00027 #include "features/RealFeatures.h"
00028 #include "preproc/SparsePreProc.h"
00029
00030 template <class ST> class CSparsePreProc;
00031
00033 template <class ST> struct TSparseEntry
00034 {
00036 int32_t feat_index;
00038 ST entry;
00039 };
00040
00042 template <class ST> struct TSparse
00043 {
00044 public:
00046 int32_t vec_index;
00048 int32_t num_feat_entries;
00050 TSparseEntry<ST>* features;
00051 };
00052
00065 template <class ST> class CSparseFeatures : public CFeatures
00066 {
00067 public:
00072 CSparseFeatures(int32_t size=0)
00073 : CFeatures(size), num_vectors(0), num_features(0),
00074 sparse_feature_matrix(NULL), feature_cache(NULL)
00075 {}
00076
00078 CSparseFeatures(const CSparseFeatures & orig)
00079 : CFeatures(orig), num_vectors(orig.num_vectors),
00080 num_features(orig.num_features),
00081 sparse_feature_matrix(orig.sparse_feature_matrix),
00082 feature_cache(orig.feature_cache)
00083 {
00084 if (orig.sparse_feature_matrix)
00085 {
00086 free_sparse_feature_matrix();
00087 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00088 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors);
00089 for (int32_t i=0; i< num_vectors; i++)
00090 {
00091 sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00092 memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00093
00094 }
00095 }
00096 }
00097
00102 CSparseFeatures(char* fname)
00103 : CFeatures(fname), num_vectors(0), num_features(0),
00104 sparse_feature_matrix(NULL), feature_cache(NULL)
00105 {}
00106
00107 virtual ~CSparseFeatures()
00108 {
00109 free_sparse_features();
00110 }
00111
00115 void free_sparse_feature_matrix()
00116 {
00117 clean_tsparse(sparse_feature_matrix, num_vectors);
00118 sparse_feature_matrix = NULL;
00119 num_vectors=0;
00120 num_features=0;
00121 }
00122
00126 void free_sparse_features()
00127 {
00128 free_sparse_feature_matrix();
00129 delete feature_cache;
00130 feature_cache = NULL;
00131 }
00132
00137 virtual CFeatures* duplicate() const
00138 {
00139 return new CSparseFeatures<ST>(*this);
00140 }
00141
00150 ST* get_full_feature_vector(int32_t num, int32_t& len)
00151 {
00152 bool vfree;
00153 int32_t num_feat;
00154 int32_t i;
00155 len=0;
00156 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00157 ST* fv=NULL;
00158
00159 if (sv)
00160 {
00161 len=num_features;
00162 fv=new ST[num_features];
00163
00164 for (i=0; i<num_features; i++)
00165 fv[i]=0;
00166
00167 for (i=0; i<num_feat; i++)
00168 fv[sv[i].feat_index]= sv[i].entry;
00169 }
00170
00171 free_sparse_feature_vector(sv, num, vfree);
00172
00173 return fv;
00174 }
00175
00176
00182 inline int32_t get_num_sparse_vec_features(int32_t num)
00183 {
00184 bool vfree;
00185 int32_t len;
00186 TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree);
00187 free_sparse_feature_vector(sv, num, vfree);
00188 return len;
00189 }
00190
00201 TSparseEntry<ST>* get_sparse_feature_vector(int32_t num, int32_t& len, bool& vfree)
00202 {
00203 ASSERT(num<num_vectors);
00204
00205 if (sparse_feature_matrix)
00206 {
00207 len= sparse_feature_matrix[num].num_feat_entries;
00208 vfree=false ;
00209 return sparse_feature_matrix[num].features;
00210 }
00211 else
00212 {
00213 TSparseEntry<ST>* feat=NULL;
00214 vfree=false;
00215
00216 if (feature_cache)
00217 {
00218 feat=feature_cache->lock_entry(num);
00219
00220 if (feat)
00221 return feat;
00222 else
00223 {
00224 feat=feature_cache->set_entry(num);
00225 }
00226 }
00227
00228 if (!feat)
00229 vfree=true;
00230
00231 feat=compute_sparse_feature_vector(num, len, feat);
00232
00233
00234 if (get_num_preproc())
00235 {
00236 int32_t tmp_len=len;
00237 TSparseEntry<ST>* tmp_feat_before = feat;
00238 TSparseEntry<ST>* tmp_feat_after = NULL;
00239
00240 for (int32_t i=0; i<get_num_preproc(); i++)
00241 {
00242
00243
00244 if (i!=0)
00245 delete[] tmp_feat_before;
00246 tmp_feat_before=tmp_feat_after;
00247 }
00248
00249 memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len);
00250 delete[] tmp_feat_after;
00251 len=tmp_len ;
00252 SG_DEBUG( "len: %d len2: %d\n", len, num_features);
00253 }
00254 return feat ;
00255 }
00256 }
00257
00258
00269 ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, int32_t alen, TSparseEntry<ST>* bvec, int32_t blen)
00270 {
00271 ST result=0;
00272
00273
00274 if (avec && bvec)
00275 {
00276 if (alen<=blen)
00277 {
00278 int32_t j=0;
00279 for (int32_t i=0; i<alen; i++)
00280 {
00281 int32_t a_feat_idx=avec[i].feat_index;
00282
00283 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00284 j++;
00285
00286 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00287 {
00288 result+= avec[i].entry * bvec[j].entry;
00289 j++;
00290 }
00291 }
00292 }
00293 else
00294 {
00295 int32_t j=0;
00296 for (int32_t i=0; i<blen; i++)
00297 {
00298 int32_t b_feat_idx=bvec[i].feat_index;
00299
00300 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00301 j++;
00302
00303 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00304 {
00305 result+= bvec[i].entry * avec[j].entry;
00306 j++;
00307 }
00308 }
00309 }
00310
00311 result*=alpha;
00312 }
00313
00314 return result;
00315 }
00316
00328 void dense_dot_range(ST* output, int32_t start, int32_t stop, ST* alphas, ST* vec, int32_t dim, ST b)
00329 {
00330 ASSERT(output);
00331 ASSERT(start>=0);
00332 ASSERT(stop<=num_vectors);
00333
00334 for (int32_t i=start; i<stop; i++)
00335 output[i]=dense_dot(alphas[i], i, vec, dim, b);
00336 }
00337
00348 ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00349 {
00350 ASSERT(vec);
00351 ASSERT(dim==num_features);
00352 ST result=b;
00353
00354 bool vfree;
00355 int32_t num_feat;
00356 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00357
00358 if (sv)
00359 {
00360 for (int32_t i=0; i<num_feat; i++)
00361 result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00362 }
00363
00364 free_sparse_feature_vector(sv, num, vfree);
00365 return result;
00366 }
00367
00377 void add_to_dense_vec(ST alpha, int32_t num, ST* vec, int32_t dim, bool abs_val=false)
00378 {
00379 ASSERT(vec);
00380 ASSERT(dim==num_features);
00381
00382 bool vfree;
00383 int32_t num_feat;
00384 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00385
00386 if (sv)
00387 {
00388 if (abs_val)
00389 {
00390 for (int32_t i=0; i<num_feat; i++)
00391 vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00392 }
00393 else
00394 {
00395 for (int32_t i=0; i<num_feat; i++)
00396 vec[sv[i].feat_index]+= alpha*sv[i].entry;
00397 }
00398 }
00399
00400 free_sparse_feature_vector(sv, num, vfree);
00401 }
00402
00409 void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00410 {
00411 if (feature_cache)
00412 feature_cache->unlock_entry(num);
00413
00414 if (free)
00415 delete[] feat_vec ;
00416 }
00417
00425 TSparse<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00426 {
00427 num_feat=num_features;
00428 num_vec=num_vectors;
00429
00430 return sparse_feature_matrix;
00431 }
00432
00438 void clean_tsparse(TSparse<ST>* sfm, int32_t num_vec)
00439 {
00440 if (sfm)
00441 {
00442 for (int32_t i=0; i<num_vec; i++)
00443 delete[] sfm[i].features;
00444
00445 delete[] sfm;
00446 }
00447 }
00448
00458 TSparse<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00459 {
00460 num_feat=num_vectors;
00461 num_vec=num_features;
00462
00463 int32_t* hist=new int32_t[num_features];
00464 memset(hist, 0, sizeof(int32_t)*num_features);
00465
00466
00467 for (int32_t v=0; v<num_vectors; v++)
00468 {
00469 int32_t vlen;
00470 bool vfree;
00471 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00472
00473 for (int32_t i=0; i<vlen; i++)
00474 hist[sv[i].feat_index]++;
00475
00476 free_sparse_feature_vector(sv, v, vfree);
00477 }
00478
00479
00480 TSparse<ST>* sfm=new TSparse<ST>[num_vec];
00481 for (int32_t v=0; v<num_vec; v++)
00482 {
00483 sfm[v].features= new TSparseEntry<ST>[hist[v]];
00484 sfm[v].num_feat_entries=hist[v];
00485 sfm[v].vec_index=v;
00486 }
00487
00488
00489 memset(hist,0,sizeof(int32_t)*num_features);
00490 for (int32_t v=0; v<num_vectors; v++)
00491 {
00492 int32_t vlen;
00493 bool vfree;
00494 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00495
00496 for (int32_t i=0; i<vlen; i++)
00497 {
00498 int32_t vidx=sv[i].feat_index;
00499 int32_t fidx=v;
00500 sfm[vidx].features[hist[vidx]].feat_index=fidx;
00501 sfm[vidx].features[hist[vidx]].entry=sv[i].entry;
00502 hist[vidx]++;
00503 }
00504
00505 free_sparse_feature_vector(sv, v, vfree);
00506 }
00507
00508 delete[] hist;
00509 return sfm;
00510 }
00511
00521 virtual void set_sparse_feature_matrix(TSparse<ST>* sfm, int32_t num_feat, int32_t num_vec)
00522 {
00523 free_sparse_feature_matrix();
00524
00525 sparse_feature_matrix=sfm;
00526 num_features=num_feat;
00527 num_vectors=num_vec;
00528 }
00529
00537 ST* get_full_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00538 {
00539 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00540 num_feat=num_features;
00541 num_vec=num_vectors;
00542
00543 ST* fm=new ST[num_feat*num_vec];
00544
00545 if (fm)
00546 {
00547 for (int64_t i=0; i<num_feat*num_vec; i++)
00548 fm[i]=0;
00549
00550 for (int32_t v=0; v<num_vec; v++)
00551 {
00552 for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00553 {
00554 int64_t offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index;
00555 fm[offs]= sparse_feature_matrix[v].features[f].entry;
00556 }
00557 }
00558 }
00559 else
00560 SG_ERROR( "error allocating memory for dense feature matrix\n");
00561
00562 return fm;
00563 }
00564
00574 virtual bool set_full_feature_matrix(ST* ffm, int32_t num_feat, int32_t num_vec)
00575 {
00576 free_sparse_feature_matrix();
00577 bool result=true;
00578 num_features=num_feat;
00579 num_vectors=num_vec;
00580
00581 SG_INFO("converting dense feature matrix to sparse one\n");
00582 int32_t* num_feat_entries=new int[num_vectors];
00583
00584 if (num_feat_entries)
00585 {
00586 int32_t num_total_entries=0;
00587
00588
00589 for (int32_t i=0; i< num_vec; i++)
00590 {
00591 num_feat_entries[i]=0;
00592 for (int32_t j=0; j< num_feat; j++)
00593 {
00594 if (ffm[i*((int64_t) num_feat) + j] != 0)
00595 num_feat_entries[i]++;
00596 }
00597 }
00598
00599 if (num_vec>0)
00600 {
00601 sparse_feature_matrix=new TSparse<ST>[num_vec];
00602
00603 if (sparse_feature_matrix)
00604 {
00605 for (int32_t i=0; i< num_vec; i++)
00606 {
00607 sparse_feature_matrix[i].vec_index=i;
00608 sparse_feature_matrix[i].num_feat_entries=0;
00609 sparse_feature_matrix[i].features= NULL;
00610
00611 if (num_feat_entries[i]>0)
00612 {
00613 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]];
00614
00615 if (!sparse_feature_matrix[i].features)
00616 {
00617 SG_INFO( "allocation of features failed\n");
00618 return false;
00619 }
00620
00621 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00622 int32_t sparse_feat_idx=0;
00623
00624 for (int32_t j=0; j< num_feat; j++)
00625 {
00626 int64_t pos= i*num_feat + j;
00627
00628 if (ffm[pos] != 0)
00629 {
00630 sparse_feature_matrix[i].features[sparse_feat_idx].entry=ffm[pos];
00631 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00632 sparse_feat_idx++;
00633 num_total_entries++;
00634 }
00635 }
00636 }
00637 }
00638 }
00639 else
00640 {
00641 SG_ERROR( "allocation of sparse feature matrix failed\n");
00642 result=false;
00643 }
00644
00645 SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00646 num_total_entries, num_feat*num_vec, (100.0*num_total_entries)/(num_feat*num_vec));
00647 }
00648 else
00649 {
00650 SG_ERROR( "huh ? zero size matrix given ?\n");
00651 result=false;
00652 }
00653 }
00654 delete[] num_feat_entries;
00655 return result;
00656 }
00657
00663 virtual bool apply_preproc(bool force_preprocessing=false)
00664 {
00665 SG_INFO( "force: %d\n", force_preprocessing);
00666
00667 if ( sparse_feature_matrix && get_num_preproc() )
00668 {
00669 for (int32_t i=0; i<get_num_preproc(); i++)
00670 {
00671 if ( (!is_preprocessed(i) || force_preprocessing) )
00672 {
00673 set_preprocessed(i);
00674 SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00675 if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL)
00676 return false;
00677 }
00678 return true;
00679 }
00680 return true;
00681 }
00682 else
00683 {
00684 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00685 return false;
00686 }
00687 }
00688
00693 virtual int32_t get_size() { return sizeof(ST); }
00694
00700 bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00701 {
00702 int32_t num_feat=0;
00703 int32_t num_vec=0;
00704 ST* fm=sf->get_feature_matrix(num_feat, num_vec);
00705 ASSERT(fm && num_feat>0 && num_vec>0);
00706
00707 return set_full_feature_matrix(fm, num_feat, num_vec);
00708 }
00709
00714 virtual inline int32_t get_num_vectors() { return num_vectors; }
00715
00720 inline int32_t get_num_features() { return num_features; }
00721
00733 inline int32_t set_num_features(int32_t num)
00734 {
00735 int32_t n=num_features;
00736 ASSERT(n<=num);
00737 num_features=num;
00738 return num_features;
00739 }
00740
00745 inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00746
00751 inline virtual EFeatureType get_feature_type();
00752
00759 void free_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00760 {
00761 if (feature_cache)
00762 feature_cache->unlock_entry(num);
00763
00764 if (free)
00765 delete[] feat_vec ;
00766 }
00767
00772 int64_t get_num_nonzero_entries()
00773 {
00774 int64_t num=0;
00775 for (int32_t i=0; i<num_vectors; i++)
00776 num+=sparse_feature_matrix[i].num_feat_entries;
00777
00778 return num;
00779 }
00780
00786 float64_t* compute_squared(float64_t* sq)
00787 {
00788 ASSERT(sq);
00789
00790 int32_t len=0;
00791 bool do_free=false;
00792
00793 for (int32_t i=0; i<this->get_num_vectors(); i++)
00794 {
00795 sq[i]=0;
00796 TSparseEntry<float64_t>* vec = ((CSparseFeatures<float64_t>*) this)->get_sparse_feature_vector(i, len, do_free);
00797
00798 for (int32_t j=0; j<len; j++)
00799 sq[i] += vec[j].entry * vec[j].entry;
00800
00801 ((CSparseFeatures<float64_t>*) this)->free_feature_vector(vec, i, do_free);
00802 }
00803
00804 return sq;
00805 }
00806
00819 float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00820 {
00821 int32_t i,j;
00822 int32_t alen, blen;
00823 bool afree, bfree;
00824 ASSERT(lhs);
00825 ASSERT(rhs);
00826
00827 TSparseEntry<float64_t>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree);
00828 TSparseEntry<float64_t>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree);
00829 ASSERT(avec);
00830 ASSERT(bvec);
00831
00832 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00833
00834 if (alen<=blen)
00835 {
00836 j=0;
00837 for (i=0; i<alen; i++)
00838 {
00839 int32_t a_feat_idx=avec[i].feat_index;
00840
00841 while ((j<blen) && (bvec[j].feat_index < a_feat_idx))
00842 j++;
00843
00844 if ((j<blen) && (bvec[j].feat_index == a_feat_idx))
00845 {
00846 result-=2*(avec[i].entry*bvec[j].entry);
00847 j++;
00848 }
00849 }
00850 }
00851 else
00852 {
00853 j=0;
00854 for (i=0; i<blen; i++)
00855 {
00856 int32_t b_feat_idx=bvec[i].feat_index;
00857
00858 while ((j<alen) && (avec[j].feat_index<b_feat_idx))
00859 j++;
00860
00861 if ((j<alen) && (avec[j].feat_index == b_feat_idx))
00862 {
00863 result-=2*(bvec[i].entry*avec[j].entry);
00864 j++;
00865 }
00866 }
00867 }
00868
00869 ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a, afree);
00870 ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
00871
00872 return CMath::abs(result);
00873 }
00874
00880 CLabels* load_svmlight_file(char* fname)
00881 {
00882 CLabels* lab=NULL;
00883
00884 size_t blocksize=1024*1024;
00885 size_t required_blocksize=blocksize;
00886 uint8_t* dummy=new uint8_t[blocksize];
00887 FILE* f=fopen(fname, "ro");
00888
00889 if (f)
00890 {
00891 free_sparse_feature_matrix();
00892 num_vectors=0;
00893 num_features=0;
00894
00895 SG_INFO("counting line numbers in file %s\n", fname);
00896 size_t sz=blocksize;
00897 size_t block_offs=0;
00898 size_t old_block_offs=0;
00899 fseek(f, 0, SEEK_END);
00900 size_t fsize=ftell(f);
00901 rewind(f);
00902
00903 while (sz == blocksize)
00904 {
00905 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00906 bool contains_cr=false;
00907 for (size_t i=0; i<sz; i++)
00908 {
00909 block_offs++;
00910 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00911 {
00912 num_vectors++;
00913 contains_cr=true;
00914 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00915 old_block_offs=block_offs;
00916 }
00917 }
00918 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00919 }
00920
00921 SG_INFO("found %d feature vectors\n", num_vectors);
00922 delete[] dummy;
00923 blocksize=required_blocksize;
00924 dummy = new uint8_t[blocksize+1];
00925
00926 lab=new CLabels(num_vectors);
00927 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00928
00929 rewind(f);
00930 sz=blocksize;
00931 int32_t lines=0;
00932 while (sz == blocksize)
00933 {
00934 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00935
00936 size_t old_sz=0;
00937 for (size_t i=0; i<sz; i++)
00938 {
00939 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00940 {
00941 size_t len=i-old_sz+1;
00942 uint8_t* data=&dummy[old_sz];
00943
00944 for (int32_t j=0; j<len; j++)
00945 dummy[j]=data[j];
00946
00947 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
00948 i=0;
00949 old_sz=0;
00950 sz+=len;
00951 }
00952
00953 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00954 {
00955
00956 size_t len=i-old_sz;
00957 uint8_t* data=&dummy[old_sz];
00958
00959 int32_t dims=0;
00960 for (int32_t j=0; j<len; j++)
00961 {
00962 if (data[j]==':')
00963 dims++;
00964 }
00965
00966 if (dims<=0)
00967 {
00968 SG_ERROR("Error in line %d - number of"
00969 " dimensions is %d line is %d characters"
00970 " long\n line_content:'%.*s'\n", lines,
00971 dims, len, len, (const char*) data);
00972 }
00973
00974 TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims];
00975 int32_t j=0;
00976 for (; j<len; j++)
00977 {
00978 if (data[j]==' ')
00979 {
00980 data[j]='\0';
00981
00982 lab->set_label(lines, atof((const char*) data));
00983 break;
00984 }
00985 }
00986
00987 int32_t d=0;
00988 j++;
00989 uint8_t* start=&data[j];
00990 for (; j<len; j++)
00991 {
00992 if (data[j]==':')
00993 {
00994 data[j]='\0';
00995
00996 feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
00997 num_features=CMath::max(num_features, feat[d].feat_index+1);
00998
00999 j++;
01000 start=&data[j];
01001 for (; j<len; j++)
01002 {
01003 if (data[j]==' ' || data[j]=='\n')
01004 {
01005 data[j]='\0';
01006 feat[d].entry=(ST) atof((const char*) start);
01007 d++;
01008 break;
01009 }
01010 }
01011
01012 if (j==len)
01013 {
01014 data[j]='\0';
01015 feat[dims-1].entry=(ST) atof((const char*) start);
01016 }
01017
01018 j++;
01019 start=&data[j];
01020 }
01021 }
01022
01023 sparse_feature_matrix[lines].vec_index=lines;
01024 sparse_feature_matrix[lines].num_feat_entries=dims;
01025 sparse_feature_matrix[lines].features=feat;
01026
01027 old_sz=i+1;
01028 lines++;
01029 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01030 }
01031 }
01032 }
01033 SG_INFO("file successfully read\n");
01034 fclose(f);
01035 }
01036
01037 delete[] dummy;
01038
01039 return lab;
01040 }
01041
01048 bool write_svmlight_file(char* fname, CLabels* label)
01049 {
01050 ASSERT(label);
01051 int32_t num=label->get_num_labels();
01052 ASSERT(num>0);
01053 ASSERT(num==num_vectors);
01054
01055 FILE* f=fopen(fname, "wb");
01056
01057 if (f)
01058 {
01059 for (int32_t i=0; i<num; i++)
01060 {
01061 fprintf(f, "%d ", (int32_t) label->get_int_label(i));
01062
01063 TSparseEntry<ST>* vec = sparse_feature_matrix[i].features;
01064 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
01065
01066 for (int32_t j=0; j<num_feat; j++)
01067 {
01068 if (j<num_feat-1)
01069 fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01070 else
01071 fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01072 }
01073 }
01074
01075 fclose(f);
01076 return true;
01077 }
01078 return false;
01079 }
01080
01081 protected:
01092 virtual TSparseEntry<ST>* compute_sparse_feature_vector(int32_t num, int32_t& len, TSparseEntry<ST>* target=NULL)
01093 {
01094 len=0;
01095 return NULL;
01096 }
01097
01098 protected:
01099
01101 int32_t num_vectors;
01102
01104 int32_t num_features;
01105
01107 TSparse<ST>* sparse_feature_matrix;
01108
01110 CCache< TSparseEntry<ST> >* feature_cache;
01111 };
01112
01113
01118 template<> inline EFeatureType CSparseFeatures<char>::get_feature_type()
01119 {
01120 return F_CHAR;
01121 }
01122
01127 template<> inline EFeatureType CSparseFeatures<uint8_t>::get_feature_type()
01128 {
01129 return F_BYTE;
01130 }
01131
01136 template<> inline EFeatureType CSparseFeatures<int16_t>::get_feature_type()
01137 {
01138 return F_SHORT;
01139 }
01140
01145 template<> inline EFeatureType CSparseFeatures<uint16_t>::get_feature_type()
01146 {
01147 return F_WORD;
01148 }
01149
01154 template<> inline EFeatureType CSparseFeatures<int32_t>::get_feature_type()
01155 {
01156 return F_INT;
01157 }
01158
01163 template<> inline EFeatureType CSparseFeatures<uint32_t>::get_feature_type()
01164 {
01165 return F_UINT;
01166 }
01167
01172 template<> inline EFeatureType CSparseFeatures<int64_t>::get_feature_type()
01173 {
01174 return F_LONG;
01175 }
01176
01181 template<> inline EFeatureType CSparseFeatures<uint64_t>::get_feature_type()
01182 {
01183 return F_ULONG;
01184 }
01185
01190 template<> inline EFeatureType CSparseFeatures<float64_t>::get_feature_type()
01191 {
01192 return F_DREAL;
01193 }
01194
01199 template<> inline EFeatureType CSparseFeatures<float32_t>::get_feature_type()
01200 {
01201 return F_SHORTREAL;
01202 }
01203
01208 template<> inline EFeatureType CSparseFeatures<float128_t>::get_feature_type()
01209 {
01210 return F_LONGREAL;
01211 }
01212 #endif