00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _SPARSEFEATURES__H__
00013 #define _SPARSEFEATURES__H__
00014
00015 #include <string.h>
00016 #include <stdlib.h>
00017
00018 #include "lib/common.h"
00019 #include "lib/Mathematics.h"
00020 #include "lib/Cache.h"
00021 #include "lib/io.h"
00022 #include "lib/Cache.h"
00023
00024 #include "features/Labels.h"
00025 #include "features/Features.h"
00026 #include "features/SimpleFeatures.h"
00027 #include "features/RealFeatures.h"
00028 #include "preproc/SparsePreProc.h"
00029
00030
00031
00032
00033 template <class ST> class CSparsePreProc;
00034
00036 template <class ST> struct TSparseEntry
00037 {
00039 INT feat_index;
00041 ST entry;
00042 };
00043
00045 template <class ST> struct TSparse
00046 {
00047 public:
00049 INT vec_index;
00051 INT num_feat_entries;
00053 TSparseEntry<ST>* features;
00054 };
00055
00057 template <class ST> class CSparseFeatures : public CFeatures
00058 {
00059 public:
00064 CSparseFeatures(INT size=0)
00065 : CFeatures(size), num_vectors(0), num_features(0),
00066 sparse_feature_matrix(NULL), feature_cache(NULL)
00067 {}
00068
00070 CSparseFeatures(const CSparseFeatures & orig)
00071 : CFeatures(orig), num_vectors(orig.num_vectors),
00072 num_features(orig.num_features),
00073 sparse_feature_matrix(orig.sparse_feature_matrix),
00074 feature_cache(orig.feature_cache)
00075 {
00076 if (orig.sparse_feature_matrix)
00077 {
00078 free_sparse_feature_matrix();
00079 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00080 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors);
00081 for (INT i=0; i< num_vectors; i++)
00082 {
00083 sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00084 memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00085
00086 }
00087 }
00088 }
00089
00094 CSparseFeatures(CHAR* fname)
00095 : CFeatures(fname), num_vectors(0), num_features(0),
00096 sparse_feature_matrix(NULL), feature_cache(NULL)
00097 {}
00098
00099 virtual ~CSparseFeatures()
00100 {
00101 free_sparse_features();
00102 }
00103
00107 void free_sparse_feature_matrix()
00108 {
00109 clean_tsparse(sparse_feature_matrix, num_vectors);
00110 sparse_feature_matrix = NULL;
00111 num_vectors=0;
00112 num_features=0;
00113 }
00114
00118 void free_sparse_features()
00119 {
00120 free_sparse_feature_matrix();
00121 delete feature_cache;
00122 feature_cache = NULL;
00123 }
00124
00129 virtual CFeatures* duplicate() const
00130 {
00131 return new CSparseFeatures<ST>(*this);
00132 }
00133
00142 ST* get_full_feature_vector(INT num, INT& len)
00143 {
00144 bool vfree;
00145 INT num_feat;
00146 INT i;
00147 len=0;
00148 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00149 ST* fv=NULL;
00150
00151 if (sv)
00152 {
00153 len=num_features;
00154 fv=new ST[num_features];
00155
00156 for (i=0; i<num_features; i++)
00157 fv[i]=0;
00158
00159 for (i=0; i<num_feat; i++)
00160 fv[sv[i].feat_index]= sv[i].entry;
00161 }
00162
00163 free_sparse_feature_vector(sv, num, vfree);
00164
00165 return fv;
00166 }
00167
00168
00174 inline INT get_num_sparse_vec_features(INT num)
00175 {
00176 bool vfree;
00177 INT len;
00178 TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree);
00179 free_sparse_feature_vector(sv, num, vfree);
00180 return len;
00181 }
00182
00193 TSparseEntry<ST>* get_sparse_feature_vector(INT num, INT& len, bool& vfree)
00194 {
00195 ASSERT(num<num_vectors);
00196
00197 if (sparse_feature_matrix)
00198 {
00199 len= sparse_feature_matrix[num].num_feat_entries;
00200 vfree=false ;
00201 return sparse_feature_matrix[num].features;
00202 }
00203 else
00204 {
00205 TSparseEntry<ST>* feat=NULL;
00206 vfree=false;
00207
00208 if (feature_cache)
00209 {
00210 feat=feature_cache->lock_entry(num);
00211
00212 if (feat)
00213 return feat;
00214 else
00215 {
00216 feat=feature_cache->set_entry(num);
00217 }
00218 }
00219
00220 if (!feat)
00221 vfree=true;
00222
00223 feat=compute_sparse_feature_vector(num, len, feat);
00224
00225
00226 if (get_num_preproc())
00227 {
00228 INT tmp_len=len;
00229 TSparseEntry<ST>* tmp_feat_before = feat;
00230 TSparseEntry<ST>* tmp_feat_after = NULL;
00231
00232 for (INT i=0; i<get_num_preproc(); i++)
00233 {
00234
00235
00236 if (i!=0)
00237 delete[] tmp_feat_before;
00238 tmp_feat_before=tmp_feat_after;
00239 }
00240
00241 memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len);
00242 delete[] tmp_feat_after;
00243 len=tmp_len ;
00244 SG_DEBUG( "len: %d len2: %d\n", len, num_features);
00245 }
00246 return feat ;
00247 }
00248 }
00249
00250
00261 ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, INT alen, TSparseEntry<ST>* bvec, INT blen)
00262 {
00263 ST result=0;
00264
00265
00266 if (avec && bvec)
00267 {
00268 if (alen<=blen)
00269 {
00270 INT j=0;
00271 for (INT i=0; i<alen; i++)
00272 {
00273 INT a_feat_idx=avec[i].feat_index;
00274
00275 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00276 j++;
00277
00278 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00279 {
00280 result+= avec[i].entry * bvec[j].entry;
00281 j++;
00282 }
00283 }
00284 }
00285 else
00286 {
00287 INT j=0;
00288 for (INT i=0; i<blen; i++)
00289 {
00290 INT b_feat_idx=bvec[i].feat_index;
00291
00292 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00293 j++;
00294
00295 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00296 {
00297 result+= bvec[i].entry * avec[j].entry;
00298 j++;
00299 }
00300 }
00301 }
00302
00303 result*=alpha;
00304 }
00305
00306 return result;
00307 }
00308
00320 void dense_dot_range(ST* output, INT start, INT stop, ST* alphas, ST* vec, INT dim, ST b)
00321 {
00322 ASSERT(output);
00323 ASSERT(start>=0);
00324 ASSERT(stop<=num_vectors);
00325
00326 for (INT i=start; i<stop; i++)
00327 output[i]=dense_dot(alphas[i], i, vec, dim, b);
00328 }
00329
00340 ST dense_dot(ST alpha, INT num, ST* vec, INT dim, ST b)
00341 {
00342 ASSERT(vec);
00343 ASSERT(dim==num_features);
00344 ST result=b;
00345
00346 bool vfree;
00347 INT num_feat;
00348 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00349
00350 if (sv)
00351 {
00352 for (INT i=0; i<num_feat; i++)
00353 result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00354 }
00355
00356 free_sparse_feature_vector(sv, num, vfree);
00357 return result;
00358 }
00359
00369 void add_to_dense_vec(ST alpha, INT num, ST* vec, INT dim, bool abs_val=false)
00370 {
00371 ASSERT(vec);
00372 ASSERT(dim==num_features);
00373
00374 bool vfree;
00375 INT num_feat;
00376 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00377
00378 if (sv)
00379 {
00380 if (abs_val)
00381 {
00382 for (INT i=0; i<num_feat; i++)
00383 vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00384 }
00385 else
00386 {
00387 for (INT i=0; i<num_feat; i++)
00388 vec[sv[i].feat_index]+= alpha*sv[i].entry;
00389 }
00390 }
00391
00392 free_sparse_feature_vector(sv, num, vfree);
00393 }
00394
00401 void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, INT num, bool free)
00402 {
00403 if (feature_cache)
00404 feature_cache->unlock_entry(num);
00405
00406 if (free)
00407 delete[] feat_vec ;
00408 }
00409
00417 TSparse<ST>* get_sparse_feature_matrix(INT &num_feat, INT &num_vec)
00418 {
00419 num_feat=num_features;
00420 num_vec=num_vectors;
00421
00422 return sparse_feature_matrix;
00423 }
00424
00430 void clean_tsparse(TSparse<ST>* sfm, INT num_vec)
00431 {
00432 if (sfm)
00433 {
00434 for (INT i=0; i<num_vec; i++)
00435 delete[] sfm[i].features;
00436
00437 delete[] sfm;
00438 }
00439 }
00440
00450 TSparse<ST>* get_transposed(INT &num_feat, INT &num_vec)
00451 {
00452 num_feat=num_vectors;
00453 num_vec=num_features;
00454
00455 INT* hist=new INT[num_features];
00456 memset(hist, 0, sizeof(INT)*num_features);
00457
00458
00459 for (INT v=0; v<num_vectors; v++)
00460 {
00461 INT vlen;
00462 bool vfree;
00463 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00464
00465 for (INT i=0; i<vlen; i++)
00466 hist[sv[i].feat_index]++;
00467
00468 free_sparse_feature_vector(sv, v, vfree);
00469 }
00470
00471
00472 TSparse<ST>* sfm=new TSparse<ST>[num_vec];
00473 for (INT v=0; v<num_vec; v++)
00474 {
00475 sfm[v].features= new TSparseEntry<ST>[hist[v]];
00476 sfm[v].num_feat_entries=hist[v];
00477 sfm[v].vec_index=v;
00478 }
00479
00480
00481 memset(hist,0,sizeof(INT)*num_features);
00482 for (INT v=0; v<num_vectors; v++)
00483 {
00484 INT vlen;
00485 bool vfree;
00486 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00487
00488 for (INT i=0; i<vlen; i++)
00489 {
00490 INT vidx=sv[i].feat_index;
00491 INT fidx=v;
00492 sfm[vidx].features[hist[vidx]].feat_index=fidx;
00493 sfm[vidx].features[hist[vidx]].entry=sv[i].entry;
00494 hist[vidx]++;
00495 }
00496
00497 free_sparse_feature_vector(sv, v, vfree);
00498 }
00499
00500 delete[] hist;
00501 return sfm;
00502 }
00503
00513 virtual void set_sparse_feature_matrix(TSparse<ST>* sfm, INT num_feat, INT num_vec)
00514 {
00515 free_sparse_feature_matrix();
00516
00517 sparse_feature_matrix=sfm;
00518 num_features=num_feat;
00519 num_vectors=num_vec;
00520 }
00521
00529 ST* get_full_feature_matrix(INT &num_feat, INT &num_vec)
00530 {
00531 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00532 num_feat=num_features;
00533 num_vec=num_vectors;
00534
00535 ST* fm=new ST[num_feat*num_vec];
00536
00537 if (fm)
00538 {
00539 for (LONG i=0; i<num_feat*num_vec; i++)
00540 fm[i]=0;
00541
00542 for (INT v=0; v<num_vec; v++)
00543 {
00544 for (INT f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00545 {
00546 LONG offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index;
00547 fm[offs]= sparse_feature_matrix[v].features[f].entry;
00548 }
00549 }
00550 }
00551 else
00552 SG_ERROR( "error allocating memory for dense feature matrix\n");
00553
00554 return fm;
00555 }
00556
00566 virtual bool set_full_feature_matrix(ST* ffm, INT num_feat, INT num_vec)
00567 {
00568 free_sparse_feature_matrix();
00569 bool result=true;
00570 num_features=num_feat;
00571 num_vectors=num_vec;
00572
00573 SG_INFO("converting dense feature matrix to sparse one\n");
00574 INT* num_feat_entries=new int[num_vectors];
00575
00576 if (num_feat_entries)
00577 {
00578 INT num_total_entries=0;
00579
00580
00581 for (INT i=0; i< num_vec; i++)
00582 {
00583 num_feat_entries[i]=0;
00584 for (INT j=0; j< num_feat; j++)
00585 {
00586 if (ffm[i*((LONG) num_feat) + j] != 0)
00587 num_feat_entries[i]++;
00588 }
00589 }
00590
00591 if (num_vec>0)
00592 {
00593 sparse_feature_matrix=new TSparse<ST>[num_vec];
00594
00595 if (sparse_feature_matrix)
00596 {
00597 for (INT i=0; i< num_vec; i++)
00598 {
00599 sparse_feature_matrix[i].vec_index=i;
00600 sparse_feature_matrix[i].num_feat_entries=0;
00601 sparse_feature_matrix[i].features= NULL;
00602
00603 if (num_feat_entries[i]>0)
00604 {
00605 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]];
00606
00607 if (!sparse_feature_matrix[i].features)
00608 {
00609 SG_INFO( "allocation of features failed\n");
00610 return false;
00611 }
00612
00613 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00614 INT sparse_feat_idx=0;
00615
00616 for (INT j=0; j< num_feat; j++)
00617 {
00618 LONG pos= i*num_feat + j;
00619
00620 if (ffm[pos] != 0)
00621 {
00622 sparse_feature_matrix[i].features[sparse_feat_idx].entry=ffm[pos];
00623 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00624 sparse_feat_idx++;
00625 num_total_entries++;
00626 }
00627 }
00628 }
00629 }
00630 }
00631 else
00632 {
00633 SG_ERROR( "allocation of sparse feature matrix failed\n");
00634 result=false;
00635 }
00636
00637 SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00638 num_total_entries, num_feat*num_vec, (100.0*num_total_entries)/(num_feat*num_vec));
00639 }
00640 else
00641 {
00642 SG_ERROR( "huh ? zero size matrix given ?\n");
00643 result=false;
00644 }
00645 }
00646 delete[] num_feat_entries;
00647 return result;
00648 }
00649
00655 virtual bool apply_preproc(bool force_preprocessing=false)
00656 {
00657 SG_INFO( "force: %d\n", force_preprocessing);
00658
00659 if ( sparse_feature_matrix && get_num_preproc() )
00660 {
00661 for (INT i=0; i<get_num_preproc(); i++)
00662 {
00663 if ( (!is_preprocessed(i) || force_preprocessing) )
00664 {
00665 set_preprocessed(i);
00666 SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00667 if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL)
00668 return false;
00669 }
00670 return true;
00671 }
00672 return true;
00673 }
00674 else
00675 {
00676 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00677 return false;
00678 }
00679 }
00680
00685 virtual INT get_size() { return sizeof(ST); }
00686
00692 bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00693 {
00694 INT num_feat=0;
00695 INT num_vec=0;
00696 ST* fm=sf->get_feature_matrix(num_feat, num_vec);
00697 ASSERT(fm && num_feat>0 && num_vec>0);
00698
00699 return set_full_feature_matrix(fm, num_feat, num_vec);
00700 }
00701
00706 virtual inline INT get_num_vectors() { return num_vectors; }
00707
00712 inline INT get_num_features() { return num_features; }
00713
00725 inline INT set_num_features(INT num)
00726 {
00727 INT n=num_features;
00728 ASSERT(n<=num);
00729 num_features=num;
00730 return num_features;
00731 }
00732
00737 inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00738
00743 inline virtual EFeatureType get_feature_type();
00744
00751 void free_feature_vector(TSparseEntry<ST>* feat_vec, INT num, bool free)
00752 {
00753 if (feature_cache)
00754 feature_cache->unlock_entry(num);
00755
00756 if (free)
00757 delete[] feat_vec ;
00758 }
00759
00764 LONG get_num_nonzero_entries()
00765 {
00766 LONG num=0;
00767 for (INT i=0; i<num_vectors; i++)
00768 num+=sparse_feature_matrix[i].num_feat_entries;
00769
00770 return num;
00771 }
00772
00778 DREAL* compute_squared(DREAL* sq)
00779 {
00780 ASSERT(sq);
00781
00782 INT len=0;
00783 bool do_free=false;
00784
00785 for (INT i=0; i<this->get_num_vectors(); i++)
00786 {
00787 sq[i]=0;
00788 TSparseEntry<DREAL>* vec = ((CSparseFeatures<DREAL>*) this)->get_sparse_feature_vector(i, len, do_free);
00789
00790 for (INT j=0; j<len; j++)
00791 sq[i] += vec[j].entry * vec[j].entry;
00792
00793 ((CSparseFeatures<DREAL>*) this)->free_feature_vector(vec, i, do_free);
00794 }
00795
00796 return sq;
00797 }
00798
00811 DREAL compute_squared_norm(CSparseFeatures<DREAL>* lhs, DREAL* sq_lhs, INT idx_a, CSparseFeatures<DREAL>* rhs, DREAL* sq_rhs, INT idx_b)
00812 {
00813 INT i,j;
00814 INT alen, blen;
00815 bool afree, bfree;
00816 ASSERT(lhs);
00817 ASSERT(rhs);
00818
00819 TSparseEntry<DREAL>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree);
00820 TSparseEntry<DREAL>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree);
00821 ASSERT(avec);
00822 ASSERT(bvec);
00823
00824 DREAL result=sq_lhs[idx_a]+sq_rhs[idx_b];
00825
00826 if (alen<=blen)
00827 {
00828 j=0;
00829 for (i=0; i<alen; i++)
00830 {
00831 INT a_feat_idx=avec[i].feat_index;
00832
00833 while ((j<blen) && (bvec[j].feat_index < a_feat_idx))
00834 j++;
00835
00836 if ((j<blen) && (bvec[j].feat_index == a_feat_idx))
00837 {
00838 result-=2*(avec[i].entry*bvec[j].entry);
00839 j++;
00840 }
00841 }
00842 }
00843 else
00844 {
00845 j=0;
00846 for (i=0; i<blen; i++)
00847 {
00848 INT b_feat_idx=bvec[i].feat_index;
00849
00850 while ((j<alen) && (avec[j].feat_index<b_feat_idx))
00851 j++;
00852
00853 if ((j<alen) && (avec[j].feat_index == b_feat_idx))
00854 {
00855 result-=2*(bvec[i].entry*avec[j].entry);
00856 j++;
00857 }
00858 }
00859 }
00860
00861 ((CSparseFeatures<DREAL>*) lhs)->free_feature_vector(avec, idx_a, afree);
00862 ((CSparseFeatures<DREAL>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
00863
00864 return CMath::abs(result);
00865 }
00866
00872 CLabels* load_svmlight_file(CHAR* fname)
00873 {
00874 CLabels* lab=NULL;
00875
00876 size_t blocksize=1024*1024;
00877 size_t required_blocksize=blocksize;
00878 BYTE* dummy=new BYTE[blocksize];
00879 FILE* f=fopen(fname, "ro");
00880
00881 if (f)
00882 {
00883 free_sparse_feature_matrix();
00884 num_vectors=0;
00885 num_features=0;
00886
00887 SG_INFO("counting line numbers in file %s\n", fname);
00888 size_t sz=blocksize;
00889 size_t block_offs=0;
00890 size_t old_block_offs=0;
00891 fseek(f, 0, SEEK_END);
00892 size_t fsize=ftell(f);
00893 rewind(f);
00894
00895 while (sz == blocksize)
00896 {
00897 sz=fread(dummy, sizeof(BYTE), blocksize, f);
00898 bool contains_cr=false;
00899 for (size_t i=0; i<sz; i++)
00900 {
00901 block_offs++;
00902 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00903 {
00904 num_vectors++;
00905 contains_cr=true;
00906 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00907 old_block_offs=block_offs;
00908 }
00909 }
00910 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00911 }
00912
00913 SG_INFO("found %d feature vectors\n", num_vectors);
00914 delete[] dummy;
00915 blocksize=required_blocksize;
00916 dummy = new BYTE[blocksize+1];
00917
00918 lab=new CLabels(num_vectors);
00919 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00920
00921 rewind(f);
00922 sz=blocksize;
00923 INT lines=0;
00924 while (sz == blocksize)
00925 {
00926 sz=fread(dummy, sizeof(BYTE), blocksize, f);
00927
00928 size_t old_sz=0;
00929 for (size_t i=0; i<sz; i++)
00930 {
00931 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00932 {
00933 size_t len=i-old_sz+1;
00934 BYTE* data=&dummy[old_sz];
00935
00936 for (INT j=0; j<len; j++)
00937 dummy[j]=data[j];
00938
00939 sz=fread(dummy+len, sizeof(BYTE), blocksize-len, f);
00940 i=0;
00941 old_sz=0;
00942 sz+=len;
00943 }
00944
00945 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00946 {
00947
00948 size_t len=i-old_sz;
00949 BYTE* data=&dummy[old_sz];
00950
00951 INT dims=0;
00952 for (INT j=0; j<len; j++)
00953 {
00954 if (data[j]==':')
00955 dims++;
00956 }
00957
00958 if (dims<=0)
00959 {
00960 SG_ERROR("Error in line %d - number of"
00961 " dimensions is %d line is %d characters"
00962 " long\n line_content:'%.*s'\n", lines,
00963 dims, len, len, (const char*) data);
00964 }
00965
00966 TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims];
00967 INT j=0;
00968 for (; j<len; j++)
00969 {
00970 if (data[j]==' ')
00971 {
00972 data[j]='\0';
00973
00974 lab->set_label(lines, atof((const char*) data));
00975 break;
00976 }
00977 }
00978
00979 INT d=0;
00980 j++;
00981 BYTE* start=&data[j];
00982 for (; j<len; j++)
00983 {
00984 if (data[j]==':')
00985 {
00986 data[j]='\0';
00987
00988 feat[d].feat_index=(INT) atoi((const char*) start)-1;
00989 num_features=CMath::max(num_features, feat[d].feat_index+1);
00990
00991 j++;
00992 start=&data[j];
00993 for (; j<len; j++)
00994 {
00995 if (data[j]==' ' || data[j]=='\n')
00996 {
00997 data[j]='\0';
00998 feat[d].entry=(ST) atof((const char*) start);
00999 d++;
01000 break;
01001 }
01002 }
01003
01004 if (j==len)
01005 {
01006 data[j]='\0';
01007 feat[dims-1].entry=(ST) atof((const char*) start);
01008 }
01009
01010 j++;
01011 start=&data[j];
01012 }
01013 }
01014
01015 sparse_feature_matrix[lines].vec_index=lines;
01016 sparse_feature_matrix[lines].num_feat_entries=dims;
01017 sparse_feature_matrix[lines].features=feat;
01018
01019 old_sz=i+1;
01020 lines++;
01021 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01022 }
01023 }
01024 }
01025 SG_INFO("file successfully read\n");
01026 fclose(f);
01027 }
01028
01029 delete[] dummy;
01030
01031 return lab;
01032 }
01033
01040 bool write_svmlight_file(CHAR* fname, CLabels* label)
01041 {
01042 ASSERT(label);
01043 INT num=label->get_num_labels();
01044 ASSERT(num>0);
01045 ASSERT(num==num_vectors);
01046
01047 FILE* f=fopen(fname, "wb");
01048
01049 if (f)
01050 {
01051 for (INT i=0; i<num; i++)
01052 {
01053 fprintf(f, "%d ", (INT) label->get_int_label(i));
01054
01055 TSparseEntry<ST>* vec = sparse_feature_matrix[i].features;
01056 INT num_feat = sparse_feature_matrix[i].num_feat_entries;
01057
01058 for (INT j=0; j<num_feat; j++)
01059 {
01060 if (j<num_feat-1)
01061 fprintf(f, "%d:%f ", (INT) vec[j].feat_index+1, (double) vec[j].entry);
01062 else
01063 fprintf(f, "%d:%f\n", (INT) vec[j].feat_index+1, (double) vec[j].entry);
01064 }
01065 }
01066
01067 fclose(f);
01068 return true;
01069 }
01070 return false;
01071 }
01072
01073 protected:
01084 virtual TSparseEntry<ST>* compute_sparse_feature_vector(INT num, INT& len, TSparseEntry<ST>* target=NULL)
01085 {
01086 len=0;
01087 return NULL;
01088 }
01089
01090 protected:
01091
01093 INT num_vectors;
01094
01096 INT num_features;
01097
01099 TSparse<ST>* sparse_feature_matrix;
01100
01102 CCache< TSparseEntry<ST> >* feature_cache;
01103 };
01104
01105
01110 template<> inline EFeatureType CSparseFeatures<CHAR>::get_feature_type()
01111 {
01112 return F_CHAR;
01113 }
01114
01119 template<> inline EFeatureType CSparseFeatures<BYTE>::get_feature_type()
01120 {
01121 return F_BYTE;
01122 }
01123
01128 template<> inline EFeatureType CSparseFeatures<SHORT>::get_feature_type()
01129 {
01130 return F_SHORT;
01131 }
01132
01137 template<> inline EFeatureType CSparseFeatures<WORD>::get_feature_type()
01138 {
01139 return F_WORD;
01140 }
01141
01146 template<> inline EFeatureType CSparseFeatures<INT>::get_feature_type()
01147 {
01148 return F_INT;
01149 }
01150
01155 template<> inline EFeatureType CSparseFeatures<UINT>::get_feature_type()
01156 {
01157 return F_UINT;
01158 }
01159
01164 template<> inline EFeatureType CSparseFeatures<LONG>::get_feature_type()
01165 {
01166 return F_LONG;
01167 }
01168
01173 template<> inline EFeatureType CSparseFeatures<ULONG>::get_feature_type()
01174 {
01175 return F_ULONG;
01176 }
01177
01182 template<> inline EFeatureType CSparseFeatures<DREAL>::get_feature_type()
01183 {
01184 return F_DREAL;
01185 }
01186
01191 template<> inline EFeatureType CSparseFeatures<SHORTREAL>::get_feature_type()
01192 {
01193 return F_SHORTREAL;
01194 }
01195
01200 template<> inline EFeatureType CSparseFeatures<LONGREAL>::get_feature_type()
01201 {
01202 return F_LONGREAL;
01203 }
01204 #endif