SparseFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _SPARSEFEATURES__H__
00013 #define _SPARSEFEATURES__H__
00014 
00015 #include <string.h>
00016 #include <stdlib.h>
00017 
00018 #include "lib/common.h"
00019 #include "lib/Mathematics.h"
00020 #include "lib/Cache.h"
00021 #include "lib/io.h"
00022 #include "lib/Cache.h"
00023 
00024 #include "features/Labels.h"
00025 #include "features/Features.h"
00026 #include "features/SimpleFeatures.h"
00027 #include "features/RealFeatures.h"
00028 #include "preproc/SparsePreProc.h"
00029 
00030 template <class ST> class CSparsePreProc;
00031 
00033 template <class ST> struct TSparseEntry
00034 {
00036     int32_t feat_index;
00038     ST entry;
00039 };
00040 
00042 template <class ST> struct TSparse
00043 {
00044     public:
00046         int32_t vec_index;
00048         int32_t num_feat_entries;
00050         TSparseEntry<ST>* features;
00051 };
00052 
00065 template <class ST> class CSparseFeatures : public CFeatures
00066 {
00067     public:
00072         CSparseFeatures(int32_t size=0)
00073         : CFeatures(size), num_vectors(0), num_features(0),
00074             sparse_feature_matrix(NULL), feature_cache(NULL)
00075         {}
00076 
00078         CSparseFeatures(const CSparseFeatures & orig)
00079         : CFeatures(orig), num_vectors(orig.num_vectors),
00080             num_features(orig.num_features),
00081             sparse_feature_matrix(orig.sparse_feature_matrix),
00082             feature_cache(orig.feature_cache)
00083         {
00084             if (orig.sparse_feature_matrix)
00085             {
00086                 free_sparse_feature_matrix();
00087                 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00088                 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors);
00089                 for (int32_t i=0; i< num_vectors; i++)
00090                 {
00091                     sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00092                     memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00093 
00094                 }
00095             }
00096         }
00097 
00102         CSparseFeatures(char* fname)
00103         : CFeatures(fname), num_vectors(0), num_features(0),
00104             sparse_feature_matrix(NULL), feature_cache(NULL)
00105         {}
00106 
00107         virtual ~CSparseFeatures()
00108         {
00109             free_sparse_features();
00110         }
00111 
00115         void free_sparse_feature_matrix()
00116         {
00117             clean_tsparse(sparse_feature_matrix, num_vectors);
00118             sparse_feature_matrix = NULL;
00119             num_vectors=0;
00120             num_features=0;
00121         }
00122 
00126         void free_sparse_features()
00127         {
00128             free_sparse_feature_matrix();
00129             delete feature_cache;
00130             feature_cache = NULL;
00131         }
00132 
00137         virtual CFeatures* duplicate() const
00138         {
00139             return new CSparseFeatures<ST>(*this);
00140         }
00141 
00150         ST* get_full_feature_vector(int32_t num, int32_t& len)
00151         {
00152             bool vfree;
00153             int32_t num_feat;
00154             int32_t i;
00155             len=0;
00156             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00157             ST* fv=NULL;
00158 
00159             if (sv)
00160             {
00161                 len=num_features;
00162                 fv=new ST[num_features];
00163 
00164                 for (i=0; i<num_features; i++)
00165                     fv[i]=0;
00166 
00167                 for (i=0; i<num_feat; i++)
00168                     fv[sv[i].feat_index]= sv[i].entry;
00169             }
00170 
00171             free_sparse_feature_vector(sv, num, vfree);
00172 
00173             return fv;
00174         }
00175 
00176 
00182         inline int32_t get_num_sparse_vec_features(int32_t num)
00183         {
00184             bool vfree;
00185             int32_t len;
00186             TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree);
00187             free_sparse_feature_vector(sv, num, vfree);
00188             return len;
00189         }
00190 
00201         TSparseEntry<ST>* get_sparse_feature_vector(int32_t num, int32_t& len, bool& vfree)
00202         {
00203             ASSERT(num<num_vectors);
00204 
00205             if (sparse_feature_matrix)
00206             {
00207                 len= sparse_feature_matrix[num].num_feat_entries;
00208                 vfree=false ;
00209                 return sparse_feature_matrix[num].features;
00210             } 
00211             else
00212             {
00213                 TSparseEntry<ST>* feat=NULL;
00214                 vfree=false;
00215 
00216                 if (feature_cache)
00217                 {
00218                     feat=feature_cache->lock_entry(num);
00219 
00220                     if (feat)
00221                         return feat;
00222                     else
00223                     {
00224                         feat=feature_cache->set_entry(num);
00225                     }
00226                 }
00227 
00228                 if (!feat)
00229                     vfree=true;
00230 
00231                 feat=compute_sparse_feature_vector(num, len, feat);
00232 
00233 
00234                 if (get_num_preproc())
00235                 {
00236                     int32_t tmp_len=len;
00237                     TSparseEntry<ST>* tmp_feat_before = feat;
00238                     TSparseEntry<ST>* tmp_feat_after = NULL;
00239 
00240                     for (int32_t i=0; i<get_num_preproc(); i++)
00241                     {
00242                         //tmp_feat_after=((CSparsePreProc<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
00243 
00244                         if (i!=0)   // delete feature vector, except for the the first one, i.e., feat
00245                             delete[] tmp_feat_before;
00246                         tmp_feat_before=tmp_feat_after;
00247                     }
00248 
00249                     memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len);
00250                     delete[] tmp_feat_after;
00251                     len=tmp_len ;
00252                     SG_DEBUG( "len: %d len2: %d\n", len, num_features);
00253                 }
00254                 return feat ;
00255             }
00256         }
00257 
00258 
00269         ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, int32_t alen, TSparseEntry<ST>* bvec, int32_t blen)
00270         {
00271             ST result=0;
00272 
00273             //result remains zero when one of the vectors is non existent
00274             if (avec && bvec)
00275             {
00276                 if (alen<=blen)
00277                 {
00278                     int32_t j=0;
00279                     for (int32_t i=0; i<alen; i++)
00280                     {
00281                         int32_t a_feat_idx=avec[i].feat_index;
00282 
00283                         while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00284                             j++;
00285 
00286                         if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00287                         {
00288                             result+= avec[i].entry * bvec[j].entry;
00289                             j++;
00290                         }
00291                     }
00292                 }
00293                 else
00294                 {
00295                     int32_t j=0;
00296                     for (int32_t i=0; i<blen; i++)
00297                     {
00298                         int32_t b_feat_idx=bvec[i].feat_index;
00299 
00300                         while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00301                             j++;
00302 
00303                         if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00304                         {
00305                             result+= bvec[i].entry * avec[j].entry;
00306                             j++;
00307                         }
00308                     }
00309                 }
00310 
00311                 result*=alpha;
00312             }
00313 
00314             return result;
00315         }
00316 
00328         void dense_dot_range(ST* output, int32_t start, int32_t stop, ST* alphas, ST* vec, int32_t dim, ST b)
00329         {
00330             ASSERT(output);
00331             ASSERT(start>=0);
00332             ASSERT(stop<=num_vectors);
00333 
00334             for (int32_t i=start; i<stop; i++)
00335                 output[i]=dense_dot(alphas[i], i, vec, dim, b);
00336         }
00337 
00348         ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00349         {
00350             ASSERT(vec);
00351             ASSERT(dim==num_features);
00352             ST result=b;
00353 
00354             bool vfree;
00355             int32_t num_feat;
00356             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00357 
00358             if (sv)
00359             {
00360                 for (int32_t i=0; i<num_feat; i++)
00361                     result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00362             }
00363 
00364             free_sparse_feature_vector(sv, num, vfree);
00365             return result;
00366         }
00367 
00377         void add_to_dense_vec(ST alpha, int32_t num, ST* vec, int32_t dim, bool abs_val=false)
00378         {
00379             ASSERT(vec);
00380             ASSERT(dim==num_features);
00381 
00382             bool vfree;
00383             int32_t num_feat;
00384             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00385 
00386             if (sv)
00387             {
00388                 if (abs_val)
00389                 {
00390                     for (int32_t i=0; i<num_feat; i++)
00391                         vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00392                 }
00393                 else
00394                 {
00395                     for (int32_t i=0; i<num_feat; i++)
00396                         vec[sv[i].feat_index]+= alpha*sv[i].entry;
00397                 }
00398             }
00399 
00400             free_sparse_feature_vector(sv, num, vfree);
00401         }
00402 
00409         void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00410         {
00411             if (feature_cache)
00412                 feature_cache->unlock_entry(num);
00413 
00414             if (free)
00415                 delete[] feat_vec ;
00416         } 
00417 
00425         TSparse<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00426         {
00427             num_feat=num_features;
00428             num_vec=num_vectors;
00429 
00430             return sparse_feature_matrix;
00431         }
00432 
00438         void clean_tsparse(TSparse<ST>* sfm, int32_t num_vec)
00439         {
00440             if (sfm)
00441             {
00442                 for (int32_t i=0; i<num_vec; i++)
00443                     delete[] sfm[i].features;
00444 
00445                 delete[] sfm;
00446             }
00447         }
00448 
00458         TSparse<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00459         {
00460             num_feat=num_vectors;
00461             num_vec=num_features;
00462 
00463             int32_t* hist=new int32_t[num_features];
00464             memset(hist, 0, sizeof(int32_t)*num_features);
00465 
00466             // count how lengths of future feature vectors
00467             for (int32_t v=0; v<num_vectors; v++)
00468             {
00469                 int32_t vlen;
00470                 bool vfree;
00471                 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00472 
00473                 for (int32_t i=0; i<vlen; i++)
00474                     hist[sv[i].feat_index]++;
00475 
00476                 free_sparse_feature_vector(sv, v, vfree);
00477             }
00478 
00479             // allocate room for future feature vectors
00480             TSparse<ST>* sfm=new TSparse<ST>[num_vec];
00481             for (int32_t v=0; v<num_vec; v++)
00482             {
00483                 sfm[v].features= new TSparseEntry<ST>[hist[v]];
00484                 sfm[v].num_feat_entries=hist[v];
00485                 sfm[v].vec_index=v;
00486             }
00487 
00488             // fill future feature vectors with content
00489             memset(hist,0,sizeof(int32_t)*num_features);
00490             for (int32_t v=0; v<num_vectors; v++)
00491             {
00492                 int32_t vlen;
00493                 bool vfree;
00494                 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00495 
00496                 for (int32_t i=0; i<vlen; i++)
00497                 {
00498                     int32_t vidx=sv[i].feat_index;
00499                     int32_t fidx=v;
00500                     sfm[vidx].features[hist[vidx]].feat_index=fidx;
00501                     sfm[vidx].features[hist[vidx]].entry=sv[i].entry;
00502                     hist[vidx]++;
00503                 }
00504 
00505                 free_sparse_feature_vector(sv, v, vfree);
00506             }
00507 
00508             delete[] hist;
00509             return sfm;
00510         }
00511 
00521         virtual void set_sparse_feature_matrix(TSparse<ST>* sfm, int32_t num_feat, int32_t num_vec)
00522         {
00523             free_sparse_feature_matrix();
00524 
00525             sparse_feature_matrix=sfm;
00526             num_features=num_feat;
00527             num_vectors=num_vec;
00528         }
00529 
00537         ST* get_full_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00538         {
00539             SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00540             num_feat=num_features;
00541             num_vec=num_vectors;
00542 
00543             ST* fm=new ST[num_feat*num_vec];
00544 
00545             if (fm)
00546             {
00547                 for (int64_t i=0; i<num_feat*num_vec; i++)
00548                     fm[i]=0;
00549 
00550                 for (int32_t v=0; v<num_vec; v++)
00551                 {
00552                     for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00553                     {
00554                         int64_t offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index;
00555                         fm[offs]= sparse_feature_matrix[v].features[f].entry;
00556                     }
00557                 }
00558             }
00559             else
00560                 SG_ERROR( "error allocating memory for dense feature matrix\n");
00561 
00562             return fm;
00563         }
00564 
00574         virtual bool set_full_feature_matrix(ST* ffm, int32_t num_feat, int32_t num_vec)
00575         {
00576             free_sparse_feature_matrix();
00577             bool result=true;
00578             num_features=num_feat;
00579             num_vectors=num_vec;
00580 
00581             SG_INFO("converting dense feature matrix to sparse one\n");
00582             int32_t* num_feat_entries=new int[num_vectors];
00583 
00584             if (num_feat_entries)
00585             {
00586                 int32_t num_total_entries=0;
00587 
00588                 // count nr of non sparse features
00589                 for (int32_t i=0; i< num_vec; i++)
00590                 {
00591                     num_feat_entries[i]=0;
00592                     for (int32_t j=0; j< num_feat; j++)
00593                     {
00594                         if (ffm[i*((int64_t) num_feat) + j] != 0)
00595                             num_feat_entries[i]++;
00596                     }
00597                 }
00598 
00599                 if (num_vec>0)
00600                 {
00601                     sparse_feature_matrix=new TSparse<ST>[num_vec];
00602 
00603                     if (sparse_feature_matrix)
00604                     {
00605                         for (int32_t i=0; i< num_vec; i++)
00606                         {
00607                             sparse_feature_matrix[i].vec_index=i;
00608                             sparse_feature_matrix[i].num_feat_entries=0;
00609                             sparse_feature_matrix[i].features= NULL;
00610 
00611                             if (num_feat_entries[i]>0)
00612                             {
00613                                 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]];
00614 
00615                                 if (!sparse_feature_matrix[i].features)
00616                                 {
00617                                     SG_INFO( "allocation of features failed\n");
00618                                     return false;
00619                                 }
00620 
00621                                 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00622                                 int32_t sparse_feat_idx=0;
00623 
00624                                 for (int32_t j=0; j< num_feat; j++)
00625                                 {
00626                                     int64_t pos= i*num_feat + j;
00627 
00628                                     if (ffm[pos] != 0)
00629                                     {
00630                                         sparse_feature_matrix[i].features[sparse_feat_idx].entry=ffm[pos];
00631                                         sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00632                                         sparse_feat_idx++;
00633                                         num_total_entries++;
00634                                     }
00635                                 }
00636                             }
00637                         }
00638                     }
00639                     else
00640                     {
00641                         SG_ERROR( "allocation of sparse feature matrix failed\n");
00642                         result=false;
00643                     }
00644 
00645                     SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00646                             num_total_entries, num_feat*num_vec, (100.0*num_total_entries)/(num_feat*num_vec));
00647                 }
00648                 else
00649                 {
00650                     SG_ERROR( "huh ? zero size matrix given ?\n");
00651                     result=false;
00652                 }
00653             }
00654             delete[] num_feat_entries;
00655             return result;
00656         }
00657 
00663         virtual bool apply_preproc(bool force_preprocessing=false)
00664         {
00665             SG_INFO( "force: %d\n", force_preprocessing);
00666 
00667             if ( sparse_feature_matrix && get_num_preproc() )
00668             {
00669                 for (int32_t i=0; i<get_num_preproc(); i++)
00670                 {
00671                     if ( (!is_preprocessed(i) || force_preprocessing) )
00672                     {
00673                         set_preprocessed(i);
00674                         SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00675                         if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL)
00676                             return false;
00677                     }
00678                     return true;
00679                 }
00680                 return true;
00681             }
00682             else
00683             {
00684                 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00685                 return false;
00686             }
00687         }
00688 
00693         virtual int32_t get_size() { return sizeof(ST); }
00694 
00700         bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00701         {
00702             int32_t num_feat=0;
00703             int32_t num_vec=0;
00704             ST* fm=sf->get_feature_matrix(num_feat, num_vec);
00705             ASSERT(fm && num_feat>0 && num_vec>0);
00706 
00707             return set_full_feature_matrix(fm, num_feat, num_vec);
00708         }
00709 
00714         virtual inline int32_t  get_num_vectors() { return num_vectors; }
00715 
00720         inline int32_t  get_num_features() { return num_features; }
00721 
00733         inline int32_t set_num_features(int32_t num)
00734         {
00735             int32_t n=num_features;
00736             ASSERT(n<=num);
00737             num_features=num;
00738             return num_features;
00739         }
00740 
00745         inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00746 
00751         inline virtual EFeatureType get_feature_type();
00752 
00759         void free_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00760         {
00761             if (feature_cache)
00762                 feature_cache->unlock_entry(num);
00763 
00764             if (free)
00765                 delete[] feat_vec ;
00766         }
00767 
00772         int64_t get_num_nonzero_entries()
00773         {
00774             int64_t num=0;
00775             for (int32_t i=0; i<num_vectors; i++)
00776                 num+=sparse_feature_matrix[i].num_feat_entries;
00777 
00778             return num;
00779         }
00780 
00786         float64_t* compute_squared(float64_t* sq)
00787         {
00788             ASSERT(sq);
00789 
00790             int32_t len=0;
00791             bool do_free=false;
00792 
00793             for (int32_t i=0; i<this->get_num_vectors(); i++)
00794             {
00795                 sq[i]=0;
00796                 TSparseEntry<float64_t>* vec = ((CSparseFeatures<float64_t>*) this)->get_sparse_feature_vector(i, len, do_free);
00797 
00798                 for (int32_t j=0; j<len; j++)
00799                     sq[i] += vec[j].entry * vec[j].entry;
00800 
00801                 ((CSparseFeatures<float64_t>*) this)->free_feature_vector(vec, i, do_free);
00802             }
00803 
00804             return sq;
00805         }
00806 
00819         float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00820         {
00821             int32_t i,j;
00822             int32_t alen, blen;
00823             bool afree, bfree;
00824             ASSERT(lhs);
00825             ASSERT(rhs);
00826 
00827             TSparseEntry<float64_t>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree);
00828             TSparseEntry<float64_t>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree);
00829             ASSERT(avec);
00830             ASSERT(bvec);
00831 
00832             float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00833 
00834             if (alen<=blen)
00835             {
00836                 j=0;
00837                 for (i=0; i<alen; i++)
00838                 {
00839                     int32_t a_feat_idx=avec[i].feat_index;
00840 
00841                     while ((j<blen) && (bvec[j].feat_index < a_feat_idx))
00842                         j++;
00843 
00844                     if ((j<blen) && (bvec[j].feat_index == a_feat_idx))
00845                     {
00846                         result-=2*(avec[i].entry*bvec[j].entry);
00847                         j++;
00848                     }
00849                 }
00850             }
00851             else
00852             {
00853                 j=0;
00854                 for (i=0; i<blen; i++)
00855                 {
00856                     int32_t b_feat_idx=bvec[i].feat_index;
00857 
00858                     while ((j<alen) && (avec[j].feat_index<b_feat_idx))
00859                         j++;
00860 
00861                     if ((j<alen) && (avec[j].feat_index == b_feat_idx))
00862                     {
00863                         result-=2*(bvec[i].entry*avec[j].entry);
00864                         j++;
00865                     }
00866                 }
00867             }
00868 
00869             ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a, afree);
00870             ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
00871 
00872             return CMath::abs(result);
00873         }
00874 
00880         CLabels* load_svmlight_file(char* fname)
00881         {
00882             CLabels* lab=NULL;
00883 
00884             size_t blocksize=1024*1024;
00885             size_t required_blocksize=blocksize;
00886             uint8_t* dummy=new uint8_t[blocksize];
00887             FILE* f=fopen(fname, "ro");
00888 
00889             if (f)
00890             {
00891                 free_sparse_feature_matrix();
00892                 num_vectors=0;
00893                 num_features=0;
00894 
00895                 SG_INFO("counting line numbers in file %s\n", fname);
00896                 size_t sz=blocksize;
00897                 size_t block_offs=0;
00898                 size_t old_block_offs=0;
00899                 fseek(f, 0, SEEK_END);
00900                 size_t fsize=ftell(f);
00901                 rewind(f);
00902 
00903                 while (sz == blocksize)
00904                 {
00905                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00906                     bool contains_cr=false;
00907                     for (size_t i=0; i<sz; i++)
00908                     {
00909                         block_offs++;
00910                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00911                         {
00912                             num_vectors++;
00913                             contains_cr=true;
00914                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00915                             old_block_offs=block_offs;
00916                         }
00917                     }
00918                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00919                 }
00920 
00921                 SG_INFO("found %d feature vectors\n", num_vectors);
00922                 delete[] dummy;
00923                 blocksize=required_blocksize;
00924                 dummy = new uint8_t[blocksize+1]; //allow setting of '\0' at EOL
00925 
00926                 lab=new CLabels(num_vectors);
00927                 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00928 
00929                 rewind(f);
00930                 sz=blocksize;
00931                 int32_t lines=0;
00932                 while (sz == blocksize)
00933                 {
00934                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00935 
00936                     size_t old_sz=0;
00937                     for (size_t i=0; i<sz; i++)
00938                     {
00939                         if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00940                         {
00941                             size_t len=i-old_sz+1;
00942                             uint8_t* data=&dummy[old_sz];
00943 
00944                             for (int32_t j=0; j<len; j++)
00945                                 dummy[j]=data[j];
00946 
00947                             sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
00948                             i=0;
00949                             old_sz=0;
00950                             sz+=len;
00951                         }
00952 
00953                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00954                         {
00955 
00956                             size_t len=i-old_sz;
00957                             uint8_t* data=&dummy[old_sz];
00958 
00959                             int32_t dims=0;
00960                             for (int32_t j=0; j<len; j++)
00961                             {
00962                                 if (data[j]==':')
00963                                     dims++;
00964                             }
00965 
00966                             if (dims<=0)
00967                             {
00968                                 SG_ERROR("Error in line %d - number of"
00969                                         " dimensions is %d line is %d characters"
00970                                         " long\n line_content:'%.*s'\n", lines,
00971                                         dims, len, len, (const char*) data);
00972                             }
00973 
00974                             TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims];
00975                             int32_t j=0;
00976                             for (; j<len; j++)
00977                             {
00978                                 if (data[j]==' ')
00979                                 {
00980                                     data[j]='\0';
00981 
00982                                     lab->set_label(lines, atof((const char*) data));
00983                                     break;
00984                                 }
00985                             }
00986 
00987                             int32_t d=0;
00988                             j++;
00989                             uint8_t* start=&data[j];
00990                             for (; j<len; j++)
00991                             {
00992                                 if (data[j]==':')
00993                                 {
00994                                     data[j]='\0';
00995 
00996                                     feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
00997                                     num_features=CMath::max(num_features, feat[d].feat_index+1);
00998 
00999                                     j++;
01000                                     start=&data[j];
01001                                     for (; j<len; j++)
01002                                     {
01003                                         if (data[j]==' ' || data[j]=='\n')
01004                                         {
01005                                             data[j]='\0';
01006                                             feat[d].entry=(ST) atof((const char*) start);
01007                                             d++;
01008                                             break;
01009                                         }
01010                                     }
01011 
01012                                     if (j==len)
01013                                     {
01014                                         data[j]='\0';
01015                                         feat[dims-1].entry=(ST) atof((const char*) start);
01016                                     }
01017 
01018                                     j++;
01019                                     start=&data[j];
01020                                 }
01021                             }
01022 
01023                             sparse_feature_matrix[lines].vec_index=lines;
01024                             sparse_feature_matrix[lines].num_feat_entries=dims;
01025                             sparse_feature_matrix[lines].features=feat;
01026 
01027                             old_sz=i+1;
01028                             lines++;
01029                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01030                         }
01031                     }
01032                 }
01033                 SG_INFO("file successfully read\n");
01034                 fclose(f);
01035             }
01036 
01037             delete[] dummy;
01038 
01039             return lab;
01040         }
01041 
01048         bool write_svmlight_file(char* fname, CLabels* label)
01049         {
01050             ASSERT(label);
01051             int32_t num=label->get_num_labels();
01052             ASSERT(num>0);
01053             ASSERT(num==num_vectors);
01054 
01055             FILE* f=fopen(fname, "wb");
01056 
01057             if (f)
01058             {
01059                 for (int32_t i=0; i<num; i++)
01060                 {
01061                     fprintf(f, "%d ", (int32_t) label->get_int_label(i));
01062 
01063                     TSparseEntry<ST>* vec = sparse_feature_matrix[i].features;
01064                     int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
01065 
01066                     for (int32_t j=0; j<num_feat; j++)
01067                     {
01068                         if (j<num_feat-1)
01069                             fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01070                         else
01071                             fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01072                     }
01073                 }
01074 
01075                 fclose(f);
01076                 return true;
01077             }
01078             return false;
01079         }
01080 
01081     protected:
01092         virtual TSparseEntry<ST>* compute_sparse_feature_vector(int32_t num, int32_t& len, TSparseEntry<ST>* target=NULL)
01093         {
01094             len=0;
01095             return NULL;
01096         }
01097 
01098     protected:
01099 
01101         int32_t num_vectors;
01102 
01104         int32_t num_features;
01105 
01107         TSparse<ST>* sparse_feature_matrix;
01108 
01110         CCache< TSparseEntry<ST> >* feature_cache;
01111 };
01112 
01113 
01118 template<> inline EFeatureType CSparseFeatures<char>::get_feature_type()
01119 {
01120     return F_CHAR;
01121 }
01122 
01127 template<> inline EFeatureType CSparseFeatures<uint8_t>::get_feature_type()
01128 {
01129     return F_BYTE;
01130 }
01131 
01136 template<> inline EFeatureType CSparseFeatures<int16_t>::get_feature_type()
01137 {
01138     return F_SHORT;
01139 }
01140 
01145 template<> inline EFeatureType CSparseFeatures<uint16_t>::get_feature_type()
01146 {
01147     return F_WORD;
01148 }
01149 
01154 template<> inline EFeatureType CSparseFeatures<int32_t>::get_feature_type()
01155 {
01156     return F_INT;
01157 }
01158 
01163 template<> inline EFeatureType CSparseFeatures<uint32_t>::get_feature_type()
01164 {
01165     return F_UINT;
01166 }
01167 
01172 template<> inline EFeatureType CSparseFeatures<int64_t>::get_feature_type()
01173 {
01174     return F_LONG;
01175 }
01176 
01181 template<> inline EFeatureType CSparseFeatures<uint64_t>::get_feature_type()
01182 {
01183     return F_ULONG;
01184 }
01185 
01190 template<> inline EFeatureType CSparseFeatures<float64_t>::get_feature_type()
01191 {
01192     return F_DREAL;
01193 }
01194 
01199 template<> inline EFeatureType CSparseFeatures<float32_t>::get_feature_type()
01200 {
01201     return F_SHORTREAL;
01202 }
01203 
01208 template<> inline EFeatureType CSparseFeatures<float128_t>::get_feature_type()
01209 {
01210     return F_LONGREAL;
01211 }
01212 #endif /* _SPARSEFEATURES__H__ */

SHOGUN Machine Learning Toolbox - Documentation