SparseFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _SPARSEFEATURES__H__
00013 #define _SPARSEFEATURES__H__
00014 
00015 #include <string.h>
00016 #include <stdlib.h>
00017 
00018 #include "lib/common.h"
00019 #include "lib/Mathematics.h"
00020 #include "lib/Cache.h"
00021 #include "lib/io.h"
00022 #include "lib/Cache.h"
00023 
00024 #include "features/Labels.h"
00025 #include "features/Features.h"
00026 #include "features/SimpleFeatures.h"
00027 #include "features/RealFeatures.h"
00028 #include "preproc/SparsePreProc.h"
00029 
00030 //features are an array of TSparse, sorted w.r.t. vec_index (increasing) and
00031 //withing same vec_index w.r.t. feat_index (increasing);
00032 
00033 template <class ST> class CSparsePreProc;
00034 
00036 template <class ST> struct TSparseEntry
00037 {
00039     INT feat_index;
00041     ST entry;
00042 };
00043 
00045 template <class ST> struct TSparse
00046 {
00047     public:
00049         INT vec_index;
00051         INT num_feat_entries;
00053         TSparseEntry<ST>* features;
00054 };
00055 
00057 template <class ST> class CSparseFeatures : public CFeatures
00058 {
00059     public:
00064         CSparseFeatures(INT size=0)
00065         : CFeatures(size), num_vectors(0), num_features(0),
00066             sparse_feature_matrix(NULL), feature_cache(NULL)
00067         {}
00068 
00070         CSparseFeatures(const CSparseFeatures & orig)
00071         : CFeatures(orig), num_vectors(orig.num_vectors),
00072             num_features(orig.num_features),
00073             sparse_feature_matrix(orig.sparse_feature_matrix),
00074             feature_cache(orig.feature_cache)
00075         {
00076             if (orig.sparse_feature_matrix)
00077             {
00078                 free_sparse_feature_matrix();
00079                 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00080                 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors);
00081                 for (INT i=0; i< num_vectors; i++)
00082                 {
00083                     sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00084                     memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00085 
00086                 }
00087             }
00088         }
00089 
00094         CSparseFeatures(CHAR* fname)
00095         : CFeatures(fname), num_vectors(0), num_features(0),
00096             sparse_feature_matrix(NULL), feature_cache(NULL)
00097         {}
00098 
00099         virtual ~CSparseFeatures()
00100         {
00101             free_sparse_features();
00102         }
00103 
00107         void free_sparse_feature_matrix()
00108         {
00109             clean_tsparse(sparse_feature_matrix, num_vectors);
00110             sparse_feature_matrix = NULL;
00111             num_vectors=0;
00112             num_features=0;
00113         }
00114 
00118         void free_sparse_features()
00119         {
00120             free_sparse_feature_matrix();
00121             delete feature_cache;
00122             feature_cache = NULL;
00123         }
00124 
00129         virtual CFeatures* duplicate() const
00130         {
00131             return new CSparseFeatures<ST>(*this);
00132         }
00133 
00142         ST* get_full_feature_vector(INT num, INT& len)
00143         {
00144             bool vfree;
00145             INT num_feat;
00146             INT i;
00147             len=0;
00148             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00149             ST* fv=NULL;
00150 
00151             if (sv)
00152             {
00153                 len=num_features;
00154                 fv=new ST[num_features];
00155 
00156                 for (i=0; i<num_features; i++)
00157                     fv[i]=0;
00158 
00159                 for (i=0; i<num_feat; i++)
00160                     fv[sv[i].feat_index]= sv[i].entry;
00161             }
00162 
00163             free_sparse_feature_vector(sv, num, vfree);
00164 
00165             return fv;
00166         }
00167 
00168 
00174         inline INT get_num_sparse_vec_features(INT num)
00175         {
00176             bool vfree;
00177             INT len;
00178             TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree);
00179             free_sparse_feature_vector(sv, num, vfree);
00180             return len;
00181         }
00182 
00193         TSparseEntry<ST>* get_sparse_feature_vector(INT num, INT& len, bool& vfree)
00194         {
00195             ASSERT(num<num_vectors);
00196 
00197             if (sparse_feature_matrix)
00198             {
00199                 len= sparse_feature_matrix[num].num_feat_entries;
00200                 vfree=false ;
00201                 return sparse_feature_matrix[num].features;
00202             } 
00203             else
00204             {
00205                 TSparseEntry<ST>* feat=NULL;
00206                 vfree=false;
00207 
00208                 if (feature_cache)
00209                 {
00210                     feat=feature_cache->lock_entry(num);
00211 
00212                     if (feat)
00213                         return feat;
00214                     else
00215                     {
00216                         feat=feature_cache->set_entry(num);
00217                     }
00218                 }
00219 
00220                 if (!feat)
00221                     vfree=true;
00222 
00223                 feat=compute_sparse_feature_vector(num, len, feat);
00224 
00225 
00226                 if (get_num_preproc())
00227                 {
00228                     INT tmp_len=len;
00229                     TSparseEntry<ST>* tmp_feat_before = feat;
00230                     TSparseEntry<ST>* tmp_feat_after = NULL;
00231 
00232                     for (INT i=0; i<get_num_preproc(); i++)
00233                     {
00234                         //tmp_feat_after=((CSparsePreProc<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
00235 
00236                         if (i!=0)   // delete feature vector, except for the the first one, i.e., feat
00237                             delete[] tmp_feat_before;
00238                         tmp_feat_before=tmp_feat_after;
00239                     }
00240 
00241                     memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len);
00242                     delete[] tmp_feat_after;
00243                     len=tmp_len ;
00244                     SG_DEBUG( "len: %d len2: %d\n", len, num_features);
00245                 }
00246                 return feat ;
00247             }
00248         }
00249 
00250 
00261         ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, INT alen, TSparseEntry<ST>* bvec, INT blen)
00262         {
00263             ST result=0;
00264 
00265             //result remains zero when one of the vectors is non existent
00266             if (avec && bvec)
00267             {
00268                 if (alen<=blen)
00269                 {
00270                     INT j=0;
00271                     for (INT i=0; i<alen; i++)
00272                     {
00273                         INT a_feat_idx=avec[i].feat_index;
00274 
00275                         while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00276                             j++;
00277 
00278                         if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00279                         {
00280                             result+= avec[i].entry * bvec[j].entry;
00281                             j++;
00282                         }
00283                     }
00284                 }
00285                 else
00286                 {
00287                     INT j=0;
00288                     for (INT i=0; i<blen; i++)
00289                     {
00290                         INT b_feat_idx=bvec[i].feat_index;
00291 
00292                         while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00293                             j++;
00294 
00295                         if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00296                         {
00297                             result+= bvec[i].entry * avec[j].entry;
00298                             j++;
00299                         }
00300                     }
00301                 }
00302 
00303                 result*=alpha;
00304             }
00305 
00306             return result;
00307         }
00308 
00320         void dense_dot_range(ST* output, INT start, INT stop, ST* alphas, ST* vec, INT dim, ST b)
00321         {
00322             ASSERT(output);
00323             ASSERT(start>=0);
00324             ASSERT(stop<=num_vectors);
00325 
00326             for (INT i=start; i<stop; i++)
00327                 output[i]=dense_dot(alphas[i], i, vec, dim, b);
00328         }
00329 
00340         ST dense_dot(ST alpha, INT num, ST* vec, INT dim, ST b)
00341         {
00342             ASSERT(vec);
00343             ASSERT(dim==num_features);
00344             ST result=b;
00345 
00346             bool vfree;
00347             INT num_feat;
00348             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00349 
00350             if (sv)
00351             {
00352                 for (INT i=0; i<num_feat; i++)
00353                     result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00354             }
00355 
00356             free_sparse_feature_vector(sv, num, vfree);
00357             return result;
00358         }
00359 
00369         void add_to_dense_vec(ST alpha, INT num, ST* vec, INT dim, bool abs_val=false)
00370         {
00371             ASSERT(vec);
00372             ASSERT(dim==num_features);
00373 
00374             bool vfree;
00375             INT num_feat;
00376             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00377 
00378             if (sv)
00379             {
00380                 if (abs_val)
00381                 {
00382                     for (INT i=0; i<num_feat; i++)
00383                         vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00384                 }
00385                 else
00386                 {
00387                     for (INT i=0; i<num_feat; i++)
00388                         vec[sv[i].feat_index]+= alpha*sv[i].entry;
00389                 }
00390             }
00391 
00392             free_sparse_feature_vector(sv, num, vfree);
00393         }
00394 
00401         void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, INT num, bool free)
00402         {
00403             if (feature_cache)
00404                 feature_cache->unlock_entry(num);
00405 
00406             if (free)
00407                 delete[] feat_vec ;
00408         } 
00409 
00417         TSparse<ST>* get_sparse_feature_matrix(INT &num_feat, INT &num_vec)
00418         {
00419             num_feat=num_features;
00420             num_vec=num_vectors;
00421 
00422             return sparse_feature_matrix;
00423         }
00424 
00430         void clean_tsparse(TSparse<ST>* sfm, INT num_vec)
00431         {
00432             if (sfm)
00433             {
00434                 for (INT i=0; i<num_vec; i++)
00435                     delete[] sfm[i].features;
00436 
00437                 delete[] sfm;
00438             }
00439         }
00440 
00450         TSparse<ST>* get_transposed(INT &num_feat, INT &num_vec)
00451         {
00452             num_feat=num_vectors;
00453             num_vec=num_features;
00454 
00455             INT* hist=new INT[num_features];
00456             memset(hist, 0, sizeof(INT)*num_features);
00457 
00458             // count how lengths of future feature vectors
00459             for (INT v=0; v<num_vectors; v++)
00460             {
00461                 INT vlen;
00462                 bool vfree;
00463                 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00464 
00465                 for (INT i=0; i<vlen; i++)
00466                     hist[sv[i].feat_index]++;
00467 
00468                 free_sparse_feature_vector(sv, v, vfree);
00469             }
00470 
00471             // allocate room for future feature vectors
00472             TSparse<ST>* sfm=new TSparse<ST>[num_vec];
00473             for (INT v=0; v<num_vec; v++)
00474             {
00475                 sfm[v].features= new TSparseEntry<ST>[hist[v]];
00476                 sfm[v].num_feat_entries=hist[v];
00477                 sfm[v].vec_index=v;
00478             }
00479 
00480             // fill future feature vectors with content
00481             memset(hist,0,sizeof(INT)*num_features);
00482             for (INT v=0; v<num_vectors; v++)
00483             {
00484                 INT vlen;
00485                 bool vfree;
00486                 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00487 
00488                 for (INT i=0; i<vlen; i++)
00489                 {
00490                     INT vidx=sv[i].feat_index;
00491                     INT fidx=v;
00492                     sfm[vidx].features[hist[vidx]].feat_index=fidx;
00493                     sfm[vidx].features[hist[vidx]].entry=sv[i].entry;
00494                     hist[vidx]++;
00495                 }
00496 
00497                 free_sparse_feature_vector(sv, v, vfree);
00498             }
00499 
00500             delete[] hist;
00501             return sfm;
00502         }
00503 
00513         virtual void set_sparse_feature_matrix(TSparse<ST>* sfm, INT num_feat, INT num_vec)
00514         {
00515             free_sparse_feature_matrix();
00516 
00517             sparse_feature_matrix=sfm;
00518             num_features=num_feat;
00519             num_vectors=num_vec;
00520         }
00521 
00529         ST* get_full_feature_matrix(INT &num_feat, INT &num_vec)
00530         {
00531             SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00532             num_feat=num_features;
00533             num_vec=num_vectors;
00534 
00535             ST* fm=new ST[num_feat*num_vec];
00536 
00537             if (fm)
00538             {
00539                 for (LONG i=0; i<num_feat*num_vec; i++)
00540                     fm[i]=0;
00541 
00542                 for (INT v=0; v<num_vec; v++)
00543                 {
00544                     for (INT f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00545                     {
00546                         LONG offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index;
00547                         fm[offs]= sparse_feature_matrix[v].features[f].entry;
00548                     }
00549                 }
00550             }
00551             else
00552                 SG_ERROR( "error allocating memory for dense feature matrix\n");
00553 
00554             return fm;
00555         }
00556 
00566         virtual bool set_full_feature_matrix(ST* ffm, INT num_feat, INT num_vec)
00567         {
00568             free_sparse_feature_matrix();
00569             bool result=true;
00570             num_features=num_feat;
00571             num_vectors=num_vec;
00572 
00573             SG_INFO("converting dense feature matrix to sparse one\n");
00574             INT* num_feat_entries=new int[num_vectors];
00575 
00576             if (num_feat_entries)
00577             {
00578                 INT num_total_entries=0;
00579 
00580                 // count nr of non sparse features
00581                 for (INT i=0; i< num_vec; i++)
00582                 {
00583                     num_feat_entries[i]=0;
00584                     for (INT j=0; j< num_feat; j++)
00585                     {
00586                         if (ffm[i*((LONG) num_feat) + j] != 0)
00587                             num_feat_entries[i]++;
00588                     }
00589                 }
00590 
00591                 if (num_vec>0)
00592                 {
00593                     sparse_feature_matrix=new TSparse<ST>[num_vec];
00594 
00595                     if (sparse_feature_matrix)
00596                     {
00597                         for (INT i=0; i< num_vec; i++)
00598                         {
00599                             sparse_feature_matrix[i].vec_index=i;
00600                             sparse_feature_matrix[i].num_feat_entries=0;
00601                             sparse_feature_matrix[i].features= NULL;
00602 
00603                             if (num_feat_entries[i]>0)
00604                             {
00605                                 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]];
00606 
00607                                 if (!sparse_feature_matrix[i].features)
00608                                 {
00609                                     SG_INFO( "allocation of features failed\n");
00610                                     return false;
00611                                 }
00612 
00613                                 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00614                                 INT sparse_feat_idx=0;
00615 
00616                                 for (INT j=0; j< num_feat; j++)
00617                                 {
00618                                     LONG pos= i*num_feat + j;
00619 
00620                                     if (ffm[pos] != 0)
00621                                     {
00622                                         sparse_feature_matrix[i].features[sparse_feat_idx].entry=ffm[pos];
00623                                         sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00624                                         sparse_feat_idx++;
00625                                         num_total_entries++;
00626                                     }
00627                                 }
00628                             }
00629                         }
00630                     }
00631                     else
00632                     {
00633                         SG_ERROR( "allocation of sparse feature matrix failed\n");
00634                         result=false;
00635                     }
00636 
00637                     SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00638                             num_total_entries, num_feat*num_vec, (100.0*num_total_entries)/(num_feat*num_vec));
00639                 }
00640                 else
00641                 {
00642                     SG_ERROR( "huh ? zero size matrix given ?\n");
00643                     result=false;
00644                 }
00645             }
00646             delete[] num_feat_entries;
00647             return result;
00648         }
00649 
00655         virtual bool apply_preproc(bool force_preprocessing=false)
00656         {
00657             SG_INFO( "force: %d\n", force_preprocessing);
00658 
00659             if ( sparse_feature_matrix && get_num_preproc() )
00660             {
00661                 for (INT i=0; i<get_num_preproc(); i++)
00662                 {
00663                     if ( (!is_preprocessed(i) || force_preprocessing) )
00664                     {
00665                         set_preprocessed(i);
00666                         SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00667                         if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL)
00668                             return false;
00669                     }
00670                     return true;
00671                 }
00672                 return true;
00673             }
00674             else
00675             {
00676                 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00677                 return false;
00678             }
00679         }
00680 
00685         virtual INT get_size() { return sizeof(ST); }
00686 
00692         bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00693         {
00694             INT num_feat=0;
00695             INT num_vec=0;
00696             ST* fm=sf->get_feature_matrix(num_feat, num_vec);
00697             ASSERT(fm && num_feat>0 && num_vec>0);
00698 
00699             return set_full_feature_matrix(fm, num_feat, num_vec);
00700         }
00701 
00706         virtual inline INT  get_num_vectors() { return num_vectors; }
00707 
00712         inline INT  get_num_features() { return num_features; }
00713 
00725         inline INT set_num_features(INT num)
00726         {
00727             INT n=num_features;
00728             ASSERT(n<=num);
00729             num_features=num;
00730             return num_features;
00731         }
00732 
00737         inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00738 
00743         inline virtual EFeatureType get_feature_type();
00744 
00751         void free_feature_vector(TSparseEntry<ST>* feat_vec, INT num, bool free)
00752         {
00753             if (feature_cache)
00754                 feature_cache->unlock_entry(num);
00755 
00756             if (free)
00757                 delete[] feat_vec ;
00758         }
00759 
00764         LONG get_num_nonzero_entries()
00765         {
00766             LONG num=0;
00767             for (INT i=0; i<num_vectors; i++)
00768                 num+=sparse_feature_matrix[i].num_feat_entries;
00769 
00770             return num;
00771         }
00772 
00778         DREAL* compute_squared(DREAL* sq)
00779         {
00780             ASSERT(sq);
00781 
00782             INT len=0;
00783             bool do_free=false;
00784 
00785             for (INT i=0; i<this->get_num_vectors(); i++)
00786             {
00787                 sq[i]=0;
00788                 TSparseEntry<DREAL>* vec = ((CSparseFeatures<DREAL>*) this)->get_sparse_feature_vector(i, len, do_free);
00789 
00790                 for (INT j=0; j<len; j++)
00791                     sq[i] += vec[j].entry * vec[j].entry;
00792 
00793                 ((CSparseFeatures<DREAL>*) this)->free_feature_vector(vec, i, do_free);
00794             }
00795 
00796             return sq;
00797         }
00798 
00811         DREAL compute_squared_norm(CSparseFeatures<DREAL>* lhs, DREAL* sq_lhs, INT idx_a, CSparseFeatures<DREAL>* rhs, DREAL* sq_rhs, INT idx_b)
00812         {
00813             INT i,j;
00814             INT alen, blen;
00815             bool afree, bfree;
00816             ASSERT(lhs);
00817             ASSERT(rhs);
00818 
00819             TSparseEntry<DREAL>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree);
00820             TSparseEntry<DREAL>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree);
00821             ASSERT(avec);
00822             ASSERT(bvec);
00823 
00824             DREAL result=sq_lhs[idx_a]+sq_rhs[idx_b];
00825 
00826             if (alen<=blen)
00827             {
00828                 j=0;
00829                 for (i=0; i<alen; i++)
00830                 {
00831                     INT a_feat_idx=avec[i].feat_index;
00832 
00833                     while ((j<blen) && (bvec[j].feat_index < a_feat_idx))
00834                         j++;
00835 
00836                     if ((j<blen) && (bvec[j].feat_index == a_feat_idx))
00837                     {
00838                         result-=2*(avec[i].entry*bvec[j].entry);
00839                         j++;
00840                     }
00841                 }
00842             }
00843             else
00844             {
00845                 j=0;
00846                 for (i=0; i<blen; i++)
00847                 {
00848                     INT b_feat_idx=bvec[i].feat_index;
00849 
00850                     while ((j<alen) && (avec[j].feat_index<b_feat_idx))
00851                         j++;
00852 
00853                     if ((j<alen) && (avec[j].feat_index == b_feat_idx))
00854                     {
00855                         result-=2*(bvec[i].entry*avec[j].entry);
00856                         j++;
00857                     }
00858                 }
00859             }
00860 
00861             ((CSparseFeatures<DREAL>*) lhs)->free_feature_vector(avec, idx_a, afree);
00862             ((CSparseFeatures<DREAL>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
00863 
00864             return CMath::abs(result);
00865         }
00866 
00872         CLabels* load_svmlight_file(CHAR* fname)
00873         {
00874             CLabels* lab=NULL;
00875 
00876             size_t blocksize=1024*1024;
00877             size_t required_blocksize=blocksize;
00878             BYTE* dummy=new BYTE[blocksize];
00879             FILE* f=fopen(fname, "ro");
00880 
00881             if (f)
00882             {
00883                 free_sparse_feature_matrix();
00884                 num_vectors=0;
00885                 num_features=0;
00886 
00887                 SG_INFO("counting line numbers in file %s\n", fname);
00888                 size_t sz=blocksize;
00889                 size_t block_offs=0;
00890                 size_t old_block_offs=0;
00891                 fseek(f, 0, SEEK_END);
00892                 size_t fsize=ftell(f);
00893                 rewind(f);
00894 
00895                 while (sz == blocksize)
00896                 {
00897                     sz=fread(dummy, sizeof(BYTE), blocksize, f);
00898                     bool contains_cr=false;
00899                     for (size_t i=0; i<sz; i++)
00900                     {
00901                         block_offs++;
00902                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00903                         {
00904                             num_vectors++;
00905                             contains_cr=true;
00906                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00907                             old_block_offs=block_offs;
00908                         }
00909                     }
00910                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00911                 }
00912 
00913                 SG_INFO("found %d feature vectors\n", num_vectors);
00914                 delete[] dummy;
00915                 blocksize=required_blocksize;
00916                 dummy = new BYTE[blocksize+1]; //allow setting of '\0' at EOL
00917 
00918                 lab=new CLabels(num_vectors);
00919                 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00920 
00921                 rewind(f);
00922                 sz=blocksize;
00923                 INT lines=0;
00924                 while (sz == blocksize)
00925                 {
00926                     sz=fread(dummy, sizeof(BYTE), blocksize, f);
00927 
00928                     size_t old_sz=0;
00929                     for (size_t i=0; i<sz; i++)
00930                     {
00931                         if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00932                         {
00933                             size_t len=i-old_sz+1;
00934                             BYTE* data=&dummy[old_sz];
00935 
00936                             for (INT j=0; j<len; j++)
00937                                 dummy[j]=data[j];
00938 
00939                             sz=fread(dummy+len, sizeof(BYTE), blocksize-len, f);
00940                             i=0;
00941                             old_sz=0;
00942                             sz+=len;
00943                         }
00944 
00945                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00946                         {
00947 
00948                             size_t len=i-old_sz;
00949                             BYTE* data=&dummy[old_sz];
00950 
00951                             INT dims=0;
00952                             for (INT j=0; j<len; j++)
00953                             {
00954                                 if (data[j]==':')
00955                                     dims++;
00956                             }
00957 
00958                             if (dims<=0)
00959                             {
00960                                 SG_ERROR("Error in line %d - number of"
00961                                         " dimensions is %d line is %d characters"
00962                                         " long\n line_content:'%.*s'\n", lines,
00963                                         dims, len, len, (const char*) data);
00964                             }
00965 
00966                             TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims];
00967                             INT j=0;
00968                             for (; j<len; j++)
00969                             {
00970                                 if (data[j]==' ')
00971                                 {
00972                                     data[j]='\0';
00973 
00974                                     lab->set_label(lines, atof((const char*) data));
00975                                     break;
00976                                 }
00977                             }
00978 
00979                             INT d=0;
00980                             j++;
00981                             BYTE* start=&data[j];
00982                             for (; j<len; j++)
00983                             {
00984                                 if (data[j]==':')
00985                                 {
00986                                     data[j]='\0';
00987 
00988                                     feat[d].feat_index=(INT) atoi((const char*) start)-1;
00989                                     num_features=CMath::max(num_features, feat[d].feat_index+1);
00990 
00991                                     j++;
00992                                     start=&data[j];
00993                                     for (; j<len; j++)
00994                                     {
00995                                         if (data[j]==' ' || data[j]=='\n')
00996                                         {
00997                                             data[j]='\0';
00998                                             feat[d].entry=(ST) atof((const char*) start);
00999                                             d++;
01000                                             break;
01001                                         }
01002                                     }
01003 
01004                                     if (j==len)
01005                                     {
01006                                         data[j]='\0';
01007                                         feat[dims-1].entry=(ST) atof((const char*) start);
01008                                     }
01009 
01010                                     j++;
01011                                     start=&data[j];
01012                                 }
01013                             }
01014 
01015                             sparse_feature_matrix[lines].vec_index=lines;
01016                             sparse_feature_matrix[lines].num_feat_entries=dims;
01017                             sparse_feature_matrix[lines].features=feat;
01018 
01019                             old_sz=i+1;
01020                             lines++;
01021                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01022                         }
01023                     }
01024                 }
01025                 SG_INFO("file successfully read\n");
01026                 fclose(f);
01027             }
01028 
01029             delete[] dummy;
01030 
01031             return lab;
01032         }
01033 
01040         bool write_svmlight_file(CHAR* fname, CLabels* label)
01041         {
01042             ASSERT(label);
01043             INT num=label->get_num_labels();
01044             ASSERT(num>0);
01045             ASSERT(num==num_vectors);
01046 
01047             FILE* f=fopen(fname, "wb");
01048 
01049             if (f)
01050             {
01051                 for (INT i=0; i<num; i++)
01052                 {
01053                     fprintf(f, "%d ", (INT) label->get_int_label(i));
01054 
01055                     TSparseEntry<ST>* vec = sparse_feature_matrix[i].features;
01056                     INT num_feat = sparse_feature_matrix[i].num_feat_entries;
01057 
01058                     for (INT j=0; j<num_feat; j++)
01059                     {
01060                         if (j<num_feat-1)
01061                             fprintf(f, "%d:%f ", (INT) vec[j].feat_index+1, (double) vec[j].entry);
01062                         else
01063                             fprintf(f, "%d:%f\n", (INT) vec[j].feat_index+1, (double) vec[j].entry);
01064                     }
01065                 }
01066 
01067                 fclose(f);
01068                 return true;
01069             }
01070             return false;
01071         }
01072 
01073     protected:
01084         virtual TSparseEntry<ST>* compute_sparse_feature_vector(INT num, INT& len, TSparseEntry<ST>* target=NULL)
01085         {
01086             len=0;
01087             return NULL;
01088         }
01089 
01090     protected:
01091 
01093         INT num_vectors;
01094 
01096         INT num_features;
01097 
01099         TSparse<ST>* sparse_feature_matrix;
01100 
01102         CCache< TSparseEntry<ST> >* feature_cache;
01103 };
01104 
01105 
01110 template<> inline EFeatureType CSparseFeatures<CHAR>::get_feature_type()
01111 {
01112     return F_CHAR;
01113 }
01114 
01119 template<> inline EFeatureType CSparseFeatures<BYTE>::get_feature_type()
01120 {
01121     return F_BYTE;
01122 }
01123 
01128 template<> inline EFeatureType CSparseFeatures<SHORT>::get_feature_type()
01129 {
01130     return F_SHORT;
01131 }
01132 
01137 template<> inline EFeatureType CSparseFeatures<WORD>::get_feature_type()
01138 {
01139     return F_WORD;
01140 }
01141 
01146 template<> inline EFeatureType CSparseFeatures<INT>::get_feature_type()
01147 {
01148     return F_INT;
01149 }
01150 
01155 template<> inline EFeatureType CSparseFeatures<UINT>::get_feature_type()
01156 {
01157     return F_UINT;
01158 }
01159 
01164 template<> inline EFeatureType CSparseFeatures<LONG>::get_feature_type()
01165 {
01166     return F_LONG;
01167 }
01168 
01173 template<> inline EFeatureType CSparseFeatures<ULONG>::get_feature_type()
01174 {
01175     return F_ULONG;
01176 }
01177 
01182 template<> inline EFeatureType CSparseFeatures<DREAL>::get_feature_type()
01183 {
01184     return F_DREAL;
01185 }
01186 
01191 template<> inline EFeatureType CSparseFeatures<SHORTREAL>::get_feature_type()
01192 {
01193     return F_SHORTREAL;
01194 }
01195 
01200 template<> inline EFeatureType CSparseFeatures<LONGREAL>::get_feature_type()
01201 {
01202     return F_LONGREAL;
01203 }
01204 #endif /* _SPARSEFEATURES__H__ */

SHOGUN Machine Learning Toolbox - Documentation