File.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2008 Soeren Sonnenburg
00008  * Copyright (C) 1999-2008 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <string.h>
00012 #include <sys/types.h>
00013 #include <sys/stat.h>
00014 #include <unistd.h>
00015 #include <ctype.h>
00016 
00017 #include "lib/File.h"
00018 #include "lib/SimpleFile.h"
00019 
00020 #include "features/StringFeatures.h"
00021 #include "features/SparseFeatures.h"
00022 
00023 
00024 CFile::CFile(FILE* f)
00025 : CSGObject()
00026 {
00027     file=f;
00028     filename=NULL;
00029     expected_type=F_UNKNOWN;
00030 }
00031 
00032 CFile::CFile(CHAR* fname, CHAR rw, EFeatureType typ, CHAR file_fourcc[4])
00033 : CSGObject()
00034 {
00035     status=false;
00036     task=rw;
00037     expected_type=typ;
00038     filename=strdup(fname);
00039     CHAR mode[2];
00040     mode[0]=rw;
00041     mode[1]='\0';
00042 
00043 
00044     if (rw=='r' || rw == 'w')
00045     {
00046         if (filename)
00047         {
00048             if ((file=fopen((const CHAR*) filename, (const CHAR*) mode)))
00049                 status=true;
00050         }
00051     }
00052     else
00053         SG_ERROR("unknown mode '%c'\n", mode[0]);
00054 
00055     if (file_fourcc)
00056     {
00057         if (rw=='r')
00058             status=read_header();
00059         else if (rw=='w')
00060             status=write_header();
00061 
00062         if (!status)
00063             fclose(file);
00064 
00065         file=NULL;
00066     }
00067 }
00068 
00069 CFile::~CFile()
00070 {
00071     free(filename);
00072     if (file)
00073       fclose(file);
00074     filename=NULL;
00075     file=NULL;
00076 }
00077 
00078 INT* CFile::load_int_data(INT* target, LONG& num)
00079 {
00080     ASSERT(expected_type==F_INT);
00081     CSimpleFile<INT> f(filename, file);
00082     target=f.load(target, num);
00083     status=(target!=NULL);
00084     return target;
00085 }
00086 
00087 bool CFile::save_int_data(INT* src, LONG num)
00088 {
00089     ASSERT(expected_type==F_INT);
00090     CSimpleFile<INT> f(filename, file);
00091     status=f.save(src, num);
00092     return status;
00093 }
00094 
00095 DREAL* CFile::load_real_data(DREAL* target, LONG& num)
00096 {
00097     ASSERT(expected_type==F_DREAL);
00098     CSimpleFile<DREAL> f(filename, file);
00099     target=f.load(target, num);
00100     status=(target!=NULL);
00101     return target;
00102 }
00103 
00104 SHORTREAL* CFile::load_shortreal_data(SHORTREAL* target, LONG& num)
00105 {
00106     ASSERT(expected_type==F_SHORTREAL);
00107     CSimpleFile<SHORTREAL> f(filename, file);
00108     target=f.load(target, num);
00109     status=(target!=NULL);
00110     return target;
00111 }
00112 
00113 bool CFile::save_real_data(DREAL* src, LONG num)
00114 {
00115     ASSERT(expected_type==F_DREAL);
00116     CSimpleFile<DREAL> f(filename, file);
00117     status=f.save(src, num);
00118     return status;
00119 }
00120 
00121 bool CFile::save_shortreal_data(SHORTREAL* src, LONG num)
00122 {
00123     ASSERT(expected_type==F_SHORTREAL);
00124     CSimpleFile<SHORTREAL> f(filename, file);
00125     status=f.save(src, num);
00126     return status;
00127 }
00128 
00129 CHAR* CFile::load_char_data(CHAR* target, LONG& num)
00130 {
00131     ASSERT(expected_type==F_CHAR);
00132     CSimpleFile<CHAR> f(filename, file);
00133     target=f.load(target, num);
00134     status=(target!=NULL);
00135     return target;
00136 }
00137 
00138 bool CFile::save_char_data(CHAR* src, LONG num)
00139 {
00140     ASSERT(expected_type==F_CHAR);
00141     CSimpleFile<CHAR> f(filename, file);
00142     status=f.save(src, num);
00143     return status;
00144 }
00145 
00146 BYTE* CFile::load_byte_data(BYTE* target, LONG& num)
00147 {
00148     ASSERT(expected_type==F_BYTE);
00149     CSimpleFile<BYTE> f(filename, file);
00150     target=f.load(target, num);
00151     status=(target!=NULL);
00152     return target;
00153 }
00154 
00155 bool CFile::save_byte_data(BYTE* src, LONG num)
00156 {
00157     ASSERT(expected_type==F_BYTE);
00158     CSimpleFile<BYTE> f(filename, file);
00159     status=f.save(src, num);
00160     return status;
00161 }
00162 
00163 WORD* CFile::load_word_data(WORD* target, LONG& num)
00164 {
00165     ASSERT(expected_type==F_WORD);
00166     CSimpleFile<WORD> f(filename, file);
00167     target=f.load(target, num);
00168     status=(target!=NULL);
00169     return target;
00170 }
00171 
00172 bool CFile::save_word_data(WORD* src, LONG num)
00173 {
00174     ASSERT(expected_type==F_WORD);
00175     CSimpleFile<WORD> f(filename, file);
00176     status=f.save(src, num);
00177     return status;
00178 }
00179 
00180 SHORT* CFile::load_short_data(SHORT* target, LONG& num)
00181 {
00182     ASSERT(expected_type==F_SHORT);
00183     CSimpleFile<SHORT> f(filename, file);
00184     target=f.load(target, num);
00185     status=(target!=NULL);
00186     return target;
00187 }
00188 
00189 bool CFile::save_short_data(SHORT* src, LONG num)
00190 {
00191     ASSERT(expected_type==F_SHORT);
00192     CSimpleFile<SHORT> f(filename, file);
00193     status=f.save(src, num);
00194     return status;
00195 }
00196 
00197 INT CFile::parse_first_header(EFeatureType &type)
00198 {
00199     return -1;
00200 }
00201 
00202 INT CFile::parse_next_header(EFeatureType &type)
00203 {
00204     return -1;
00205 }
00206 
00207 
00208 bool CFile::read_header()
00209 {
00210     ASSERT(file);
00211     UINT intlen=0;
00212     UINT endian=0;
00213     UINT file_fourcc=0;
00214     UINT doublelen=0;
00215 
00216     if ( (fread(&intlen, sizeof(BYTE), 1, file)==1) &&
00217             (fread(&doublelen, sizeof(BYTE), 1, file)==1) &&
00218             (fread(&endian, (UINT) intlen, 1, file)== 1) &&
00219             (fread(&file_fourcc, (UINT) intlen, 1, file)==1))
00220         return true;
00221     else
00222         return false;
00223 }
00224 
00225 bool CFile::write_header()
00226 {
00227     BYTE intlen=sizeof(UINT);
00228     BYTE doublelen=sizeof(double);
00229     UINT endian=0x12345678;
00230 
00231     if ((fwrite(&intlen, sizeof(BYTE), 1, file)==1) &&
00232             (fwrite(&doublelen, sizeof(BYTE), 1, file)==1) &&
00233             (fwrite(&endian, sizeof(UINT), 1, file)==1) &&
00234             (fwrite(&fourcc, 4*sizeof(char), 1, file)==1))
00235         return true;
00236     else
00237         return false;
00238 }
00239 
00240 template <class T> void CFile::append_item(CDynamicArray<T>* items, CHAR* ptr_data, CHAR* ptr_item)
00241 {
00242     size_t len=(ptr_data-ptr_item)/sizeof(CHAR);
00243     CHAR* item=new CHAR[len+1];
00244     memset(item, 0, sizeof(CHAR)*(len+1));
00245     item=strncpy(item, ptr_item, len);
00246 
00247     SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
00248     items->append_element(item);
00249 }
00250 
00251 bool CFile::read_real_valued_dense(DREAL*& matrix, INT& num_feat, INT& num_vec)
00252 {
00253     ASSERT(expected_type==F_DREAL);
00254 
00255     struct stat stats;
00256     if (stat(filename, &stats)!=0)
00257         SG_ERROR("Could not get file statistics.\n");
00258 
00259     CHAR* data=new CHAR[stats.st_size+1];
00260     memset(data, 0, sizeof(CHAR)*(stats.st_size+1));
00261     size_t nread=fread(data, sizeof(CHAR), stats.st_size, file);
00262     if (nread<=0)
00263         SG_ERROR("Could not read data from %s.\n");
00264 
00265     SG_DEBUG("data read from file:\n%s\n", data);
00266 
00267     // determine num_feat and num_vec, populate dynamic array
00268     INT nf=0;
00269     num_feat=0;
00270     num_vec=0;
00271     CHAR* ptr_item=NULL;
00272     CHAR* ptr_data=data;
00273     CDynamicArray<CHAR*>* items=new CDynamicArray<CHAR*>();
00274 
00275     while (*ptr_data)
00276     {
00277         if (*ptr_data=='\n')
00278         {
00279             if (ptr_item)
00280                 nf++;
00281 
00282             if (num_feat!=0 && nf!=num_feat)
00283                 SG_ERROR("Number of features mismatches (%d != %d) in vector %d in file %s.\n", num_feat, nf, num_vec, filename);
00284 
00285             append_item(items, ptr_data, ptr_item);
00286             num_feat=nf;
00287             num_vec++;
00288             nf=0;
00289             ptr_item=NULL;
00290         }
00291         else if (!isblank(*ptr_data) && !ptr_item)
00292         {
00293             ptr_item=ptr_data;
00294         }
00295         else if (isblank(*ptr_data) && ptr_item)
00296         {
00297             append_item(items, ptr_data, ptr_item);
00298             ptr_item=NULL;
00299             nf++;
00300         }
00301 
00302         ptr_data++;
00303     }
00304 
00305     SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec);
00306     delete[] data;
00307 
00308     // now copy data into matrix
00309     matrix=new DREAL[num_vec*num_feat];
00310     for (INT i=0; i<num_vec; i++)
00311     {
00312         for (INT j=0; j<num_feat; j++)
00313         {
00314             CHAR* item=items->get_element(i*num_feat+j);
00315             matrix[i*num_feat+j]=atof(item);
00316             delete[] item;
00317         }
00318     }
00319     delete items;
00320 
00321     //CMath::display_matrix(matrix, num_feat, num_vec);
00322     return true;
00323 }
00324 
00325 bool CFile::write_real_valued_dense(const DREAL* matrix, INT num_feat, INT num_vec)
00326 {
00327     if (!(file && matrix))
00328         SG_ERROR("File or matrix invalid.\n");
00329 
00330     for (INT j=0; j<num_vec; j++)
00331     {
00332         for (INT i=0; i<num_feat; i++)
00333         {
00334             DREAL v=matrix[num_feat*j+i];
00335             if (i==num_feat-1)
00336                 fprintf(file, "%f\n", v);
00337             else
00338                 fprintf(file, "%f ", v);
00339         }
00340     }
00341 
00342     return true;
00343 }
00344 
00345 bool CFile::read_real_valued_sparse(TSparse<DREAL>*& matrix, INT& num_feat, INT& num_vec)
00346 {
00347     size_t blocksize=1024*1024;
00348     size_t required_blocksize=blocksize;
00349     BYTE* dummy=new BYTE[blocksize];
00350 
00351     if (file)
00352     {
00353         num_vec=0;
00354         num_feat=0;
00355 
00356         SG_INFO("counting line numbers in file %s\n", filename);
00357         size_t sz=blocksize;
00358         size_t block_offs=0;
00359         size_t old_block_offs=0;
00360         fseek(file, 0, SEEK_END);
00361         size_t fsize=ftell(file);
00362         rewind(file);
00363 
00364         while (sz == blocksize)
00365         {
00366             sz=fread(dummy, sizeof(BYTE), blocksize, file);
00367             bool contains_cr=false;
00368             for (size_t i=0; i<sz; i++)
00369             {
00370                 block_offs++;
00371                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00372                 {
00373                     num_vec++;
00374                     contains_cr=true;
00375                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00376                     old_block_offs=block_offs;
00377                 }
00378             }
00379             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00380         }
00381 
00382         SG_INFO("found %d feature vectors\n", num_vec);
00383         delete[] dummy;
00384         blocksize=required_blocksize;
00385         dummy = new BYTE[blocksize+1]; //allow setting of '\0' at EOL
00386         matrix=new TSparse<DREAL>[num_vec];
00387 
00388         rewind(file);
00389         sz=blocksize;
00390         INT lines=0;
00391         while (sz == blocksize)
00392         {
00393             sz=fread(dummy, sizeof(BYTE), blocksize, file);
00394 
00395             size_t old_sz=0;
00396             for (size_t i=0; i<sz; i++)
00397             {
00398                 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00399                 {
00400                     size_t len=i-old_sz+1;
00401                     BYTE* data=&dummy[old_sz];
00402 
00403                     for (size_t j=0; j<len; j++)
00404                         dummy[j]=data[j];
00405 
00406                     sz=fread(dummy+len, sizeof(BYTE), blocksize-len, file);
00407                     i=0;
00408                     old_sz=0;
00409                     sz+=len;
00410                 }
00411 
00412                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00413                 {
00414 
00415                     size_t len=i-old_sz;
00416                     BYTE* data=&dummy[old_sz];
00417 
00418                     INT dims=0;
00419                     for (size_t j=0; j<len; j++)
00420                     {
00421                         if (data[j]==':')
00422                             dims++;
00423                     }
00424 
00425                     if (dims<=0)
00426                     {
00427                         SG_ERROR("Error in line %d - number of"
00428                                 " dimensions is %d line is %d characters"
00429                                 " long\n line_content:'%.*s'\n", lines,
00430                                 dims, len, len, (const char*) data);
00431                     }
00432 
00433                     TSparseEntry<DREAL>* feat=new TSparseEntry<DREAL>[dims];
00434 
00435                     //skip label part
00436                     size_t j=0;
00437                     for (; j<len; j++)
00438                     {
00439                         if (data[j]==':')
00440                         {
00441                             j=-1; //file without label
00442                             break;
00443                         }
00444 
00445                         if (data[j]==' ')
00446                         {
00447                             data[j]='\0';
00448 
00449                             //skip label part
00450                             break;
00451                         }
00452                     }
00453 
00454                     INT d=0;
00455                     j++;
00456                     BYTE* start=&data[j];
00457                     for (; j<len; j++)
00458                     {
00459                         if (data[j]==':')
00460                         {
00461                             data[j]='\0';
00462 
00463                             feat[d].feat_index=(INT) atoi((const char*) start)-1;
00464                             num_feat=CMath::max(num_feat, feat[d].feat_index+1);
00465 
00466                             j++;
00467                             start=&data[j];
00468                             for (; j<len; j++)
00469                             {
00470                                 if (data[j]==' ' || data[j]=='\n')
00471                                 {
00472                                     data[j]='\0';
00473                                     feat[d].entry=(DREAL) atof((const char*) start);
00474                                     d++;
00475                                     break;
00476                                 }
00477                             }
00478 
00479                             if (j==len)
00480                             {
00481                                 data[j]='\0';
00482                                 feat[dims-1].entry=(DREAL) atof((const char*) start);
00483                             }
00484 
00485                             j++;
00486                             start=&data[j];
00487                         }
00488                     }
00489 
00490                     matrix[lines].vec_index=lines;
00491                     matrix[lines].num_feat_entries=dims;
00492                     matrix[lines].features=feat;
00493 
00494                     old_sz=i+1;
00495                     lines++;
00496                     SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t");
00497                 }
00498             }
00499         }
00500 
00501         SG_INFO("file successfully read\n");
00502     }
00503 
00504     delete[] dummy;
00505     return true;
00506 }
00507 
00508 bool CFile::write_real_valued_sparse(const TSparse<DREAL>* matrix, INT num_feat, INT num_vec)
00509 {
00510     if (!(file && matrix))
00511         SG_ERROR("File or matrix invalid.\n");
00512 
00513     for (INT i=0; i<num_vec; i++)
00514     {
00515         TSparseEntry<DREAL>* vec = matrix[i].features;
00516         INT len=matrix[i].num_feat_entries;
00517 
00518         for (INT j=0; j<len; j++)
00519         {
00520             if (j<len-1)
00521                 fprintf(file, "%d:%f ", (INT) vec[j].feat_index+1, (double) vec[j].entry);
00522             else
00523                 fprintf(file, "%d:%f\n", (INT) vec[j].feat_index+1, (double) vec[j].entry);
00524         }
00525     }
00526 
00527     return true;
00528 }
00529 
00530 
00531 bool CFile::read_char_valued_strings(T_STRING<CHAR>*& strings, INT& num_str, INT& max_string_len)
00532 {
00533     bool result=false;
00534 
00535     size_t blocksize=1024*1024;
00536     size_t required_blocksize=0;
00537     CHAR* dummy=new CHAR[blocksize];
00538     CHAR* overflow=NULL;
00539     INT overflow_len=0;
00540 
00541     if (file)
00542     {
00543         num_str=0;
00544         max_string_len=0;
00545 
00546         SG_INFO("counting line numbers in file %s\n", filename);
00547         size_t sz=blocksize;
00548         size_t block_offs=0;
00549         size_t old_block_offs=0;
00550         fseek(file, 0, SEEK_END);
00551         size_t fsize=ftell(file);
00552         rewind(file);
00553 
00554         while (sz == blocksize)
00555         {
00556             sz=fread(dummy, sizeof(CHAR), blocksize, file);
00557             bool contains_cr=false;
00558             for (size_t i=0; i<sz; i++)
00559             {
00560                 block_offs++;
00561                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00562                 {
00563                     num_str++;
00564                     contains_cr=true;
00565                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00566                     old_block_offs=block_offs;
00567                 }
00568             }
00569             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00570         }
00571 
00572         SG_INFO("found %d strings\n", num_str);
00573         SG_DEBUG("block_size=%d\n", required_blocksize);
00574         delete[] dummy;
00575         blocksize=required_blocksize;
00576         dummy=new CHAR[blocksize];
00577         overflow=new CHAR[blocksize];
00578         strings=new T_STRING<CHAR>[num_str];
00579 
00580         rewind(file);
00581         sz=blocksize;
00582         INT lines=0;
00583         size_t old_sz=0;
00584         while (sz == blocksize)
00585         {
00586             sz=fread(dummy, sizeof(CHAR), blocksize, file);
00587 
00588             old_sz=0;
00589             for (size_t i=0; i<sz; i++)
00590             {
00591                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00592                 {
00593                     INT len=i-old_sz;
00594                     max_string_len=CMath::max(max_string_len, len+overflow_len);
00595 
00596                     strings[lines].length=len+overflow_len;
00597                     strings[lines].string=new CHAR[len+overflow_len];
00598 
00599                     for (INT j=0; j<overflow_len; j++)
00600                         strings[lines].string[j]=overflow[j];
00601                     for (INT j=0; j<len; j++)
00602                         strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00603 
00604                     // clear overflow
00605                     overflow_len=0;
00606 
00607                     //CMath::display_vector(strings[lines].string, len);
00608                     old_sz=i+1;
00609                     lines++;
00610                     SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00611                 }
00612             }
00613 
00614             for (size_t i=old_sz; i<sz; i++)
00615                 overflow[i-old_sz]=dummy[i];
00616 
00617             overflow_len=sz-old_sz;
00618         }
00619         result=true;
00620         SG_INFO("file successfully read\n");
00621         SG_INFO("max_string_length=%d\n", max_string_len);
00622         SG_INFO("num_strings=%d\n", num_str);
00623     }
00624 
00625     delete[] dummy;
00626     delete[] overflow;
00627 
00628     return result;
00629 }
00630 
00631 bool CFile::write_char_valued_strings(const T_STRING<CHAR>* strings, INT num_str)
00632 {
00633     if (!(file && strings))
00634         SG_ERROR("File or strings invalid.\n");
00635 
00636     for (INT i=0; i<num_str; i++)
00637     {
00638         INT len = strings[i].length;
00639         fwrite(strings[i].string, sizeof(CHAR), len, file);
00640         fprintf(file, "\n");
00641     }
00642 
00643     return true;
00644 }
00645 
00646 

SHOGUN Machine Learning Toolbox - Documentation