00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <string.h>
00012 #include <sys/types.h>
00013 #include <sys/stat.h>
00014 #include <unistd.h>
00015 #include <ctype.h>
00016
00017 #include "lib/File.h"
00018 #include "lib/SimpleFile.h"
00019
00020 #include "features/StringFeatures.h"
00021 #include "features/SparseFeatures.h"
00022
00023
00024 CFile::CFile(FILE* f)
00025 : CSGObject()
00026 {
00027 file=f;
00028 filename=NULL;
00029 expected_type=F_UNKNOWN;
00030 }
00031
00032 CFile::CFile(CHAR* fname, CHAR rw, EFeatureType typ, CHAR file_fourcc[4])
00033 : CSGObject()
00034 {
00035 status=false;
00036 task=rw;
00037 expected_type=typ;
00038 filename=strdup(fname);
00039 CHAR mode[2];
00040 mode[0]=rw;
00041 mode[1]='\0';
00042
00043
00044 if (rw=='r' || rw == 'w')
00045 {
00046 if (filename)
00047 {
00048 if ((file=fopen((const CHAR*) filename, (const CHAR*) mode)))
00049 status=true;
00050 }
00051 }
00052 else
00053 SG_ERROR("unknown mode '%c'\n", mode[0]);
00054
00055 if (file_fourcc)
00056 {
00057 if (rw=='r')
00058 status=read_header();
00059 else if (rw=='w')
00060 status=write_header();
00061
00062 if (!status)
00063 fclose(file);
00064
00065 file=NULL;
00066 }
00067 }
00068
00069 CFile::~CFile()
00070 {
00071 free(filename);
00072 if (file)
00073 fclose(file);
00074 filename=NULL;
00075 file=NULL;
00076 }
00077
00078 INT* CFile::load_int_data(INT* target, LONG& num)
00079 {
00080 ASSERT(expected_type==F_INT);
00081 CSimpleFile<INT> f(filename, file);
00082 target=f.load(target, num);
00083 status=(target!=NULL);
00084 return target;
00085 }
00086
00087 bool CFile::save_int_data(INT* src, LONG num)
00088 {
00089 ASSERT(expected_type==F_INT);
00090 CSimpleFile<INT> f(filename, file);
00091 status=f.save(src, num);
00092 return status;
00093 }
00094
00095 DREAL* CFile::load_real_data(DREAL* target, LONG& num)
00096 {
00097 ASSERT(expected_type==F_DREAL);
00098 CSimpleFile<DREAL> f(filename, file);
00099 target=f.load(target, num);
00100 status=(target!=NULL);
00101 return target;
00102 }
00103
00104 SHORTREAL* CFile::load_shortreal_data(SHORTREAL* target, LONG& num)
00105 {
00106 ASSERT(expected_type==F_SHORTREAL);
00107 CSimpleFile<SHORTREAL> f(filename, file);
00108 target=f.load(target, num);
00109 status=(target!=NULL);
00110 return target;
00111 }
00112
00113 bool CFile::save_real_data(DREAL* src, LONG num)
00114 {
00115 ASSERT(expected_type==F_DREAL);
00116 CSimpleFile<DREAL> f(filename, file);
00117 status=f.save(src, num);
00118 return status;
00119 }
00120
00121 bool CFile::save_shortreal_data(SHORTREAL* src, LONG num)
00122 {
00123 ASSERT(expected_type==F_SHORTREAL);
00124 CSimpleFile<SHORTREAL> f(filename, file);
00125 status=f.save(src, num);
00126 return status;
00127 }
00128
00129 CHAR* CFile::load_char_data(CHAR* target, LONG& num)
00130 {
00131 ASSERT(expected_type==F_CHAR);
00132 CSimpleFile<CHAR> f(filename, file);
00133 target=f.load(target, num);
00134 status=(target!=NULL);
00135 return target;
00136 }
00137
00138 bool CFile::save_char_data(CHAR* src, LONG num)
00139 {
00140 ASSERT(expected_type==F_CHAR);
00141 CSimpleFile<CHAR> f(filename, file);
00142 status=f.save(src, num);
00143 return status;
00144 }
00145
00146 BYTE* CFile::load_byte_data(BYTE* target, LONG& num)
00147 {
00148 ASSERT(expected_type==F_BYTE);
00149 CSimpleFile<BYTE> f(filename, file);
00150 target=f.load(target, num);
00151 status=(target!=NULL);
00152 return target;
00153 }
00154
00155 bool CFile::save_byte_data(BYTE* src, LONG num)
00156 {
00157 ASSERT(expected_type==F_BYTE);
00158 CSimpleFile<BYTE> f(filename, file);
00159 status=f.save(src, num);
00160 return status;
00161 }
00162
00163 WORD* CFile::load_word_data(WORD* target, LONG& num)
00164 {
00165 ASSERT(expected_type==F_WORD);
00166 CSimpleFile<WORD> f(filename, file);
00167 target=f.load(target, num);
00168 status=(target!=NULL);
00169 return target;
00170 }
00171
00172 bool CFile::save_word_data(WORD* src, LONG num)
00173 {
00174 ASSERT(expected_type==F_WORD);
00175 CSimpleFile<WORD> f(filename, file);
00176 status=f.save(src, num);
00177 return status;
00178 }
00179
00180 SHORT* CFile::load_short_data(SHORT* target, LONG& num)
00181 {
00182 ASSERT(expected_type==F_SHORT);
00183 CSimpleFile<SHORT> f(filename, file);
00184 target=f.load(target, num);
00185 status=(target!=NULL);
00186 return target;
00187 }
00188
00189 bool CFile::save_short_data(SHORT* src, LONG num)
00190 {
00191 ASSERT(expected_type==F_SHORT);
00192 CSimpleFile<SHORT> f(filename, file);
00193 status=f.save(src, num);
00194 return status;
00195 }
00196
00197 INT CFile::parse_first_header(EFeatureType &type)
00198 {
00199 return -1;
00200 }
00201
00202 INT CFile::parse_next_header(EFeatureType &type)
00203 {
00204 return -1;
00205 }
00206
00207
00208 bool CFile::read_header()
00209 {
00210 ASSERT(file);
00211 UINT intlen=0;
00212 UINT endian=0;
00213 UINT file_fourcc=0;
00214 UINT doublelen=0;
00215
00216 if ( (fread(&intlen, sizeof(BYTE), 1, file)==1) &&
00217 (fread(&doublelen, sizeof(BYTE), 1, file)==1) &&
00218 (fread(&endian, (UINT) intlen, 1, file)== 1) &&
00219 (fread(&file_fourcc, (UINT) intlen, 1, file)==1))
00220 return true;
00221 else
00222 return false;
00223 }
00224
00225 bool CFile::write_header()
00226 {
00227 BYTE intlen=sizeof(UINT);
00228 BYTE doublelen=sizeof(double);
00229 UINT endian=0x12345678;
00230
00231 if ((fwrite(&intlen, sizeof(BYTE), 1, file)==1) &&
00232 (fwrite(&doublelen, sizeof(BYTE), 1, file)==1) &&
00233 (fwrite(&endian, sizeof(UINT), 1, file)==1) &&
00234 (fwrite(&fourcc, 4*sizeof(char), 1, file)==1))
00235 return true;
00236 else
00237 return false;
00238 }
00239
00240 template <class T> void CFile::append_item(CDynamicArray<T>* items, CHAR* ptr_data, CHAR* ptr_item)
00241 {
00242 size_t len=(ptr_data-ptr_item)/sizeof(CHAR);
00243 CHAR* item=new CHAR[len+1];
00244 memset(item, 0, sizeof(CHAR)*(len+1));
00245 item=strncpy(item, ptr_item, len);
00246
00247 SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
00248 items->append_element(item);
00249 }
00250
00251 bool CFile::read_real_valued_dense(DREAL*& matrix, INT& num_feat, INT& num_vec)
00252 {
00253 ASSERT(expected_type==F_DREAL);
00254
00255 struct stat stats;
00256 if (stat(filename, &stats)!=0)
00257 SG_ERROR("Could not get file statistics.\n");
00258
00259 CHAR* data=new CHAR[stats.st_size+1];
00260 memset(data, 0, sizeof(CHAR)*(stats.st_size+1));
00261 size_t nread=fread(data, sizeof(CHAR), stats.st_size, file);
00262 if (nread<=0)
00263 SG_ERROR("Could not read data from %s.\n");
00264
00265 SG_DEBUG("data read from file:\n%s\n", data);
00266
00267
00268 INT nf=0;
00269 num_feat=0;
00270 num_vec=0;
00271 CHAR* ptr_item=NULL;
00272 CHAR* ptr_data=data;
00273 CDynamicArray<CHAR*>* items=new CDynamicArray<CHAR*>();
00274
00275 while (*ptr_data)
00276 {
00277 if (*ptr_data=='\n')
00278 {
00279 if (ptr_item)
00280 nf++;
00281
00282 if (num_feat!=0 && nf!=num_feat)
00283 SG_ERROR("Number of features mismatches (%d != %d) in vector %d in file %s.\n", num_feat, nf, num_vec, filename);
00284
00285 append_item(items, ptr_data, ptr_item);
00286 num_feat=nf;
00287 num_vec++;
00288 nf=0;
00289 ptr_item=NULL;
00290 }
00291 else if (!isblank(*ptr_data) && !ptr_item)
00292 {
00293 ptr_item=ptr_data;
00294 }
00295 else if (isblank(*ptr_data) && ptr_item)
00296 {
00297 append_item(items, ptr_data, ptr_item);
00298 ptr_item=NULL;
00299 nf++;
00300 }
00301
00302 ptr_data++;
00303 }
00304
00305 SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec);
00306 delete[] data;
00307
00308
00309 matrix=new DREAL[num_vec*num_feat];
00310 for (INT i=0; i<num_vec; i++)
00311 {
00312 for (INT j=0; j<num_feat; j++)
00313 {
00314 CHAR* item=items->get_element(i*num_feat+j);
00315 matrix[i*num_feat+j]=atof(item);
00316 delete[] item;
00317 }
00318 }
00319 delete items;
00320
00321
00322 return true;
00323 }
00324
00325 bool CFile::write_real_valued_dense(const DREAL* matrix, INT num_feat, INT num_vec)
00326 {
00327 if (!(file && matrix))
00328 SG_ERROR("File or matrix invalid.\n");
00329
00330 for (INT j=0; j<num_vec; j++)
00331 {
00332 for (INT i=0; i<num_feat; i++)
00333 {
00334 DREAL v=matrix[num_feat*j+i];
00335 if (i==num_feat-1)
00336 fprintf(file, "%f\n", v);
00337 else
00338 fprintf(file, "%f ", v);
00339 }
00340 }
00341
00342 return true;
00343 }
00344
00345 bool CFile::read_real_valued_sparse(TSparse<DREAL>*& matrix, INT& num_feat, INT& num_vec)
00346 {
00347 size_t blocksize=1024*1024;
00348 size_t required_blocksize=blocksize;
00349 BYTE* dummy=new BYTE[blocksize];
00350
00351 if (file)
00352 {
00353 num_vec=0;
00354 num_feat=0;
00355
00356 SG_INFO("counting line numbers in file %s\n", filename);
00357 size_t sz=blocksize;
00358 size_t block_offs=0;
00359 size_t old_block_offs=0;
00360 fseek(file, 0, SEEK_END);
00361 size_t fsize=ftell(file);
00362 rewind(file);
00363
00364 while (sz == blocksize)
00365 {
00366 sz=fread(dummy, sizeof(BYTE), blocksize, file);
00367 bool contains_cr=false;
00368 for (size_t i=0; i<sz; i++)
00369 {
00370 block_offs++;
00371 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00372 {
00373 num_vec++;
00374 contains_cr=true;
00375 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00376 old_block_offs=block_offs;
00377 }
00378 }
00379 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00380 }
00381
00382 SG_INFO("found %d feature vectors\n", num_vec);
00383 delete[] dummy;
00384 blocksize=required_blocksize;
00385 dummy = new BYTE[blocksize+1];
00386 matrix=new TSparse<DREAL>[num_vec];
00387
00388 rewind(file);
00389 sz=blocksize;
00390 INT lines=0;
00391 while (sz == blocksize)
00392 {
00393 sz=fread(dummy, sizeof(BYTE), blocksize, file);
00394
00395 size_t old_sz=0;
00396 for (size_t i=0; i<sz; i++)
00397 {
00398 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00399 {
00400 size_t len=i-old_sz+1;
00401 BYTE* data=&dummy[old_sz];
00402
00403 for (size_t j=0; j<len; j++)
00404 dummy[j]=data[j];
00405
00406 sz=fread(dummy+len, sizeof(BYTE), blocksize-len, file);
00407 i=0;
00408 old_sz=0;
00409 sz+=len;
00410 }
00411
00412 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00413 {
00414
00415 size_t len=i-old_sz;
00416 BYTE* data=&dummy[old_sz];
00417
00418 INT dims=0;
00419 for (size_t j=0; j<len; j++)
00420 {
00421 if (data[j]==':')
00422 dims++;
00423 }
00424
00425 if (dims<=0)
00426 {
00427 SG_ERROR("Error in line %d - number of"
00428 " dimensions is %d line is %d characters"
00429 " long\n line_content:'%.*s'\n", lines,
00430 dims, len, len, (const char*) data);
00431 }
00432
00433 TSparseEntry<DREAL>* feat=new TSparseEntry<DREAL>[dims];
00434
00435
00436 size_t j=0;
00437 for (; j<len; j++)
00438 {
00439 if (data[j]==':')
00440 {
00441 j=-1;
00442 break;
00443 }
00444
00445 if (data[j]==' ')
00446 {
00447 data[j]='\0';
00448
00449
00450 break;
00451 }
00452 }
00453
00454 INT d=0;
00455 j++;
00456 BYTE* start=&data[j];
00457 for (; j<len; j++)
00458 {
00459 if (data[j]==':')
00460 {
00461 data[j]='\0';
00462
00463 feat[d].feat_index=(INT) atoi((const char*) start)-1;
00464 num_feat=CMath::max(num_feat, feat[d].feat_index+1);
00465
00466 j++;
00467 start=&data[j];
00468 for (; j<len; j++)
00469 {
00470 if (data[j]==' ' || data[j]=='\n')
00471 {
00472 data[j]='\0';
00473 feat[d].entry=(DREAL) atof((const char*) start);
00474 d++;
00475 break;
00476 }
00477 }
00478
00479 if (j==len)
00480 {
00481 data[j]='\0';
00482 feat[dims-1].entry=(DREAL) atof((const char*) start);
00483 }
00484
00485 j++;
00486 start=&data[j];
00487 }
00488 }
00489
00490 matrix[lines].vec_index=lines;
00491 matrix[lines].num_feat_entries=dims;
00492 matrix[lines].features=feat;
00493
00494 old_sz=i+1;
00495 lines++;
00496 SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t");
00497 }
00498 }
00499 }
00500
00501 SG_INFO("file successfully read\n");
00502 }
00503
00504 delete[] dummy;
00505 return true;
00506 }
00507
00508 bool CFile::write_real_valued_sparse(const TSparse<DREAL>* matrix, INT num_feat, INT num_vec)
00509 {
00510 if (!(file && matrix))
00511 SG_ERROR("File or matrix invalid.\n");
00512
00513 for (INT i=0; i<num_vec; i++)
00514 {
00515 TSparseEntry<DREAL>* vec = matrix[i].features;
00516 INT len=matrix[i].num_feat_entries;
00517
00518 for (INT j=0; j<len; j++)
00519 {
00520 if (j<len-1)
00521 fprintf(file, "%d:%f ", (INT) vec[j].feat_index+1, (double) vec[j].entry);
00522 else
00523 fprintf(file, "%d:%f\n", (INT) vec[j].feat_index+1, (double) vec[j].entry);
00524 }
00525 }
00526
00527 return true;
00528 }
00529
00530
00531 bool CFile::read_char_valued_strings(T_STRING<CHAR>*& strings, INT& num_str, INT& max_string_len)
00532 {
00533 bool result=false;
00534
00535 size_t blocksize=1024*1024;
00536 size_t required_blocksize=0;
00537 CHAR* dummy=new CHAR[blocksize];
00538 CHAR* overflow=NULL;
00539 INT overflow_len=0;
00540
00541 if (file)
00542 {
00543 num_str=0;
00544 max_string_len=0;
00545
00546 SG_INFO("counting line numbers in file %s\n", filename);
00547 size_t sz=blocksize;
00548 size_t block_offs=0;
00549 size_t old_block_offs=0;
00550 fseek(file, 0, SEEK_END);
00551 size_t fsize=ftell(file);
00552 rewind(file);
00553
00554 while (sz == blocksize)
00555 {
00556 sz=fread(dummy, sizeof(CHAR), blocksize, file);
00557 bool contains_cr=false;
00558 for (size_t i=0; i<sz; i++)
00559 {
00560 block_offs++;
00561 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00562 {
00563 num_str++;
00564 contains_cr=true;
00565 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00566 old_block_offs=block_offs;
00567 }
00568 }
00569 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00570 }
00571
00572 SG_INFO("found %d strings\n", num_str);
00573 SG_DEBUG("block_size=%d\n", required_blocksize);
00574 delete[] dummy;
00575 blocksize=required_blocksize;
00576 dummy=new CHAR[blocksize];
00577 overflow=new CHAR[blocksize];
00578 strings=new T_STRING<CHAR>[num_str];
00579
00580 rewind(file);
00581 sz=blocksize;
00582 INT lines=0;
00583 size_t old_sz=0;
00584 while (sz == blocksize)
00585 {
00586 sz=fread(dummy, sizeof(CHAR), blocksize, file);
00587
00588 old_sz=0;
00589 for (size_t i=0; i<sz; i++)
00590 {
00591 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00592 {
00593 INT len=i-old_sz;
00594 max_string_len=CMath::max(max_string_len, len+overflow_len);
00595
00596 strings[lines].length=len+overflow_len;
00597 strings[lines].string=new CHAR[len+overflow_len];
00598
00599 for (INT j=0; j<overflow_len; j++)
00600 strings[lines].string[j]=overflow[j];
00601 for (INT j=0; j<len; j++)
00602 strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00603
00604
00605 overflow_len=0;
00606
00607
00608 old_sz=i+1;
00609 lines++;
00610 SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00611 }
00612 }
00613
00614 for (size_t i=old_sz; i<sz; i++)
00615 overflow[i-old_sz]=dummy[i];
00616
00617 overflow_len=sz-old_sz;
00618 }
00619 result=true;
00620 SG_INFO("file successfully read\n");
00621 SG_INFO("max_string_length=%d\n", max_string_len);
00622 SG_INFO("num_strings=%d\n", num_str);
00623 }
00624
00625 delete[] dummy;
00626 delete[] overflow;
00627
00628 return result;
00629 }
00630
00631 bool CFile::write_char_valued_strings(const T_STRING<CHAR>* strings, INT num_str)
00632 {
00633 if (!(file && strings))
00634 SG_ERROR("File or strings invalid.\n");
00635
00636 for (INT i=0; i<num_str; i++)
00637 {
00638 INT len = strings[i].length;
00639 fwrite(strings[i].string, sizeof(CHAR), len, file);
00640 fprintf(file, "\n");
00641 }
00642
00643 return true;
00644 }
00645
00646