LLVM API Documentation

Compressor.cpp

Go to the documentation of this file.
00001 //===- lib/Support/Compressor.cpp -------------------------------*- C++ -*-===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file was developed by Reid Spencer and is distributed under the
00006 // University of Illinois Open Source License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file implements the llvm::Compressor class, an abstraction for memory
00011 // block compression.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "llvm/Config/config.h"
00016 #include "llvm/Support/Compressor.h"
00017 #include "llvm/ADT/StringExtras.h"
00018 #include <cassert>
00019 #include <string>
00020 #include <ostream>
00021 #include "bzip2/bzlib.h"
00022 using namespace llvm;
00023 
00024 enum CompressionTypes {
00025   COMP_TYPE_NONE  = '0',
00026   COMP_TYPE_BZIP2 = '2'
00027 };
00028 
00029 static int getdata(char*& buffer, size_t &size,
00030                    llvm::Compressor::OutputDataCallback* cb, void* context) {
00031   buffer = 0;
00032   size = 0;
00033   int result = (*cb)(buffer, size, context);
00034   assert(buffer != 0 && "Invalid result from Compressor callback");
00035   assert(size != 0 && "Invalid result from Compressor callback");
00036   return result;
00037 }
00038 
00039 static int getdata_uns(char*& buffer, unsigned &size,
00040                        llvm::Compressor::OutputDataCallback* cb, void* context)
00041 {
00042   size_t SizeOut;
00043   int Res = getdata(buffer, SizeOut, cb, context);
00044   size = SizeOut;
00045   return Res;
00046 }
00047 
00048 //===----------------------------------------------------------------------===//
00049 //=== NULLCOMP - a compression like set of routines that just copies data
00050 //===            without doing any compression. This is provided so that if the
00051 //===            configured environment doesn't have a compression library the
00052 //===            program can still work, albeit using more data/memory.
00053 //===----------------------------------------------------------------------===//
00054 
00055 struct NULLCOMP_stream {
00056   // User provided fields
00057   char*  next_in;
00058   size_t avail_in;
00059   char*  next_out;
00060   size_t avail_out;
00061 
00062   // Information fields
00063   size_t output_count; // Total count of output bytes
00064 };
00065 
00066 static void NULLCOMP_init(NULLCOMP_stream* s) {
00067   s->output_count = 0;
00068 }
00069 
00070 static bool NULLCOMP_compress(NULLCOMP_stream* s) {
00071   assert(s && "Invalid NULLCOMP_stream");
00072   assert(s->next_in != 0);
00073   assert(s->next_out != 0);
00074   assert(s->avail_in >= 1);
00075   assert(s->avail_out >= 1);
00076 
00077   if (s->avail_out >= s->avail_in) {
00078     ::memcpy(s->next_out, s->next_in, s->avail_in);
00079     s->output_count += s->avail_in;
00080     s->avail_out -= s->avail_in;
00081     s->next_in += s->avail_in;
00082     s->avail_in = 0;
00083     return true;
00084   } else {
00085     ::memcpy(s->next_out, s->next_in, s->avail_out);
00086     s->output_count += s->avail_out;
00087     s->avail_in -= s->avail_out;
00088     s->next_in += s->avail_out;
00089     s->avail_out = 0;
00090     return false;
00091   }
00092 }
00093 
00094 static bool NULLCOMP_decompress(NULLCOMP_stream* s) {
00095   assert(s && "Invalid NULLCOMP_stream");
00096   assert(s->next_in != 0);
00097   assert(s->next_out != 0);
00098   assert(s->avail_in >= 1);
00099   assert(s->avail_out >= 1);
00100 
00101   if (s->avail_out >= s->avail_in) {
00102     ::memcpy(s->next_out, s->next_in, s->avail_in);
00103     s->output_count += s->avail_in;
00104     s->avail_out -= s->avail_in;
00105     s->next_in += s->avail_in;
00106     s->avail_in = 0;
00107     return true;
00108   } else {
00109     ::memcpy(s->next_out, s->next_in, s->avail_out);
00110     s->output_count += s->avail_out;
00111     s->avail_in -= s->avail_out;
00112     s->next_in += s->avail_out;
00113     s->avail_out = 0;
00114     return false;
00115   }
00116 }
00117 
00118 static void NULLCOMP_end(NULLCOMP_stream* strm) {
00119 }
00120 
00121 namespace {
00122 
00123 /// This structure is only used when a bytecode file is compressed.
00124 /// As bytecode is being decompressed, the memory buffer might need
00125 /// to be reallocated. The buffer allocation is handled in a callback
00126 /// and this structure is needed to retain information across calls
00127 /// to the callback.
00128 /// @brief An internal buffer object used for handling decompression
00129 struct BufferContext {
00130   char* buff;
00131   size_t size;
00132   BufferContext(size_t compressedSize) {
00133     // Null to indicate malloc of a new block
00134     buff = 0;
00135 
00136     // Compute the initial length of the uncompression buffer. Note that this
00137     // is twice the length of the compressed buffer and will be doubled again
00138     // in the callback for an initial allocation of 4x compressedSize.  This
00139     // calculation is based on the typical compression ratio of bzip2 on LLVM
00140     // bytecode files which typically ranges in the 50%-75% range.   Since we
00141     // typically get at least 50%, doubling is insufficient. By using a 4x
00142     // multiplier on the first allocation, we minimize the impact of having to
00143     // copy the buffer on reallocation.
00144     size = compressedSize*2;
00145   }
00146 
00147   /// trimTo - Reduce the size of the buffer down to the specified amount.  This
00148   /// is useful after have read in the bytecode file to discard extra unused
00149   /// memory.
00150   ///
00151   void trimTo(size_t NewSize) {
00152     buff = (char*)::realloc(buff, NewSize);
00153     size = NewSize;
00154   }
00155 
00156   /// This function handles allocation of the buffer used for decompression of
00157   /// compressed bytecode files. It is called by Compressor::decompress which is
00158   /// called by BytecodeReader::ParseBytecode.
00159   static size_t callback(char*&buff, size_t &sz, void* ctxt){
00160     // Case the context variable to our BufferContext
00161     BufferContext* bc = reinterpret_cast<BufferContext*>(ctxt);
00162 
00163     // Compute the new, doubled, size of the block
00164     size_t new_size = bc->size * 2;
00165 
00166     // Extend or allocate the block (realloc(0,n) == malloc(n))
00167     char* new_buff = (char*) ::realloc(bc->buff, new_size);
00168 
00169     // Figure out what to return to the Compressor. If this is the first call,
00170     // then bc->buff will be null. In this case we want to return the entire
00171     // buffer because there was no previous allocation.  Otherwise, when the
00172     // buffer is reallocated, we save the new base pointer in the
00173     // BufferContext.buff field but return the address of only the extension,
00174     // mid-way through the buffer (since its size was doubled). Furthermore,
00175     // the sz result must be 1/2 the total size of the buffer.
00176     if (bc->buff == 0 ) {
00177       buff = bc->buff = new_buff;
00178       sz = new_size;
00179     } else {
00180       bc->buff = new_buff;
00181       buff = new_buff + bc->size;
00182       sz = bc->size;
00183     }
00184 
00185     // Retain the size of the allocated block
00186     bc->size = new_size;
00187 
00188     // Make sure we fail (return 1) if we didn't get any memory.
00189     return (bc->buff == 0 ? 1 : 0);
00190   }
00191 };
00192 
00193 } // end anonymous namespace
00194 
00195 
00196 namespace {
00197 
00198 // This structure retains the context when compressing the bytecode file. The
00199 // WriteCompressedData function below uses it to keep track of the previously
00200 // filled chunk of memory (which it writes) and how many bytes have been
00201 // written.
00202 struct WriterContext {
00203   // Initialize the context
00204   WriterContext(std::ostream*OS, size_t CS)
00205     : chunk(0), sz(0), written(0), compSize(CS), Out(OS) {}
00206 
00207   // Make sure we clean up memory
00208   ~WriterContext() {
00209     if (chunk)
00210       delete [] chunk;
00211   }
00212 
00213   // Write the chunk
00214   void write(size_t size = 0) {
00215     size_t write_size = (size == 0 ? sz : size);
00216     Out->write(chunk,write_size);
00217     written += write_size;
00218     delete [] chunk;
00219     chunk = 0;
00220     sz = 0;
00221   }
00222 
00223   // This function is a callback used by the Compressor::compress function to
00224   // allocate memory for the compression buffer. This function fulfills that
00225   // responsibility but also writes the previous (now filled) buffer out to the
00226   // stream.
00227   static size_t callback(char*& buffer, size_t &size, void* context) {
00228     // Cast the context to the structure it must point to.
00229     WriterContext* ctxt = reinterpret_cast<WriterContext*>(context);
00230 
00231     // If there's a previously allocated chunk, it must now be filled with
00232     // compressed data, so we write it out and deallocate it.
00233     if (ctxt->chunk != 0 && ctxt->sz > 0 ) {
00234       ctxt->write();
00235     }
00236 
00237     // Compute the size of the next chunk to allocate. We attempt to allocate
00238     // enough memory to handle the compression in a single memory allocation. In
00239     // general, the worst we do on compression of bytecode is about 50% so we
00240     // conservatively estimate compSize / 2 as the size needed for the
00241     // compression buffer. compSize is the size of the compressed data, provided
00242     // by WriteBytecodeToFile.
00243     size = ctxt->sz = ctxt->compSize / 2;
00244 
00245     // Allocate the chunks
00246     buffer = ctxt->chunk = new char [size];
00247 
00248     // We must return 1 if the allocation failed so that the Compressor knows
00249     // not to use the buffer pointer.
00250     return (ctxt->chunk == 0 ? 1 : 0);
00251   }
00252 
00253   char* chunk;       // pointer to the chunk of memory filled by compression
00254   size_t sz;         // size of chunk
00255   size_t written;    // aggregate total of bytes written in all chunks
00256   size_t compSize;   // size of the uncompressed buffer
00257   std::ostream* Out; // The stream we write the data to.
00258 };
00259 
00260 }  // end anonymous namespace
00261 
00262 // Compress in one of three ways
00263 size_t Compressor::compress(const char* in, size_t size,
00264                             OutputDataCallback* cb, void* context,
00265                             std::string* error ) {
00266   assert(in && "Can't compress null buffer");
00267   assert(size && "Can't compress empty buffer");
00268   assert(cb && "Can't compress without a callback function");
00269 
00270   size_t result = 0;
00271 
00272   // For small files, we just don't bother compressing. bzip2 isn't very good
00273   // with tiny files and can actually make the file larger, so we just avoid
00274   // it altogether.
00275   if (size > 64*1024) {
00276     // Set up the bz_stream
00277     bz_stream bzdata;
00278     bzdata.bzalloc = 0;
00279     bzdata.bzfree = 0;
00280     bzdata.opaque = 0;
00281     bzdata.next_in = (char*)in;
00282     bzdata.avail_in = size;
00283     bzdata.next_out = 0;
00284     bzdata.avail_out = 0;
00285     switch ( BZ2_bzCompressInit(&bzdata, 5, 0, 100) ) {
00286       case BZ_CONFIG_ERROR: 
00287         if (error)
00288           *error = "bzip2 library mis-compiled";
00289         return result;
00290       case BZ_PARAM_ERROR:  
00291         if (error)
00292           *error = "Compressor internal error";
00293         return result;
00294       case BZ_MEM_ERROR:    
00295         if (error)
00296           *error = "Out of memory";
00297         return result;
00298       case BZ_OK:
00299       default:
00300         break;
00301     }
00302 
00303     // Get a block of memory
00304     if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
00305       BZ2_bzCompressEnd(&bzdata);
00306       if (error)
00307         *error = "Can't allocate output buffer";
00308       return result;
00309     }
00310 
00311     // Put compression code in first byte
00312     (*bzdata.next_out++) = COMP_TYPE_BZIP2;
00313     bzdata.avail_out--;
00314 
00315     // Compress it
00316     int bzerr = BZ_FINISH_OK;
00317     while (BZ_FINISH_OK == (bzerr = BZ2_bzCompress(&bzdata, BZ_FINISH))) {
00318       if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
00319         BZ2_bzCompressEnd(&bzdata);
00320         if (error)
00321           *error = "Can't allocate output buffer";
00322         return result;
00323       }
00324     }
00325     switch (bzerr) {
00326       case BZ_SEQUENCE_ERROR:
00327       case BZ_PARAM_ERROR: 
00328         if (error)
00329           *error = "Param/Sequence error";
00330         return result;
00331       case BZ_FINISH_OK:
00332       case BZ_STREAM_END: break;
00333       default:
00334         if (error)
00335           *error = "BZip2 Error: " + utostr(unsigned(bzerr));
00336         return result;
00337     }
00338 
00339     // Finish
00340     result = bzdata.total_out_lo32 + 1;
00341     if (sizeof(size_t) == sizeof(uint64_t))
00342       result |= static_cast<uint64_t>(bzdata.total_out_hi32) << 32;
00343 
00344     BZ2_bzCompressEnd(&bzdata);
00345   } else {
00346     // Do null compression, for small files
00347     NULLCOMP_stream sdata;
00348     sdata.next_in = (char*)in;
00349     sdata.avail_in = size;
00350     NULLCOMP_init(&sdata);
00351 
00352     if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
00353       if (error)
00354         *error = "Can't allocate output buffer";
00355       return result;
00356     }
00357 
00358     *(sdata.next_out++) = COMP_TYPE_NONE;
00359     sdata.avail_out--;
00360 
00361     while (!NULLCOMP_compress(&sdata)) {
00362       if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
00363         if (error)
00364           *error = "Can't allocate output buffer";
00365         return result;
00366       }
00367     }
00368 
00369     result = sdata.output_count + 1;
00370     NULLCOMP_end(&sdata);
00371   }
00372   return result;
00373 }
00374 
00375 size_t Compressor::compressToNewBuffer(const char* in, size_t size, char*&out,
00376                                        std::string* error) {
00377   BufferContext bc(size);
00378   size_t result = compress(in,size,BufferContext::callback,(void*)&bc,error);
00379   bc.trimTo(result);
00380   out = bc.buff;
00381   return result;
00382 }
00383 
00384 size_t
00385 Compressor::compressToStream(const char*in, size_t size, std::ostream& out,
00386                              std::string* error) {
00387   // Set up the context and writer
00388   WriterContext ctxt(&out, size / 2);
00389 
00390   // Compress everything after the magic number (which we'll alter).
00391   size_t zipSize = Compressor::compress(in,size,
00392     WriterContext::callback, (void*)&ctxt,error);
00393 
00394   if (zipSize && ctxt.chunk) {
00395     ctxt.write(zipSize - ctxt.written);
00396   }
00397   return zipSize;
00398 }
00399 
00400 // Decompress in one of three ways
00401 size_t Compressor::decompress(const char *in, size_t size,
00402                               OutputDataCallback* cb, void* context,
00403                               std::string* error) {
00404   assert(in && "Can't decompress null buffer");
00405   assert(size > 1 && "Can't decompress empty buffer");
00406   assert(cb && "Can't decompress without a callback function");
00407 
00408   size_t result = 0;
00409 
00410   switch (*in++) {
00411     case COMP_TYPE_BZIP2: {
00412       // Set up the bz_stream
00413       bz_stream bzdata;
00414       bzdata.bzalloc = 0;
00415       bzdata.bzfree = 0;
00416       bzdata.opaque = 0;
00417       bzdata.next_in = (char*)in;
00418       bzdata.avail_in = size - 1;
00419       bzdata.next_out = 0;
00420       bzdata.avail_out = 0;
00421       switch ( BZ2_bzDecompressInit(&bzdata, 0, 0) ) {
00422         case BZ_CONFIG_ERROR: 
00423           if (error)
00424             *error = "bzip2 library mis-compiled";
00425           return result;
00426         case BZ_PARAM_ERROR:  
00427           if (error)
00428             *error = "Compressor internal error";
00429           return result;
00430         case BZ_MEM_ERROR:    
00431           if (error)
00432             *error = "Out of memory";
00433           return result;
00434         case BZ_OK:
00435         default:
00436           break;
00437       }
00438 
00439       // Get a block of memory
00440       if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
00441         BZ2_bzDecompressEnd(&bzdata);
00442         if (error)
00443           *error = "Can't allocate output buffer";
00444         return result;
00445       }
00446 
00447       // Decompress it
00448       int bzerr = BZ_OK;
00449       while ( BZ_OK == (bzerr = BZ2_bzDecompress(&bzdata)) &&
00450               bzdata.avail_in != 0 ) {
00451         if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
00452           BZ2_bzDecompressEnd(&bzdata);
00453           if (error)
00454             *error = "Can't allocate output buffer";
00455           return result;
00456         }
00457       }
00458 
00459       switch (bzerr) {
00460           BZ2_bzDecompressEnd(&bzdata);
00461         case BZ_PARAM_ERROR:  
00462           if (error)
00463             *error = "Compressor internal error";
00464           return result;
00465         case BZ_MEM_ERROR:    
00466           BZ2_bzDecompressEnd(&bzdata);
00467           if (error)
00468             *error = "Out of memory";
00469           return result;
00470         case BZ_DATA_ERROR:   
00471           BZ2_bzDecompressEnd(&bzdata);
00472           if (error)
00473             *error = "Data integrity error";
00474           return result;
00475         case BZ_DATA_ERROR_MAGIC:
00476           BZ2_bzDecompressEnd(&bzdata);
00477           if (error)
00478             *error = "Data is not BZIP2";
00479           return result;
00480         case BZ_OK:           
00481           BZ2_bzDecompressEnd(&bzdata);
00482           if (error)
00483             *error = "Insufficient input for bzip2";
00484           return result;
00485         case BZ_STREAM_END: break;
00486         default: 
00487           BZ2_bzDecompressEnd(&bzdata);
00488           if (error)
00489             *error = "Unknown result code from bzDecompress";
00490           return result;
00491       }
00492 
00493       // Finish
00494       result = bzdata.total_out_lo32;
00495       if (sizeof(size_t) == sizeof(uint64_t))
00496         result |= (static_cast<uint64_t>(bzdata.total_out_hi32) << 32);
00497       BZ2_bzDecompressEnd(&bzdata);
00498       break;
00499     }
00500 
00501     case COMP_TYPE_NONE: {
00502       NULLCOMP_stream sdata;
00503       sdata.next_in = (char*)in;
00504       sdata.avail_in = size - 1;
00505       NULLCOMP_init(&sdata);
00506 
00507       if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
00508         if (error)
00509           *error = "Can't allocate output buffer";
00510         return result;
00511       }
00512 
00513       while (!NULLCOMP_decompress(&sdata)) {
00514         if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
00515           if (error)
00516             *error = "Can't allocate output buffer";
00517           return result;
00518         }
00519       }
00520 
00521       result = sdata.output_count;
00522       NULLCOMP_end(&sdata);
00523       break;
00524     }
00525 
00526     default:
00527       if (error)
00528         *error = "Unknown type of compressed data";
00529       return result;
00530   }
00531 
00532   return result;
00533 }
00534 
00535 size_t
00536 Compressor::decompressToNewBuffer(const char* in, size_t size, char*&out,
00537                                   std::string* error) {
00538   BufferContext bc(size);
00539   size_t result = decompress(in,size,BufferContext::callback,(void*)&bc,error);
00540   out = bc.buff;
00541   return result;
00542 }
00543 
00544 size_t
00545 Compressor::decompressToStream(const char*in, size_t size, std::ostream& out,
00546                                std::string* error) {
00547   // Set up the context and writer
00548   WriterContext ctxt(&out,size / 2);
00549 
00550   // Decompress everything after the magic number (which we'll alter)
00551   size_t zipSize = Compressor::decompress(in,size,
00552     WriterContext::callback, (void*)&ctxt,error);
00553 
00554   if (zipSize && ctxt.chunk) {
00555     ctxt.write(zipSize - ctxt.written);
00556   }
00557   return zipSize;
00558 }