LLVM API Documentation

Compressor.cpp

Go to the documentation of this file.
00001 //===- lib/Support/Compressor.cpp -------------------------------*- C++ -*-===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file was developed by Reid Spencer and is distributed under the
00006 // University of Illinois Open Source License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file implements the llvm::Compressor class, an abstraction for memory
00011 // block compression.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "llvm/Config/config.h"
00016 #include "llvm/Support/Compressor.h"
00017 #include "llvm/ADT/StringExtras.h"
00018 #include <cassert>
00019 #include <string>
00020 #include <ostream>
00021 #include "bzip2/bzlib.h"
00022 using namespace llvm;
00023 
00024 enum CompressionTypes {
00025   COMP_TYPE_NONE  = '0',
00026   COMP_TYPE_BZIP2 = '2',
00027 };
00028 
00029 static int getdata(char*& buffer, size_t &size,
00030                    llvm::Compressor::OutputDataCallback* cb, void* context) {
00031   buffer = 0;
00032   size = 0;
00033   int result = (*cb)(buffer, size, context);
00034   assert(buffer != 0 && "Invalid result from Compressor callback");
00035   assert(size != 0 && "Invalid result from Compressor callback");
00036   return result;
00037 }
00038 
00039 static int getdata_uns(char*& buffer, unsigned &size,
00040                        llvm::Compressor::OutputDataCallback* cb, void* context)
00041 {
00042   size_t SizeOut;
00043   int Res = getdata(buffer, SizeOut, cb, context);
00044   size = SizeOut;
00045   return Res;
00046 }
00047 
00048 //===----------------------------------------------------------------------===//
00049 //=== NULLCOMP - a compression like set of routines that just copies data
00050 //===            without doing any compression. This is provided so that if the
00051 //===            configured environment doesn't have a compression library the
00052 //===            program can still work, albeit using more data/memory.
00053 //===----------------------------------------------------------------------===//
00054 
00055 struct NULLCOMP_stream {
00056   // User provided fields
00057   char*  next_in;
00058   size_t avail_in;
00059   char*  next_out;
00060   size_t avail_out;
00061 
00062   // Information fields
00063   size_t output_count; // Total count of output bytes
00064 };
00065 
00066 static void NULLCOMP_init(NULLCOMP_stream* s) {
00067   s->output_count = 0;
00068 }
00069 
00070 static bool NULLCOMP_compress(NULLCOMP_stream* s) {
00071   assert(s && "Invalid NULLCOMP_stream");
00072   assert(s->next_in != 0);
00073   assert(s->next_out != 0);
00074   assert(s->avail_in >= 1);
00075   assert(s->avail_out >= 1);
00076 
00077   if (s->avail_out >= s->avail_in) {
00078     ::memcpy(s->next_out, s->next_in, s->avail_in);
00079     s->output_count += s->avail_in;
00080     s->avail_out -= s->avail_in;
00081     s->next_in += s->avail_in;
00082     s->avail_in = 0;
00083     return true;
00084   } else {
00085     ::memcpy(s->next_out, s->next_in, s->avail_out);
00086     s->output_count += s->avail_out;
00087     s->avail_in -= s->avail_out;
00088     s->next_in += s->avail_out;
00089     s->avail_out = 0;
00090     return false;
00091   }
00092 }
00093 
00094 static bool NULLCOMP_decompress(NULLCOMP_stream* s) {
00095   assert(s && "Invalid NULLCOMP_stream");
00096   assert(s->next_in != 0);
00097   assert(s->next_out != 0);
00098   assert(s->avail_in >= 1);
00099   assert(s->avail_out >= 1);
00100 
00101   if (s->avail_out >= s->avail_in) {
00102     ::memcpy(s->next_out, s->next_in, s->avail_in);
00103     s->output_count += s->avail_in;
00104     s->avail_out -= s->avail_in;
00105     s->next_in += s->avail_in;
00106     s->avail_in = 0;
00107     return true;
00108   } else {
00109     ::memcpy(s->next_out, s->next_in, s->avail_out);
00110     s->output_count += s->avail_out;
00111     s->avail_in -= s->avail_out;
00112     s->next_in += s->avail_out;
00113     s->avail_out = 0;
00114     return false;
00115   }
00116 }
00117 
00118 static void NULLCOMP_end(NULLCOMP_stream* strm) {
00119 }
00120 
00121 namespace {
00122 
00123 /// This structure is only used when a bytecode file is compressed.
00124 /// As bytecode is being decompressed, the memory buffer might need
00125 /// to be reallocated. The buffer allocation is handled in a callback
00126 /// and this structure is needed to retain information across calls
00127 /// to the callback.
00128 /// @brief An internal buffer object used for handling decompression
00129 struct BufferContext {
00130   char* buff;
00131   size_t size;
00132   BufferContext(size_t compressedSize) {
00133     // Null to indicate malloc of a new block
00134     buff = 0;
00135 
00136     // Compute the initial length of the uncompression buffer. Note that this
00137     // is twice the length of the compressed buffer and will be doubled again
00138     // in the callback for an initial allocation of 4x compressedSize.  This
00139     // calculation is based on the typical compression ratio of bzip2 on LLVM
00140     // bytecode files which typically ranges in the 50%-75% range.   Since we
00141     // typically get at least 50%, doubling is insufficient. By using a 4x
00142     // multiplier on the first allocation, we minimize the impact of having to
00143     // copy the buffer on reallocation.
00144     size = compressedSize*2;
00145   }
00146 
00147   /// trimTo - Reduce the size of the buffer down to the specified amount.  This
00148   /// is useful after have read in the bytecode file to discard extra unused
00149   /// memory.
00150   ///
00151   void trimTo(size_t NewSize) {
00152     buff = (char*)::realloc(buff, NewSize);
00153     size = NewSize;
00154   }
00155 
00156   /// This function handles allocation of the buffer used for decompression of
00157   /// compressed bytecode files. It is called by Compressor::decompress which is
00158   /// called by BytecodeReader::ParseBytecode.
00159   static size_t callback(char*&buff, size_t &sz, void* ctxt){
00160     // Case the context variable to our BufferContext
00161     BufferContext* bc = reinterpret_cast<BufferContext*>(ctxt);
00162 
00163     // Compute the new, doubled, size of the block
00164     size_t new_size = bc->size * 2;
00165 
00166     // Extend or allocate the block (realloc(0,n) == malloc(n))
00167     char* new_buff = (char*) ::realloc(bc->buff, new_size);
00168 
00169     // Figure out what to return to the Compressor. If this is the first call,
00170     // then bc->buff will be null. In this case we want to return the entire
00171     // buffer because there was no previous allocation.  Otherwise, when the
00172     // buffer is reallocated, we save the new base pointer in the
00173     // BufferContext.buff field but return the address of only the extension,
00174     // mid-way through the buffer (since its size was doubled). Furthermore,
00175     // the sz result must be 1/2 the total size of the buffer.
00176     if (bc->buff == 0 ) {
00177       buff = bc->buff = new_buff;
00178       sz = new_size;
00179     } else {
00180       bc->buff = new_buff;
00181       buff = new_buff + bc->size;
00182       sz = bc->size;
00183     }
00184 
00185     // Retain the size of the allocated block
00186     bc->size = new_size;
00187 
00188     // Make sure we fail (return 1) if we didn't get any memory.
00189     return (bc->buff == 0 ? 1 : 0);
00190   }
00191 };
00192 
00193 } // end anonymous namespace
00194 
00195 
00196 namespace {
00197 
00198 // This structure retains the context when compressing the bytecode file. The
00199 // WriteCompressedData function below uses it to keep track of the previously
00200 // filled chunk of memory (which it writes) and how many bytes have been
00201 // written.
00202 struct WriterContext {
00203   // Initialize the context
00204   WriterContext(std::ostream*OS, size_t CS)
00205     : chunk(0), sz(0), written(0), compSize(CS), Out(OS) {}
00206 
00207   // Make sure we clean up memory
00208   ~WriterContext() {
00209     if (chunk)
00210       delete [] chunk;
00211   }
00212 
00213   // Write the chunk
00214   void write(size_t size = 0) {
00215     size_t write_size = (size == 0 ? sz : size);
00216     Out->write(chunk,write_size);
00217     written += write_size;
00218     delete [] chunk;
00219     chunk = 0;
00220     sz = 0;
00221   }
00222 
00223   // This function is a callback used by the Compressor::compress function to
00224   // allocate memory for the compression buffer. This function fulfills that
00225   // responsibility but also writes the previous (now filled) buffer out to the
00226   // stream.
00227   static size_t callback(char*& buffer, size_t &size, void* context) {
00228     // Cast the context to the structure it must point to.
00229     WriterContext* ctxt = reinterpret_cast<WriterContext*>(context);
00230 
00231     // If there's a previously allocated chunk, it must now be filled with
00232     // compressed data, so we write it out and deallocate it.
00233     if (ctxt->chunk != 0 && ctxt->sz > 0 ) {
00234       ctxt->write();
00235     }
00236 
00237     // Compute the size of the next chunk to allocate. We attempt to allocate
00238     // enough memory to handle the compression in a single memory allocation. In
00239     // general, the worst we do on compression of bytecode is about 50% so we
00240     // conservatively estimate compSize / 2 as the size needed for the
00241     // compression buffer. compSize is the size of the compressed data, provided
00242     // by WriteBytecodeToFile.
00243     size = ctxt->sz = ctxt->compSize / 2;
00244 
00245     // Allocate the chunks
00246     buffer = ctxt->chunk = new char [size];
00247 
00248     // We must return 1 if the allocation failed so that the Compressor knows
00249     // not to use the buffer pointer.
00250     return (ctxt->chunk == 0 ? 1 : 0);
00251   }
00252 
00253   char* chunk;       // pointer to the chunk of memory filled by compression
00254   size_t sz;         // size of chunk
00255   size_t written;    // aggregate total of bytes written in all chunks
00256   size_t compSize;   // size of the uncompressed buffer
00257   std::ostream* Out; // The stream we write the data to.
00258 };
00259 
00260 }  // end anonymous namespace
00261 
00262 // Compress in one of three ways
00263 size_t Compressor::compress(const char* in, size_t size,
00264                             OutputDataCallback* cb, void* context) {
00265   assert(in && "Can't compress null buffer");
00266   assert(size && "Can't compress empty buffer");
00267   assert(cb && "Can't compress without a callback function");
00268 
00269   size_t result = 0;
00270 
00271   // For small files, we just don't bother compressing. bzip2 isn't very good
00272   // with tiny files and can actually make the file larger, so we just avoid
00273   // it altogether.
00274   if (size > 64*1024) {
00275     // Set up the bz_stream
00276     bz_stream bzdata;
00277     bzdata.bzalloc = 0;
00278     bzdata.bzfree = 0;
00279     bzdata.opaque = 0;
00280     bzdata.next_in = (char*)in;
00281     bzdata.avail_in = size;
00282     bzdata.next_out = 0;
00283     bzdata.avail_out = 0;
00284     switch ( BZ2_bzCompressInit(&bzdata, 5, 0, 100) ) {
00285       case BZ_CONFIG_ERROR: throw std::string("bzip2 library mis-compiled");
00286       case BZ_PARAM_ERROR:  throw std::string("Compressor internal error");
00287       case BZ_MEM_ERROR:    throw std::string("Out of memory");
00288       case BZ_OK:
00289       default:
00290         break;
00291     }
00292 
00293     // Get a block of memory
00294     if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
00295       BZ2_bzCompressEnd(&bzdata);
00296       throw std::string("Can't allocate output buffer");
00297     }
00298 
00299     // Put compression code in first byte
00300     (*bzdata.next_out++) = COMP_TYPE_BZIP2;
00301     bzdata.avail_out--;
00302 
00303     // Compress it
00304     int bzerr = BZ_FINISH_OK;
00305     while (BZ_FINISH_OK == (bzerr = BZ2_bzCompress(&bzdata, BZ_FINISH))) {
00306       if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
00307         BZ2_bzCompressEnd(&bzdata);
00308         throw std::string("Can't allocate output buffer");
00309       }
00310     }
00311     switch (bzerr) {
00312       case BZ_SEQUENCE_ERROR:
00313       case BZ_PARAM_ERROR: throw std::string("Param/Sequence error");
00314       case BZ_FINISH_OK:
00315       case BZ_STREAM_END: break;
00316       default: throw std::string("Oops: ") + utostr(unsigned(bzerr));
00317     }
00318 
00319     // Finish
00320     result = bzdata.total_out_lo32 + 1;
00321     if (sizeof(size_t) == sizeof(uint64_t))
00322       result |= static_cast<uint64_t>(bzdata.total_out_hi32) << 32;
00323 
00324     BZ2_bzCompressEnd(&bzdata);
00325   } else {
00326     // Do null compression, for small files
00327     NULLCOMP_stream sdata;
00328     sdata.next_in = (char*)in;
00329     sdata.avail_in = size;
00330     NULLCOMP_init(&sdata);
00331 
00332     if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
00333       throw std::string("Can't allocate output buffer");
00334     }
00335 
00336     *(sdata.next_out++) = COMP_TYPE_NONE;
00337     sdata.avail_out--;
00338 
00339     while (!NULLCOMP_compress(&sdata)) {
00340       if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
00341         throw std::string("Can't allocate output buffer");
00342       }
00343     }
00344 
00345     result = sdata.output_count + 1;
00346     NULLCOMP_end(&sdata);
00347   }
00348   return result;
00349 }
00350 
00351 size_t Compressor::compressToNewBuffer(const char* in, size_t size, char*&out) {
00352   BufferContext bc(size);
00353   size_t result = compress(in,size,BufferContext::callback,(void*)&bc);
00354   bc.trimTo(result);
00355   out = bc.buff;
00356   return result;
00357 }
00358 
00359 size_t
00360 Compressor::compressToStream(const char*in, size_t size, std::ostream& out) {
00361   // Set up the context and writer
00362   WriterContext ctxt(&out, size / 2);
00363 
00364   // Compress everything after the magic number (which we'll alter).
00365   size_t zipSize = Compressor::compress(in,size,
00366     WriterContext::callback, (void*)&ctxt);
00367 
00368   if (ctxt.chunk) {
00369     ctxt.write(zipSize - ctxt.written);
00370   }
00371   return zipSize;
00372 }
00373 
00374 // Decompress in one of three ways
00375 size_t Compressor::decompress(const char *in, size_t size,
00376                               OutputDataCallback* cb, void* context) {
00377   assert(in && "Can't decompress null buffer");
00378   assert(size > 1 && "Can't decompress empty buffer");
00379   assert(cb && "Can't decompress without a callback function");
00380 
00381   size_t result = 0;
00382 
00383   switch (*in++) {
00384     case COMP_TYPE_BZIP2: {
00385       // Set up the bz_stream
00386       bz_stream bzdata;
00387       bzdata.bzalloc = 0;
00388       bzdata.bzfree = 0;
00389       bzdata.opaque = 0;
00390       bzdata.next_in = (char*)in;
00391       bzdata.avail_in = size - 1;
00392       bzdata.next_out = 0;
00393       bzdata.avail_out = 0;
00394       switch ( BZ2_bzDecompressInit(&bzdata, 0, 0) ) {
00395         case BZ_CONFIG_ERROR: throw std::string("bzip2 library mis-compiled");
00396         case BZ_PARAM_ERROR:  throw std::string("Compressor internal error");
00397         case BZ_MEM_ERROR:    throw std::string("Out of memory");
00398         case BZ_OK:
00399         default:
00400           break;
00401       }
00402 
00403       // Get a block of memory
00404       if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
00405         BZ2_bzDecompressEnd(&bzdata);
00406         throw std::string("Can't allocate output buffer");
00407       }
00408 
00409       // Decompress it
00410       int bzerr = BZ_OK;
00411       while ( BZ_OK == (bzerr = BZ2_bzDecompress(&bzdata)) &&
00412               bzdata.avail_in != 0 ) {
00413         if (0 != getdata_uns(bzdata.next_out, bzdata.avail_out,cb,context)) {
00414           BZ2_bzDecompressEnd(&bzdata);
00415           throw std::string("Can't allocate output buffer");
00416         }
00417       }
00418 
00419       switch (bzerr) {
00420         case BZ_PARAM_ERROR:  throw std::string("Compressor internal error");
00421         case BZ_MEM_ERROR:    throw std::string("Out of memory");
00422         case BZ_DATA_ERROR:   throw std::string("Data integrity error");
00423         case BZ_DATA_ERROR_MAGIC:throw std::string("Data is not BZIP2");
00424         case BZ_OK:           throw std::string("Insufficient input for bzip2");
00425         case BZ_STREAM_END: break;
00426         default: throw("Ooops");
00427       }
00428 
00429 
00430       // Finish
00431       result = bzdata.total_out_lo32;
00432       if (sizeof(size_t) == sizeof(uint64_t))
00433         result |= (static_cast<uint64_t>(bzdata.total_out_hi32) << 32);
00434       BZ2_bzDecompressEnd(&bzdata);
00435       break;
00436     }
00437 
00438     case COMP_TYPE_NONE: {
00439       NULLCOMP_stream sdata;
00440       sdata.next_in = (char*)in;
00441       sdata.avail_in = size - 1;
00442       NULLCOMP_init(&sdata);
00443 
00444       if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
00445         throw std::string("Can't allocate output buffer");
00446       }
00447 
00448       while (!NULLCOMP_decompress(&sdata)) {
00449         if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
00450           throw std::string("Can't allocate output buffer");
00451         }
00452       }
00453 
00454       result = sdata.output_count;
00455       NULLCOMP_end(&sdata);
00456       break;
00457     }
00458 
00459     default:
00460       throw std::string("Unknown type of compressed data");
00461   }
00462 
00463   return result;
00464 }
00465 
00466 size_t
00467 Compressor::decompressToNewBuffer(const char* in, size_t size, char*&out) {
00468   BufferContext bc(size);
00469   size_t result = decompress(in,size,BufferContext::callback,(void*)&bc);
00470   out = bc.buff;
00471   return result;
00472 }
00473 
00474 size_t
00475 Compressor::decompressToStream(const char*in, size_t size, std::ostream& out){
00476   // Set up the context and writer
00477   WriterContext ctxt(&out,size / 2);
00478 
00479   // Decompress everything after the magic number (which we'll alter)
00480   size_t zipSize = Compressor::decompress(in,size,
00481     WriterContext::callback, (void*)&ctxt);
00482 
00483   if (ctxt.chunk) {
00484     ctxt.write(zipSize - ctxt.written);
00485   }
00486   return zipSize;
00487 }
00488 
00489 // vim: sw=2 ai