LLVM API Documentation

Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members | Related Pages

Compressor.cpp

Go to the documentation of this file.
00001 //===- lib/Support/Compressor.cpp -------------------------------*- C++ -*-===//
00002 // 
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file was developed by Reid Spencer and is distributed under the 
00006 // University of Illinois Open Source License. See LICENSE.TXT for details.
00007 // 
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file implements the llvm::Compressor class, an abstraction for memory
00011 // block compression.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "llvm/Config/config.h"
00016 #include "llvm/Support/Compressor.h"
00017 #include "llvm/ADT/StringExtras.h"
00018 #include <cassert>
00019 #include <string>
00020 #include "bzip2/bzlib.h"
00021 
00022 namespace {
00023 
00024 enum CompressionTypes {
00025   COMP_TYPE_NONE  = '0',
00026   COMP_TYPE_BZIP2 = '2',
00027 };
00028 
00029 inline int getdata(char*& buffer, unsigned& size, 
00030                    llvm::Compressor::OutputDataCallback* cb, void* context) {
00031   buffer = 0;
00032   size = 0;
00033   int result = (*cb)(buffer, size, context);
00034   assert(buffer != 0 && "Invalid result from Compressor callback");
00035   assert(size != 0 && "Invalid result from Compressor callback");
00036   return result;
00037 }
00038 
00039 //===----------------------------------------------------------------------===//
00040 //=== NULLCOMP - a compression like set of routines that just copies data 
00041 //===            without doing any compression. This is provided so that if the
00042 //===            configured environment doesn't have a compression library the
00043 //===            program can still work, albeit using more data/memory.
00044 //===----------------------------------------------------------------------===//
00045 
00046 struct NULLCOMP_stream {
00047   // User provided fields
00048   char* next_in;
00049   unsigned avail_in;
00050   char* next_out;
00051   unsigned avail_out;
00052 
00053   // Information fields
00054   uint64_t output_count; // Total count of output bytes
00055 };
00056 
00057 void NULLCOMP_init(NULLCOMP_stream* s) {
00058   s->output_count = 0;
00059 }
00060 
00061 bool NULLCOMP_compress(NULLCOMP_stream* s) {
00062   assert(s && "Invalid NULLCOMP_stream");
00063   assert(s->next_in != 0);
00064   assert(s->next_out != 0);
00065   assert(s->avail_in >= 1);
00066   assert(s->avail_out >= 1);
00067 
00068   if (s->avail_out >= s->avail_in) {
00069     ::memcpy(s->next_out, s->next_in, s->avail_in);
00070     s->output_count += s->avail_in;
00071     s->avail_out -= s->avail_in;
00072     s->next_in += s->avail_in;
00073     s->avail_in = 0;
00074     return true;
00075   } else {
00076     ::memcpy(s->next_out, s->next_in, s->avail_out);
00077     s->output_count += s->avail_out;
00078     s->avail_in -= s->avail_out;
00079     s->next_in += s->avail_out;
00080     s->avail_out = 0;
00081     return false;
00082   }
00083 }
00084 
00085 bool NULLCOMP_decompress(NULLCOMP_stream* s) {
00086   assert(s && "Invalid NULLCOMP_stream");
00087   assert(s->next_in != 0);
00088   assert(s->next_out != 0);
00089   assert(s->avail_in >= 1);
00090   assert(s->avail_out >= 1);
00091 
00092   if (s->avail_out >= s->avail_in) {
00093     ::memcpy(s->next_out, s->next_in, s->avail_in);
00094     s->output_count += s->avail_in;
00095     s->avail_out -= s->avail_in;
00096     s->next_in += s->avail_in;
00097     s->avail_in = 0;
00098     return true;
00099   } else {
00100     ::memcpy(s->next_out, s->next_in, s->avail_out);
00101     s->output_count += s->avail_out;
00102     s->avail_in -= s->avail_out;
00103     s->next_in += s->avail_out;
00104     s->avail_out = 0;
00105     return false;
00106   }
00107 }
00108 
00109 void NULLCOMP_end(NULLCOMP_stream* strm) {
00110 }
00111 
00112 /// This structure is only used when a bytecode file is compressed.
00113 /// As bytecode is being decompressed, the memory buffer might need
00114 /// to be reallocated. The buffer allocation is handled in a callback 
00115 /// and this structure is needed to retain information across calls
00116 /// to the callback.
00117 /// @brief An internal buffer object used for handling decompression
00118 struct BufferContext {
00119   char* buff;
00120   unsigned size;
00121   BufferContext(unsigned compressedSize ) { 
00122     // Null to indicate malloc of a new block
00123     buff = 0; 
00124 
00125     // Compute the initial length of the uncompression buffer. Note that this
00126     // is twice the length of the compressed buffer and will be doubled again
00127     // in the callback for an initial allocation of 4x compressedSize.  This 
00128     // calculation is based on the typical compression ratio of bzip2 on LLVM 
00129     // bytecode files which typically ranges in the 50%-75% range.   Since we 
00130     // tyipcally get at least 50%, doubling is insufficient. By using a 4x 
00131     // multiplier on the first allocation, we minimize the impact of having to
00132     // copy the buffer on reallocation.
00133     size = compressedSize*2; 
00134   }
00135 
00136   /// This function handles allocation of the buffer used for decompression of
00137   /// compressed bytecode files. It is called by Compressor::decompress which is
00138   /// called by BytecodeReader::ParseBytecode. 
00139   static unsigned callback(char*&buff, unsigned& sz, void* ctxt){
00140     // Case the context variable to our BufferContext
00141     BufferContext* bc = reinterpret_cast<BufferContext*>(ctxt);
00142 
00143     // Compute the new, doubled, size of the block
00144     unsigned new_size = bc->size * 2;
00145 
00146     // Extend or allocate the block (realloc(0,n) == malloc(n))
00147     char* new_buff = (char*) ::realloc(bc->buff, new_size);
00148 
00149     // Figure out what to return to the Compressor. If this is the first call,
00150     // then bc->buff will be null. In this case we want to return the entire
00151     // buffer because there was no previous allocation.  Otherwise, when the
00152     // buffer is reallocated, we save the new base pointer in the 
00153     // BufferContext.buff field but return the address of only the extension, 
00154     // mid-way through the buffer (since its size was doubled). Furthermore, 
00155     // the sz result must be 1/2 the total size of the buffer.
00156     if (bc->buff == 0 ) {
00157       buff = bc->buff = new_buff;
00158       sz = new_size;
00159     } else {
00160       bc->buff = new_buff;
00161       buff = new_buff + bc->size;
00162       sz = bc->size;
00163     }
00164 
00165     // Retain the size of the allocated block
00166     bc->size = new_size;
00167 
00168     // Make sure we fail (return 1) if we didn't get any memory.
00169     return (bc->buff == 0 ? 1 : 0);
00170   }
00171 };
00172 
00173 // This structure retains the context when compressing the bytecode file. The
00174 // WriteCompressedData function below uses it to keep track of the previously
00175 // filled chunk of memory (which it writes) and how many bytes have been 
00176 // written.
00177 struct WriterContext {
00178   // Initialize the context
00179   WriterContext(std::ostream*OS, unsigned CS) 
00180     : chunk(0), sz(0), written(0), compSize(CS), Out(OS) {}
00181 
00182   // Make sure we clean up memory
00183   ~WriterContext() {
00184     if (chunk)
00185       delete [] chunk;
00186   }
00187 
00188   // Write the chunk
00189   void write(unsigned size = 0) {
00190     unsigned write_size = (size == 0 ? sz : size);
00191     Out->write(chunk,write_size);
00192     written += write_size;
00193     delete [] chunk;
00194     chunk = 0;
00195     sz = 0;
00196   }
00197 
00198   // This function is a callback used by the Compressor::compress function to 
00199   // allocate memory for the compression buffer. This function fulfills that
00200   // responsibility but also writes the previous (now filled) buffer out to the
00201   // stream. 
00202   static unsigned callback(char*& buffer, unsigned& size, void* context) {
00203     // Cast the context to the structure it must point to.
00204     WriterContext* ctxt = 
00205       reinterpret_cast<WriterContext*>(context);
00206 
00207     // If there's a previously allocated chunk, it must now be filled with
00208     // compressed data, so we write it out and deallocate it.
00209     if (ctxt->chunk != 0 && ctxt->sz > 0 ) {
00210       ctxt->write();
00211     }
00212 
00213     // Compute the size of the next chunk to allocate. We attempt to allocate
00214     // enough memory to handle the compression in a single memory allocation. In
00215     // general, the worst we do on compression of bytecode is about 50% so we
00216     // conservatively estimate compSize / 2 as the size needed for the
00217     // compression buffer. compSize is the size of the compressed data, provided
00218     // by WriteBytecodeToFile.
00219     size = ctxt->sz = ctxt->compSize / 2;
00220 
00221     // Allocate the chunks
00222     buffer = ctxt->chunk = new char [size];
00223 
00224     // We must return 1 if the allocation failed so that the Compressor knows
00225     // not to use the buffer pointer.
00226     return (ctxt->chunk == 0 ? 1 : 0);
00227   }
00228 
00229   char* chunk;       // pointer to the chunk of memory filled by compression
00230   unsigned sz;       // size of chunk
00231   unsigned written;  // aggregate total of bytes written in all chunks
00232   unsigned compSize; // size of the uncompressed buffer
00233   std::ostream* Out; // The stream we write the data to.
00234 };
00235 
00236 }
00237 
00238 namespace llvm {
00239 
00240 // Compress in one of three ways
00241 uint64_t Compressor::compress(const char* in, unsigned size, 
00242     OutputDataCallback* cb, void* context ) {
00243   assert(in && "Can't compress null buffer");
00244   assert(size && "Can't compress empty buffer");
00245   assert(cb && "Can't compress without a callback function");
00246 
00247   uint64_t result = 0;
00248 
00249   // For small files, we just don't bother compressing. bzip2 isn't very good
00250   // with tiny files and can actually make the file larger, so we just avoid
00251   // it altogether.
00252   if (size > 64*1024) {
00253     // Set up the bz_stream
00254     bz_stream bzdata;
00255     bzdata.bzalloc = 0;
00256     bzdata.bzfree = 0;
00257     bzdata.opaque = 0;
00258     bzdata.next_in = (char*)in;
00259     bzdata.avail_in = size;
00260     bzdata.next_out = 0;
00261     bzdata.avail_out = 0;
00262     switch ( BZ2_bzCompressInit(&bzdata, 5, 0, 100) ) {
00263       case BZ_CONFIG_ERROR: throw std::string("bzip2 library mis-compiled");
00264       case BZ_PARAM_ERROR:  throw std::string("Compressor internal error");
00265       case BZ_MEM_ERROR:    throw std::string("Out of memory");
00266       case BZ_OK:
00267       default:
00268         break;
00269     }
00270 
00271     // Get a block of memory
00272     if (0 != getdata(bzdata.next_out, bzdata.avail_out,cb,context)) {
00273       BZ2_bzCompressEnd(&bzdata);
00274       throw std::string("Can't allocate output buffer");
00275     }
00276 
00277     // Put compression code in first byte
00278     (*bzdata.next_out++) = COMP_TYPE_BZIP2;
00279     bzdata.avail_out--;
00280 
00281     // Compress it
00282     int bzerr = BZ_FINISH_OK;
00283     while (BZ_FINISH_OK == (bzerr = BZ2_bzCompress(&bzdata, BZ_FINISH))) {
00284       if (0 != getdata(bzdata.next_out, bzdata.avail_out,cb,context)) {
00285         BZ2_bzCompressEnd(&bzdata);
00286         throw std::string("Can't allocate output buffer");
00287       }
00288     }
00289     switch (bzerr) {
00290       case BZ_SEQUENCE_ERROR:
00291       case BZ_PARAM_ERROR: throw std::string("Param/Sequence error");
00292       case BZ_FINISH_OK:
00293       case BZ_STREAM_END: break;
00294       default: throw std::string("Oops: ") + utostr(unsigned(bzerr));
00295     }
00296 
00297     // Finish
00298     result = (static_cast<uint64_t>(bzdata.total_out_hi32) << 32) |
00299         bzdata.total_out_lo32 + 1;
00300 
00301     BZ2_bzCompressEnd(&bzdata);
00302   } else {
00303     // Do null compression, for small files
00304     NULLCOMP_stream sdata;
00305     sdata.next_in = (char*)in;
00306     sdata.avail_in = size;
00307     NULLCOMP_init(&sdata);
00308 
00309     if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
00310       throw std::string("Can't allocate output buffer");
00311     }
00312 
00313     *(sdata.next_out++) = COMP_TYPE_NONE;
00314     sdata.avail_out--;
00315 
00316     while (!NULLCOMP_compress(&sdata)) {
00317       if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
00318         throw std::string("Can't allocate output buffer");
00319       }
00320     }
00321 
00322     result = sdata.output_count + 1;
00323     NULLCOMP_end(&sdata);
00324   }
00325   return result;
00326 }
00327 
00328 uint64_t 
00329 Compressor::compressToNewBuffer(const char* in, unsigned size, char*&out) {
00330   BufferContext bc(size);
00331   unsigned result = compress(in,size,BufferContext::callback,(void*)&bc);
00332   out = bc.buff;
00333   return result;
00334 }
00335 
00336 uint64_t 
00337 Compressor::compressToStream(const char*in, unsigned size, std::ostream& out) {
00338   // Set up the context and writer
00339   WriterContext ctxt(&out,size / 2);
00340 
00341   // Compress everything after the magic number (which we'll alter)
00342   uint64_t zipSize = Compressor::compress(in,size,
00343     WriterContext::callback, (void*)&ctxt);
00344 
00345   if (ctxt.chunk) {
00346     ctxt.write(zipSize - ctxt.written);
00347   }
00348   return zipSize;
00349 }
00350 
00351 // Decompress in one of three ways
00352 uint64_t Compressor::decompress(const char *in, unsigned size,
00353                                 OutputDataCallback* cb, void* context) {
00354   assert(in && "Can't decompress null buffer");
00355   assert(size > 1 && "Can't decompress empty buffer");
00356   assert(cb && "Can't decompress without a callback function");
00357 
00358   uint64_t result = 0;
00359 
00360   switch (*in++) {
00361     case COMP_TYPE_BZIP2: {
00362       // Set up the bz_stream
00363       bz_stream bzdata;
00364       bzdata.bzalloc = 0;
00365       bzdata.bzfree = 0;
00366       bzdata.opaque = 0;
00367       bzdata.next_in = (char*)in;
00368       bzdata.avail_in = size - 1;
00369       bzdata.next_out = 0;
00370       bzdata.avail_out = 0;
00371       switch ( BZ2_bzDecompressInit(&bzdata, 0, 0) ) {
00372         case BZ_CONFIG_ERROR: throw std::string("bzip2 library mis-compiled");
00373         case BZ_PARAM_ERROR:  throw std::string("Compressor internal error");
00374         case BZ_MEM_ERROR:    throw std::string("Out of memory");
00375         case BZ_OK:
00376         default:
00377           break;
00378       }
00379 
00380       // Get a block of memory
00381       if (0 != getdata(bzdata.next_out, bzdata.avail_out,cb,context)) {
00382         BZ2_bzDecompressEnd(&bzdata);
00383         throw std::string("Can't allocate output buffer");
00384       }
00385 
00386       // Decompress it
00387       int bzerr = BZ_OK;
00388       while (BZ_OK == (bzerr = BZ2_bzDecompress(&bzdata))) {
00389         if (0 != getdata(bzdata.next_out, bzdata.avail_out,cb,context)) {
00390           BZ2_bzDecompressEnd(&bzdata);
00391           throw std::string("Can't allocate output buffer");
00392         }
00393       }
00394 
00395       switch (bzerr) {
00396         case BZ_PARAM_ERROR:  throw std::string("Compressor internal error");
00397         case BZ_MEM_ERROR:    throw std::string("Out of memory");
00398         case BZ_DATA_ERROR:   throw std::string("Data integrity error");
00399         case BZ_DATA_ERROR_MAGIC:throw std::string("Data is not BZIP2");
00400         default: throw("Ooops");
00401         case BZ_STREAM_END:
00402           break;
00403       }
00404 
00405       // Finish
00406       result = (static_cast<uint64_t>(bzdata.total_out_hi32) << 32) |
00407         bzdata.total_out_lo32;
00408       BZ2_bzDecompressEnd(&bzdata);
00409       break;
00410     }
00411 
00412     case COMP_TYPE_NONE: {
00413       NULLCOMP_stream sdata;
00414       sdata.next_in = (char*)in;
00415       sdata.avail_in = size - 1;
00416       NULLCOMP_init(&sdata);
00417 
00418       if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
00419         throw std::string("Can't allocate output buffer");
00420       }
00421 
00422       while (!NULLCOMP_decompress(&sdata)) {
00423         if (0 != getdata(sdata.next_out, sdata.avail_out,cb,context)) {
00424           throw std::string("Can't allocate output buffer");
00425         }
00426       }
00427 
00428       result = sdata.output_count;
00429       NULLCOMP_end(&sdata);
00430       break;
00431     }
00432 
00433     default:
00434       throw std::string("Unknown type of compressed data");
00435   }
00436 
00437   return result;
00438 }
00439 
00440 uint64_t 
00441 Compressor::decompressToNewBuffer(const char* in, unsigned size, char*&out) {
00442   BufferContext bc(size);
00443   unsigned result = decompress(in,size,BufferContext::callback,(void*)&bc);
00444   out = bc.buff;
00445   return result;
00446 }
00447                                                                                                                                             
00448 uint64_t 
00449 Compressor::decompressToStream(const char*in, unsigned size, std::ostream& out){
00450   // Set up the context and writer
00451   WriterContext ctxt(&out,size / 2);
00452 
00453   // Compress everything after the magic number (which we'll alter)
00454   uint64_t zipSize = Compressor::decompress(in,size,
00455     WriterContext::callback, (void*)&ctxt);
00456 
00457   if (ctxt.chunk) {
00458     ctxt.write(zipSize - ctxt.written);
00459   }
00460   return zipSize;
00461 }
00462 
00463 }
00464 
00465 // vim: sw=2 ai