archival/libunarchive/decompress_bunzip2.c

   1 /* vi: set sw=4 ts=4: */
   2 /*      Small bzip2 deflate implementation, by Rob Landley (rob@landley.net).
   3
   4         Based on bzip2 decompression code by Julian R Seward (jseward@acm.org),
   5         which also acknowledges contributions by Mike Burrows, David Wheeler,
   6         Peter Fenwick, Alistair Moffat, Radford Neal, Ian H. Witten,
   7         Robert Sedgewick, and Jon L. Bentley.
   8
   9         This code is licensed under the LGPLv2:
  10                 LGPL http://www.gnu.org/copyleft/lgpl.html
  11 */
  12
  13 /*
  14         Size and speed optimizations by Manuel Novoa III  (mjn3@codepoet.org).
  15
  16         More efficient reading of Huffman codes, a streamlined read_bunzip()
  17         function, and various other tweaks.  In (limited) tests, approximately
  18         20% faster than bzcat on x86 and about 10% faster on arm.
  19
  20         Note that about 2/3 of the time is spent in read_unzip() reversing
  21         the Burrows-Wheeler transformation.  Much of that time is delay
  22         resulting from cache misses.
  23
  24         I would ask that anyone benefiting from this work, especially those
  25         using it in commercial products, consider making a donation to my local
  26         non-profit hospice organization (www.hospiceacadiana.com) in the name of
  27         the woman I loved, Toni W. Hagan, who passed away Feb. 12, 2003.
  28
  29         Manuel
  30  */
  31
  32 #include <setjmp.h>
  33 #include <stdio.h>
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <unistd.h>
  37 #include <limits.h>
  38
  39 #include "libbb.h"
  40
  41 /* Constants for Huffman coding */
  42 #define MAX_GROUPS                      6
  43 #define GROUP_SIZE                      50              /* 64 would have been more efficient */
  44 #define MAX_HUFCODE_BITS        20              /* Longest Huffman code allowed */
  45 #define MAX_SYMBOLS                     258             /* 256 literals + RUNA + RUNB */
  46 #define SYMBOL_RUNA                     0
  47 #define SYMBOL_RUNB                     1
  48
  49 /* Status return values */
  50 #define RETVAL_OK                                               0
  51 #define RETVAL_LAST_BLOCK                               (-1)
  52 #define RETVAL_NOT_BZIP_DATA                    (-2)
  53 #define RETVAL_UNEXPECTED_INPUT_EOF             (-3)
  54 #define RETVAL_UNEXPECTED_OUTPUT_EOF    (-4)
  55 #define RETVAL_DATA_ERROR                               (-5)
  56 #define RETVAL_OUT_OF_MEMORY                    (-6)
  57 #define RETVAL_OBSOLETE_INPUT                   (-7)
  58
  59 /* Other housekeeping constants */
  60 #define IOBUF_SIZE                      4096
  61
  62 /* This is what we know about each Huffman coding group */
  63 struct group_data {
  64         /* We have an extra slot at the end of limit[] for a sentinal value. */
  65         int limit[MAX_HUFCODE_BITS+1],base[MAX_HUFCODE_BITS],permute[MAX_SYMBOLS];
  66         int minLen, maxLen;
  67 };
  68
  69 /* Structure holding all the housekeeping data, including IO buffers and
  70    memory that persists between calls to bunzip */
  71
  72 typedef struct {
  73         /* State for interrupting output loop */
  74
  75         int writeCopies,writePos,writeRunCountdown,writeCount,writeCurrent;
  76
  77         /* I/O tracking data (file handles, buffers, positions, etc.) */
  78
  79         int in_fd,out_fd,inbufCount,inbufPos /*,outbufPos*/;
  80         unsigned char *inbuf /*,*outbuf*/;
  81         unsigned int inbufBitCount, inbufBits;
  82
  83         /* The CRC values stored in the block header and calculated from the data */
  84
  85         unsigned int crc32Table[256],headerCRC, totalCRC, writeCRC;
  86
  87         /* Intermediate buffer and its size (in bytes) */
  88
  89         unsigned int *dbuf, dbufSize;
  90
  91         /* These things are a bit too big to go on the stack */
  92
  93         unsigned char selectors[32768];                 /* nSelectors=15 bits */
  94         struct group_data groups[MAX_GROUPS];   /* Huffman coding tables */
  95
  96         /* For I/O error handling */
  97
  98         jmp_buf jmpbuf;
  99 } bunzip_data;
 100
 101 /* Return the next nnn bits of input.  All reads from the compressed input
 102    are done through this function.  All reads are big endian */
 103
 104 static unsigned int get_bits(bunzip_data *bd, char bits_wanted)
 105 {
 106         unsigned int bits=0;
 107
 108         /* If we need to get more data from the byte buffer, do so.  (Loop getting
 109            one byte at a time to enforce endianness and avoid unaligned access.) */
 110
 111         while (bd->inbufBitCount<bits_wanted) {
 112
 113                 /* If we need to read more data from file into byte buffer, do so */
 114
 115                 if(bd->inbufPos==bd->inbufCount) {
 116                         if((bd->inbufCount = read(bd->in_fd, bd->inbuf, IOBUF_SIZE)) <= 0)
 117                                 longjmp(bd->jmpbuf,RETVAL_UNEXPECTED_INPUT_EOF);
 118                         bd->inbufPos=0;
 119                 }
 120
 121                 /* Avoid 32-bit overflow (dump bit buffer to top of output) */
 122
 123                 if(bd->inbufBitCount>=24) {
 124                         bits=bd->inbufBits&((1<<bd->inbufBitCount)-1);
 125                         bits_wanted-=bd->inbufBitCount;
 126                         bits<<=bits_wanted;
 127                         bd->inbufBitCount=0;
 128                 }
 129
 130                 /* Grab next 8 bits of input from buffer. */
 131
 132                 bd->inbufBits=(bd->inbufBits<<8)|bd->inbuf[bd->inbufPos++];
 133                 bd->inbufBitCount+=8;
 134         }
 135
 136         /* Calculate result */
 137
 138         bd->inbufBitCount-=bits_wanted;
 139         bits|=(bd->inbufBits>>bd->inbufBitCount)&((1<<bits_wanted)-1);
 140
 141         return bits;
 142 }
 143
 144 /* Unpacks the next block and sets up for the inverse burrows-wheeler step. */
 145
 146 static int get_next_block(bunzip_data *bd)
 147 {
 148         struct group_data *hufGroup;
 149         int dbufCount,nextSym,dbufSize,groupCount,*base,*limit,selector,
 150                 i,j,k,t,runPos,symCount,symTotal,nSelectors,byteCount[256];
 151         unsigned char uc, symToByte[256], mtfSymbol[256], *selectors;
 152         unsigned int *dbuf,origPtr;
 153
 154         dbuf=bd->dbuf;
 155         dbufSize=bd->dbufSize;
 156         selectors=bd->selectors;
 157
 158         /* Reset longjmp I/O error handling */
 159
 160         i=setjmp(bd->jmpbuf);
 161         if(i) return i;
 162
 163         /* Read in header signature and CRC, then validate signature.
 164            (last block signature means CRC is for whole file, return now) */
 165
 166         i = get_bits(bd,24);
 167         j = get_bits(bd,24);
 168         bd->headerCRC=get_bits(bd,32);
 169         if ((i == 0x177245) && (j == 0x385090)) return RETVAL_LAST_BLOCK;
 170         if ((i != 0x314159) || (j != 0x265359)) return RETVAL_NOT_BZIP_DATA;
 171
 172         /* We can add support for blockRandomised if anybody complains.  There was
 173            some code for this in busybox 1.0.0-pre3, but nobody ever noticed that
 174            it didn't actually work. */
 175
 176         if(get_bits(bd,1)) return RETVAL_OBSOLETE_INPUT;
 177         if((origPtr=get_bits(bd,24)) > dbufSize) return RETVAL_DATA_ERROR;
 178
 179         /* mapping table: if some byte values are never used (encoding things
 180            like ascii text), the compression code removes the gaps to have fewer
 181            symbols to deal with, and writes a sparse bitfield indicating which
 182            values were present.  We make a translation table to convert the symbols
 183            back to the corresponding bytes. */
 184
 185         t=get_bits(bd, 16);
 186         symTotal=0;
 187         for (i=0;i<16;i++) {
 188                 if(t&(1<<(15-i))) {
 189                         k=get_bits(bd,16);
 190                         for(j=0;j<16;j++)
 191                                 if(k&(1<<(15-j))) symToByte[symTotal++]=(16*i)+j;
 192                 }
 193         }
 194
 195         /* How many different Huffman coding groups does this block use? */
 196
 197         groupCount=get_bits(bd,3);
 198         if (groupCount<2 || groupCount>MAX_GROUPS) return RETVAL_DATA_ERROR;
 199
 200         /* nSelectors: Every GROUP_SIZE many symbols we select a new Huffman coding
 201            group.  Read in the group selector list, which is stored as MTF encoded
 202            bit runs.  (MTF=Move To Front, as each value is used it's moved to the
 203            start of the list.) */
 204
 205         if(!(nSelectors=get_bits(bd, 15))) return RETVAL_DATA_ERROR;
 206         for(i=0; i<groupCount; i++) mtfSymbol[i] = i;
 207         for(i=0; i<nSelectors; i++) {
 208
 209                 /* Get next value */
 210
 211                 for(j=0;get_bits(bd,1);j++) if (j>=groupCount) return RETVAL_DATA_ERROR;
 212
 213                 /* Decode MTF to get the next selector */
 214
 215                 uc = mtfSymbol[j];
 216                 for(;j;j--) mtfSymbol[j] = mtfSymbol[j-1];
 217                 mtfSymbol[0]=selectors[i]=uc;
 218         }
 219
 220         /* Read the Huffman coding tables for each group, which code for symTotal
 221            literal symbols, plus two run symbols (RUNA, RUNB) */
 222
 223         symCount=symTotal+2;
 224         for (j=0; j<groupCount; j++) {
 225                 unsigned char length[MAX_SYMBOLS],temp[MAX_HUFCODE_BITS+1];
 226                 int     minLen, maxLen, pp;
 227
 228                 /* Read Huffman code lengths for each symbol.  They're stored in
 229                    a way similar to mtf; record a starting value for the first symbol,
 230                    and an offset from the previous value for everys symbol after that.
 231                    (Subtracting 1 before the loop and then adding it back at the end is
 232                    an optimization that makes the test inside the loop simpler: symbol
 233                    length 0 becomes negative, so an unsigned inequality catches it.) */
 234
 235                 t=get_bits(bd, 5)-1;
 236                 for (i = 0; i < symCount; i++) {
 237                         for(;;) {
 238                                 if (((unsigned)t) > (MAX_HUFCODE_BITS-1))
 239                                         return RETVAL_DATA_ERROR;
 240
 241                                 /* If first bit is 0, stop.  Else second bit indicates whether
 242                                    to increment or decrement the value.  Optimization: grab 2
 243                                    bits and unget the second if the first was 0. */
 244
 245                                 k = get_bits(bd,2);
 246                                 if (k < 2) {
 247                                         bd->inbufBitCount++;
 248                                         break;
 249                                 }
 250
 251                                 /* Add one if second bit 1, else subtract 1.  Avoids if/else */
 252
 253                                 t+=(((k+1)&2)-1);
 254                         }
 255
 256                         /* Correct for the initial -1, to get the final symbol length */
 257
 258                         length[i]=t+1;
 259                 }
 260
 261                 /* Find largest and smallest lengths in this group */
 262
 263                 minLen=maxLen=length[0];
 264                 for(i = 1; i < symCount; i++) {
 265                         if(length[i] > maxLen) maxLen = length[i];
 266                         else if(length[i] < minLen) minLen = length[i];
 267                 }
 268
 269                 /* Calculate permute[], base[], and limit[] tables from length[].
 270                  *
 271                  * permute[] is the lookup table for converting Huffman coded symbols
 272                  * into decoded symbols.  base[] is the amount to subtract from the
 273                  * value of a Huffman symbol of a given length when using permute[].
 274                  *
 275                  * limit[] indicates the largest numerical value a symbol with a given
 276                  * number of bits can have.  This is how the Huffman codes can vary in
 277                  * length: each code with a value>limit[length] needs another bit.
 278                  */
 279
 280                 hufGroup=bd->groups+j;
 281                 hufGroup->minLen = minLen;
 282                 hufGroup->maxLen = maxLen;
 283
 284                 /* Note that minLen can't be smaller than 1, so we adjust the base
 285                    and limit array pointers so we're not always wasting the first
 286                    entry.  We do this again when using them (during symbol decoding).*/
 287
 288                 base=hufGroup->base-1;
 289                 limit=hufGroup->limit-1;
 290
 291                 /* Calculate permute[].  Concurently, initialize temp[] and limit[]. */
 292
 293                 pp=0;
 294                 for(i=minLen;i<=maxLen;i++) {
 295                         temp[i]=limit[i]=0;
 296                         for(t=0;t<symCount;t++)
 297                                 if(length[t]==i) hufGroup->permute[pp++] = t;
 298                 }
 299
 300                 /* Count symbols coded for at each bit length */
 301
 302                 for (i=0;i<symCount;i++) temp[length[i]]++;
 303
 304                 /* Calculate limit[] (the largest symbol-coding value at each bit
 305                  * length, which is (previous limit<<1)+symbols at this level), and
 306                  * base[] (number of symbols to ignore at each bit length, which is
 307                  * limit minus the cumulative count of symbols coded for already). */
 308
 309                 pp=t=0;
 310                 for (i=minLen; i<maxLen; i++) {
 311                         pp+=temp[i];
 312
 313                         /* We read the largest possible symbol size and then unget bits
 314                            after determining how many we need, and those extra bits could
 315                            be set to anything.  (They're noise from future symbols.)  At
 316                            each level we're really only interested in the first few bits,
 317                            so here we set all the trailing to-be-ignored bits to 1 so they
 318                            don't affect the value>limit[length] comparison. */
 319
 320                         limit[i]= (pp << (maxLen - i)) - 1;
 321                         pp<<=1;
 322                         base[i+1]=pp-(t+=temp[i]);
 323                 }
 324                 limit[maxLen+1] = INT_MAX; /* Sentinal value for reading next sym. */
 325                 limit[maxLen]=pp+temp[maxLen]-1;
 326                 base[minLen]=0;
 327         }
 328
 329         /* We've finished reading and digesting the block header.  Now read this
 330            block's Huffman coded symbols from the file and undo the Huffman coding
 331            and run length encoding, saving the result into dbuf[dbufCount++]=uc */
 332
 333         /* Initialize symbol occurrence counters and symbol Move To Front table */
 334
 335         for(i=0;i<256;i++) {
 336                 byteCount[i] = 0;
 337                 mtfSymbol[i]=(unsigned char)i;
 338         }
 339
 340         /* Loop through compressed symbols. */
 341
 342         runPos=dbufCount=selector=0;
 343         for(;;) {
 344
 345                 /* fetch next Huffman coding group from list. */
 346
 347                 symCount=GROUP_SIZE-1;
 348                 if(selector>=nSelectors) return RETVAL_DATA_ERROR;
 349                 hufGroup=bd->groups+selectors[selector++];
 350                 base=hufGroup->base-1;
 351                 limit=hufGroup->limit-1;
 352 continue_this_group:
 353
 354                 /* Read next Huffman-coded symbol. */
 355
 356                 /* Note: It is far cheaper to read maxLen bits and back up than it is
 357                    to read minLen bits and then an additional bit at a time, testing
 358                    as we go.  Because there is a trailing last block (with file CRC),
 359                    there is no danger of the overread causing an unexpected EOF for a
 360                    valid compressed file.  As a further optimization, we do the read
 361                    inline (falling back to a call to get_bits if the buffer runs
 362                    dry).  The following (up to got_huff_bits:) is equivalent to
 363                    j=get_bits(bd,hufGroup->maxLen);
 364                  */
 365
 366                 while (bd->inbufBitCount<hufGroup->maxLen) {
 367                         if(bd->inbufPos==bd->inbufCount) {
 368                                 j = get_bits(bd,hufGroup->maxLen);
 369                                 goto got_huff_bits;
 370                         }
 371                         bd->inbufBits=(bd->inbufBits<<8)|bd->inbuf[bd->inbufPos++];
 372                         bd->inbufBitCount+=8;
 373                 };
 374                 bd->inbufBitCount-=hufGroup->maxLen;
 375                 j = (bd->inbufBits>>bd->inbufBitCount)&((1<<hufGroup->maxLen)-1);
 376
 377 got_huff_bits:
 378
 379                 /* Figure how how many bits are in next symbol and unget extras */
 380
 381                 i=hufGroup->minLen;
 382                 while(j>limit[i]) ++i;
 383                 bd->inbufBitCount += (hufGroup->maxLen - i);
 384
 385                 /* Huffman decode value to get nextSym (with bounds checking) */
 386
 387                 if ((i > hufGroup->maxLen)
 388                         || (((unsigned)(j=(j>>(hufGroup->maxLen-i))-base[i]))
 389                                 >= MAX_SYMBOLS))
 390                         return RETVAL_DATA_ERROR;
 391                 nextSym = hufGroup->permute[j];
 392
 393                 /* We have now decoded the symbol, which indicates either a new literal
 394                    byte, or a repeated run of the most recent literal byte.  First,
 395                    check if nextSym indicates a repeated run, and if so loop collecting
 396                    how many times to repeat the last literal. */
 397
 398                 if (((unsigned)nextSym) <= SYMBOL_RUNB) { /* RUNA or RUNB */
 399
 400                         /* If this is the start of a new run, zero out counter */
 401
 402                         if(!runPos) {
 403                                 runPos = 1;
 404                                 t = 0;
 405                         }
 406
 407                         /* Neat trick that saves 1 symbol: instead of or-ing 0 or 1 at
 408                            each bit position, add 1 or 2 instead.  For example,
 409                            1011 is 1<<0 + 1<<1 + 2<<2.  1010 is 2<<0 + 2<<1 + 1<<2.
 410                            You can make any bit pattern that way using 1 less symbol than
 411                            the basic or 0/1 method (except all bits 0, which would use no
 412                            symbols, but a run of length 0 doesn't mean anything in this
 413                            context).  Thus space is saved. */
 414
 415                         t += (runPos << nextSym); /* +runPos if RUNA; +2*runPos if RUNB */
 416                         if(runPos < dbufSize) runPos <<= 1;
 417                         goto end_of_huffman_loop;
 418                 }
 419
 420                 /* When we hit the first non-run symbol after a run, we now know
 421                    how many times to repeat the last literal, so append that many
 422                    copies to our buffer of decoded symbols (dbuf) now.  (The last
 423                    literal used is the one at the head of the mtfSymbol array.) */
 424
 425                 if(runPos) {
 426                         runPos=0;
 427                         if(dbufCount+t>=dbufSize) return RETVAL_DATA_ERROR;
 428
 429                         uc = symToByte[mtfSymbol[0]];
 430                         byteCount[uc] += t;
 431                         while(t--) dbuf[dbufCount++]=uc;
 432                 }
 433
 434                 /* Is this the terminating symbol? */
 435
 436                 if(nextSym>symTotal) break;
 437
 438                 /* At this point, nextSym indicates a new literal character.  Subtract
 439                    one to get the position in the MTF array at which this literal is
 440                    currently to be found.  (Note that the result can't be -1 or 0,
 441                    because 0 and 1 are RUNA and RUNB.  But another instance of the
 442                    first symbol in the mtf array, position 0, would have been handled
 443                    as part of a run above.  Therefore 1 unused mtf position minus
 444                    2 non-literal nextSym values equals -1.) */
 445
 446                 if(dbufCount>=dbufSize) return RETVAL_DATA_ERROR;
 447                 i = nextSym - 1;
 448                 uc = mtfSymbol[i];
 449
 450                 /* Adjust the MTF array.  Since we typically expect to move only a
 451                  * small number of symbols, and are bound by 256 in any case, using
 452                  * memmove here would typically be bigger and slower due to function
 453                  * call overhead and other assorted setup costs. */
 454
 455                 do {
 456                         mtfSymbol[i] = mtfSymbol[i-1];
 457                 } while (--i);
 458                 mtfSymbol[0] = uc;
 459                 uc=symToByte[uc];
 460
 461                 /* We have our literal byte.  Save it into dbuf. */
 462
 463                 byteCount[uc]++;
 464                 dbuf[dbufCount++] = (unsigned int)uc;
 465
 466                 /* Skip group initialization if we're not done with this group.  Done
 467                  * this way to avoid compiler warning. */
 468
 469 end_of_huffman_loop:
 470                 if(symCount--) goto continue_this_group;
 471         }
 472
 473         /* At this point, we've read all the Huffman-coded symbols (and repeated
 474        runs) for this block from the input stream, and decoded them into the
 475            intermediate buffer.  There are dbufCount many decoded bytes in dbuf[].
 476            Now undo the Burrows-Wheeler transform on dbuf.
 477            See http://dogma.net/markn/articles/bwt/bwt.htm
 478          */
 479
 480         /* Turn byteCount into cumulative occurrence counts of 0 to n-1. */
 481
 482         j=0;
 483         for(i=0;i<256;i++) {
 484                 k=j+byteCount[i];
 485                 byteCount[i] = j;
 486                 j=k;
 487         }
 488
 489         /* Figure out what order dbuf would be in if we sorted it. */
 490
 491         for (i=0;i<dbufCount;i++) {
 492                 uc=(unsigned char)(dbuf[i] & 0xff);
 493                 dbuf[byteCount[uc]] |= (i << 8);
 494                 byteCount[uc]++;
 495         }
 496
 497         /* Decode first byte by hand to initialize "previous" byte.  Note that it
 498            doesn't get output, and if the first three characters are identical
 499            it doesn't qualify as a run (hence writeRunCountdown=5). */
 500
 501         if(dbufCount) {
 502                 if(origPtr>=dbufCount) return RETVAL_DATA_ERROR;
 503                 bd->writePos=dbuf[origPtr];
 504             bd->writeCurrent=(unsigned char)(bd->writePos&0xff);
 505                 bd->writePos>>=8;
 506                 bd->writeRunCountdown=5;
 507         }
 508         bd->writeCount=dbufCount;
 509
 510         return RETVAL_OK;
 511 }
 512
 513 /* Undo burrows-wheeler transform on intermediate buffer to produce output.
 514    If start_bunzip was initialized with out_fd=-1, then up to len bytes of
 515    data are written to outbuf.  Return value is number of bytes written or
 516    error (all errors are negative numbers).  If out_fd!=-1, outbuf and len
 517    are ignored, data is written to out_fd and return is RETVAL_OK or error.
 518 */
 519
 520 static int read_bunzip(bunzip_data *bd, char *outbuf, int len)
 521 {
 522         const unsigned int *dbuf;
 523         int pos,current,previous,gotcount;
 524
 525         /* If last read was short due to end of file, return last block now */
 526         if(bd->writeCount<0) return bd->writeCount;
 527
 528         gotcount = 0;
 529         dbuf=bd->dbuf;
 530         pos=bd->writePos;
 531         current=bd->writeCurrent;
 532
 533         /* We will always have pending decoded data to write into the output
 534            buffer unless this is the very first call (in which case we haven't
 535            Huffman-decoded a block into the intermediate buffer yet). */
 536
 537         if (bd->writeCopies) {
 538
 539                 /* Inside the loop, writeCopies means extra copies (beyond 1) */
 540
 541                 --bd->writeCopies;
 542
 543                 /* Loop outputting bytes */
 544
 545                 for(;;) {
 546
 547                         /* If the output buffer is full, snapshot state and return */
 548
 549                         if(gotcount >= len) {
 550                                 bd->writePos=pos;
 551                                 bd->writeCurrent=current;
 552                                 bd->writeCopies++;
 553                                 return len;
 554                         }
 555
 556                         /* Write next byte into output buffer, updating CRC */
 557
 558                         outbuf[gotcount++] = current;
 559                         bd->writeCRC=(((bd->writeCRC)<<8)
 560                                                   ^bd->crc32Table[((bd->writeCRC)>>24)^current]);
 561
 562                         /* Loop now if we're outputting multiple copies of this byte */
 563
 564                         if (bd->writeCopies) {
 565                                 --bd->writeCopies;
 566                                 continue;
 567                         }
 568 decode_next_byte:
 569                         if (!bd->writeCount--) break;
 570                         /* Follow sequence vector to undo Burrows-Wheeler transform */
 571                         previous=current;
 572                         pos=dbuf[pos];
 573                         current=pos&0xff;
 574                         pos>>=8;
 575
 576                         /* After 3 consecutive copies of the same byte, the 4th is a repeat
 577                            count.  We count down from 4 instead
 578                          * of counting up because testing for non-zero is faster */
 579
 580                         if(--bd->writeRunCountdown) {
 581                                 if(current!=previous) bd->writeRunCountdown=4;
 582                         } else {
 583
 584                                 /* We have a repeated run, this byte indicates the count */
 585
 586                                 bd->writeCopies=current;
 587                                 current=previous;
 588                                 bd->writeRunCountdown=5;
 589
 590                                 /* Sometimes there are just 3 bytes (run length 0) */
 591
 592                                 if(!bd->writeCopies) goto decode_next_byte;
 593
 594                                 /* Subtract the 1 copy we'd output anyway to get extras */
 595
 596                                 --bd->writeCopies;
 597                         }
 598                 }
 599
 600                 /* Decompression of this block completed successfully */
 601
 602                 bd->writeCRC=~bd->writeCRC;
 603                 bd->totalCRC=((bd->totalCRC<<1) | (bd->totalCRC>>31)) ^ bd->writeCRC;
 604
 605                 /* If this block had a CRC error, force file level CRC error. */
 606
 607                 if(bd->writeCRC!=bd->headerCRC) {
 608                         bd->totalCRC=bd->headerCRC+1;
 609                         return RETVAL_LAST_BLOCK;
 610                 }
 611         }
 612
 613         /* Refill the intermediate buffer by Huffman-decoding next block of input */
 614         /* (previous is just a convenient unused temp variable here) */
 615
 616         previous=get_next_block(bd);
 617         if(previous) {
 618                 bd->writeCount=previous;
 619                 return (previous!=RETVAL_LAST_BLOCK) ? previous : gotcount;
 620         }
 621         bd->writeCRC=0xffffffffUL;
 622         pos=bd->writePos;
 623         current=bd->writeCurrent;
 624         goto decode_next_byte;
 625 }
 626
 627 /* Allocate the structure, read file header.  If in_fd==-1, inbuf must contain
 628    a complete bunzip file (len bytes long).  If in_fd!=-1, inbuf and len are
 629    ignored, and data is read from file handle into temporary buffer. */
 630
 631 static int start_bunzip(bunzip_data **bdp, int in_fd, unsigned char *inbuf,
 632                                                 int len)
 633 {
 634         bunzip_data *bd;
 635         unsigned int i,j,c;
 636         const unsigned int BZh0=(((unsigned int)'B')<<24)+(((unsigned int)'Z')<<16)
 637                                                         +(((unsigned int)'h')<<8)+(unsigned int)'0';
 638
 639         /* Figure out how much data to allocate */
 640
 641         i=sizeof(bunzip_data);
 642         if(in_fd!=-1) i+=IOBUF_SIZE;
 643
 644         /* Allocate bunzip_data.  Most fields initialize to zero. */
 645
 646         bd=*bdp=xmalloc(i);
 647         memset(bd,0,sizeof(bunzip_data));
 648
 649         /* Setup input buffer */
 650
 651         if(-1==(bd->in_fd=in_fd)) {
 652                 bd->inbuf=inbuf;
 653                 bd->inbufCount=len;
 654         } else bd->inbuf=(unsigned char *)(bd+1);
 655
 656         /* Init the CRC32 table (big endian) */
 657
 658         for(i=0;i<256;i++) {
 659                 c=i<<24;
 660                 for(j=8;j;j--)
 661                         c=c&0x80000000 ? (c<<1)^0x04c11db7 : (c<<1);
 662                 bd->crc32Table[i]=c;
 663         }
 664
 665         /* Setup for I/O error handling via longjmp */
 666
 667         i=setjmp(bd->jmpbuf);
 668         if(i) return i;
 669
 670         /* Ensure that file starts with "BZh['1'-'9']." */
 671
 672         i = get_bits(bd,32);
 673         if (((unsigned int)(i-BZh0-1)) >= 9) return RETVAL_NOT_BZIP_DATA;
 674
 675         /* Fourth byte (ascii '1'-'9'), indicates block size in units of 100k of
 676            uncompressed data.  Allocate intermediate buffer for block. */
 677
 678         bd->dbufSize=100000*(i-BZh0);
 679
 680         bd->dbuf=xmalloc(bd->dbufSize * sizeof(int));
 681         return RETVAL_OK;
 682 }
 683
 684 /* Example usage: decompress src_fd to dst_fd.  (Stops at end of bzip data,
 685    not end of file.) */
 686
 687 int uncompressStream(int src_fd, int dst_fd)
 688 {
 689         char *outbuf;
 690         bunzip_data *bd;
 691         int i;
 692
 693         outbuf=xmalloc(IOBUF_SIZE);
 694         if(!(i=start_bunzip(&bd,src_fd,0,0))) {
 695                 for(;;) {
 696                         if((i=read_bunzip(bd,outbuf,IOBUF_SIZE)) <= 0) break;
 697                         if(i!=write(dst_fd,outbuf,i)) {
 698                                 i=RETVAL_UNEXPECTED_OUTPUT_EOF;
 699                                 break;
 700                         }
 701                 }
 702         }
 703
 704         /* Check CRC and release memory */
 705
 706         if(i==RETVAL_LAST_BLOCK) {
 707                 if (bd->headerCRC!=bd->totalCRC) {
 708                         bb_error_msg("Data integrity error when decompressing.");
 709                 } else {
 710                         i=RETVAL_OK;
 711                 }
 712         } else if (i==RETVAL_UNEXPECTED_OUTPUT_EOF) {
 713                 bb_error_msg("Compressed file ends unexpectedly");
 714         } else {
 715                 bb_error_msg("Decompression failed");
 716         }
 717         free(bd->dbuf);
 718         free(bd);
 719         free(outbuf);
 720
 721         return i;
 722 }
 723
 724 #ifdef TESTING
 725
 726 static char * const bunzip_errors[]={NULL,"Bad file checksum","Not bzip data",
 727                 "Unexpected input EOF","Unexpected output EOF","Data error",
 728                  "Out of memory","Obsolete (pre 0.9.5) bzip format not supported."};
 729
 730 /* Dumb little test thing, decompress stdin to stdout */
 731 int main(int argc, char *argv[])
 732 {
 733         int i=uncompressStream(0,1);
 734         char c;
 735
 736         if(i) fprintf(stderr,"%s\n", bunzip_errors[-i]);
 737     else if(read(0,&c,1)) fprintf(stderr,"Trailing garbage ignored\n");
 738         return -i;
 739 }
 740 #endif