archival/libarchive/bz/blocksort.c

   1 /*
   2  * bzip2 is written by Julian Seward <jseward@bzip.org>.
   3  * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
   4  * See README and LICENSE files in this directory for more information.
   5  */
   6
   7 /*-------------------------------------------------------------*/
   8 /*--- Block sorting machinery                               ---*/
   9 /*---                                           blocksort.c ---*/
  10 /*-------------------------------------------------------------*/
  11
  12 /* ------------------------------------------------------------------
  13 This file is part of bzip2/libbzip2, a program and library for
  14 lossless, block-sorting data compression.
  15
  16 bzip2/libbzip2 version 1.0.4 of 20 December 2006
  17 Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
  18
  19 Please read the WARNING, DISCLAIMER and PATENTS sections in the
  20 README file.
  21
  22 This program is released under the terms of the license contained
  23 in the file LICENSE.
  24 ------------------------------------------------------------------ */
  25
  26 /* #include "bzlib_private.h" */
  27
  28 #define mswap(zz1, zz2) \
  29 { \
  30         int32_t zztmp = zz1; \
  31         zz1 = zz2; \
  32         zz2 = zztmp; \
  33 }
  34
  35 static
  36 /* No measurable speed gain with inlining */
  37 /* ALWAYS_INLINE */
  38 void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
  39 {
  40         while (zzn > 0) {
  41                 mswap(ptr[zzp1], ptr[zzp2]);
  42                 zzp1++;
  43                 zzp2++;
  44                 zzn--;
  45         }
  46 }
  47
  48 static
  49 ALWAYS_INLINE
  50 int32_t mmin(int32_t a, int32_t b)
  51 {
  52         return (a < b) ? a : b;
  53 }
  54
  55
  56 /*---------------------------------------------*/
  57 /*--- Fallback O(N log(N)^2) sorting        ---*/
  58 /*--- algorithm, for repetitive blocks      ---*/
  59 /*---------------------------------------------*/
  60
  61 /*---------------------------------------------*/
  62 static
  63 inline
  64 void fallbackSimpleSort(uint32_t* fmap,
  65                 uint32_t* eclass,
  66                 int32_t   lo,
  67                 int32_t   hi)
  68 {
  69         int32_t i, j, tmp;
  70         uint32_t ec_tmp;
  71
  72         if (lo == hi) return;
  73
  74         if (hi - lo > 3) {
  75                 for (i = hi-4; i >= lo; i--) {
  76                         tmp = fmap[i];
  77                         ec_tmp = eclass[tmp];
  78                         for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
  79                                 fmap[j-4] = fmap[j];
  80                         fmap[j-4] = tmp;
  81                 }
  82         }
  83
  84         for (i = hi-1; i >= lo; i--) {
  85                 tmp = fmap[i];
  86                 ec_tmp = eclass[tmp];
  87                 for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
  88                         fmap[j-1] = fmap[j];
  89                 fmap[j-1] = tmp;
  90         }
  91 }
  92
  93
  94 /*---------------------------------------------*/
  95 #define fpush(lz,hz) { \
  96         stackLo[sp] = lz; \
  97         stackHi[sp] = hz; \
  98         sp++; \
  99 }
 100
 101 #define fpop(lz,hz) { \
 102         sp--; \
 103         lz = stackLo[sp]; \
 104         hz = stackHi[sp]; \
 105 }
 106
 107 #define FALLBACK_QSORT_SMALL_THRESH 10
 108 #define FALLBACK_QSORT_STACK_SIZE   100
 109
 110 static
 111 void fallbackQSort3(uint32_t* fmap,
 112                 uint32_t* eclass,
 113                 int32_t   loSt,
 114                 int32_t   hiSt)
 115 {
 116         int32_t unLo, unHi, ltLo, gtHi, n, m;
 117         int32_t sp, lo, hi;
 118         uint32_t med, r, r3;
 119         int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
 120         int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
 121
 122         r = 0;
 123
 124         sp = 0;
 125         fpush(loSt, hiSt);
 126
 127         while (sp > 0) {
 128                 AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
 129
 130                 fpop(lo, hi);
 131                 if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
 132                         fallbackSimpleSort(fmap, eclass, lo, hi);
 133                         continue;
 134                 }
 135
 136                 /* Random partitioning.  Median of 3 sometimes fails to
 137                  * avoid bad cases.  Median of 9 seems to help but
 138                  * looks rather expensive.  This too seems to work but
 139                  * is cheaper.  Guidance for the magic constants
 140                  * 7621 and 32768 is taken from Sedgewick's algorithms
 141                  * book, chapter 35.
 142                  */
 143                 r = ((r * 7621) + 1) % 32768;
 144                 r3 = r % 3;
 145                 if (r3 == 0)
 146                         med = eclass[fmap[lo]];
 147                 else if (r3 == 1)
 148                         med = eclass[fmap[(lo+hi)>>1]];
 149                 else
 150                         med = eclass[fmap[hi]];
 151
 152                 unLo = ltLo = lo;
 153                 unHi = gtHi = hi;
 154
 155                 while (1) {
 156                         while (1) {
 157                                 if (unLo > unHi) break;
 158                                 n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
 159                                 if (n == 0) {
 160                                         mswap(fmap[unLo], fmap[ltLo]);
 161                                         ltLo++;
 162                                         unLo++;
 163                                         continue;
 164                                 };
 165                                 if (n > 0) break;
 166                                 unLo++;
 167                         }
 168                         while (1) {
 169                                 if (unLo > unHi) break;
 170                                 n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
 171                                 if (n == 0) {
 172                                         mswap(fmap[unHi], fmap[gtHi]);
 173                                         gtHi--; unHi--;
 174                                         continue;
 175                                 };
 176                                 if (n < 0) break;
 177                                 unHi--;
 178                         }
 179                         if (unLo > unHi) break;
 180                         mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
 181                 }
 182
 183                 AssertD(unHi == unLo-1, "fallbackQSort3(2)");
 184
 185                 if (gtHi < ltLo) continue;
 186
 187                 n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
 188                 m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
 189
 190                 n = lo + unLo - ltLo - 1;
 191                 m = hi - (gtHi - unHi) + 1;
 192
 193                 if (n - lo > hi - m) {
 194                         fpush(lo, n);
 195                         fpush(m, hi);
 196                 } else {
 197                         fpush(m, hi);
 198                         fpush(lo, n);
 199                 }
 200         }
 201 }
 202
 203 #undef fpush
 204 #undef fpop
 205 #undef FALLBACK_QSORT_SMALL_THRESH
 206 #undef FALLBACK_QSORT_STACK_SIZE
 207
 208
 209 /*---------------------------------------------*/
 210 /* Pre:
 211  *      nblock > 0
 212  *      eclass exists for [0 .. nblock-1]
 213  *      ((uint8_t*)eclass) [0 .. nblock-1] holds block
 214  *      ptr exists for [0 .. nblock-1]
 215  *
 216  * Post:
 217  *      ((uint8_t*)eclass) [0 .. nblock-1] holds block
 218  *      All other areas of eclass destroyed
 219  *      fmap [0 .. nblock-1] holds sorted order
 220  *      bhtab[0 .. 2+(nblock/32)] destroyed
 221 */
 222
 223 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
 224 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
 225 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
 226 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
 227 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
 228
 229 static
 230 void fallbackSort(uint32_t* fmap,
 231                 uint32_t* eclass,
 232                 uint32_t* bhtab,
 233                 int32_t   nblock)
 234 {
 235         int32_t ftab[257];
 236         int32_t ftabCopy[256];
 237         int32_t H, i, j, k, l, r, cc, cc1;
 238         int32_t nNotDone;
 239         int32_t nBhtab;
 240         uint8_t* eclass8 = (uint8_t*)eclass;
 241
 242         /*
 243          * Initial 1-char radix sort to generate
 244          * initial fmap and initial BH bits.
 245          */
 246         for (i = 0; i < 257;    i++) ftab[i] = 0;
 247         for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
 248         for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
 249
 250         j = ftab[0];  /* bbox: optimized */
 251         for (i = 1; i < 257;    i++) {
 252                 j += ftab[i];
 253                 ftab[i] = j;
 254         }
 255
 256         for (i = 0; i < nblock; i++) {
 257                 j = eclass8[i];
 258                 k = ftab[j] - 1;
 259                 ftab[j] = k;
 260                 fmap[k] = i;
 261         }
 262
 263         nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */
 264         for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
 265         for (i = 0; i < 256; i++) SET_BH(ftab[i]);
 266
 267         /*
 268          * Inductively refine the buckets.  Kind-of an
 269          * "exponential radix sort" (!), inspired by the
 270          * Manber-Myers suffix array construction algorithm.
 271          */
 272
 273         /*-- set sentinel bits for block-end detection --*/
 274         for (i = 0; i < 32; i++) {
 275                 SET_BH(nblock + 2*i);
 276                 CLEAR_BH(nblock + 2*i + 1);
 277         }
 278
 279         /*-- the log(N) loop --*/
 280         H = 1;
 281         while (1) {
 282                 j = 0;
 283                 for (i = 0; i < nblock; i++) {
 284                         if (ISSET_BH(i))
 285                                 j = i;
 286                         k = fmap[i] - H;
 287                         if (k < 0)
 288                                 k += nblock;
 289                         eclass[k] = j;
 290                 }
 291
 292                 nNotDone = 0;
 293                 r = -1;
 294                 while (1) {
 295
 296                 /*-- find the next non-singleton bucket --*/
 297                         k = r + 1;
 298                         while (ISSET_BH(k) && UNALIGNED_BH(k))
 299                                 k++;
 300                         if (ISSET_BH(k)) {
 301                                 while (WORD_BH(k) == 0xffffffff) k += 32;
 302                                 while (ISSET_BH(k)) k++;
 303                         }
 304                         l = k - 1;
 305                         if (l >= nblock)
 306                                 break;
 307                         while (!ISSET_BH(k) && UNALIGNED_BH(k))
 308                                 k++;
 309                         if (!ISSET_BH(k)) {
 310                                 while (WORD_BH(k) == 0x00000000) k += 32;
 311                                 while (!ISSET_BH(k)) k++;
 312                         }
 313                         r = k - 1;
 314                         if (r >= nblock)
 315                                 break;
 316
 317                         /*-- now [l, r] bracket current bucket --*/
 318                         if (r > l) {
 319                                 nNotDone += (r - l + 1);
 320                                 fallbackQSort3(fmap, eclass, l, r);
 321
 322                                 /*-- scan bucket and generate header bits-- */
 323                                 cc = -1;
 324                                 for (i = l; i <= r; i++) {
 325                                         cc1 = eclass[fmap[i]];
 326                                         if (cc != cc1) {
 327                                                 SET_BH(i);
 328                                                 cc = cc1;
 329                                         };
 330                                 }
 331                         }
 332                 }
 333
 334                 H *= 2;
 335                 if (H > nblock || nNotDone == 0)
 336                         break;
 337         }
 338
 339         /*
 340          * Reconstruct the original block in
 341          * eclass8 [0 .. nblock-1], since the
 342          * previous phase destroyed it.
 343          */
 344         j = 0;
 345         for (i = 0; i < nblock; i++) {
 346                 while (ftabCopy[j] == 0)
 347                         j++;
 348                 ftabCopy[j]--;
 349                 eclass8[fmap[i]] = (uint8_t)j;
 350         }
 351         AssertH(j < 256, 1005);
 352 }
 353
 354 #undef       SET_BH
 355 #undef     CLEAR_BH
 356 #undef     ISSET_BH
 357 #undef      WORD_BH
 358 #undef UNALIGNED_BH
 359
 360
 361 /*---------------------------------------------*/
 362 /*--- The main, O(N^2 log(N)) sorting       ---*/
 363 /*--- algorithm.  Faster for "normal"       ---*/
 364 /*--- non-repetitive blocks.                ---*/
 365 /*---------------------------------------------*/
 366
 367 /*---------------------------------------------*/
 368 static
 369 NOINLINE
 370 int mainGtU(
 371                 uint32_t  i1,
 372                 uint32_t  i2,
 373                 uint8_t*  block,
 374                 uint16_t* quadrant,
 375                 uint32_t  nblock,
 376                 int32_t*  budget)
 377 {
 378         int32_t  k;
 379         uint8_t  c1, c2;
 380         uint16_t s1, s2;
 381
 382 /* Loop unrolling here is actually very useful
 383  * (generated code is much simpler),
 384  * code size increase is only 270 bytes (i386)
 385  * but speeds up compression 10% overall
 386  */
 387
 388 #if CONFIG_BZIP2_FAST >= 1
 389
 390 #define TIMES_8(code) \
 391         code; code; code; code; \
 392         code; code; code; code;
 393 #define TIMES_12(code) \
 394         code; code; code; code; \
 395         code; code; code; code; \
 396         code; code; code; code;
 397
 398 #else
 399
 400 #define TIMES_8(code) \
 401 { \
 402         int nn = 8; \
 403         do { \
 404                 code; \
 405         } while (--nn); \
 406 }
 407 #define TIMES_12(code) \
 408 { \
 409         int nn = 12; \
 410         do { \
 411                 code; \
 412         } while (--nn); \
 413 }
 414
 415 #endif
 416
 417         AssertD(i1 != i2, "mainGtU");
 418         TIMES_12(
 419                 c1 = block[i1]; c2 = block[i2];
 420                 if (c1 != c2) return (c1 > c2);
 421                 i1++; i2++;
 422         )
 423
 424         k = nblock + 8;
 425
 426         do {
 427                 TIMES_8(
 428                         c1 = block[i1]; c2 = block[i2];
 429                         if (c1 != c2) return (c1 > c2);
 430                         s1 = quadrant[i1]; s2 = quadrant[i2];
 431                         if (s1 != s2) return (s1 > s2);
 432                         i1++; i2++;
 433                 )
 434
 435                 if (i1 >= nblock) i1 -= nblock;
 436                 if (i2 >= nblock) i2 -= nblock;
 437
 438                 (*budget)--;
 439                 k -= 8;
 440         } while (k >= 0);
 441
 442         return False;
 443 }
 444 #undef TIMES_8
 445 #undef TIMES_12
 446
 447 /*---------------------------------------------*/
 448 /*
 449  * Knuth's increments seem to work better
 450  * than Incerpi-Sedgewick here.  Possibly
 451  * because the number of elems to sort is
 452  * usually small, typically <= 20.
 453  */
 454 static
 455 const int32_t incs[14] = {
 456         1, 4, 13, 40, 121, 364, 1093, 3280,
 457         9841, 29524, 88573, 265720,
 458         797161, 2391484
 459 };
 460
 461 static
 462 void mainSimpleSort(uint32_t* ptr,
 463                 uint8_t*  block,
 464                 uint16_t* quadrant,
 465                 int32_t   nblock,
 466                 int32_t   lo,
 467                 int32_t   hi,
 468                 int32_t   d,
 469                 int32_t*  budget)
 470 {
 471         int32_t i, j, h, bigN, hp;
 472         uint32_t v;
 473
 474         bigN = hi - lo + 1;
 475         if (bigN < 2) return;
 476
 477         hp = 0;
 478         while (incs[hp] < bigN) hp++;
 479         hp--;
 480
 481         for (; hp >= 0; hp--) {
 482                 h = incs[hp];
 483
 484                 i = lo + h;
 485                 while (1) {
 486                         /*-- copy 1 --*/
 487                         if (i > hi) break;
 488                         v = ptr[i];
 489                         j = i;
 490                         while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
 491                                 ptr[j] = ptr[j-h];
 492                                 j = j - h;
 493                                 if (j <= (lo + h - 1)) break;
 494                         }
 495                         ptr[j] = v;
 496                         i++;
 497
 498 /* 1.5% overall speedup, +290 bytes */
 499 #if CONFIG_BZIP2_FAST >= 3
 500                         /*-- copy 2 --*/
 501                         if (i > hi) break;
 502                         v = ptr[i];
 503                         j = i;
 504                         while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
 505                                 ptr[j] = ptr[j-h];
 506                                 j = j - h;
 507                                 if (j <= (lo + h - 1)) break;
 508                         }
 509                         ptr[j] = v;
 510                         i++;
 511
 512                         /*-- copy 3 --*/
 513                         if (i > hi) break;
 514                         v = ptr[i];
 515                         j = i;
 516                         while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
 517                                 ptr[j] = ptr[j-h];
 518                                 j = j - h;
 519                                 if (j <= (lo + h - 1)) break;
 520                         }
 521                         ptr[j] = v;
 522                         i++;
 523 #endif
 524                         if (*budget < 0) return;
 525                 }
 526         }
 527 }
 528
 529
 530 /*---------------------------------------------*/
 531 /*
 532  * The following is an implementation of
 533  * an elegant 3-way quicksort for strings,
 534  * described in a paper "Fast Algorithms for
 535  * Sorting and Searching Strings", by Robert
 536  * Sedgewick and Jon L. Bentley.
 537  */
 538
 539 static
 540 ALWAYS_INLINE
 541 uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
 542 {
 543         uint8_t t;
 544         if (a > b) {
 545                 t = a;
 546                 a = b;
 547                 b = t;
 548         };
 549         /* here b >= a */
 550         if (b > c) {
 551                 b = c;
 552                 if (a > b)
 553                         b = a;
 554         }
 555         return b;
 556 }
 557
 558 #define mpush(lz,hz,dz) \
 559 { \
 560         stackLo[sp] = lz; \
 561         stackHi[sp] = hz; \
 562         stackD [sp] = dz; \
 563         sp++; \
 564 }
 565
 566 #define mpop(lz,hz,dz) \
 567 { \
 568         sp--; \
 569         lz = stackLo[sp]; \
 570         hz = stackHi[sp]; \
 571         dz = stackD [sp]; \
 572 }
 573
 574 #define mnextsize(az) (nextHi[az] - nextLo[az])
 575
 576 #define mnextswap(az,bz) \
 577 { \
 578         int32_t tz; \
 579         tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
 580         tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
 581         tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
 582 }
 583
 584 #define MAIN_QSORT_SMALL_THRESH 20
 585 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
 586 #define MAIN_QSORT_STACK_SIZE   100
 587
 588 static NOINLINE
 589 void mainQSort3(uint32_t* ptr,
 590                 uint8_t*  block,
 591                 uint16_t* quadrant,
 592                 int32_t   nblock,
 593                 int32_t   loSt,
 594                 int32_t   hiSt,
 595                 int32_t   dSt,
 596                 int32_t*  budget)
 597 {
 598         int32_t unLo, unHi, ltLo, gtHi, n, m, med;
 599         int32_t sp, lo, hi, d;
 600
 601         int32_t stackLo[MAIN_QSORT_STACK_SIZE];
 602         int32_t stackHi[MAIN_QSORT_STACK_SIZE];
 603         int32_t stackD [MAIN_QSORT_STACK_SIZE];
 604
 605         int32_t nextLo[3];
 606         int32_t nextHi[3];
 607         int32_t nextD [3];
 608
 609         sp = 0;
 610         mpush(loSt, hiSt, dSt);
 611
 612         while (sp > 0) {
 613                 AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
 614
 615                 mpop(lo, hi, d);
 616                 if (hi - lo < MAIN_QSORT_SMALL_THRESH
 617                  || d > MAIN_QSORT_DEPTH_THRESH
 618                 ) {
 619                         mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
 620                         if (*budget < 0)
 621                                 return;
 622                         continue;
 623                 }
 624                 med = (int32_t) mmed3(block[ptr[lo          ] + d],
 625                                       block[ptr[hi          ] + d],
 626                                       block[ptr[(lo+hi) >> 1] + d]);
 627
 628                 unLo = ltLo = lo;
 629                 unHi = gtHi = hi;
 630
 631                 while (1) {
 632                         while (1) {
 633                                 if (unLo > unHi)
 634                                         break;
 635                                 n = ((int32_t)block[ptr[unLo]+d]) - med;
 636                                 if (n == 0) {
 637                                         mswap(ptr[unLo], ptr[ltLo]);
 638                                         ltLo++;
 639                                         unLo++;
 640                                         continue;
 641                                 };
 642                                 if (n >  0) break;
 643                                 unLo++;
 644                         }
 645                         while (1) {
 646                                 if (unLo > unHi)
 647                                         break;
 648                                 n = ((int32_t)block[ptr[unHi]+d]) - med;
 649                                 if (n == 0) {
 650                                         mswap(ptr[unHi], ptr[gtHi]);
 651                                         gtHi--;
 652                                         unHi--;
 653                                         continue;
 654                                 };
 655                                 if (n <  0) break;
 656                                 unHi--;
 657                         }
 658                         if (unLo > unHi)
 659                                 break;
 660                         mswap(ptr[unLo], ptr[unHi]);
 661                         unLo++;
 662                         unHi--;
 663                 }
 664
 665                 AssertD(unHi == unLo-1, "mainQSort3(2)");
 666
 667                 if (gtHi < ltLo) {
 668                         mpush(lo, hi, d + 1);
 669                         continue;
 670                 }
 671
 672                 n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
 673                 m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
 674
 675                 n = lo + unLo - ltLo - 1;
 676                 m = hi - (gtHi - unHi) + 1;
 677
 678                 nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
 679                 nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
 680                 nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
 681
 682                 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
 683                 if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
 684                 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
 685
 686                 AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
 687                 AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
 688
 689                 mpush(nextLo[0], nextHi[0], nextD[0]);
 690                 mpush(nextLo[1], nextHi[1], nextD[1]);
 691                 mpush(nextLo[2], nextHi[2], nextD[2]);
 692         }
 693 }
 694
 695 #undef mpush
 696 #undef mpop
 697 #undef mnextsize
 698 #undef mnextswap
 699 #undef MAIN_QSORT_SMALL_THRESH
 700 #undef MAIN_QSORT_DEPTH_THRESH
 701 #undef MAIN_QSORT_STACK_SIZE
 702
 703
 704 /*---------------------------------------------*/
 705 /* Pre:
 706  *      nblock > N_OVERSHOOT
 707  *      block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
 708  *      ((uint8_t*)block32) [0 .. nblock-1] holds block
 709  *      ptr exists for [0 .. nblock-1]
 710  *
 711  * Post:
 712  *      ((uint8_t*)block32) [0 .. nblock-1] holds block
 713  *      All other areas of block32 destroyed
 714  *      ftab[0 .. 65536] destroyed
 715  *      ptr [0 .. nblock-1] holds sorted order
 716  *      if (*budget < 0), sorting was abandoned
 717  */
 718
 719 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
 720 #define SETMASK (1 << 21)
 721 #define CLEARMASK (~(SETMASK))
 722
 723 static NOINLINE
 724 void mainSort(EState* state,
 725                 uint32_t* ptr,
 726                 uint8_t*  block,
 727                 uint16_t* quadrant,
 728                 uint32_t* ftab,
 729                 int32_t   nblock,
 730                 int32_t*  budget)
 731 {
 732         int32_t  i, j, k, ss, sb;
 733         uint8_t  c1;
 734         int32_t  numQSorted;
 735         uint16_t s;
 736         Bool     bigDone[256];
 737         /* bbox: moved to EState to save stack
 738         int32_t  runningOrder[256];
 739         int32_t  copyStart[256];
 740         int32_t  copyEnd  [256];
 741         */
 742 #define runningOrder (state->mainSort__runningOrder)
 743 #define copyStart    (state->mainSort__copyStart)
 744 #define copyEnd      (state->mainSort__copyEnd)
 745
 746         /*-- set up the 2-byte frequency table --*/
 747         /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
 748         memset(ftab, 0, 65537 * sizeof(ftab[0]));
 749
 750         j = block[0] << 8;
 751         i = nblock - 1;
 752 /* 3%, +300 bytes */
 753 #if CONFIG_BZIP2_FAST >= 2
 754         for (; i >= 3; i -= 4) {
 755                 quadrant[i] = 0;
 756                 j = (j >> 8) | (((uint16_t)block[i]) << 8);
 757                 ftab[j]++;
 758                 quadrant[i-1] = 0;
 759                 j = (j >> 8) | (((uint16_t)block[i-1]) << 8);
 760                 ftab[j]++;
 761                 quadrant[i-2] = 0;
 762                 j = (j >> 8) | (((uint16_t)block[i-2]) << 8);
 763                 ftab[j]++;
 764                 quadrant[i-3] = 0;
 765                 j = (j >> 8) | (((uint16_t)block[i-3]) << 8);
 766                 ftab[j]++;
 767         }
 768 #endif
 769         for (; i >= 0; i--) {
 770                 quadrant[i] = 0;
 771                 j = (j >> 8) | (((uint16_t)block[i]) << 8);
 772                 ftab[j]++;
 773         }
 774
 775         /*-- (emphasises close relationship of block & quadrant) --*/
 776         for (i = 0; i < BZ_N_OVERSHOOT; i++) {
 777                 block   [nblock+i] = block[i];
 778                 quadrant[nblock+i] = 0;
 779         }
 780
 781         /*-- Complete the initial radix sort --*/
 782         j = ftab[0]; /* bbox: optimized */
 783         for (i = 1; i <= 65536; i++) {
 784                 j += ftab[i];
 785                 ftab[i] = j;
 786         }
 787
 788         s = block[0] << 8;
 789         i = nblock - 1;
 790 #if CONFIG_BZIP2_FAST >= 2
 791         for (; i >= 3; i -= 4) {
 792                 s = (s >> 8) | (block[i] << 8);
 793                 j = ftab[s] - 1;
 794                 ftab[s] = j;
 795                 ptr[j] = i;
 796                 s = (s >> 8) | (block[i-1] << 8);
 797                 j = ftab[s] - 1;
 798                 ftab[s] = j;
 799                 ptr[j] = i-1;
 800                 s = (s >> 8) | (block[i-2] << 8);
 801                 j = ftab[s] - 1;
 802                 ftab[s] = j;
 803                 ptr[j] = i-2;
 804                 s = (s >> 8) | (block[i-3] << 8);
 805                 j = ftab[s] - 1;
 806                 ftab[s] = j;
 807                 ptr[j] = i-3;
 808         }
 809 #endif
 810         for (; i >= 0; i--) {
 811                 s = (s >> 8) | (block[i] << 8);
 812                 j = ftab[s] - 1;
 813                 ftab[s] = j;
 814                 ptr[j] = i;
 815         }
 816
 817         /*
 818          * Now ftab contains the first loc of every small bucket.
 819          * Calculate the running order, from smallest to largest
 820          * big bucket.
 821          */
 822         for (i = 0; i <= 255; i++) {
 823                 bigDone     [i] = False;
 824                 runningOrder[i] = i;
 825         }
 826
 827         {
 828                 int32_t vv;
 829                 /* bbox: was: int32_t h = 1; */
 830                 /* do h = 3 * h + 1; while (h <= 256); */
 831                 uint32_t h = 364;
 832
 833                 do {
 834                         /*h = h / 3;*/
 835                         h = (h * 171) >> 9; /* bbox: fast h/3 */
 836                         for (i = h; i <= 255; i++) {
 837                                 vv = runningOrder[i];
 838                                 j = i;
 839                                 while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) {
 840                                         runningOrder[j] = runningOrder[j-h];
 841                                         j = j - h;
 842                                         if (j <= (h - 1))
 843                                                 goto zero;
 844                                 }
 845  zero:
 846                                 runningOrder[j] = vv;
 847                         }
 848                 } while (h != 1);
 849         }
 850
 851         /*
 852          * The main sorting loop.
 853          */
 854
 855         numQSorted = 0;
 856
 857         for (i = 0; i <= 255; i++) {
 858
 859                 /*
 860                  * Process big buckets, starting with the least full.
 861                  * Basically this is a 3-step process in which we call
 862                  * mainQSort3 to sort the small buckets [ss, j], but
 863                  * also make a big effort to avoid the calls if we can.
 864                  */
 865                 ss = runningOrder[i];
 866
 867                 /*
 868                  * Step 1:
 869                  * Complete the big bucket [ss] by quicksorting
 870                  * any unsorted small buckets [ss, j], for j != ss.
 871                  * Hopefully previous pointer-scanning phases have already
 872                  * completed many of the small buckets [ss, j], so
 873                  * we don't have to sort them at all.
 874                  */
 875                 for (j = 0; j <= 255; j++) {
 876                         if (j != ss) {
 877                                 sb = (ss << 8) + j;
 878                                 if (!(ftab[sb] & SETMASK)) {
 879                                         int32_t lo =  ftab[sb]   & CLEARMASK;
 880                                         int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
 881                                         if (hi > lo) {
 882                                                 mainQSort3(
 883                                                         ptr, block, quadrant, nblock,
 884                                                         lo, hi, BZ_N_RADIX, budget
 885                                                 );
 886                                                 if (*budget < 0) return;
 887                                                 numQSorted += (hi - lo + 1);
 888                                         }
 889                                 }
 890                                 ftab[sb] |= SETMASK;
 891                         }
 892                 }
 893
 894                 AssertH(!bigDone[ss], 1006);
 895
 896                 /*
 897                  * Step 2:
 898                  * Now scan this big bucket [ss] so as to synthesise the
 899                  * sorted order for small buckets [t, ss] for all t,
 900                  * including, magically, the bucket [ss,ss] too.
 901                  * This will avoid doing Real Work in subsequent Step 1's.
 902                  */
 903                 {
 904                         for (j = 0; j <= 255; j++) {
 905                                 copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
 906                                 copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
 907                         }
 908                         for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
 909                                 k = ptr[j] - 1;
 910                                 if (k < 0)
 911                                         k += nblock;
 912                                 c1 = block[k];
 913                                 if (!bigDone[c1])
 914                                         ptr[copyStart[c1]++] = k;
 915                         }
 916                         for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
 917                                 k = ptr[j]-1;
 918                                 if (k < 0)
 919                                         k += nblock;
 920                                 c1 = block[k];
 921                                 if (!bigDone[c1])
 922                                         ptr[copyEnd[c1]--] = k;
 923                         }
 924                 }
 925
 926                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
 927                  * Necessity for this case is demonstrated by compressing
 928                  * a sequence of approximately 48.5 million of character
 929                  * 251; 1.0.0/1.0.1 will then die here. */
 930                 AssertH((copyStart[ss]-1 == copyEnd[ss]) \
 931                      || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
 932
 933                 for (j = 0; j <= 255; j++)
 934                         ftab[(j << 8) + ss] |= SETMASK;
 935
 936                 /*
 937                  * Step 3:
 938                  * The [ss] big bucket is now done.  Record this fact,
 939                  * and update the quadrant descriptors.  Remember to
 940                  * update quadrants in the overshoot area too, if
 941                  * necessary.  The "if (i < 255)" test merely skips
 942                  * this updating for the last bucket processed, since
 943                  * updating for the last bucket is pointless.
 944                  *
 945                  * The quadrant array provides a way to incrementally
 946                  * cache sort orderings, as they appear, so as to
 947                  * make subsequent comparisons in fullGtU() complete
 948                  * faster.  For repetitive blocks this makes a big
 949                  * difference (but not big enough to be able to avoid
 950                  * the fallback sorting mechanism, exponential radix sort).
 951                  *
 952                  * The precise meaning is: at all times:
 953                  *
 954                  *      for 0 <= i < nblock and 0 <= j <= nblock
 955                  *
 956                  *      if block[i] != block[j],
 957                  *
 958                  *              then the relative values of quadrant[i] and
 959                  *                        quadrant[j] are meaningless.
 960                  *
 961                  *              else {
 962                  *                      if quadrant[i] < quadrant[j]
 963                  *                              then the string starting at i lexicographically
 964                  *                              precedes the string starting at j
 965                  *
 966                  *                      else if quadrant[i] > quadrant[j]
 967                  *                              then the string starting at j lexicographically
 968                  *                              precedes the string starting at i
 969                  *
 970                  *                      else
 971                  *                              the relative ordering of the strings starting
 972                  *                              at i and j has not yet been determined.
 973                  *              }
 974                  */
 975                 bigDone[ss] = True;
 976
 977                 if (i < 255) {
 978                         int32_t bbStart = ftab[ss << 8] & CLEARMASK;
 979                         int32_t bbSize  = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
 980                         int32_t shifts  = 0;
 981
 982                         while ((bbSize >> shifts) > 65534) shifts++;
 983
 984                         for (j = bbSize-1; j >= 0; j--) {
 985                                 int32_t a2update   = ptr[bbStart + j];
 986                                 uint16_t qVal      = (uint16_t)(j >> shifts);
 987                                 quadrant[a2update] = qVal;
 988                                 if (a2update < BZ_N_OVERSHOOT)
 989                                         quadrant[a2update + nblock] = qVal;
 990                         }
 991                         AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
 992                 }
 993         }
 994 #undef runningOrder
 995 #undef copyStart
 996 #undef copyEnd
 997 }
 998
 999 #undef BIGFREQ
1000 #undef SETMASK
1001 #undef CLEARMASK
1002
1003
1004 /*---------------------------------------------*/
1005 /* Pre:
1006  *      nblock > 0
1007  *      arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1008  *        ((uint8_t*)arr2)[0 .. nblock-1] holds block
1009  *      arr1 exists for [0 .. nblock-1]
1010  *
1011  * Post:
1012  *      ((uint8_t*)arr2) [0 .. nblock-1] holds block
1013  *      All other areas of block destroyed
1014  *      ftab[0 .. 65536] destroyed
1015  *      arr1[0 .. nblock-1] holds sorted order
1016  */
1017 static NOINLINE
1018 void BZ2_blockSort(EState* s)
1019 {
1020         /* In original bzip2 1.0.4, it's a parameter, but 30
1021          * (which was the default) should work ok. */
1022         enum { wfact = 30 };
1023
1024         uint32_t* ptr    = s->ptr;
1025         uint8_t*  block  = s->block;
1026         uint32_t* ftab   = s->ftab;
1027         int32_t   nblock = s->nblock;
1028         uint16_t* quadrant;
1029         int32_t   budget;
1030         int32_t   i;
1031
1032         if (nblock < 10000) {
1033                 fallbackSort(s->arr1, s->arr2, ftab, nblock);
1034         } else {
1035                 /* Calculate the location for quadrant, remembering to get
1036                  * the alignment right.  Assumes that &(block[0]) is at least
1037                  * 2-byte aligned -- this should be ok since block is really
1038                  * the first section of arr2.
1039                  */
1040                 i = nblock + BZ_N_OVERSHOOT;
1041                 if (i & 1) i++;
1042                 quadrant = (uint16_t*)(&(block[i]));
1043
1044                 /* (wfact-1) / 3 puts the default-factor-30
1045                  * transition point at very roughly the same place as
1046                  * with v0.1 and v0.9.0.
1047                  * Not that it particularly matters any more, since the
1048                  * resulting compressed stream is now the same regardless
1049                  * of whether or not we use the main sort or fallback sort.
1050                  */
1051                 budget = nblock * ((wfact-1) / 3);
1052
1053                 mainSort(s, ptr, block, quadrant, ftab, nblock, &budget);
1054                 if (budget < 0) {
1055                         fallbackSort(s->arr1, s->arr2, ftab, nblock);
1056                 }
1057         }
1058
1059         s->origPtr = -1;
1060         for (i = 0; i < s->nblock; i++)
1061                 if (ptr[i] == 0) {
1062                         s->origPtr = i;
1063                         break;
1064                 };
1065
1066         AssertH(s->origPtr != -1, 1003);
1067 }
1068
1069
1070 /*-------------------------------------------------------------*/
1071 /*--- end                                       blocksort.c ---*/
1072 /*-------------------------------------------------------------*/