2 * bzip2 is written by Julian Seward <jseward@bzip.org>.
3 * Adapted for busybox by Denys Vlasenko <vda.linux@googlemail.com>.
4 * See README and LICENSE files in this directory for more information.
7 /*-------------------------------------------------------------*/
8 /*--- Block sorting machinery ---*/
9 /*--- blocksort.c ---*/
10 /*-------------------------------------------------------------*/
12 /* ------------------------------------------------------------------
13 This file is part of bzip2/libbzip2, a program and library for
14 lossless, block-sorting data compression.
16 bzip2/libbzip2 version 1.0.4 of 20 December 2006
17 Copyright (C) 1996-2006 Julian Seward <jseward@bzip.org>
19 Please read the WARNING, DISCLAIMER and PATENTS sections in the
22 This program is released under the terms of the license contained
24 ------------------------------------------------------------------ */
26 /* #include "bzlib_private.h" */
28 #define mswap(zz1, zz2) \
30 int32_t zztmp = zz1; \
36 /* No measurable speed gain with inlining */
38 void mvswap(uint32_t* ptr, int32_t zzp1, int32_t zzp2, int32_t zzn)
41 mswap(ptr[zzp1], ptr[zzp2]);
50 int32_t mmin(int32_t a, int32_t b)
52 return (a < b) ? a : b;
56 /*---------------------------------------------*/
57 /*--- Fallback O(N log(N)^2) sorting ---*/
58 /*--- algorithm, for repetitive blocks ---*/
59 /*---------------------------------------------*/
61 /*---------------------------------------------*/
64 void fallbackSimpleSort(uint32_t* fmap,
75 for (i = hi-4; i >= lo; i--) {
78 for (j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4)
84 for (i = hi-1; i >= lo; i--) {
87 for (j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++)
94 /*---------------------------------------------*/
95 #define fpush(lz,hz) { \
101 #define fpop(lz,hz) { \
107 #define FALLBACK_QSORT_SMALL_THRESH 10
108 #define FALLBACK_QSORT_STACK_SIZE 100
111 void fallbackQSort3(uint32_t* fmap,
116 int32_t unLo, unHi, ltLo, gtHi, n, m;
119 int32_t stackLo[FALLBACK_QSORT_STACK_SIZE];
120 int32_t stackHi[FALLBACK_QSORT_STACK_SIZE];
128 AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004);
131 if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
132 fallbackSimpleSort(fmap, eclass, lo, hi);
136 /* Random partitioning. Median of 3 sometimes fails to
137 * avoid bad cases. Median of 9 seems to help but
138 * looks rather expensive. This too seems to work but
139 * is cheaper. Guidance for the magic constants
140 * 7621 and 32768 is taken from Sedgewick's algorithms
143 r = ((r * 7621) + 1) % 32768;
146 med = eclass[fmap[lo]];
148 med = eclass[fmap[(lo+hi)>>1]];
150 med = eclass[fmap[hi]];
157 if (unLo > unHi) break;
158 n = (int32_t)eclass[fmap[unLo]] - (int32_t)med;
160 mswap(fmap[unLo], fmap[ltLo]);
169 if (unLo > unHi) break;
170 n = (int32_t)eclass[fmap[unHi]] - (int32_t)med;
172 mswap(fmap[unHi], fmap[gtHi]);
179 if (unLo > unHi) break;
180 mswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
183 AssertD(unHi == unLo-1, "fallbackQSort3(2)");
185 if (gtHi < ltLo) continue;
187 n = mmin(ltLo-lo, unLo-ltLo); mvswap(fmap, lo, unLo-n, n);
188 m = mmin(hi-gtHi, gtHi-unHi); mvswap(fmap, unLo, hi-m+1, m);
190 n = lo + unLo - ltLo - 1;
191 m = hi - (gtHi - unHi) + 1;
193 if (n - lo > hi - m) {
205 #undef FALLBACK_QSORT_SMALL_THRESH
206 #undef FALLBACK_QSORT_STACK_SIZE
209 /*---------------------------------------------*/
212 * eclass exists for [0 .. nblock-1]
213 * ((uint8_t*)eclass) [0 .. nblock-1] holds block
214 * ptr exists for [0 .. nblock-1]
217 * ((uint8_t*)eclass) [0 .. nblock-1] holds block
218 * All other areas of eclass destroyed
219 * fmap [0 .. nblock-1] holds sorted order
220 * bhtab[0 .. 2+(nblock/32)] destroyed
223 #define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
224 #define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
225 #define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
226 #define WORD_BH(zz) bhtab[(zz) >> 5]
227 #define UNALIGNED_BH(zz) ((zz) & 0x01f)
230 void fallbackSort(uint32_t* fmap,
236 int32_t ftabCopy[256];
237 int32_t H, i, j, k, l, r, cc, cc1;
240 uint8_t* eclass8 = (uint8_t*)eclass;
243 * Initial 1-char radix sort to generate
244 * initial fmap and initial BH bits.
246 for (i = 0; i < 257; i++) ftab[i] = 0;
247 for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
248 for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i];
250 j = ftab[0]; /* bbox: optimized */
251 for (i = 1; i < 257; i++) {
256 for (i = 0; i < nblock; i++) {
263 nBhtab = 2 + ((uint32_t)nblock / 32); /* bbox: unsigned div is easier */
264 for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
265 for (i = 0; i < 256; i++) SET_BH(ftab[i]);
268 * Inductively refine the buckets. Kind-of an
269 * "exponential radix sort" (!), inspired by the
270 * Manber-Myers suffix array construction algorithm.
273 /*-- set sentinel bits for block-end detection --*/
274 for (i = 0; i < 32; i++) {
275 SET_BH(nblock + 2*i);
276 CLEAR_BH(nblock + 2*i + 1);
279 /*-- the log(N) loop --*/
283 for (i = 0; i < nblock; i++) {
296 /*-- find the next non-singleton bucket --*/
298 while (ISSET_BH(k) && UNALIGNED_BH(k))
301 while (WORD_BH(k) == 0xffffffff) k += 32;
302 while (ISSET_BH(k)) k++;
307 while (!ISSET_BH(k) && UNALIGNED_BH(k))
310 while (WORD_BH(k) == 0x00000000) k += 32;
311 while (!ISSET_BH(k)) k++;
317 /*-- now [l, r] bracket current bucket --*/
319 nNotDone += (r - l + 1);
320 fallbackQSort3(fmap, eclass, l, r);
322 /*-- scan bucket and generate header bits-- */
324 for (i = l; i <= r; i++) {
325 cc1 = eclass[fmap[i]];
335 if (H > nblock || nNotDone == 0)
340 * Reconstruct the original block in
341 * eclass8 [0 .. nblock-1], since the
342 * previous phase destroyed it.
345 for (i = 0; i < nblock; i++) {
346 while (ftabCopy[j] == 0)
349 eclass8[fmap[i]] = (uint8_t)j;
351 AssertH(j < 256, 1005);
361 /*---------------------------------------------*/
362 /*--- The main, O(N^2 log(N)) sorting ---*/
363 /*--- algorithm. Faster for "normal" ---*/
364 /*--- non-repetitive blocks. ---*/
365 /*---------------------------------------------*/
367 /*---------------------------------------------*/
382 /* Loop unrolling here is actually very useful
383 * (generated code is much simpler),
384 * code size increase is only 270 bytes (i386)
385 * but speeds up compression 10% overall
388 #if CONFIG_BZIP2_FAST >= 1
390 #define TIMES_8(code) \
391 code; code; code; code; \
392 code; code; code; code;
393 #define TIMES_12(code) \
394 code; code; code; code; \
395 code; code; code; code; \
396 code; code; code; code;
400 #define TIMES_8(code) \
407 #define TIMES_12(code) \
417 AssertD(i1 != i2, "mainGtU");
419 c1 = block[i1]; c2 = block[i2];
420 if (c1 != c2) return (c1 > c2);
428 c1 = block[i1]; c2 = block[i2];
429 if (c1 != c2) return (c1 > c2);
430 s1 = quadrant[i1]; s2 = quadrant[i2];
431 if (s1 != s2) return (s1 > s2);
435 if (i1 >= nblock) i1 -= nblock;
436 if (i2 >= nblock) i2 -= nblock;
447 /*---------------------------------------------*/
449 * Knuth's increments seem to work better
450 * than Incerpi-Sedgewick here. Possibly
451 * because the number of elems to sort is
452 * usually small, typically <= 20.
455 const int32_t incs[14] = {
456 1, 4, 13, 40, 121, 364, 1093, 3280,
457 9841, 29524, 88573, 265720,
462 void mainSimpleSort(uint32_t* ptr,
471 int32_t i, j, h, bigN, hp;
475 if (bigN < 2) return;
478 while (incs[hp] < bigN) hp++;
481 for (; hp >= 0; hp--) {
490 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
493 if (j <= (lo + h - 1)) break;
498 /* 1.5% overall speedup, +290 bytes */
499 #if CONFIG_BZIP2_FAST >= 3
504 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
507 if (j <= (lo + h - 1)) break;
516 while (mainGtU(ptr[j-h]+d, v+d, block, quadrant, nblock, budget)) {
519 if (j <= (lo + h - 1)) break;
524 if (*budget < 0) return;
530 /*---------------------------------------------*/
532 * The following is an implementation of
533 * an elegant 3-way quicksort for strings,
534 * described in a paper "Fast Algorithms for
535 * Sorting and Searching Strings", by Robert
536 * Sedgewick and Jon L. Bentley.
541 uint8_t mmed3(uint8_t a, uint8_t b, uint8_t c)
558 #define mpush(lz,hz,dz) \
566 #define mpop(lz,hz,dz) \
574 #define mnextsize(az) (nextHi[az] - nextLo[az])
576 #define mnextswap(az,bz) \
579 tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
580 tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
581 tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; \
584 #define MAIN_QSORT_SMALL_THRESH 20
585 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
586 #define MAIN_QSORT_STACK_SIZE 100
589 void mainQSort3(uint32_t* ptr,
598 int32_t unLo, unHi, ltLo, gtHi, n, m, med;
599 int32_t sp, lo, hi, d;
601 int32_t stackLo[MAIN_QSORT_STACK_SIZE];
602 int32_t stackHi[MAIN_QSORT_STACK_SIZE];
603 int32_t stackD [MAIN_QSORT_STACK_SIZE];
610 mpush(loSt, hiSt, dSt);
613 AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001);
616 if (hi - lo < MAIN_QSORT_SMALL_THRESH
617 || d > MAIN_QSORT_DEPTH_THRESH
619 mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget);
624 med = (int32_t) mmed3(block[ptr[lo ] + d],
626 block[ptr[(lo+hi) >> 1] + d]);
635 n = ((int32_t)block[ptr[unLo]+d]) - med;
637 mswap(ptr[unLo], ptr[ltLo]);
648 n = ((int32_t)block[ptr[unHi]+d]) - med;
650 mswap(ptr[unHi], ptr[gtHi]);
660 mswap(ptr[unLo], ptr[unHi]);
665 AssertD(unHi == unLo-1, "mainQSort3(2)");
668 mpush(lo, hi, d + 1);
672 n = mmin(ltLo-lo, unLo-ltLo); mvswap(ptr, lo, unLo-n, n);
673 m = mmin(hi-gtHi, gtHi-unHi); mvswap(ptr, unLo, hi-m+1, m);
675 n = lo + unLo - ltLo - 1;
676 m = hi - (gtHi - unHi) + 1;
678 nextLo[0] = lo; nextHi[0] = n; nextD[0] = d;
679 nextLo[1] = m; nextHi[1] = hi; nextD[1] = d;
680 nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
682 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
683 if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2);
684 if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1);
686 AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)");
687 AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)");
689 mpush(nextLo[0], nextHi[0], nextD[0]);
690 mpush(nextLo[1], nextHi[1], nextD[1]);
691 mpush(nextLo[2], nextHi[2], nextD[2]);
699 #undef MAIN_QSORT_SMALL_THRESH
700 #undef MAIN_QSORT_DEPTH_THRESH
701 #undef MAIN_QSORT_STACK_SIZE
704 /*---------------------------------------------*/
706 * nblock > N_OVERSHOOT
707 * block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
708 * ((uint8_t*)block32) [0 .. nblock-1] holds block
709 * ptr exists for [0 .. nblock-1]
712 * ((uint8_t*)block32) [0 .. nblock-1] holds block
713 * All other areas of block32 destroyed
714 * ftab[0 .. 65536] destroyed
715 * ptr [0 .. nblock-1] holds sorted order
716 * if (*budget < 0), sorting was abandoned
719 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
720 #define SETMASK (1 << 21)
721 #define CLEARMASK (~(SETMASK))
724 void mainSort(EState* state,
732 int32_t i, j, k, ss, sb;
737 /* bbox: moved to EState to save stack
738 int32_t runningOrder[256];
739 int32_t copyStart[256];
740 int32_t copyEnd [256];
742 #define runningOrder (state->mainSort__runningOrder)
743 #define copyStart (state->mainSort__copyStart)
744 #define copyEnd (state->mainSort__copyEnd)
746 /*-- set up the 2-byte frequency table --*/
747 /* was: for (i = 65536; i >= 0; i--) ftab[i] = 0; */
748 memset(ftab, 0, 65537 * sizeof(ftab[0]));
753 #if CONFIG_BZIP2_FAST >= 2
754 for (; i >= 3; i -= 4) {
756 j = (j >> 8) | (((uint16_t)block[i]) << 8);
759 j = (j >> 8) | (((uint16_t)block[i-1]) << 8);
762 j = (j >> 8) | (((uint16_t)block[i-2]) << 8);
765 j = (j >> 8) | (((uint16_t)block[i-3]) << 8);
769 for (; i >= 0; i--) {
771 j = (j >> 8) | (((uint16_t)block[i]) << 8);
775 /*-- (emphasises close relationship of block & quadrant) --*/
776 for (i = 0; i < BZ_N_OVERSHOOT; i++) {
777 block [nblock+i] = block[i];
778 quadrant[nblock+i] = 0;
781 /*-- Complete the initial radix sort --*/
782 j = ftab[0]; /* bbox: optimized */
783 for (i = 1; i <= 65536; i++) {
790 #if CONFIG_BZIP2_FAST >= 2
791 for (; i >= 3; i -= 4) {
792 s = (s >> 8) | (block[i] << 8);
796 s = (s >> 8) | (block[i-1] << 8);
800 s = (s >> 8) | (block[i-2] << 8);
804 s = (s >> 8) | (block[i-3] << 8);
810 for (; i >= 0; i--) {
811 s = (s >> 8) | (block[i] << 8);
818 * Now ftab contains the first loc of every small bucket.
819 * Calculate the running order, from smallest to largest
822 for (i = 0; i <= 255; i++) {
829 /* bbox: was: int32_t h = 1; */
830 /* do h = 3 * h + 1; while (h <= 256); */
835 h = (h * 171) >> 9; /* bbox: fast h/3 */
836 for (i = h; i <= 255; i++) {
837 vv = runningOrder[i];
839 while (BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv)) {
840 runningOrder[j] = runningOrder[j-h];
846 runningOrder[j] = vv;
852 * The main sorting loop.
857 for (i = 0; i <= 255; i++) {
860 * Process big buckets, starting with the least full.
861 * Basically this is a 3-step process in which we call
862 * mainQSort3 to sort the small buckets [ss, j], but
863 * also make a big effort to avoid the calls if we can.
865 ss = runningOrder[i];
869 * Complete the big bucket [ss] by quicksorting
870 * any unsorted small buckets [ss, j], for j != ss.
871 * Hopefully previous pointer-scanning phases have already
872 * completed many of the small buckets [ss, j], so
873 * we don't have to sort them at all.
875 for (j = 0; j <= 255; j++) {
878 if (!(ftab[sb] & SETMASK)) {
879 int32_t lo = ftab[sb] & CLEARMASK;
880 int32_t hi = (ftab[sb+1] & CLEARMASK) - 1;
883 ptr, block, quadrant, nblock,
884 lo, hi, BZ_N_RADIX, budget
886 if (*budget < 0) return;
887 numQSorted += (hi - lo + 1);
894 AssertH(!bigDone[ss], 1006);
898 * Now scan this big bucket [ss] so as to synthesise the
899 * sorted order for small buckets [t, ss] for all t,
900 * including, magically, the bucket [ss,ss] too.
901 * This will avoid doing Real Work in subsequent Step 1's.
904 for (j = 0; j <= 255; j++) {
905 copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK;
906 copyEnd [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
908 for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
914 ptr[copyStart[c1]++] = k;
916 for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
922 ptr[copyEnd[c1]--] = k;
926 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
927 * Necessity for this case is demonstrated by compressing
928 * a sequence of approximately 48.5 million of character
929 * 251; 1.0.0/1.0.1 will then die here. */
930 AssertH((copyStart[ss]-1 == copyEnd[ss]) \
931 || (copyStart[ss] == 0 && copyEnd[ss] == nblock-1), 1007);
933 for (j = 0; j <= 255; j++)
934 ftab[(j << 8) + ss] |= SETMASK;
938 * The [ss] big bucket is now done. Record this fact,
939 * and update the quadrant descriptors. Remember to
940 * update quadrants in the overshoot area too, if
941 * necessary. The "if (i < 255)" test merely skips
942 * this updating for the last bucket processed, since
943 * updating for the last bucket is pointless.
945 * The quadrant array provides a way to incrementally
946 * cache sort orderings, as they appear, so as to
947 * make subsequent comparisons in fullGtU() complete
948 * faster. For repetitive blocks this makes a big
949 * difference (but not big enough to be able to avoid
950 * the fallback sorting mechanism, exponential radix sort).
952 * The precise meaning is: at all times:
954 * for 0 <= i < nblock and 0 <= j <= nblock
956 * if block[i] != block[j],
958 * then the relative values of quadrant[i] and
959 * quadrant[j] are meaningless.
962 * if quadrant[i] < quadrant[j]
963 * then the string starting at i lexicographically
964 * precedes the string starting at j
966 * else if quadrant[i] > quadrant[j]
967 * then the string starting at j lexicographically
968 * precedes the string starting at i
971 * the relative ordering of the strings starting
972 * at i and j has not yet been determined.
978 int32_t bbStart = ftab[ss << 8] & CLEARMASK;
979 int32_t bbSize = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
982 while ((bbSize >> shifts) > 65534) shifts++;
984 for (j = bbSize-1; j >= 0; j--) {
985 int32_t a2update = ptr[bbStart + j];
986 uint16_t qVal = (uint16_t)(j >> shifts);
987 quadrant[a2update] = qVal;
988 if (a2update < BZ_N_OVERSHOOT)
989 quadrant[a2update + nblock] = qVal;
991 AssertH(((bbSize-1) >> shifts) <= 65535, 1002);
1004 /*---------------------------------------------*/
1007 * arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1008 * ((uint8_t*)arr2)[0 .. nblock-1] holds block
1009 * arr1 exists for [0 .. nblock-1]
1012 * ((uint8_t*)arr2) [0 .. nblock-1] holds block
1013 * All other areas of block destroyed
1014 * ftab[0 .. 65536] destroyed
1015 * arr1[0 .. nblock-1] holds sorted order
1018 void BZ2_blockSort(EState* s)
1020 /* In original bzip2 1.0.4, it's a parameter, but 30
1021 * (which was the default) should work ok. */
1022 enum { wfact = 30 };
1024 uint32_t* ptr = s->ptr;
1025 uint8_t* block = s->block;
1026 uint32_t* ftab = s->ftab;
1027 int32_t nblock = s->nblock;
1032 if (nblock < 10000) {
1033 fallbackSort(s->arr1, s->arr2, ftab, nblock);
1035 /* Calculate the location for quadrant, remembering to get
1036 * the alignment right. Assumes that &(block[0]) is at least
1037 * 2-byte aligned -- this should be ok since block is really
1038 * the first section of arr2.
1040 i = nblock + BZ_N_OVERSHOOT;
1042 quadrant = (uint16_t*)(&(block[i]));
1044 /* (wfact-1) / 3 puts the default-factor-30
1045 * transition point at very roughly the same place as
1046 * with v0.1 and v0.9.0.
1047 * Not that it particularly matters any more, since the
1048 * resulting compressed stream is now the same regardless
1049 * of whether or not we use the main sort or fallback sort.
1051 budget = nblock * ((wfact-1) / 3);
1053 mainSort(s, ptr, block, quadrant, ftab, nblock, &budget);
1055 fallbackSort(s->arr1, s->arr2, ftab, nblock);
1060 for (i = 0; i < s->nblock; i++)
1066 AssertH(s->origPtr != -1, 1003);
1070 /*-------------------------------------------------------------*/
1071 /*--- end blocksort.c ---*/
1072 /*-------------------------------------------------------------*/