bzip2: work around bad compiler optimization
authorDenys Vlasenko <vda.linux@googlemail.com>
Sun, 4 Feb 2018 23:34:08 +0000 (00:34 +0100)
committerDenys Vlasenko <vda.linux@googlemail.com>
Sun, 4 Feb 2018 23:34:08 +0000 (00:34 +0100)
gc-6.1.1 x86_64:
function                                             old     new   delta
generateMTFValues                                    380     367     -13

gcc-4.3.1 386:
function                                             old     new   delta
inner_loop                                             -      41     +41
generateMTFValues                                    357     294     -63
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 0/1 up/down: 41/-63)            Total: -22 bytes

gcc-6.3.0 386:
function                                             old     new   delta
inner_loop                                             -      36     +36
generateMTFValues                                    363     250    -113
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 0/1 up/down: 36/-113)           Total: -77 bytes

The last case, gcc-6.3.0, runs almost 3 times faster after this change.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
archival/libarchive/bz/compress.c

index f6507675869b4fc3e04c73f669507c4ce46a6b52..462740b6c0253e721446c7764bd9a23644d0a5c0 100644 (file)
@@ -158,6 +158,38 @@ void makeMaps_e(EState* s)
 
 
 /*---------------------------------------------------*/
+/*
+ * This bit of code is performance-critical.
+ * On 32bit x86, gcc-6.3.0 was observed to spill ryy_j to stack,
+ * resulting in abysmal performance (x3 slowdown).
+ * Forcing it into a separate function alleviates register pressure,
+ * and spillage no longer happens.
+ * Other versions of gcc do not exhibit this problem, but out-of-line code
+ * seems to be helping them too (code is both smaller and faster).
+ * Therefore NOINLINE is enabled for the entire 32bit x86 arch for now,
+ * without a check for gcc version.
+ */
+static
+#if defined __i386__
+NOINLINE
+#endif
+int inner_loop(uint8_t *yy, uint8_t ll_i)
+{
+       register uint8_t  rtmp;
+       register uint8_t* ryy_j;
+       rtmp  = yy[1];
+       yy[1] = yy[0];
+       ryy_j = &(yy[1]);
+       while (ll_i != rtmp) {
+               register uint8_t rtmp2;
+               ryy_j++;
+               rtmp2  = rtmp;
+               rtmp   = *ryy_j;
+               *ryy_j = rtmp2;
+       }
+       yy[0] = rtmp;
+       return ryy_j - &(yy[0]);
+}
 static NOINLINE
 void generateMTFValues(EState* s)
 {
@@ -165,7 +197,6 @@ void generateMTFValues(EState* s)
        int i;
        int zPend;
        int32_t wr;
-       int32_t EOB;
 
        /*
         * After sorting (eg, here),
@@ -189,15 +220,12 @@ void generateMTFValues(EState* s)
         * compressBlock().
         */
        uint32_t* ptr   = s->ptr;
-       uint8_t*  block = s->block;
-       uint16_t* mtfv  = s->mtfv;
 
        makeMaps_e(s);
-       EOB = s->nInUse+1;
 
        wr = 0;
        zPend = 0;
-       for (i = 0; i <= EOB; i++)
+       for (i = 0; i <= s->nInUse+1; i++)
                s->mtfFreq[i] = 0;
 
        for (i = 0; i < s->nInUse; i++)
@@ -211,7 +239,7 @@ void generateMTFValues(EState* s)
                j = ptr[i] - 1;
                if (j < 0)
                        j += s->nblock;
-               ll_i = s->unseqToSeq[block[j]];
+               ll_i = s->unseqToSeq[s->block[j]];
                AssertD(ll_i < s->nInUse, "generateMTFValues(2a)");
 
                if (yy[0] == ll_i) {
@@ -225,15 +253,15 @@ void generateMTFValues(EState* s)
                        while (1) {
 #if 0
                                if (zPend & 1) {
-                                       mtfv[wr] = BZ_RUNB; wr++;
+                                       s->mtfv[wr] = BZ_RUNB; wr++;
                                        s->mtfFreq[BZ_RUNB]++;
                                } else {
-                                       mtfv[wr] = BZ_RUNA; wr++;
+                                       s->mtfv[wr] = BZ_RUNA; wr++;
                                        s->mtfFreq[BZ_RUNA]++;
                                }
 #else /* same as above, since BZ_RUNA is 0 and BZ_RUNB is 1 */
                                unsigned run = zPend & 1;
-                               mtfv[wr] = run;
+                               s->mtfv[wr] = run;
                                wr++;
                                s->mtfFreq[run]++;
 #endif
@@ -247,36 +275,19 @@ void generateMTFValues(EState* s)
                                goto end;
                        zPend = 0;
                }
-               {
-                       register uint8_t  rtmp;
-                       register uint8_t* ryy_j;
-                       register uint8_t  rll_i;
-                       rtmp  = yy[1];
-                       yy[1] = yy[0];
-                       ryy_j = &(yy[1]);
-                       rll_i = ll_i;
-                       while (rll_i != rtmp) {
-                               register uint8_t rtmp2;
-                               ryy_j++;
-                               rtmp2  = rtmp;
-                               rtmp   = *ryy_j;
-                               *ryy_j = rtmp2;
-                       }
-                       yy[0] = rtmp;
-                       j = ryy_j - &(yy[0]);
-                       mtfv[wr] = j+1;
-                       wr++;
-                       s->mtfFreq[j+1]++;
-               }
+               j = inner_loop(yy, ll_i);
+               s->mtfv[wr] = j+1;
+               wr++;
+               s->mtfFreq[j+1]++;
        }
 
        i = -1;
        if (zPend > 0)
                goto process_zPend; /* "process it and come back here" */
  end:
-       mtfv[wr] = EOB;
+       s->mtfv[wr] = s->nInUse+1;
        wr++;
-       s->mtfFreq[EOB]++;
+       s->mtfFreq[s->nInUse+1]++;
 
        s->nMTF = wr;
 }