From f7f70bf1b3025550ea4ad8d13d977b846a868a06 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 2 Feb 2015 16:07:07 +0100 Subject: [PATCH] gzip: speed up and shrink put_16bit() function old new delta put_16bit 104 98 -6 Signed-off-by: Denys Vlasenko --- archival/gzip.c | 41 ++++++++++++++++++++++++++++++++++------- include/platform.h | 2 ++ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/archival/gzip.c b/archival/gzip.c index 46367f9e6..18d795996 100644 --- a/archival/gzip.c +++ b/archival/gzip.c @@ -417,19 +417,46 @@ static void flush_outbuf(void) #define put_8bit(c) \ do { \ G1.outbuf[G1.outcnt++] = (c); \ - if (G1.outcnt == OUTBUFSIZ) flush_outbuf(); \ + if (G1.outcnt == OUTBUFSIZ) \ + flush_outbuf(); \ } while (0) /* Output a 16 bit value, lsb first */ static void put_16bit(ush w) { - if (G1.outcnt < OUTBUFSIZ - 2) { - G1.outbuf[G1.outcnt++] = w; - G1.outbuf[G1.outcnt++] = w >> 8; - } else { - put_8bit(w); - put_8bit(w >> 8); + /* GCC 4.2.1 won't optimize out redundant loads of G1.outcnt + * (probably because of fear of aliasing with G1.outbuf[] + * stores), do it explicitly: + */ + unsigned outcnt = G1.outcnt; + uch *dst = &G1.outbuf[outcnt]; + +#if BB_UNALIGNED_MEMACCESS_OK && BB_LITTLE_ENDIAN + if (outcnt < OUTBUFSIZ-2) { + /* Common case */ + ush *dst16 = (void*) dst; + *dst16 = w; /* unalinged LSB 16-bit store */ + G1.outcnt = outcnt + 2; + return; + } + *dst = (uch)w; + w >>= 8; +#else + *dst++ = (uch)w; + w >>= 8; + if (outcnt < OUTBUFSIZ-2) { + /* Common case */ + *dst = w; + G1.outcnt = outcnt + 2; + return; } +#endif + + /* Slowpath: we will need to do flush_outbuf() */ + G1.outcnt++; + if (G1.outcnt == OUTBUFSIZ) + flush_outbuf(); + put_8bit(w); } static void put_32bit(ulg n) diff --git a/include/platform.h b/include/platform.h index 0b0fce182..df9594507 100644 --- a/include/platform.h +++ b/include/platform.h @@ -217,6 +217,7 @@ typedef uint64_t bb__aliased_uint64_t FIX_ALIASING; * a lvalue. This makes it more likely to not swap them by mistake */ #if defined(i386) || defined(__x86_64__) || defined(__powerpc__) +# define BB_UNALIGNED_MEMACCESS_OK 1 # define move_from_unaligned_int(v, intp) ((v) = *(bb__aliased_int*)(intp)) # define move_from_unaligned_long(v, longp) ((v) = *(bb__aliased_long*)(longp)) # define move_from_unaligned16(v, u16p) ((v) = *(bb__aliased_uint16_t*)(u16p)) @@ -225,6 +226,7 @@ typedef uint64_t bb__aliased_uint64_t FIX_ALIASING; # define move_to_unaligned32(u32p, v) (*(bb__aliased_uint32_t*)(u32p) = (v)) /* #elif ... - add your favorite arch today! */ #else +# define BB_UNALIGNED_MEMACCESS_OK 0 /* performs reasonably well (gcc usually inlines memcpy here) */ # define move_from_unaligned_int(v, intp) (memcpy(&(v), (intp), sizeof(int))) # define move_from_unaligned_long(v, longp) (memcpy(&(v), (longp), sizeof(long))) -- 2.25.1