From 30a8652fbf16884490cee4a624f039a9ab587269 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 15 Jan 2013 01:12:26 +0100 Subject: [PATCH] sha3: make size/speed optimization decision configurable Signed-off-by: Denys Vlasenko --- libbb/Config.src | 10 ++++++ libbb/hash_md5_sha.c | 77 +++++++++++++++++++++++++++++++++----------- 2 files changed, 68 insertions(+), 19 deletions(-) diff --git a/libbb/Config.src b/libbb/Config.src index ee1b66a45..19021fed1 100644 --- a/libbb/Config.src +++ b/libbb/Config.src @@ -28,6 +28,16 @@ config MD5_SMALL 2 3.0 5088 3 (smallest) 5.1 4912 +config SHA3_SMALL + int "SHA3: Trade bytes for speed (0:fast, 1:slow)" + default 1 + range 0 1 + help + Trade binary size versus speed for the sha3sum algorithm. + SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate): + 64-bit x86: +270 bytes of code, 45% faster + 32-bit x86: +450 bytes of code, 75% faster + config FEATURE_FAST_TOP bool "Faster /proc scanning code (+100 bytes)" default y diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 06a2400b5..643cf205f 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -918,6 +918,16 @@ void FAST_FUNC sha512_end(sha512_ctx_t *ctx, void *resbuf) * Busybox modifications (C) Lauri Kasanen, under the GPLv2. */ +#if CONFIG_SHA3_SMALL < 0 +# define SHA3_SMALL 0 +#elif CONFIG_SHA3_SMALL > 1 +# define SHA3_SMALL 1 +#else +# define SHA3_SMALL CONFIG_SHA3_SMALL +#endif + +#define ARCH_IS_64BIT (sizeof(long) >= sizeof(uint64_t)) + enum { cKeccakR_SizeInBytes = 576 / 8, cKeccakNumberOfRounds = 24, @@ -967,8 +977,6 @@ static const uint8_t KeccakF_Mod5[10] = { static void KeccakF(uint64_t *state) { uint8_t x, y; - uint64_t temp; - uint64_t BC[5]; int round; if (BB_BIG_ENDIAN) { @@ -979,30 +987,61 @@ static void KeccakF(uint64_t *state) for (round = 0; round < cKeccakNumberOfRounds; ++round) { /* Theta */ - for (x = 0; x < 5; ++x) { - BC[x] = state[x] ^ state[5 + x] ^ state[10 + x] ^ - state[15 + x] ^ state[20 + x]; - } - for (x = 0; x < 5; ++x) { - temp = BC[KeccakF_Mod5[x + 4]] ^ - rotl64(BC[KeccakF_Mod5[x + 1]], 1); - - for (y = 0; y <= 20; y += 5) { - state[y + x] ^= temp; + { + uint64_t BC[5]; + for (x = 0; x < 5; ++x) { + BC[x] = state[x] ^ state[5 + x] ^ state[10 + x] ^ + state[15 + x] ^ state[20 + x]; + } + for (x = 0; x < 5; ++x) { + uint64_t temp = BC[KeccakF_Mod5[x + 4]] ^ + rotl64(BC[KeccakF_Mod5[x + 1]], 1); + if (SHA3_SMALL && !ARCH_IS_64BIT) { + for (y = 0; y <= 20; y += 5) + state[y + x] ^= temp; + } else { + /* on 64-bit arch, this is actually smaller too */ + state[0 + x] ^= temp; + state[5 + x] ^= temp; + state[10 + x] ^= temp; + state[15 + x] ^= temp; + state[20 + x] ^= temp; + } } } /* Rho Pi */ - temp = state[1]; - for (x = 0; x < 24; ++x) { - BC[0] = state[KeccakF_PiLane[x]]; - state[KeccakF_PiLane[x]] = - rotl64(temp, KeccakF_RotationConstants[x]); - temp = BC[0]; + if (SHA3_SMALL) { + uint64_t t1 = state[1]; + for (x = 0; x < 24; ++x) { + uint64_t t0 = state[KeccakF_PiLane[x]]; + state[KeccakF_PiLane[x]] = rotl64(t1, KeccakF_RotationConstants[x]); + t1 = t0; + } + } else { + /* Especially large benefit for 32-bit arch: + * 64-bit rotations by non-constant usually are SLOW on those. + * We resort to unrolling here. + * This optimizes out KeccakF_PiLane[] and KeccakF_RotationConstants[], + * but generates 300-500 more bytes of code. + */ + uint64_t t0; + uint64_t t1 = state[1]; +#define RhoPi_twice(x) \ + t0 = state[KeccakF_PiLane[x ]]; state[KeccakF_PiLane[x ]] = rotl64(t1, KeccakF_RotationConstants[x ]); \ + t1 = state[KeccakF_PiLane[x+1]]; state[KeccakF_PiLane[x+1]] = rotl64(t0, KeccakF_RotationConstants[x+1]); + RhoPi_twice(0); RhoPi_twice(2); + RhoPi_twice(4); RhoPi_twice(6); + RhoPi_twice(8); RhoPi_twice(10); + RhoPi_twice(12); RhoPi_twice(14); + RhoPi_twice(16); RhoPi_twice(18); + RhoPi_twice(20); RhoPi_twice(22); +#undef RhoPi_twice } /* Chi */ - for (y = 0; y < 25; y += 5) { + for (y = 0; y <= 20; y += 5) { + uint64_t BC[5]; BC[0] = state[y + 0]; BC[1] = state[y + 1]; BC[2] = state[y + 2]; -- 2.25.1