From 5c3598307ebbf5a88d1c39fbb2629536e443a5dd Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 21 Jul 2014 15:29:09 +0200 Subject: [PATCH] sha1-ppc.pl: shave off one cycle from BODY_20_39 and improve performance by 10% on POWER[78]. Reviewed-by: Kurt Roeckx --- crypto/sha/asm/sha1-ppc.pl | 16 ++++++++-------- crypto/sha/asm/sha512p8-ppc.pl | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/crypto/sha/asm/sha1-ppc.pl b/crypto/sha/asm/sha1-ppc.pl index 24a5d065d9..df5989610c 100755 --- a/crypto/sha/asm/sha1-ppc.pl +++ b/crypto/sha/asm/sha1-ppc.pl @@ -125,31 +125,31 @@ my ($i,$a,$b,$c,$d,$e,$f)=@_; my $j=$i+1; $code.=<<___ if ($i<79); add $f,$K,$e + xor $t0,$b,$d rotlwi $e,$a,5 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] add $f,$f,@X[$i%16] - xor $t0,$b,$c + xor $t0,$t0,$c xor @X[$j%16],@X[$j%16],@X[($j+8)%16] - add $f,$f,$e + add $f,$f,$t0 rotlwi $b,$b,30 - xor $t0,$t0,$d xor @X[$j%16],@X[$j%16],@X[($j+13)%16] - add $f,$f,$t0 + add $f,$f,$e rotlwi @X[$j%16],@X[$j%16],1 ___ $code.=<<___ if ($i==79); add $f,$K,$e + xor $t0,$b,$d rotlwi $e,$a,5 lwz r16,0($ctx) add $f,$f,@X[$i%16] - xor $t0,$b,$c + xor $t0,$t0,$c lwz r17,4($ctx) - add $f,$f,$e + add $f,$f,$t0 rotlwi $b,$b,30 lwz r18,8($ctx) - xor $t0,$t0,$d lwz r19,12($ctx) - add $f,$f,$t0 + add $f,$f,$e lwz r20,16($ctx) ___ } diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl index cb0268c9d6..a316b31a4f 100755 --- a/crypto/sha/asm/sha512p8-ppc.pl +++ b/crypto/sha/asm/sha512p8-ppc.pl @@ -13,8 +13,8 @@ # always virtualized setup with possibly throttled processor. # Relative comparison is therefore more informative. This module is # ~60% faster than integer-only sha512-ppc.pl. To anchor to something -# else, SHA256 is 16% slower than sha1-ppc.pl and 2.5x slower than -# hardware-assisted aes-128-cbc encrypt. SHA512 is 33% faster than +# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than +# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than # sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting # result is degree of computational resources' utilization. POWER8 is # "massively multi-threaded chip" and difference between single- and -- 2.25.1