sha/asm/keccak1600-ppc64.pl: up 10% performance improvement.
authorAndy Polyakov <appro@openssl.org>
Fri, 8 Mar 2019 13:40:56 +0000 (14:40 +0100)
committerRichard Levitte <levitte@openssl.org>
Mon, 11 Mar 2019 11:33:39 +0000 (12:33 +0100)
Reviewed-by: Matt Caswell <matt@openssl.org>
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/8444)

crypto/sha/asm/keccak1600-ppc64.pl

index 5c23841d6a97ca9c19ce5b9ced2fdceda7b91ffd..876632b1e7041d80a0ed728f338ce3b05c698c8b 100755 (executable)
 #
 #              r=1088(*)
 #
-# PPC970/G5    14.6/+120%
-# POWER7       10.3/+100%
-# POWER8       11.5/+85%
-# POWER9       9.4/+45%
+# PPC970/G5    14.0/+130%
+# POWER7       9.7/+110%
+# POWER8       10.6/+100%
+# POWER9       8.2/+66%
 #
 # (*)  Corresponds to SHA3-256. Percentage after slash is improvement
 #      over gcc-4.x-generated KECCAK_1X_ALT code. Newer compilers do
@@ -384,19 +384,19 @@ KeccakF1600:
 .type  dword_le_load,\@function
 .align 5
 dword_le_load:
-       lbzu    r0,1(r3)
-       lbzu    r4,1(r3)
-       lbzu    r5,1(r3)
+       lbz     r0,1(r3)
+       lbz     r4,2(r3)
+       lbz     r5,3(r3)
        insrdi  r0,r4,8,48
-       lbzu    r4,1(r3)
+       lbz     r4,4(r3)
        insrdi  r0,r5,8,40
-       lbzu    r5,1(r3)
+       lbz     r5,5(r3)
        insrdi  r0,r4,8,32
-       lbzu    r4,1(r3)
+       lbz     r4,6(r3)
        insrdi  r0,r5,8,24
-       lbzu    r5,1(r3)
+       lbz     r5,7(r3)
        insrdi  r0,r4,8,16
-       lbzu    r4,1(r3)
+       lbzu    r4,8(r3)
        insrdi  r0,r5,8,8
        insrdi  r0,r4,8,0
        blr
@@ -657,21 +657,21 @@ SHA3_squeeze:
        ${UCMP}i $len,8
        blt     .Lsqueeze_tail
 
-       stbu    r0,1($out)
+       stb     r0,1($out)
        srdi    r0,r0,8
-       stbu    r0,1($out)
+       stb     r0,2($out)
        srdi    r0,r0,8
-       stbu    r0,1($out)
+       stb     r0,3($out)
        srdi    r0,r0,8
-       stbu    r0,1($out)
+       stb     r0,4($out)
        srdi    r0,r0,8
-       stbu    r0,1($out)
+       stb     r0,5($out)
        srdi    r0,r0,8
-       stbu    r0,1($out)
+       stb     r0,6($out)
        srdi    r0,r0,8
-       stbu    r0,1($out)
+       stb     r0,7($out)
        srdi    r0,r0,8
-       stbu    r0,1($out)
+       stbu    r0,8($out)
 
        subic.  $len,$len,8
        beq     .Lsqueeze_done