From c4d9ef4cc5bf1c48a74b64879622ae9fd6f26b03 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sat, 2 Jun 2018 15:25:50 +0200 Subject: [PATCH] sha/asm/sha512p8-ppc.pl: improve POWER9 performance by ~10%. Biggest part, ~7%, of improvement resulted from omitting constants' table index increment in each round. And minor part from rescheduling instructions. Apparently POWER9 (and POWER8) manage to dispatch instructions more efficiently if they are laid down as if they have no latency... Reviewed-by: Rich Salz (Merged from https://github.com/openssl/openssl/pull/6406) --- crypto/sha/asm/sha512p8-ppc.pl | 122 +++++++++++++++------------------ 1 file changed, 55 insertions(+), 67 deletions(-) diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl index 7a8d4358f0..e3f522cb7c 100755 --- a/crypto/sha/asm/sha512p8-ppc.pl +++ b/crypto/sha/asm/sha512p8-ppc.pl @@ -37,8 +37,8 @@ # build of sha512-ppc.pl, presented for reference. # # POWER8 POWER9 -# SHA256 9.9 [15.8] 12.2 [12.5] -# SHA512 6.3 [10.3] 7.7 [7.9] +# SHA256 9.7 [15.8] 11.2 [12.5] +# SHA512 6.1 [10.3] 7.0 [7.9] $flavour=shift; $output =shift; @@ -79,7 +79,8 @@ if ($output =~ /512/) { } $func="sha${bits}_block_p8"; -$FRAME=8*$SIZE_T; +$LOCALS=8*$SIZE_T+8*16; +$FRAME=$LOCALS+9*16+6*$SIZE_T; $sp ="r1"; $toc="r2"; @@ -91,16 +92,17 @@ $idx="r7"; $lrsave="r8"; $offload="r11"; $vrsave="r12"; -($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31)); - $x00=0 if ($flavour =~ /osx/); +@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31)); + $x00=0 if ($flavour =~ /osx/); @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7)); -@X=map("v$_",(8..23)); -($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31)); +@X=map("v$_",(8..19,24..27)); +($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31)); sub ROUND { my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; my $j=($i+1)%16; +my $k=($i+2)%8; $code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1)); lvx_u @X[$i+1],0,$inp ; load X[i] in advance @@ -112,26 +114,30 @@ ___ $code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0); vperm @X[$i],@X[$i],@X[$i],$lemask ___ +$code.=<<___ if ($i>=15); + vshasigma${sz} $Sigma,@X[($j+1)%16],0,0 + vaddu${sz}m @X[$j],@X[$j],$Sigma + vshasigma${sz} $Sigma,@X[($j+14)%16],0,15 + vaddu${sz}m @X[$j],@X[$j],$Sigma + vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16] +___ $code.=<<___; - `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)` - vsel $Func,$g,$f,$e ; Ch(e,f,g) - vshasigma${sz} $S1,$e,1,15 ; Sigma1(e) vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i] - vshasigma${sz} $S0,$a,1,0 ; Sigma0(a) - `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)` + vsel $Func,$g,$f,$e ; Ch(e,f,g) + vaddu${sz}m $g,$g,$Ki ; future h+=K[i] vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g) + vshasigma${sz} $Sigma,$e,1,15 ; Sigma1(e) + vaddu${sz}m $h,$h,$Sigma ; h+=Sigma1(e) vxor $Func,$a,$b - `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)` - vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e) vsel $Func,$b,$c,$Func ; Maj(a,b,c) - vaddu${sz}m $g,$g,$Ki ; future h+=K[i] vaddu${sz}m $d,$d,$h ; d+=h - vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c) - `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)` - lvx $Ki,$idx,$Tbl ; load next K[i] - addi $idx,$idx,16 - vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c) - `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)` + vshasigma${sz} $Sigma,$a,1,0 ; Sigma0(a) + vaddu${sz}m $Sigma,$Sigma,$Func ; Sigma0(a)+Maj(a,b,c) + vaddu${sz}m $h,$h,$Sigma ; h+=Sigma0(a)+Maj(a,b,c) + lvx $Ki,@I[$k],$idx ; load next K[i] +___ +$code.=<<___ if ($k == 7); + addi $idx,$idx,0x80 ___ } @@ -142,21 +148,13 @@ $code=<<___; .globl $func .align 6 $func: - $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp) + $STU $sp,-$FRAME($sp) mflr $lrsave - li r10,`$FRAME+8*16+15` - li r11,`$FRAME+8*16+31` - stvx v20,r10,$sp # ABI says so + li r10,`$LOCALS+15` + li r11,`$LOCALS+31` + stvx v24,r10,$sp # ABI says so addi r10,r10,32 mfspr $vrsave,256 - stvx v21,r11,$sp - addi r11,r11,32 - stvx v22,r10,$sp - addi r10,r10,32 - stvx v23,r11,$sp - addi r11,r11,32 - stvx v24,r10,$sp - addi r10,r10,32 stvx v25,r11,$sp addi r11,r11,32 stvx v26,r10,$sp @@ -169,26 +167,26 @@ $func: addi r11,r11,32 stvx v30,r10,$sp stvx v31,r11,$sp - li r11,-1 - stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave + li r11,-4096+255 + stw $vrsave,`$FRAME+6*$SIZE_T-4`($sp) # save vrsave li $x10,0x10 - $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp) + $PUSH r26,`$FRAME-6*$SIZE_T`($sp) li $x20,0x20 - $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp) + $PUSH r27,`$FRAME-5*$SIZE_T`($sp) li $x30,0x30 - $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp) + $PUSH r28,`$FRAME-4*$SIZE_T`($sp) li $x40,0x40 - $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp) + $PUSH r29,`$FRAME-3*$SIZE_T`($sp) li $x50,0x50 - $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp) + $PUSH r30,`$FRAME-2*$SIZE_T`($sp) li $x60,0x60 - $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp) + $PUSH r31,`$FRAME-1*$SIZE_T`($sp) li $x70,0x70 - $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp) + $PUSH $lrsave,`$FRAME+$LRSAVE`($sp) mtspr 256,r11 bl LPICmeup - addi $offload,$sp,$FRAME+15 + addi $offload,$sp,`8*$SIZE_T+15` ___ $code.=<<___ if ($LENDIAN); li $idx,8 @@ -222,9 +220,9 @@ $code.=<<___; .align 5 Loop: lvx $Ki,$x00,$Tbl - li $idx,16 lvx_u @X[0],0,$inp addi $inp,$inp,16 + mr $idx,$Tbl # copy $Tbl stvx $A,$x00,$offload # offload $A-$H stvx $B,$x10,$offload stvx $C,$x20,$offload @@ -234,8 +232,7 @@ Loop: stvx $G,$x60,$offload stvx $H,$x70,$offload vaddu${sz}m $H,$H,$Ki # h+K[i] - lvx $Ki,$idx,$Tbl - addi $idx,$idx,16 + lvx $Ki,$x10,$Tbl ___ for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } $code.=<<___; @@ -268,10 +265,9 @@ $code.=<<___; bne Loop ___ $code.=<<___ if ($SZ==4); - lvx @X[0],$idx,$Tbl - addi $idx,$idx,16 + lvx @X[0],$x20,$idx vperm $A,$A,$B,$Ki # pack the answer - lvx @X[1],$idx,$Tbl + lvx @X[1],$x30,$idx vperm $E,$E,$F,$Ki vperm $A,$A,$C,@X[0] vperm $E,$E,$G,@X[0] @@ -291,19 +287,11 @@ $code.=<<___ if ($SZ==8); stvx_u $G,$x30,$ctx ___ $code.=<<___; - li r10,`$FRAME+8*16+15` + li r10,`$LOCALS+15` mtlr $lrsave - li r11,`$FRAME+8*16+31` + li r11,`$LOCALS+31` mtspr 256,$vrsave - lvx v20,r10,$sp # ABI says so - addi r10,r10,32 - lvx v21,r11,$sp - addi r11,r11,32 - lvx v22,r10,$sp - addi r10,r10,32 - lvx v23,r11,$sp - addi r11,r11,32 - lvx v24,r10,$sp + lvx v24,r10,$sp # ABI says so addi r10,r10,32 lvx v25,r11,$sp addi r11,r11,32 @@ -317,13 +305,13 @@ $code.=<<___; addi r11,r11,32 lvx v30,r10,$sp lvx v31,r11,$sp - $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp) - $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp) - $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp) - $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp) - $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp) - $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp) - addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T` + $POP r26,`$FRAME-6*$SIZE_T`($sp) + $POP r27,`$FRAME-5*$SIZE_T`($sp) + $POP r28,`$FRAME-4*$SIZE_T`($sp) + $POP r29,`$FRAME-3*$SIZE_T`($sp) + $POP r30,`$FRAME-2*$SIZE_T`($sp) + $POP r31,`$FRAME-1*$SIZE_T`($sp) + addi $sp,$sp,$FRAME blr .long 0 .byte 0,12,4,1,0x80,6,3,0 -- 2.25.1