# build of sha512-ppc.pl, presented for reference.
#
# POWER8 POWER9
-# SHA256 9.9 [15.8] 12.2 [12.5]
-# SHA512 6.3 [10.3] 7.7 [7.9]
+# SHA256 9.7 [15.8] 11.2 [12.5]
+# SHA512 6.1 [10.3] 7.0 [7.9]
$flavour=shift;
$output =shift;
}
$func="sha${bits}_block_p8";
-$FRAME=8*$SIZE_T;
+$LOCALS=8*$SIZE_T+8*16;
+$FRAME=$LOCALS+9*16+6*$SIZE_T;
$sp ="r1";
$toc="r2";
$lrsave="r8";
$offload="r11";
$vrsave="r12";
-($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
- $x00=0 if ($flavour =~ /osx/);
+@I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
+ $x00=0 if ($flavour =~ /osx/);
@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
-@X=map("v$_",(8..23));
-($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
+@X=map("v$_",(8..19,24..27));
+($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));
sub ROUND {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
my $j=($i+1)%16;
+my $k=($i+2)%8;
$code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
lvx_u @X[$i+1],0,$inp ; load X[i] in advance
$code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
vperm @X[$i],@X[$i],@X[$i],$lemask
___
+$code.=<<___ if ($i>=15);
+ vshasigma${sz} $Sigma,@X[($j+1)%16],0,0
+ vaddu${sz}m @X[$j],@X[$j],$Sigma
+ vshasigma${sz} $Sigma,@X[($j+14)%16],0,15
+ vaddu${sz}m @X[$j],@X[$j],$Sigma
+ vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]
+___
$code.=<<___;
- `"vshasigma${sz} $s0,@X[($j+1)%16],0,0" if ($i>=15)`
- vsel $Func,$g,$f,$e ; Ch(e,f,g)
- vshasigma${sz} $S1,$e,1,15 ; Sigma1(e)
vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i]
- vshasigma${sz} $S0,$a,1,0 ; Sigma0(a)
- `"vshasigma${sz} $s1,@X[($j+14)%16],0,15" if ($i>=15)`
+ vsel $Func,$g,$f,$e ; Ch(e,f,g)
+ vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g)
+ vshasigma${sz} $Sigma,$e,1,15 ; Sigma1(e)
+ vaddu${sz}m $h,$h,$Sigma ; h+=Sigma1(e)
vxor $Func,$a,$b
- `"vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16]" if ($i>=15)`
- vaddu${sz}m $h,$h,$S1 ; h+=Sigma1(e)
vsel $Func,$b,$c,$Func ; Maj(a,b,c)
- vaddu${sz}m $g,$g,$Ki ; future h+=K[i]
vaddu${sz}m $d,$d,$h ; d+=h
- vaddu${sz}m $S0,$S0,$Func ; Sigma0(a)+Maj(a,b,c)
- `"vaddu${sz}m @X[$j],@X[$j],$s0" if ($i>=15)`
- lvx $Ki,$idx,$Tbl ; load next K[i]
- addi $idx,$idx,16
- vaddu${sz}m $h,$h,$S0 ; h+=Sigma0(a)+Maj(a,b,c)
- `"vaddu${sz}m @X[$j],@X[$j],$s1" if ($i>=15)`
+ vshasigma${sz} $Sigma,$a,1,0 ; Sigma0(a)
+ vaddu${sz}m $Sigma,$Sigma,$Func ; Sigma0(a)+Maj(a,b,c)
+ vaddu${sz}m $h,$h,$Sigma ; h+=Sigma0(a)+Maj(a,b,c)
+ lvx $Ki,@I[$k],$idx ; load next K[i]
+___
+$code.=<<___ if ($k == 7);
+ addi $idx,$idx,0x80
___
}
.globl $func
.align 6
$func:
- $STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+ $STU $sp,-$FRAME($sp)
mflr $lrsave
- li r10,`$FRAME+8*16+15`
- li r11,`$FRAME+8*16+31`
- stvx v20,r10,$sp # ABI says so
+ li r10,`$LOCALS+15`
+ li r11,`$LOCALS+31`
+ stvx v24,r10,$sp # ABI says so
addi r10,r10,32
mfspr $vrsave,256
- stvx v21,r11,$sp
- addi r11,r11,32
- stvx v22,r10,$sp
- addi r10,r10,32
- stvx v23,r11,$sp
- addi r11,r11,32
- stvx v24,r10,$sp
- addi r10,r10,32
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
addi r11,r11,32
stvx v30,r10,$sp
stvx v31,r11,$sp
- li r11,-1
- stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
+ li r11,-4096+255
+ stw $vrsave,`$FRAME+6*$SIZE_T-4`($sp) # save vrsave
li $x10,0x10
- $PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+ $PUSH r26,`$FRAME-6*$SIZE_T`($sp)
li $x20,0x20
- $PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+ $PUSH r27,`$FRAME-5*$SIZE_T`($sp)
li $x30,0x30
- $PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+ $PUSH r28,`$FRAME-4*$SIZE_T`($sp)
li $x40,0x40
- $PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+ $PUSH r29,`$FRAME-3*$SIZE_T`($sp)
li $x50,0x50
- $PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+ $PUSH r30,`$FRAME-2*$SIZE_T`($sp)
li $x60,0x60
- $PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+ $PUSH r31,`$FRAME-1*$SIZE_T`($sp)
li $x70,0x70
- $PUSH $lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+ $PUSH $lrsave,`$FRAME+$LRSAVE`($sp)
mtspr 256,r11
bl LPICmeup
- addi $offload,$sp,$FRAME+15
+ addi $offload,$sp,`8*$SIZE_T+15`
___
$code.=<<___ if ($LENDIAN);
li $idx,8
.align 5
Loop:
lvx $Ki,$x00,$Tbl
- li $idx,16
lvx_u @X[0],0,$inp
addi $inp,$inp,16
+ mr $idx,$Tbl # copy $Tbl
stvx $A,$x00,$offload # offload $A-$H
stvx $B,$x10,$offload
stvx $C,$x20,$offload
stvx $G,$x60,$offload
stvx $H,$x70,$offload
vaddu${sz}m $H,$H,$Ki # h+K[i]
- lvx $Ki,$idx,$Tbl
- addi $idx,$idx,16
+ lvx $Ki,$x10,$Tbl
___
for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bne Loop
___
$code.=<<___ if ($SZ==4);
- lvx @X[0],$idx,$Tbl
- addi $idx,$idx,16
+ lvx @X[0],$x20,$idx
vperm $A,$A,$B,$Ki # pack the answer
- lvx @X[1],$idx,$Tbl
+ lvx @X[1],$x30,$idx
vperm $E,$E,$F,$Ki
vperm $A,$A,$C,@X[0]
vperm $E,$E,$G,@X[0]
stvx_u $G,$x30,$ctx
___
$code.=<<___;
- li r10,`$FRAME+8*16+15`
+ li r10,`$LOCALS+15`
mtlr $lrsave
- li r11,`$FRAME+8*16+31`
+ li r11,`$LOCALS+31`
mtspr 256,$vrsave
- lvx v20,r10,$sp # ABI says so
- addi r10,r10,32
- lvx v21,r11,$sp
- addi r11,r11,32
- lvx v22,r10,$sp
- addi r10,r10,32
- lvx v23,r11,$sp
- addi r11,r11,32
- lvx v24,r10,$sp
+ lvx v24,r10,$sp # ABI says so
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
- $POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
- $POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
- $POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
- $POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
- $POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
- $POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
- addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+ $POP r26,`$FRAME-6*$SIZE_T`($sp)
+ $POP r27,`$FRAME-5*$SIZE_T`($sp)
+ $POP r28,`$FRAME-4*$SIZE_T`($sp)
+ $POP r29,`$FRAME-3*$SIZE_T`($sp)
+ $POP r30,`$FRAME-2*$SIZE_T`($sp)
+ $POP r31,`$FRAME-1*$SIZE_T`($sp)
+ addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,6,3,0