From 603ebe03529101424670051aa0c616dc6e037b28 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Fri, 1 Dec 2017 22:32:48 +0100 Subject: [PATCH] modes/asm/ghashv8-armx.pl: handle lengths not divisible by 4x. Reviewed-by: Rich Salz (Merged from https://github.com/openssl/openssl/pull/4830) --- crypto/modes/asm/ghashv8-armx.pl | 146 +++++++++++++++++++++++++++++-- 1 file changed, 137 insertions(+), 9 deletions(-) diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl index 8cb2e452cd..7e57238706 100644 --- a/crypto/modes/asm/ghashv8-armx.pl +++ b/crypto/modes/asm/ghashv8-armx.pl @@ -255,9 +255,8 @@ $code.=<<___; gcm_ghash_v8: ___ $code.=<<___ if ($flavour =~ /64/); - bic $inc,$len,#63 - cmp $len,$inc - b.eq .Lgcm_ghash_v8_4x + cmp $len,#64 + b.hs .Lgcm_ghash_v8_4x ___ $code.=<<___ if ($flavour !~ /64/); vstmdb sp!,{d8-d15} @ 32-bit ABI says so @@ -421,12 +420,10 @@ gcm_ghash_v8_4x: vmov.i8 $xC2,#0xe1 vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4 vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant -#ifndef __ARMEB__ - vrev64.8 $Xl,$Xl -#endif vld1.64 {$I0-$j3},[$inp],#64 #ifndef __ARMEB__ + vrev64.8 $Xl,$Xl vrev64.8 $j1,$j1 vrev64.8 $j2,$j2 vrev64.8 $j3,$j3 @@ -459,8 +456,8 @@ gcm_ghash_v8_4x: veor $Yh,$Yh,$I1 veor $Ym,$Ym,$j1 - subs $len,$len,#64 - b.eq .Ltail4x + subs $len,$len,#128 + b.lo .Ltail4x b .Loop4x @@ -525,7 +522,7 @@ gcm_ghash_v8_4x: veor $Ym,$Ym,$j1 subs $len,$len,#64 - b.ne .Loop4x + b.hs .Loop4x .Ltail4x: veor $t0,$I0,$Xl @@ -540,6 +537,137 @@ gcm_ghash_v8_4x: veor $Xh,$Xh,$Yh veor $Xm,$Xm,$Ym + adds $len,$len,#64 + b.eq .Ldone4x + + cmp $len,#32 + b.lo .Lone + b.eq .Ltwo +.Lthree: + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + vld1.64 {$I0-$j2},[$inp] + veor $Xm,$Xm,$t2 +#ifndef __ARMEB__ + vrev64.8 $j1,$j1 + vrev64.8 $j2,$j2 + vrev64.8 $I0,$I0 +#endif + + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + vext.8 $I2,$j2,$j2,#8 + vext.8 $I1,$j1,$j1,#8 + veor $Xl,$Xm,$t2 + + vpmull.p64 $Yl,$H,$I2 @ H·Ii+2 + veor $j2,$j2,$I2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + vpmull2.p64 $Yh,$H,$I2 + vpmull.p64 $Ym,$Hhl,$j2 + veor $Xl,$Xl,$t2 + vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1 + veor $j1,$j1,$I1 + vext.8 $Xl,$Xl,$Xl,#8 + + vpmull2.p64 $I1,$H2,$I1 + veor $t0,$I0,$Xl + vpmull2.p64 $j1,$Hhl,$j1 + vext.8 $IN,$t0,$t0,#8 + + veor $Yl,$Yl,$j3 + veor $Yh,$Yh,$I1 + veor $Ym,$Ym,$j1 + + vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii) + veor $t0,$t0,$IN + vpmull2.p64 $Xh,$H3,$IN + vpmull.p64 $Xm,$H34,$t0 + + veor $Xl,$Xl,$Yl + veor $Xh,$Xh,$Yh + veor $Xm,$Xm,$Ym + b .Ldone4x + +.align 4 +.Ltwo: + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + vld1.64 {$I0-$j1},[$inp] + veor $Xm,$Xm,$t2 +#ifndef __ARMEB__ + vrev64.8 $j1,$j1 + vrev64.8 $I0,$I0 +#endif + + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + vext.8 $I1,$j1,$j1,#8 + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + vext.8 $Xl,$Xl,$Xl,#8 + + vpmull.p64 $Yl,$H,$I1 @ H·Ii+1 + veor $j1,$j1,$I1 + + veor $t0,$I0,$Xl + vext.8 $IN,$t0,$t0,#8 + + vpmull2.p64 $Yh,$H,$I1 + vpmull.p64 $Ym,$Hhl,$j1 + + vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii) + veor $t0,$t0,$IN + vpmull2.p64 $Xh,$H2,$IN + vpmull2.p64 $Xm,$Hhl,$t0 + + veor $Xl,$Xl,$Yl + veor $Xh,$Xh,$Yh + veor $Xm,$Xm,$Ym + b .Ldone4x + +.align 4 +.Lone: + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + vld1.64 {$I0},[$inp] + veor $Xm,$Xm,$t2 +#ifndef __ARMEB__ + vrev64.8 $I0,$I0 +#endif + + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + vext.8 $Xl,$Xl,$Xl,#8 + + veor $t0,$I0,$Xl + vext.8 $IN,$t0,$t0,#8 + + vpmull.p64 $Xl,$H,$IN + veor $t0,$t0,$IN + vpmull2.p64 $Xh,$H,$IN + vpmull.p64 $Xm,$Hhl,$t0 + +.Ldone4x: vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh veor $Xm,$Xm,$t1 -- 2.25.1