From 5599c7331b90d9d29c9914c2a95c16d91485415a Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Fri, 14 Feb 2014 17:06:15 +0100 Subject: [PATCH] aes/asm/aesni-x86_64.pl: further optimization for Atom Silvermont. Improve CBC decrypt and CTR by ~13/16%, which adds up to ~25/33% improvement over "pre-Silvermont" version. [Add performance table to aesni-x86.pl]. --- crypto/aes/asm/aesni-x86.pl | 11 ++ crypto/aes/asm/aesni-x86_64.pl | 189 +++++++++++++++++++++++++++++---- 2 files changed, 179 insertions(+), 21 deletions(-) diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl index 6fcbb9581d..c3df97db7b 100644 --- a/crypto/aes/asm/aesni-x86.pl +++ b/crypto/aes/asm/aesni-x86.pl @@ -43,6 +43,17 @@ # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. +###################################################################### +# Current large-block performance in cycles per byte processed with +# 128-bit key (less is better). +# +# CBC en-/decrypt CTR XTS ECB +# Westmere 3.77/1.37 1.37 1.52 1.27 +# * Bridge 5.07/0.98 0.99 1.09 0.91 +# Haswell 4.44/0.80 0.97 1.03 0.72 +# Atom 5.77/3.56 3.67 4.03 3.46 +# Bulldozer 5.80/0.98 1.05 1.24 0.93 + $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-586.pl:-) diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index 96ef5c5114..708fabd3de 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -158,25 +158,19 @@ # in CTR mode AES instruction interleave factor was chosen to be 6x. ###################################################################### -# For reference, AMD Bulldozer spends 5.77 cycles per byte processed -# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70 -# in ECB, 0.71 in CTR, 0.90 in XTS... This means that aes[enc|dec] -# instruction latency is 9 cycles and that they can be issued every -# cycle. - -###################################################################### -# Haswell spends 4.44 cycles per byte in CBC encrypt, 0.63 in CBC -# decrypt, CTR and ECB, 0.73 in XTS. - -###################################################################### -# Atom Silvermont spends 5.77/4.0 cycles per byte in CBC en-/decrypt, -# 3.87 in ECB, 4.15 in CTR, 4.12 in XTS. Results for parallelizeable -# modes [other than XTS] are actually suboptimal, because of penalties -# incurred by operations on %xmm8-15, which are inevitable with such -# high instruction interleave factors. This means that performance can -# be improved by decreasing the interleave factor, but then it would -# negatively affect other platforms in relatively larger degree. -# Run-time detection would solve the dilemma... +# Current large-block performance in cycles per byte processed with +# 128-bit key (less is better). +# +# CBC en-/decrypt CTR XTS ECB +# Westmere 3.77/1.25 1.25 1.25 1.26 +# * Bridge 5.07/0.74 0.75 0.90 0.85 +# Haswell 4.44/0.63 0.63 0.73 0.63 +# Atom 5.75/3.54 3.56 4.12 3.87(*) +# Bulldozer 5.77/0.70 0.72 0.90 0.70 +# +# (*) Atom ECB result is suboptimal because of penalties incurred +# by operations on %xmm8-15. As ECB is not considered +# critical, nothing was done to mitigate the problem. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for @@ -201,6 +195,7 @@ $movkey = $PREFIX eq "aesni" ? "movups" : "movups"; ("%rdi","%rsi","%rdx","%rcx"); # Unix order $code=".text\n"; +$code.=".extern OPENSSL_ia32cap_P\n"; $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... @@ -1119,7 +1114,9 @@ $code.=<<___; lea 7($ctr),%r9 mov %r10d,0x60+12(%rsp) bswap %r9d + mov OPENSSL_ia32cap_P+4(%rip),%r10d xor $key0,%r9d + and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE mov %r9d,0x70+12(%rsp) $movkey 0x10($key),$rndkey1 @@ -1130,10 +1127,104 @@ $code.=<<___; cmp \$8,$len jb .Lctr32_tail + sub \$6,$len + cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE + je .Lctr32_6x + lea 0x80($key),$key # size optimization - sub \$8,$len + sub \$2,$len jmp .Lctr32_loop8 +.align 16 +.Lctr32_6x: + shl \$4,$rounds + mov \$48,$rnds_ + bswap $key0 + lea 32($key,$rounds),$key # end of key schedule + sub %rax,%r10 # twisted $rounds + jmp .Lctr32_loop6 + +.align 16 +.Lctr32_loop6: + add \$6,$ctr + $movkey -48($key,$rnds_),$rndkey0 + aesenc $rndkey1,$inout0 + mov $ctr,%eax + xor $key0,%eax + aesenc $rndkey1,$inout1 + movbe %eax,`0x00+12`(%rsp) + lea 1($ctr),%eax + aesenc $rndkey1,$inout2 + xor $key0,%eax + movbe %eax,`0x10+12`(%rsp) + aesenc $rndkey1,$inout3 + lea 2($ctr),%eax + xor $key0,%eax + aesenc $rndkey1,$inout4 + movbe %eax,`0x20+12`(%rsp) + lea 3($ctr),%eax + aesenc $rndkey1,$inout5 + $movkey -32($key,$rnds_),$rndkey1 + xor $key0,%eax + + aesenc $rndkey0,$inout0 + movbe %eax,`0x30+12`(%rsp) + lea 4($ctr),%eax + aesenc $rndkey0,$inout1 + xor $key0,%eax + movbe %eax,`0x40+12`(%rsp) + aesenc $rndkey0,$inout2 + lea 5($ctr),%eax + xor $key0,%eax + aesenc $rndkey0,$inout3 + movbe %eax,`0x50+12`(%rsp) + mov %r10,%rax # mov $rnds_,$rounds + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey -16($key,$rnds_),$rndkey0 + + call .Lenc_loop6 + + movdqu ($inp),$inout6 + movdqu 0x10($inp),$inout7 + movdqu 0x20($inp),$in0 + movdqu 0x30($inp),$in1 + movdqu 0x40($inp),$in2 + movdqu 0x50($inp),$in3 + lea 0x60($inp),$inp + $movkey -64($key,$rnds_),$rndkey1 + pxor $inout0,$inout6 + movaps 0x00(%rsp),$inout0 + pxor $inout1,$inout7 + movaps 0x10(%rsp),$inout1 + pxor $inout2,$in0 + movaps 0x20(%rsp),$inout2 + pxor $inout3,$in1 + movaps 0x30(%rsp),$inout3 + pxor $inout4,$in2 + movaps 0x40(%rsp),$inout4 + pxor $inout5,$in3 + movaps 0x50(%rsp),$inout5 + movdqu $inout6,($out) + movdqu $inout7,0x10($out) + movdqu $in0,0x20($out) + movdqu $in1,0x30($out) + movdqu $in2,0x40($out) + movdqu $in3,0x50($out) + lea 0x60($out),$out + + sub \$6,$len + jnc .Lctr32_loop6 + + add \$6,$len + jz .Lctr32_done + + lea -48($rnds_),$rounds + lea -80($key,$rnds_),$key # restore $key + neg $rounds + shr \$4,$rounds # restore $rounds + jmp .Lctr32_tail + .align 32 .Lctr32_loop8: add \$8,$ctr @@ -2455,10 +2546,15 @@ $code.=<<___; movdqa $inout3,$in3 movdqu 0x50($inp),$inout5 movdqa $inout4,$in4 + mov OPENSSL_ia32cap_P+4(%rip),%r9d cmp \$0x70,$len jbe .Lcbc_dec_six_or_seven - sub \$0x70,$len + and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE + sub \$0x50,$len + cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE + je .Lcbc_dec_loop6_enter + sub \$0x20,$len lea 0x70($key),$key # size optimization jmp .Lcbc_dec_loop8_enter .align 16 @@ -2638,6 +2734,51 @@ $code.=<<___; movdqa $inout6,$inout0 jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_loop6: + movups $inout5,($out) + lea 0x10($out),$out + movdqu 0x00($inp),$inout0 # load input + movdqu 0x10($inp),$inout1 + movdqa $inout0,$in0 + movdqu 0x20($inp),$inout2 + movdqa $inout1,$in1 + movdqu 0x30($inp),$inout3 + movdqa $inout2,$in2 + movdqu 0x40($inp),$inout4 + movdqa $inout3,$in3 + movdqu 0x50($inp),$inout5 + movdqa $inout4,$in4 +.Lcbc_dec_loop6_enter: + lea 0x60($inp),$inp + movdqa $inout5,$inout6 + + call _aesni_decrypt6 + + pxor $iv,$inout0 # ^= IV + movdqa $inout6,$iv + pxor $in0,$inout1 + movdqu $inout0,($out) + pxor $in1,$inout2 + movdqu $inout1,0x10($out) + pxor $in2,$inout3 + movdqu $inout2,0x20($out) + pxor $in3,$inout4 + mov $key_,$key + movdqu $inout3,0x30($out) + pxor $in4,$inout5 + mov $rnds_,$rounds + movdqu $inout4,0x40($out) + lea 0x50($out),$out + sub \$0x60,$len + ja .Lcbc_dec_loop6 + + movdqa $inout5,$inout0 + add \$0x50,$len + jle .Lcbc_dec_tail_collected + movups $inout5,($out) + lea 0x10($out),$out + .Lcbc_dec_tail: movups ($inp),$inout0 sub \$0x10,$len @@ -3360,8 +3501,14 @@ sub aesni { return $line; } +sub movbe { + ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; +} + $code =~ s/\`([^\`]*)\`/eval($1)/gem; $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; +#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact +$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; print $code; -- 2.25.1