From d608b4d6629b5a19c4e96ff4ae599cef95d74c8e Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sat, 2 May 2009 09:04:17 +0000 Subject: [PATCH] AES-NI engine jumbo update. --- crypto/aes/asm/aesni-x86.pl | 270 +++++++---- crypto/aes/asm/aesni-x86_64.pl | 836 +++++++++++++++++---------------- crypto/engine/eng_aesni.c | 17 +- test/test_aesni | 12 +- 4 files changed, 631 insertions(+), 504 deletions(-) diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl index fe0cbe0b5e..86062a9940 100644 --- a/crypto/aes/asm/aesni-x86.pl +++ b/crypto/aes/asm/aesni-x86.pl @@ -29,8 +29,8 @@ $rounds="ecx"; $key="edx"; $inp="esi"; $out="edi"; -$rounds_="ebx"; -$key_="ebp"; +$rounds_="ebx"; # backup copy for $rounds +$key_="ebp"; # backup copy for $key $inout0="xmm0"; $inout1="xmm1"; @@ -39,26 +39,23 @@ $rndkey0="xmm3"; $rndkey1="xmm4"; $ivec="xmm5"; $in0="xmm6"; -$in1="xmm7"; +$in1="xmm7"; $inout3="xmm7"; -sub _aesni_generate1 # folded loop +# Inline version of internal aesni_[en|de]crypt1 +sub aesni_inline_generate1 { my $p=shift; - &function_begin_B("_aesni_${p}rypt1"); - &$movekey ($rndkey0,&QWP(0,$key)); - &$movekey ($rndkey1,&QWP(16,$key)); - &lea ($key,&DWP(16,$key)); - &pxor ($inout0,$rndkey0); - &dec ($rounds); - &set_label("${p}1_loop",16); + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(16,$key)); + &lea ($key,&DWP(32,$key)); + &pxor ($inout0,$rndkey0); + &set_label("${p}1_loop"); eval"&aes${p} ($inout0,$rndkey1)"; &dec ($rounds); - &lea ($key,&DWP(16,$key)); &$movekey ($rndkey1,&QWP(0,$key)); - &jnz (&label("${p}1_loop")); + &lea ($key,&DWP(16,$key)); + &jnz (&label("${p}1_loop")); eval"&aes${p}last ($inout0,$rndkey1)"; - &ret(); - &function_end_B("_aesni_${p}rypt1"); } sub aesni_generate1 # fully unrolled loop @@ -67,7 +64,7 @@ sub aesni_generate1 # fully unrolled loop &function_begin_B("_aesni_${p}rypt1"); &$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey1,&QWP(0x10,$key)); - &cmp ($rounds,12); + &cmp ($rounds,11); &pxor ($inout0,$rndkey0); &$movekey ($rndkey0,&QWP(0x20,$key)); &lea ($key,&DWP(0x30,$key)); @@ -107,52 +104,52 @@ sub aesni_generate1 # fully unrolled loop &function_end_B("_aesni_${p}rypt1"); } -&aesni_generate1("enc"); # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); +# &aesni_generate1("dec"); &function_begin_B("${PREFIX}_encrypt"); &mov ("eax",&wparam(0)); &mov ($key,&wparam(2)); &movups ($inout0,&QWP(0,"eax")); &mov ($rounds,&DWP(240,$key)); &mov ("eax",&wparam(1)); - &call ("_aesni_encrypt1"); + &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1"); &movups (&QWP(0,"eax"),$inout0); &ret (); &function_end_B("${PREFIX}_encrypt"); -&aesni_generate1("dec"); # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); +# &aesni_generate1("dec"); &function_begin_B("${PREFIX}_decrypt"); &mov ("eax",&wparam(0)); &mov ($key,&wparam(2)); &movups ($inout0,&QWP(0,"eax")); &mov ($rounds,&DWP(240,$key)); &mov ("eax",&wparam(1)); - &call ("_aesni_decrypt1"); + &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt1"); &movups (&QWP(0,"eax"),$inout0); &ret (); &function_end_B("${PREFIX}_decrypt"); - -# _aesni_[en|de]crypt3 are private interfaces, 3 denotes interleave -# factor. Why 3x? Even though aes[enc|dec] latency is 6, it turned -# out that it can be scheduled only every *second* cycle. Thus 3x -# interleave is the one providing optimal utilization, i.e. when -# subroutine's throughput is virtually same as of non-interleaved -# subroutine for number of input blocks up to 3. This is why it -# handles even double-block inputs. Larger interleave factor would -# perform suboptimally on shorter inputs... - + +# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave +# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] +# latency is 6, it turned out that it can be scheduled only every +# *second* cycle. Thus 3x interleave is the one providing optimal +# utilization, i.e. when subroutine's throughput is virtually same as +# of non-interleaved subroutine [for number of input blocks up to 3]. +# This is why it makes no sense to implement 2x subroutine. As soon +# as/if Intel improves throughput by making it possible to schedule +# the instructions in question *every* cycles I would have to +# implement 6x interleave and use it in loop... sub aesni_generate3 { my $p=shift; &function_begin_B("_aesni_${p}rypt3"); &$movekey ($rndkey0,&QWP(0,$key)); - &$movekey ($rndkey1,&QWP(16,$key)); &shr ($rounds,1); + &$movekey ($rndkey1,&QWP(16,$key)); &lea ($key,&DWP(32,$key)); &pxor ($inout0,$rndkey0); &pxor ($inout1,$rndkey0); - &dec ($rounds); &pxor ($inout2,$rndkey0); &jmp (&label("${p}3_loop")); &set_label("${p}3_loop",16); @@ -177,14 +174,59 @@ sub aesni_generate3 &ret(); &function_end_B("_aesni_${p}rypt3"); } + +# 4x interleave is implemented to improve small block performance, +# most notably [and naturally] 4 block by ~30%. One can argue that one +# should have implemented 5x as well, but improvement would be <20%, +# so it's not worth it... +sub aesni_generate4 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt4"); + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(16,$key)); + &shr ($rounds,1); + &lea ($key,&DWP(32,$key)); + &pxor ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &pxor ($inout2,$rndkey0); + &pxor ($inout3,$rndkey0); + &jmp (&label("${p}3_loop")); + &set_label("${p}3_loop",16); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey0,&QWP(0,$key)); + eval"&aes${p} ($inout1,$rndkey1)"; + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + &$movekey ($rndkey1,&QWP(16,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + &lea ($key,&DWP(32,$key)); + eval"&aes${p} ($inout1,$rndkey0)"; + eval"&aes${p} ($inout2,$rndkey0)"; + eval"&aes${p} ($inout3,$rndkey0)"; + &jnz (&label("${p}3_loop")); + eval"&aes${p} ($inout0,$rndkey1)"; + &$movekey ($rndkey0,&QWP(0,$key)); + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + eval"&aes${p}last ($inout3,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt4"); +} &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); +&aesni_generate4("enc") if ($PREFIX eq "aesni"); +&aesni_generate4("dec"); if ($PREFIX eq "aesni") { # void aesni_ecb_encrypt (const void *in, void *out, # size_t length, const AES_KEY *key, # int enc); - &function_begin("aesni_ecb_encrypt"); &mov ($inp,&wparam(0)); &mov ($out,&wparam(1)); @@ -200,79 +242,121 @@ if ($PREFIX eq "aesni") { &mov ($rounds_,$rounds); # backup $rounds &jz (&label("ecb_decrypt")); - &sub ($len,0x30); - &jc (&label("ecb_enc_tail")); - jmp (&label("ecb_enc_loop3")); + &sub ($len,0x40); + &jbe (&label("ecb_enc_tail")); + &jmp (&label("ecb_enc_loop3")); &set_label("ecb_enc_loop3",16); &movups ($inout0,&QWP(0,$inp)); &movups ($inout1,&QWP(0x10,$inp)); &movups ($inout2,&QWP(0x20,$inp)); - &lea ($inp,&DWP(0x30,$inp)); &call ("_aesni_encrypt3"); - &movups (&QWP(0,$out),$inout0); &sub ($len,0x30); - &movups (&QWP(0x10,$out),$inout1); + &lea ($inp,&DWP(0x30,$inp)); + &lea ($out,&DWP(0x30,$out)); + &movups (&QWP(-0x30,$out),$inout0); &mov ($key,$key_); # restore $key - &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(-0x20,$out),$inout1); &mov ($rounds,$rounds_); # restore $rounds - &lea ($out,&DWP(0x30,$out)); - &jnc (&label("ecb_enc_loop3")); + &movups (&QWP(-0x10,$out),$inout2); + &ja (&label("ecb_enc_loop3")); &set_label("ecb_enc_tail"); - &add ($len,0x30); + &add ($len,0x40); &jz (&label("ecb_ret")); &cmp ($len,0x10); &movups ($inout0,&QWP(0,$inp)); - je (&label("ecb_enc_one")); + &je (&label("ecb_enc_one")); + &cmp ($len,0x20); &movups ($inout1,&QWP(0x10,$inp)); - &call ("_aesni_encrypt3"); + &je (&label("ecb_enc_two")); + &cmp ($len,0x30); + &movups ($inout2,&QWP(0x20,$inp)); + &je (&label("ecb_enc_three")); + &movups ($inout3,&QWP(0x30,$inp)); + &call ("_aesni_encrypt4"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); jmp (&label("ecb_ret")); &set_label("ecb_enc_one",16); - &call ("_aesni_encrypt1"); + &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1"); &movups (&QWP(0,$out),$inout0); &jmp (&label("ecb_ret")); +&set_label("ecb_enc_two",16); + &call ("_aesni_encrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &jmp (&label("ecb_ret")); + +&set_label("ecb_enc_three",16); + &call ("_aesni_encrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &jmp (&label("ecb_ret")); + &set_label("ecb_decrypt",16); - &sub ($len,0x30); - &jc (&label("ecb_dec_tail")); - jmp (&label("ecb_dec_loop3")); + &sub ($len,0x40); + &jbe (&label("ecb_dec_tail")); + &jmp (&label("ecb_dec_loop3")); &set_label("ecb_dec_loop3",16); &movups ($inout0,&QWP(0,$inp)); &movups ($inout1,&QWP(0x10,$inp)); &movups ($inout2,&QWP(0x20,$inp)); &call ("_aesni_decrypt3"); - &movups (&QWP(0,$out),$inout0); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); - &movups (&QWP(0x10,$out),$inout1); + &lea ($out,&DWP(0x30,$out)); + &movups (&QWP(-0x30,$out),$inout0); &mov ($key,$key_); # restore $key - &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(-0x20,$out),$inout1); &mov ($rounds,$rounds_); # restore $rounds - &lea ($out,&DWP(0x30,$out)); - &jnc (&label("ecb_dec_loop3")); + &movups (&QWP(-0x10,$out),$inout2); + &ja (&label("ecb_dec_loop3")); &set_label("ecb_dec_tail"); - &add ($len,0x30); + &add ($len,0x40); &jz (&label("ecb_ret")); &cmp ($len,0x10); &movups ($inout0,&QWP(0,$inp)); - je (&label("ecb_dec_one")); + &je (&label("ecb_dec_one")); + &cmp ($len,0x20); &movups ($inout1,&QWP(0x10,$inp)); - &call ("_aesni_decrypt3"); + &je (&label("ecb_dec_two")); + &cmp ($len,0x30); + &movups ($inout2,&QWP(0x20,$inp)); + &je (&label("ecb_dec_three")); + &movups ($inout3,&QWP(0x30,$inp)); + &call ("_aesni_decrypt4"); &movups (&QWP(0,$out),$inout0); &movups (&QWP(0x10,$out),$inout1); - jmp (&label("ecb_ret")); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &jmp (&label("ecb_ret")); &set_label("ecb_dec_one",16); - &call ("_aesni_decrypt1"); + &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3"); &movups (&QWP(0,$out),$inout0); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_two",16); + &call ("_aesni_decrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_three",16); + &call ("_aesni_decrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); &set_label("ecb_ret"); &function_end("aesni_ecb_encrypt"); @@ -288,7 +372,7 @@ if ($PREFIX eq "aesni") { &mov ($key,&wparam(3)); &test ($len,$len); &mov ($key_,&wparam(4)); - &je (&label("cbc_ret")); + &jz (&label("cbc_ret")); &cmp (&wparam(5),0); &movups ($ivec,&QWP(0,$key_)); # load IV @@ -307,12 +391,12 @@ if ($PREFIX eq "aesni") { &movups ($ivec,&QWP(0,$inp)); &lea ($inp,&DWP(16,$inp)); &pxor ($inout0,$ivec); - &call ("_aesni_encrypt1"); + &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt3"); &sub ($len,16); + &lea ($out,&DWP(16,$out)); &mov ($rounds,$rounds_); # restore $rounds &mov ($key,$key_); # restore $key - &movups (&QWP(0,$out),$inout0); - &lea ($out,&DWP(16,$out)); + &movups (&QWP(-16,$out),$inout0); &jnc (&label("cbc_enc_loop")); &add ($len,16); &jnz (&label("cbc_enc_tail")); @@ -333,8 +417,8 @@ if ($PREFIX eq "aesni") { &jmp (&label("cbc_enc_loop")); &set_label("cbc_decrypt",16); - &sub ($len,0x30); - &jc (&label("cbc_dec_tail")); + &sub ($len,0x40); + &jbe (&label("cbc_dec_tail")); &jmp (&label("cbc_dec_loop3")); &set_label("cbc_dec_loop3",16); @@ -346,20 +430,20 @@ if ($PREFIX eq "aesni") { &call ("_aesni_decrypt3"); &sub ($len,0x30); &lea ($inp,&DWP(0x30,$inp)); + &lea ($out,&DWP(0x30,$out)); &pxor ($inout0,$ivec); &pxor ($inout1,$in0); &movups ($ivec,&QWP(-0x10,$inp)); &pxor ($inout2,$in1); - &movups (&QWP(0,$out),$inout0); + &movups (&QWP(-0x30,$out),$inout0); &mov ($rounds,$rounds_) # restore $rounds - &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(-0x20,$out),$inout1); &mov ($key,$key_); # restore $key - &movups (&QWP(0x20,$out),$inout2); - &lea ($out,&DWP(0x30,$out)); - &jnc (&label("cbc_dec_loop3")); + &movups (&QWP(-0x10,$out),$inout2); + &ja (&label("cbc_dec_loop3")); &set_label("cbc_dec_tail"); - &add ($len,0x30); + &add ($len,0x40); &jz (&label("cbc_ret")); &movups ($inout0,&QWP(0,$inp)); @@ -371,19 +455,26 @@ if ($PREFIX eq "aesni") { &movaps ($in1,$inout1); &jbe (&label("cbc_dec_two")); &movups ($inout2,&QWP(0x20,$inp)); - &call ("_aesni_decrypt3"); + &cmp ($len,0x30); + &jbe (&label("cbc_dec_three")); + &movups ($inout3,&QWP(0x30,$inp)); + &call ("_aesni_decrypt4"); + &movups ($rndkey0,&QWP(0x10,$inp)); + &movups ($rndkey1,&QWP(0x20,$inp)); &pxor ($inout0,$ivec); - &movups ($ivec,&QWP(0x20,$inp)); &pxor ($inout1,$in0); - &pxor ($inout2,$in1); + &movups ($ivec,&QWP(0x30,$inp)); &movups (&QWP(0,$out),$inout0); + &pxor ($inout2,$rndkey0); + &pxor ($inout3,$rndkey1); &movups (&QWP(0x10,$out),$inout1); - &movaps ($inout0,$inout2); - &lea ($out,&DWP(0x20,$out)); + &movups (&QWP(0x20,$out),$inout2); + &movaps ($inout0,$inout3); + &lea ($out,&DWP(0x30,$out)); &jmp (&label("cbc_dec_tail_collected")); &set_label("cbc_dec_one"); - &call ("_aesni_decrypt1"); + &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3"); &pxor ($inout0,$ivec); &movaps ($ivec,$in0); &jmp (&label("cbc_dec_tail_collected")); @@ -396,6 +487,18 @@ if ($PREFIX eq "aesni") { &movaps ($inout0,$inout1); &movaps ($ivec,$in1); &lea ($out,&DWP(0x10,$out)); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_three"); + &call ("_aesni_decrypt3"); + &pxor ($inout0,$ivec); + &pxor ($inout1,$in0); + &pxor ($inout2,$in1); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movaps ($inout0,$inout2); + &movups ($ivec,&QWP(0x20,$inp)); + &lea ($out,&DWP(0x20,$out)); &set_label("cbc_dec_tail_collected"); &and ($len,15); @@ -446,7 +549,7 @@ if ($PREFIX eq "aesni") { &jne (&label("bad_keybits")); &set_label("10rounds",16); - &mov ($rounds,10); + &mov ($rounds,9); &$movekey (&QWP(-16,$key),"xmm0"); # round 0 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 &call (&label("key_128_cold")); @@ -487,7 +590,7 @@ if ($PREFIX eq "aesni") { &set_label("12rounds",16); &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey - &mov ($rounds,12); + &mov ($rounds,11); &$movekey (&QWP(-16,$key),"xmm0") # round 0 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 &call (&label("key_192a_cold")); @@ -540,7 +643,7 @@ if ($PREFIX eq "aesni") { &set_label("14rounds",16); &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey - &mov ($rounds,14); + &mov ($rounds,13); &lea ($key,&DWP(16,$key)); &$movekey (&QWP(-32,$key),"xmm0"); # round 0 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 @@ -625,10 +728,10 @@ if ($PREFIX eq "aesni") { &mov ($key,&wparam(2)); &call ("_aesni_set_encrypt_key"); &mov ($key,&wparam(2)); - &shl ($rounds,4) # actually rounds after _aesni_set_encrypt_key + &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key &test ("eax","eax"); &jnz (&label("dec_key_ret")); - &lea ("eax",&DWP(0,$key,$rounds)); # end of key schedule + &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule &$movekey ("xmm0",&QWP(0,$key)); # just swap &$movekey ("xmm1",&QWP(0,"eax")); @@ -636,9 +739,8 @@ if ($PREFIX eq "aesni") { &$movekey (&QWP(0,$key),"xmm1"); &lea ($key,&DWP(16,$key)); &lea ("eax",&DWP(-16,"eax")); - &jmp (&label("dec_key_inverse")); -&set_label("dec_key_inverse",16); +&set_label("dec_key_inverse"); &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse &$movekey ("xmm1",&QWP(0,"eax")); &aesimc ("xmm0","xmm0"); diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index 4ed3932b75..c9d7485637 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -11,9 +11,6 @@ # OpenSSL context it's used with Intel engine, but can also be used as # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for # details]. -# -# TODO: -# - Win64 SEH handlers; $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for @@ -33,15 +30,15 @@ die "can't locate x86_64-xlate.pl"; open STDOUT,"| $^X $xlate $flavour $output"; $movkey = $PREFIX eq "aesni" ? "movaps" : "movups"; +@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order $code=".text\n"; $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! - -# this is natural argument order for public $PREFIX_*crypt... +# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... $inp="%rdi"; $out="%rsi"; -# ... and for $PREFIX_[ebc|cbc]_encrypt in particular. $len="%rdx"; $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! $ivp="%r8"; # cbc @@ -52,162 +49,169 @@ $key_="%r11"; # backup copy for $key # %xmm register layout $inout0="%xmm0"; $inout1="%xmm1"; $inout2="%xmm2"; $inout3="%xmm3"; -$inout4="%xmm4"; $inout5="%xmm5"; -$rndkey0="%xmm6"; $rndkey1="%xmm7"; +$rndkey0="%xmm4"; $rndkey1="%xmm5"; -$iv="%xmm8"; -$in0="%xmm9"; $in1="%xmm10"; -$in2="%xmm11"; $in3="%xmm12"; -$in4="%xmm13"; $in5="%xmm14"; +$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt +$in1="%xmm8"; $in2="%xmm9"; # Inline version of internal aesni_[en|de]crypt1. # # Why folded loop? Because aes[enc|dec] is slow enough to accommodate # cycles which take care of loop variables... { my $sn; -sub aesni_encrypt1 { -my ($data,$rndkey0,$rndkey1,$key,$rounds)=@_; +sub aesni_generate1 { +my ($p,$key,$rounds)=@_; ++$sn; $code.=<<___; $movkey ($key),$rndkey0 $movkey 16($key),$rndkey1 - lea 16($key),$key - pxor $rndkey0,$data - dec $rounds - jmp .Loop_enc1_$sn -.align 16 -.Loop_enc1_$sn: - aesenc $rndkey1,$data + lea 32($key),$key + pxor $rndkey0,$inout0 +.Loop_${p}1_$sn: + aes${p} $rndkey1,$inout0 dec $rounds - lea 16($key),$key $movkey ($key),$rndkey1 - jnz .Loop_enc1_$sn # loop body is 16 bytes - - aesenclast $rndkey1,$data -___ -}} -{ my $sn; -sub aesni_decrypt1 { -my ($data,$rndkey0,$rndkey1,$key,$rounds)=@_; -++$sn; -$code.=<<___; - $movkey ($key),$rndkey0 - $movkey 16($key),$rndkey1 lea 16($key),$key - pxor $rndkey0,$data - dec $rounds - jmp .Loop_dec1_$sn -.align 16 -.Loop_dec1_$sn: - aesdec $rndkey1,$data - dec $rounds - lea 16($key),$key - $movkey ($key),$rndkey1 - jnz .Loop_dec1_$sn # loop body is 16 bytes - - aesdeclast $rndkey1,$data + jnz .Loop_${p}1_$sn # loop body is 16 bytes + aes${p}last $rndkey1,$inout0 ___ }} - -# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); +# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); # +{ my ($inp,$out,$key) = @_4args; + $code.=<<___; .globl ${PREFIX}_encrypt -.type ${PREFIX}_encrypt,\@function,3 +.type ${PREFIX}_encrypt,\@abi-omnipotent .align 16 ${PREFIX}_encrypt: - movups ($inp),%xmm0 # load input - mov 240(%rdx),$rounds # pull $rounds + movups ($inp),$inout0 # load input + mov 240($key),$rounds # pull $rounds ___ - &aesni_encrypt1("%xmm0","%xmm1","%xmm2","%rdx",$rounds); + &aesni_generate1("enc",$key,$rounds); $code.=<<___; - movups %xmm0,(%rsi) # output + movups $inout0,($out) # output ret .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt -___ -# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); -# -$code.=<<___; .globl ${PREFIX}_decrypt -.type ${PREFIX}_decrypt,\@function,3 +.type ${PREFIX}_decrypt,\@abi-omnipotent .align 16 ${PREFIX}_decrypt: - movups ($inp),%xmm0 # load input - mov 240(%rdx),$rounds # pull $rounds + movups ($inp),$inout0 # load input + mov 240($key),$rounds # pull $rounds ___ - &aesni_decrypt1("%xmm0","%xmm1","%xmm2","%rdx",$rounds); + &aesni_generate1("dec",$key,$rounds); $code.=<<___; - movups %xmm0,($out) # output + movups $inout0,($out) # output ret .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt ___ +} -# _aesni_[en|de]crypt6 are private interfaces, 6 denotes interleave -# factor. Why 6x? Because aes[enc|dec] latency is 6 and 6x interleave -# provides optimal utilization, so that subroutine's throughput is -# virtually same for *any* number [naturally up to 6] of input blocks -# as for non-interleaved subroutine. This is why it handles even -# double-, tripple-, quad- and penta-block inputs. Larger interleave -# factor, e.g. 8x, would perform suboptimally on these shorter inputs... -sub aesni_generate6 { +# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave +# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] +# latency is 6, it turned out that it can be scheduled only every +# *second* cycle. Thus 3x interleave is the one providing optimal +# utilization, i.e. when subroutine's throughput is virtually same as +# of non-interleaved subroutine [for number of input blocks up to 3]. +# This is why it makes no sense to implement 2x subroutine. As soon +# as/if Intel improves throughput by making it possible to schedule +# the instructions in question *every* cycles I would have to +# implement 6x interleave and use it in loop... +sub aesni_generate3 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* -# preserved. $inout[0-5] is cipher/clear text... +# preserved. $inout[0-2] is cipher/clear text... $code.=<<___; -.type _aesni_${dir}rypt6,\@abi-omnipotent +.type _aesni_${dir}rypt3,\@abi-omnipotent .align 16 -_aesni_${dir}rypt6: +_aesni_${dir}rypt3: $movkey ($key),$rndkey0 + shr \$1,$rounds $movkey 16($key),$rndkey1 + lea 32($key),$key + pxor $rndkey0,$inout0 + pxor $rndkey0,$inout1 + pxor $rndkey0,$inout2 + +.L${dir}_loop3: + aes${dir} $rndkey1,$inout0 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout1 + dec $rounds + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey0,$inout0 + $movkey 16($key),$rndkey1 + aes${dir} $rndkey0,$inout1 + lea 32($key),$key + aes${dir} $rndkey0,$inout2 + jnz .L${dir}_loop3 + + aes${dir} $rndkey1,$inout0 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + ret +.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 +___ +} +# 4x interleave is implemented to improve small block performance, +# most notably [and naturally] 4 block by ~30%. One can argue that one +# should have implemented 5x as well, but improvement would be <20%, +# so it's not worth it... +sub aesni_generate4 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-3] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt4,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt4: + $movkey ($key),$rndkey0 shr \$1,$rounds + $movkey 16($key),$rndkey1 lea 32($key),$key - dec $rounds pxor $rndkey0,$inout0 pxor $rndkey0,$inout1 pxor $rndkey0,$inout2 pxor $rndkey0,$inout3 - pxor $rndkey0,$inout4 - pxor $rndkey0,$inout5 - jmp .L${dir}_loop6 -.align 16 -.L${dir}_loop6: + +.L${dir}_loop4: aes${dir} $rndkey1,$inout0 $movkey ($key),$rndkey0 aes${dir} $rndkey1,$inout1 dec $rounds aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 - aes${dir} $rndkey1,$inout4 - aes${dir} $rndkey1,$inout5 aes${dir} $rndkey0,$inout0 $movkey 16($key),$rndkey1 aes${dir} $rndkey0,$inout1 lea 32($key),$key aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 - aes${dir} $rndkey0,$inout4 - aes${dir} $rndkey0,$inout5 - jnz .L${dir}_loop6 + jnz .L${dir}_loop4 + aes${dir} $rndkey1,$inout0 $movkey ($key),$rndkey0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 - aes${dir} $rndkey1,$inout4 - aes${dir} $rndkey1,$inout5 aes${dir}last $rndkey0,$inout0 aes${dir}last $rndkey0,$inout1 aes${dir}last $rndkey0,$inout2 aes${dir}last $rndkey0,$inout3 - aes${dir}last $rndkey0,$inout4 - aes${dir}last $rndkey0,$inout5 ret -.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 +.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 ___ } -&aesni_generate6("enc"); -&aesni_generate6("dec"); +&aesni_generate3("enc") if ($PREFIX eq "aesni"); +&aesni_generate3("dec"); +&aesni_generate4("enc") if ($PREFIX eq "aesni"); +&aesni_generate4("dec"); if ($PREFIX eq "aesni") { # void aesni_ecb_encrypt (const void *in, void *out, @@ -219,48 +223,36 @@ $code.=<<___; .align 16 aesni_ecb_encrypt: cmp \$16,$len # check length - jb .Lecb_abort -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,16(%rsp) -___ -$code.=<<___; + jb .Lecb_ret + mov 240($key),$rounds # pull $rounds and \$-16,$len mov $key,$key_ # backup $key - test %r8d,%r8d + test %r8d,%r8d # 5th argument mov $rounds,$rnds_ # backup $rounds jz .Lecb_decrypt #--------------------------- ECB ENCRYPT ------------------------------# - sub \$0x60,$len - jc .Lecb_enc_tail - jmp .Lecb_enc_loop6 + sub \$0x40,$len + jbe .Lecb_enc_tail + jmp .Lecb_enc_loop3 .align 16 -.Lecb_enc_loop6: +.Lecb_enc_loop3: movups ($inp),$inout0 movups 0x10($inp),$inout1 movups 0x20($inp),$inout2 - movups 0x30($inp),$inout3 - movups 0x40($inp),$inout4 - movups 0x50($inp),$inout5 - call _aesni_encrypt6 - movups $inout0,($out) - sub \$0x60,$len - movups $inout1,0x10($out) - lea 0x60($inp),$inp - movups $inout2,0x20($out) + call _aesni_encrypt3 + sub \$0x30,$len + lea 0x30($inp),$inp + lea 0x30($out),$out + movups $inout0,-0x30($out) mov $rnds_,$rounds # restore $rounds - movups $inout3,0x30($out) + movups $inout1,-0x20($out) mov $key_,$key # restore $key - movups $inout4,0x40($out) - movups $inout5,0x50($out) - lea 0x60($out),$out - jnc .Lecb_enc_loop6 + movups $inout2,-0x10($out) + ja .Lecb_enc_loop3 .Lecb_enc_tail: - add \$0x60,$len + add \$0x40,$len jz .Lecb_ret cmp \$0x10,$len @@ -272,75 +264,57 @@ $code.=<<___; cmp \$0x30,$len movups 0x20($inp),$inout2 je .Lecb_enc_three - cmp \$0x40,$len movups 0x30($inp),$inout3 - je .Lecb_enc_four - movups 0x40($inp),$inout4 - call _aesni_encrypt6 + call _aesni_encrypt4 movups $inout0,($out) movups $inout1,0x10($out) movups $inout2,0x20($out) movups $inout3,0x30($out) - movups $inout4,0x40($out) jmp .Lecb_ret .align 16 .Lecb_enc_one: ___ - &aesni_encrypt1($inout0,$rndkey0,$rndkey1,$key,$rounds); + &aesni_generate1("enc",$key,$rounds); $code.=<<___; movups $inout0,($out) jmp .Lecb_ret .align 16 .Lecb_enc_two: - call _aesni_encrypt6 + call _aesni_encrypt3 movups $inout0,($out) movups $inout1,0x10($out) jmp .Lecb_ret .align 16 .Lecb_enc_three: - call _aesni_encrypt6 + call _aesni_encrypt3 movups $inout0,($out) movups $inout1,0x10($out) movups $inout2,0x20($out) jmp .Lecb_ret -.align 16 -.Lecb_enc_four: - call _aesni_encrypt6 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - movups $inout3,0x30($out) - jmp .Lecb_ret #--------------------------- ECB DECRYPT ------------------------------# .align 16 .Lecb_decrypt: - sub \$0x60,$len - jc .Lecb_dec_tail - jmp .Lecb_dec_loop6 + sub \$0x40,$len + jbe .Lecb_dec_tail + jmp .Lecb_dec_loop3 .align 16 -.Lecb_dec_loop6: +.Lecb_dec_loop3: movups ($inp),$inout0 movups 0x10($inp),$inout1 movups 0x20($inp),$inout2 - movups 0x30($inp),$inout3 - movups 0x40($inp),$inout4 - movups 0x50($inp),$inout5 - call _aesni_decrypt6 - movups $inout0,($out) - sub \$0x60,$len - movups $inout1,0x10($out) - lea 0x60($inp),$inp - movups $inout2,0x20($out) + call _aesni_decrypt3 + sub \$0x30,$len + lea 0x30($inp),$inp + lea 0x30($out),$out + movups $inout0,-0x30($out) mov $rnds_,$rounds # restore $rounds - movups $inout3,0x30($out) + movups $inout1,-0x20($out) mov $key_,$key # restore $key - movups $inout4,0x40($out) - movups $inout5,0x50($out) - lea 0x60($out),$out - jnc .Lecb_dec_loop6 + movups $inout2,-0x10($out) + ja .Lecb_dec_loop3 .Lecb_dec_tail: - add \$0x60,$len + add \$0x40,$len jz .Lecb_ret cmp \$0x10,$len @@ -352,54 +326,34 @@ $code.=<<___; cmp \$0x30,$len movups 0x20($inp),$inout2 je .Lecb_dec_three - cmp \$0x40,$len movups 0x30($inp),$inout3 - je .Lecb_dec_four - movups 0x40($inp),$inout4 - call _aesni_decrypt6 + call _aesni_decrypt4 movups $inout0,($out) movups $inout1,0x10($out) movups $inout2,0x20($out) movups $inout3,0x30($out) - movups $inout4,0x40($out) jmp .Lecb_ret .align 16 .Lecb_dec_one: ___ - &aesni_decrypt1($inout0,$rndkey0,$rndkey1,$key,$rounds); + &aesni_generate1("dec",$key,$rounds); $code.=<<___; movups $inout0,($out) jmp .Lecb_ret .align 16 .Lecb_dec_two: - call _aesni_decrypt6 + call _aesni_decrypt3 movups $inout0,($out) movups $inout1,0x10($out) jmp .Lecb_ret .align 16 .Lecb_dec_three: - call _aesni_decrypt6 - movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) - jmp .Lecb_ret -.align 16 -.Lecb_dec_four: - call _aesni_decrypt6 + call _aesni_decrypt3 movups $inout0,($out) movups $inout1,0x10($out) movups $inout2,0x20($out) - movups $inout3,0x30($out) .Lecb_ret: -___ -$code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - lea 0x28(%rsp),%rsp -___ -$code.=<<___; -.Lecb_abort: ret .size aesni_ecb_encrypt,.-aesni_ecb_encrypt ___ @@ -408,7 +362,7 @@ ___ # void $PREFIX_cbc_encrypt (const void *inp, void *out, # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); -$reserved = $win64?0x90:-0x18; # used in decrypt +$reserved = $win64?0x40:-0x18; # used in decrypt $code.=<<___; .globl ${PREFIX}_cbc_encrypt .type ${PREFIX}_cbc_encrypt,\@function,6 @@ -416,34 +370,35 @@ $code.=<<___; ${PREFIX}_cbc_encrypt: test $len,$len # check length jz .Lcbc_ret - mov 240($key),$rounds # pull $rounds + + mov 240($key),$rnds_ # pull $rounds mov $key,$key_ # backup $key - test %r9d,%r9d - mov $rounds,$rnds_ # backup $rounds + test %r9d,%r9d # 6th argument jz .Lcbc_decrypt #--------------------------- CBC ENCRYPT ------------------------------# - movups ($ivp),%xmm0 # load iv as initial state + movups ($ivp),$inout0 # load iv as initial state cmp \$16,$len + mov $rnds_,$rounds jb .Lcbc_enc_tail sub \$16,$len jmp .Lcbc_enc_loop .align 16 .Lcbc_enc_loop: - movups ($inp),%xmm2 # load input + movups ($inp),$inout1 # load input lea 16($inp),$inp - pxor %xmm2,%xmm0 + pxor $inout1,$inout0 ___ - &aesni_encrypt1("%xmm0","%xmm1","%xmm2",$key,$rounds); + &aesni_generate1("enc",$key,$rounds); $code.=<<___; - movups %xmm0,($out) # store output sub \$16,$len lea 16($out),$out - mov $rnds_,$rounds # restore $rounds - mov $key_,$key # restore $key + mov $rnds_,$rounds # restore $rounds + mov $key_,$key # restore $key + movups $inout0,-16($out) # store output jnc .Lcbc_enc_loop add \$16,$len jnz .Lcbc_enc_tail - movups %xmm0,($ivp) + movups $inout0,($ivp) jmp .Lcbc_ret .Lcbc_enc_tail: @@ -465,59 +420,44 @@ $code.=<<___; .Lcbc_decrypt: ___ $code.=<<___ if ($win64); - lea -0xa8(%rsp),%rsp + lea -0x58(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) - movaps %xmm10,0x40(%rsp) - movaps %xmm11,0x50(%rsp) - movaps %xmm12,0x60(%rsp) - movaps %xmm13,0x70(%rsp) - movaps %xmm14,0x80(%rsp) +.Lcbc_decrypt_body: ___ $code.=<<___; movups ($ivp),$iv - sub \$0x60,$len - jc .Lcbc_dec_tail - jmp .Lcbc_dec_loop6 + sub \$0x40,$len + mov $rnds_,$rounds + jbe .Lcbc_dec_tail + jmp .Lcbc_dec_loop3 .align 16 -.Lcbc_dec_loop6: +.Lcbc_dec_loop3: movups ($inp),$inout0 movups 0x10($inp),$inout1 movups 0x20($inp),$inout2 - movups 0x30($inp),$inout3 movaps $inout0,$in0 - movups 0x40($inp),$inout4 movaps $inout1,$in1 - movups 0x50($inp),$inout5 movaps $inout2,$in2 - movaps $inout3,$in3 - movaps $inout4,$in4 - movaps $inout5,$in5 - call _aesni_decrypt6 + call _aesni_decrypt3 + sub \$0x30,$len + lea 0x30($inp),$inp + lea 0x30($out),$out pxor $iv,$inout0 pxor $in0,$inout1 - movups $inout0,($out) - sub \$0x60,$len + movaps $in2,$iv pxor $in1,$inout2 - movups $inout1,0x10($out) - lea 0x60($inp),$inp - pxor $in2,$inout3 - movups $inout2,0x20($out) + movups $inout0,-0x30($out) mov $rnds_,$rounds # restore $rounds - pxor $in3,$inout4 - movups $inout3,0x30($out) + movups $inout1,-0x20($out) mov $key_,$key # restore $key - pxor $in4,$inout5 - movups $inout4,0x40($out) - movaps $in5,$iv - movups $inout5,0x50($out) - lea 0x60($out),$out - jnc .Lcbc_dec_loop6 + movups $inout2,-0x10($out) + ja .Lcbc_dec_loop3 .Lcbc_dec_tail: - add \$0x60,$len + add \$0x40,$len movups $iv,($ivp) jz .Lcbc_dec_ret @@ -534,42 +474,29 @@ $code.=<<___; movaps $inout2,$in2 jbe .Lcbc_dec_three movups 0x30($inp),$inout3 - cmp \$0x40,$len - movaps $inout3,$in3 - jbe .Lcbc_dec_four - movups 0x40($inp),$inout4 - cmp \$0x50,$len - movaps $inout4,$in4 - jbe .Lcbc_dec_five - movups 0x50($inp),$inout5 - movaps $inout5,$in5 - call _aesni_decrypt6 + call _aesni_decrypt4 pxor $iv,$inout0 + movups 0x30($inp),$iv pxor $in0,$inout1 movups $inout0,($out) pxor $in1,$inout2 movups $inout1,0x10($out) pxor $in2,$inout3 movups $inout2,0x20($out) - pxor $in3,$inout4 - movups $inout3,0x30($out) - pxor $in4,$inout5 - movups $inout4,0x40($out) - movaps $in5,$iv - movaps $inout5,$inout0 - lea 0x50($out),$out + movaps $inout3,$inout0 + lea 0x30($out),$out jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_one: ___ - &aesni_decrypt1($inout0,$rndkey0,$rndkey1,$key,$rounds); + &aesni_generate1("dec",$key,$rounds); $code.=<<___; pxor $iv,$inout0 movaps $in0,$iv jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_two: - call _aesni_decrypt6 + call _aesni_decrypt3 pxor $iv,$inout0 pxor $in0,$inout1 movups $inout0,($out) @@ -579,7 +506,7 @@ $code.=<<___; jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_three: - call _aesni_decrypt6 + call _aesni_decrypt3 pxor $iv,$inout0 pxor $in0,$inout1 movups $inout0,($out) @@ -590,36 +517,6 @@ $code.=<<___; lea 0x20($out),$out jmp .Lcbc_dec_tail_collected .align 16 -.Lcbc_dec_four: - call _aesni_decrypt6 - pxor $iv,$inout0 - pxor $in0,$inout1 - movups $inout0,($out) - pxor $in1,$inout2 - movups $inout1,0x10($out) - pxor $in2,$inout3 - movups $inout2,0x20($out) - movaps $in3,$iv - movaps $inout3,$inout0 - lea 0x30($out),$out - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_five: - call _aesni_decrypt6 - pxor $iv,$inout0 - pxor $in0,$inout1 - movups $inout0,($out) - pxor $in1,$inout2 - movups $inout1,0x10($out) - pxor $in2,$inout3 - movups $inout2,0x20($out) - pxor $in3,$inout4 - movups $inout3,0x30($out) - movaps $in4,$iv - movaps $inout4,$inout0 - lea 0x40($out),$out - jmp .Lcbc_dec_tail_collected -.align 16 .Lcbc_dec_tail_collected: and \$15,$len movups $iv,($ivp) @@ -640,12 +537,7 @@ $code.=<<___ if ($win64); movaps 0x10(%rsp),%xmm7 movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 - movaps 0x40(%rsp),%xmm10 - movaps 0x50(%rsp),%xmm11 - movaps 0x60(%rsp),%xmm12 - movaps 0x70(%rsp),%xmm13 - movaps 0x80(%rsp),%xmm14 - lea 0xa8(%rsp),%rsp + lea 0x58(%rsp),%rsp ___ $code.=<<___; .Lcbc_ret: @@ -653,61 +545,49 @@ $code.=<<___; .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt ___ -{ -# this is natural argument order for $PREFIX_set_[en|de]crypt_key -my $inp="%rdi"; -my $bits="%esi"; -my $key="%rdx"; - -# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, -# AES_KEY *key) -$code.=<<___; -.globl ${PREFIX}_set_encrypt_key -.type ${PREFIX}_set_encrypt_key,\@function,3 -.align 16 -${PREFIX}_set_encrypt_key: - call _aesni_set_encrypt_key - ret -.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key -___ -# int $PREFIX_set_decrypt_key(const unsigned char *userKey, const int bits, -# AES_KEY *key) +# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, +# int bits, AES_KEY *key) +{ my ($inp,$bits,$key) = @_4args; + $bits =~ s/%r/%e/; + $code.=<<___; .globl ${PREFIX}_set_decrypt_key -.type ${PREFIX}_set_decrypt_key,\@function,3 +.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent .align 16 ${PREFIX}_set_decrypt_key: + .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 call _aesni_set_encrypt_key - shl \$4,%esi # actually rounds after _aesni_set_encrypt_key + shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key test %eax,%eax jnz .Ldec_key_ret - lea (%rdx,%rsi),%rsi# points at the end of key schedule - - $movkey (%rdx),%xmm0 # just swap - $movkey (%rsi),%xmm1 - $movkey %xmm0,(%rsi) - $movkey %xmm1,(%rdx) - lea 16(%rdx),%rdx - lea -16(%rsi),%rsi - jmp .Ldec_key_inverse -.align 16 + lea 16($key,$bits),$inp # points at the end of key schedule + + $movkey ($key),%xmm0 # just swap + $movkey ($inp),%xmm1 + $movkey %xmm0,($inp) + $movkey %xmm1,($key) + lea 16($key),$key + lea -16($inp),$inp + .Ldec_key_inverse: - $movkey (%rdx),%xmm0 # swap and inverse - $movkey (%rsi),%xmm1 + $movkey ($key),%xmm0 # swap and inverse + $movkey ($inp),%xmm1 aesimc %xmm0,%xmm0 aesimc %xmm1,%xmm1 - lea 16(%rdx),%rdx - lea -16(%rsi),%rsi - cmp %rdx,%rsi - $movkey %xmm0,16(%rsi) - $movkey %xmm1,-16(%rdx) + lea 16($key),$key + lea -16($inp),$inp + cmp $key,$inp + $movkey %xmm0,16($inp) + $movkey %xmm1,-16($key) ja .Ldec_key_inverse - $movkey (%rdx),%xmm0 # inverse middle + $movkey ($key),%xmm0 # inverse middle aesimc %xmm0,%xmm0 - $movkey %xmm0,(%rsi) + $movkey %xmm0,($inp) .Ldec_key_ret: + add \$8,%rsp ret +.LSEH_end_set_decrypt_key: .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key ___ @@ -721,27 +601,31 @@ ___ # and is contained in %xmm0-5 to meet Win64 ABI requirement. # $code.=<<___; -.type _aesni_set_encrypt_key,\@abi-omnipotent +.globl ${PREFIX}_set_encrypt_key +.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent .align 16 +${PREFIX}_set_encrypt_key: _aesni_set_encrypt_key: - test %rdi,%rdi - jz .Lbad_pointer - test %rdx,%rdx - jz .Lbad_pointer - - movups (%rdi),%xmm0 # pull first 128 bits of *userKey + .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 + test $inp,$inp + mov \$-1,%rax + jz .Lenc_key_ret + test $key,$key + jz .Lenc_key_ret + + movups ($inp),%xmm0 # pull first 128 bits of *userKey pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0 - lea 16(%rdx),%rcx - cmp \$256,%esi + lea 16($key),%rax + cmp \$256,$bits je .L14rounds - cmp \$192,%esi + cmp \$192,$bits je .L12rounds - cmp \$128,%esi + cmp \$128,$bits jne .Lbad_keybits - + .L10rounds: - mov \$10,%esi # 10 rounds for 128-bit key - $movkey %xmm0,(%rdx) # round 0 + mov \$9,$bits # 10 rounds for 128-bit key + $movkey %xmm0,($key) # round 0 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 call .Lkey_expansion_128_cold aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 @@ -762,29 +646,16 @@ _aesni_set_encrypt_key: call .Lkey_expansion_128 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 call .Lkey_expansion_128 - $movkey %xmm0,(%rcx) - mov %esi,80(%rcx) # 240(%rdx) + $movkey %xmm0,(%rax) + mov $bits,80(%rax) # 240(%rdx) xor %eax,%eax - ret + jmp .Lenc_key_ret -.align 16 -.Lkey_expansion_128: - $movkey %xmm0,(%rcx) - lea 16(%rcx),%rcx -.Lkey_expansion_128_cold: - shufps \$0b00010000,%xmm0,%xmm4 - pxor %xmm4, %xmm0 - shufps \$0b10001100,%xmm0,%xmm4 - pxor %xmm4, %xmm0 - pshufd \$0b11111111,%xmm1,%xmm1 # critical path - pxor %xmm1,%xmm0 - ret - .align 16 .L12rounds: - movq 16(%rdi),%xmm2 # remaining 1/3 of *userKey - mov \$12,%esi # 12 rounds for 192 - $movkey %xmm0,(%rdx) # round 0 + movq 16($inp),%xmm2 # remaining 1/3 of *userKey + mov \$11,$bits # 12 rounds for 192 + $movkey %xmm0,($key) # round 0 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 call .Lkey_expansion_192a_cold aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 @@ -801,48 +672,18 @@ _aesni_set_encrypt_key: call .Lkey_expansion_192a aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 call .Lkey_expansion_192b - $movkey %xmm0,(%rcx) - mov %esi,48(%rcx) # 240(%rdx) + $movkey %xmm0,(%rax) + mov $bits,48(%rax) # 240(%rdx) xor %rax, %rax - ret + jmp .Lenc_key_ret -.align 16 -.Lkey_expansion_192a: - $movkey %xmm0,(%rcx) - lea 16(%rcx),%rcx -.Lkey_expansion_192a_cold: - movaps %xmm2, %xmm5 -.Lkey_expansion_192b_warm: - shufps \$0b00010000,%xmm0,%xmm4 - movaps %xmm2,%xmm3 - pxor %xmm4,%xmm0 - shufps \$0b10001100,%xmm0,%xmm4 - pslldq \$4,%xmm3 - pxor %xmm4,%xmm0 - pshufd \$0b01010101,%xmm1,%xmm1 # critical path - pxor %xmm3,%xmm2 - pxor %xmm1,%xmm0 - pshufd \$0b11111111,%xmm0,%xmm3 - pxor %xmm3,%xmm2 - ret - -.align 16 -.Lkey_expansion_192b: - movaps %xmm0,%xmm3 - shufps \$0b01000100,%xmm0,%xmm5 - $movkey %xmm5,(%rcx) - shufps \$0b01001110,%xmm2,%xmm3 - $movkey %xmm3,16(%rcx) - lea 32(%rcx),%rcx - jmp .Lkey_expansion_192b_warm - .align 16 .L14rounds: - movups 16(%rdi),%xmm2 # remaning half of *userKey - mov \$14,%esi # 14 rounds for 256 - lea 16(%rcx),%rcx - $movkey %xmm0,(%rdx) # round 0 - $movkey %xmm2,16(%rdx) # round 1 + movups 16($inp),%xmm2 # remaning half of *userKey + mov \$13,$bits # 14 rounds for 256 + lea 16(%rax),%rax + $movkey %xmm0,($key) # round 0 + $movkey %xmm2,16($key) # round 1 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 call .Lkey_expansion_256a_cold aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 @@ -869,15 +710,66 @@ _aesni_set_encrypt_key: call .Lkey_expansion_256b aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 call .Lkey_expansion_256a - $movkey %xmm0,(%rcx) - mov %esi,16(%rcx) # 240(%rdx) + $movkey %xmm0,(%rax) + mov $bits,16(%rax) # 240(%rdx) xor %rax,%rax + jmp .Lenc_key_ret + +.align 16 +.Lbad_keybits: + mov \$-2,%rax +.Lenc_key_ret: + add \$8,%rsp + ret +.LSEH_end_set_encrypt_key: + +.align 16 +.Lkey_expansion_128: + $movkey %xmm0,(%rax) + lea 16(%rax),%rax +.Lkey_expansion_128_cold: + shufps \$0b00010000,%xmm0,%xmm4 + pxor %xmm4, %xmm0 + shufps \$0b10001100,%xmm0,%xmm4 + pxor %xmm4, %xmm0 + pshufd \$0b11111111,%xmm1,%xmm1 # critical path + pxor %xmm1,%xmm0 + ret + +.align 16 +.Lkey_expansion_192a: + $movkey %xmm0,(%rax) + lea 16(%rax),%rax +.Lkey_expansion_192a_cold: + movaps %xmm2, %xmm5 +.Lkey_expansion_192b_warm: + shufps \$0b00010000,%xmm0,%xmm4 + movaps %xmm2,%xmm3 + pxor %xmm4,%xmm0 + shufps \$0b10001100,%xmm0,%xmm4 + pslldq \$4,%xmm3 + pxor %xmm4,%xmm0 + pshufd \$0b01010101,%xmm1,%xmm1 # critical path + pxor %xmm3,%xmm2 + pxor %xmm1,%xmm0 + pshufd \$0b11111111,%xmm0,%xmm3 + pxor %xmm3,%xmm2 ret +.align 16 +.Lkey_expansion_192b: + movaps %xmm0,%xmm3 + shufps \$0b01000100,%xmm0,%xmm5 + $movkey %xmm5,(%rax) + shufps \$0b01001110,%xmm2,%xmm3 + $movkey %xmm3,16(%rax) + lea 32(%rax),%rax + jmp .Lkey_expansion_192b_warm + .align 16 .Lkey_expansion_256a: - $movkey %xmm2,(%rcx) - lea 16(%rcx),%rcx + $movkey %xmm2,(%rax) + lea 16(%rax),%rax .Lkey_expansion_256a_cold: shufps \$0b00010000,%xmm0,%xmm4 pxor %xmm4,%xmm0 @@ -889,8 +781,8 @@ _aesni_set_encrypt_key: .align 16 .Lkey_expansion_256b: - $movkey %xmm0,(%rcx) - lea 16(%rcx),%rcx + $movkey %xmm0,(%rax) + lea 16(%rax),%rax shufps \$0b00010000,%xmm2,%xmm4 pxor %xmm4,%xmm2 @@ -899,15 +791,7 @@ _aesni_set_encrypt_key: pshufd \$0b10101010,%xmm1,%xmm1 # critical path pxor %xmm1,%xmm2 ret - -.align 16 -.Lbad_pointer: - mov \$-1, %rax - ret -.Lbad_keybits: - mov \$-2, %rax - ret -.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key +.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key ___ } @@ -916,6 +800,150 @@ $code.=<<___; .align 64 ___ +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type cbc_se_handler,\@abi-omnipotent +.align 16 +cbc_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 152($context),%rax # pull context->Rsp + mov 248($context),%rbx # pull context->Rip + + lea .Lcbc_decrypt(%rip),%r10 + cmp %r10,%rbx # context->Rip<"prologue" label + jb .Lin_prologue + + lea .Lcbc_decrypt_body(%rip),%r10 + cmp %r10,%rbx # context->RipRip>="epilogue" label + jae .Lin_prologue + + lea 0(%rax),%rsi # top of stack + lea 512($context),%rdi # &context.Xmm6 + mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0x58(%rax),%rax # adjust stack pointer + jmp .Lin_prologue + +.Lrestore_rax: + mov 120($context),%rax +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + jmp .Lcommon_seh_exit +.size cbc_se_handler,.-cbc_se_handler + +.type ecb_se_handler,\@abi-omnipotent +.align 16 +ecb_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 152($context),%rax # pull context->Rsp + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + +.Lcommon_seh_exit: + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size cbc_se_handler,.-cbc_se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_${PREFIX}_ecb_encrypt + .rva .LSEH_end_${PREFIX}_ecb_encrypt + .rva .LSEH_info_ecb + + .rva .LSEH_begin_${PREFIX}_cbc_encrypt + .rva .LSEH_end_${PREFIX}_cbc_encrypt + .rva .LSEH_info_cbc + + .rva ${PREFIX}_set_encrypt_key + .rva .LSEH_end_set_encrypt_key + .rva .LSEH_info_key + + .rva ${PREFIX}_set_decrypt_key + .rva .LSEH_end_set_decrypt_key + .rva .LSEH_info_key +.section .xdata +.align 8 +.LSEH_info_ecb: + .byte 9,0,0,0 + .rva ecb_se_handler +.LSEH_info_cbc: + .byte 9,0,0,0 + .rva cbc_se_handler +.LSEH_info_key: + .byte 0x01,0x04,0x01,0x00 + .byte 0x04,0x02,0x00,0x00 +___ +} + sub rex { local *opcode=shift; my ($dst,$src)=@_; diff --git a/crypto/engine/eng_aesni.c b/crypto/engine/eng_aesni.c index 6707418614..5491063811 100644 --- a/crypto/engine/eng_aesni.c +++ b/crypto/engine/eng_aesni.c @@ -147,8 +147,9 @@ static int aesni_ciphers(ENGINE *e, const EVP_CIPHER **cipher, ((void *)(((unsigned long)(x)+AESNI_MIN_ALIGN-1)&~(AESNI_MIN_ALIGN-1))) /* Engine names */ -static const char *aesni_id = "aesni"; -static const char *aesni_name = "Intel AES-NI engine"; +static const char aesni_id[] = "aesni", + aesni_name[] = "Intel AES-NI engine", + no_aesni_name[] = "Intel AES-NI engine (no-aesni)"; /* ===== Engine "management" functions ===== */ @@ -156,15 +157,15 @@ static const char *aesni_name = "Intel AES-NI engine"; static int aesni_bind_helper(ENGINE *e) { - if (!(OPENSSL_ia32cap_P[1] & (1UL << (57-32)))) - return 0; + int engage = (OPENSSL_ia32cap_P[1] & (1 << (57-32))) != 0; /* Register everything or return with an error */ if (!ENGINE_set_id(e, aesni_id) || - !ENGINE_set_name(e, aesni_name) || + !ENGINE_set_name(e, engage ? aesni_name : no_aesni_name) || !ENGINE_set_init_function(e, aesni_init) || - !ENGINE_set_ciphers (e, aesni_ciphers)) + (engage && !ENGINE_set_ciphers (e, aesni_ciphers)) + ) return 0; /* Everything looks good */ @@ -286,14 +287,14 @@ static int aesni_cipher_cfb(EVP_CIPHER_CTX *ctx, unsigned char *out, { AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); CRYPTO_cfb128_encrypt(in, out, inl, key, ctx->iv, &ctx->num, ctx->encrypt, - aesni_encrypt); + (block128_f)aesni_encrypt); return 1; } static int aesni_cipher_ofb(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t inl) { AES_KEY *key = AESNI_ALIGN(ctx->cipher_data); CRYPTO_ofb128_encrypt(in, out, inl, key, ctx->iv, - &ctx->num, aesni_encrypt); + &ctx->num, (block128_f)aesni_encrypt); return 1; } diff --git a/test/test_aesni b/test/test_aesni index 87f5da191e..e8fb63ee2b 100755 --- a/test/test_aesni +++ b/test/test_aesni @@ -14,23 +14,19 @@ else exit 1; fi -if $PROG engine aesni | grep aesni; then +if $PROG engine aesni | grep -v no-aesni; then HASH=`cat $PROG | $PROG dgst -hex` - ACE_ALGS=" aes-128-ecb aes-192-ecb aes-256-ecb \ + AES_ALGS=" aes-128-ecb aes-192-ecb aes-256-ecb \ aes-128-cbc aes-192-cbc aes-256-cbc \ aes-128-cfb aes-192-cfb aes-256-cfb \ aes-128-ofb aes-192-ofb aes-256-ofb" - BUFSIZE="16 32 48 64 80 96 128 999" - ACE_ALGS=" aes-128-cbc aes-192-cbc aes-256-cbc \ - aes-128-cfb aes-192-cfb aes-256-cfb \ - aes-128-ofb aes-192-ofb aes-256-ofb" - BUFSIZE="48 64 80 96 128 999" + BUFSIZE="16 32 48 64 80 96 128 144 999" nerr=0 - for alg in $ACE_ALGS; do + for alg in $AES_ALGS; do echo $alg for bufsize in $BUFSIZE; do TEST=`( cat $PROG | \ -- 2.25.1