From 42660b3cf1762354a3a670b85af9f19325aac1fb Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Fri, 14 Oct 2011 09:21:03 +0000 Subject: [PATCH] aesni-x86[_64].pl: pull from HEAD. --- crypto/aes/asm/aesni-x86.pl | 68 ++++++++++++++++--------------- crypto/aes/asm/aesni-x86_64.pl | 74 ++++++++++++++++++---------------- 2 files changed, 75 insertions(+), 67 deletions(-) diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl index b3c8d1f60a..3dc345b585 100644 --- a/crypto/aes/asm/aesni-x86.pl +++ b/crypto/aes/asm/aesni-x86.pl @@ -594,6 +594,7 @@ if ($PREFIX eq "aesni") { &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($cmac,&QWP(0,$rounds)); # load cmac + &mov ($rounds,&DWP(240,$key)); # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); @@ -602,34 +603,30 @@ if ($PREFIX eq "aesni") { &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack - &mov ($rounds,1); + &mov ($rounds_,1); &xor ($key_,$key_); - &mov (&DWP(16,"esp"),$rounds); + &mov (&DWP(16,"esp"),$rounds_); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); + &shr ($rounds,1); + &lea ($key_,&DWP(0,$key)); &movdqa ($inout3,&QWP(0,"esp")); - &pshufb ($ivec,$inout3); # keep iv in reverse order - - &mov ($rounds,&DWP(240,$key)); - &mov ($key_,$key); - &mov ($rounds_,$rounds); &movdqa ($inout0,$ivec); + &mov ($rounds_,$rounds); + &pshufb ($ivec,$inout3); &set_label("ccm64_enc_outer"); - &movups ($in0,&QWP(0,$inp)); - &pshufb ($inout0,$inout3); - &mov ($key,$key_); + &$movekey ($rndkey0,&QWP(0,$key_)); &mov ($rounds,$rounds_); + &movups ($in0,&QWP(0,$inp)); - &$movekey ($rndkey0,&QWP(0,$key)); - &shr ($rounds,1); - &$movekey ($rndkey1,&QWP(16,$key)); - &xorps ($in0,$rndkey0); - &lea ($key,&DWP(32,$key)); &xorps ($inout0,$rndkey0); - &xorps ($cmac,$in0); # cmac^=inp + &$movekey ($rndkey1,&QWP(16,$key_)); + &xorps ($rndkey0,$in0); + &lea ($key,&DWP(32,$key_)); + &xorps ($cmac,$rndkey0); # cmac^=inp &$movekey ($rndkey0,&QWP(0,$key)); &set_label("ccm64_enc2_loop"); @@ -644,16 +641,17 @@ if ($PREFIX eq "aesni") { &jnz (&label("ccm64_enc2_loop")); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); + &paddq ($ivec,&QWP(16,"esp")); &aesenclast ($inout0,$rndkey0); &aesenclast ($cmac,$rndkey0); - &paddq ($ivec,&QWP(16,"esp")); &dec ($len); &lea ($inp,&DWP(16,$inp)); &xorps ($in0,$inout0); # inp^=E(ivec) &movdqa ($inout0,$ivec); - &movups (&QWP(0,$out),$in0); + &movups (&QWP(0,$out),$in0); # save output &lea ($out,&DWP(16,$out)); + &pshufb ($inout0,$inout3); &jnz (&label("ccm64_enc_outer")); &mov ("esp",&DWP(48,"esp")); @@ -675,6 +673,7 @@ if ($PREFIX eq "aesni") { &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($cmac,&QWP(0,$rounds)); # load cmac + &mov ($rounds,&DWP(240,$key)); # compose byte-swap control mask for pshufb on stack &mov (&DWP(0,"esp"),0x0c0d0e0f); @@ -683,46 +682,45 @@ if ($PREFIX eq "aesni") { &mov (&DWP(12,"esp"),0x00010203); # compose counter increment vector on stack - &mov ($rounds,1); + &mov ($rounds_,1); &xor ($key_,$key_); - &mov (&DWP(16,"esp"),$rounds); + &mov (&DWP(16,"esp"),$rounds_); &mov (&DWP(20,"esp"),$key_); &mov (&DWP(24,"esp"),$key_); &mov (&DWP(28,"esp"),$key_); &movdqa ($inout3,&QWP(0,"esp")); # bswap mask &movdqa ($inout0,$ivec); - &pshufb ($ivec,$inout3); # keep iv in reverse order - &mov ($rounds,&DWP(240,$key)); &mov ($key_,$key); &mov ($rounds_,$rounds); + &pshufb ($ivec,$inout3); if ($inline) { &aesni_inline_generate1("enc"); } else { &call ("_aesni_encrypt1"); } - -&set_label("ccm64_dec_outer"); - &paddq ($ivec,&QWP(16,"esp")); &movups ($in0,&QWP(0,$inp)); # load inp - &xorps ($in0,$inout0); - &movdqa ($inout0,$ivec); + &paddq ($ivec,&QWP(16,"esp")); &lea ($inp,&QWP(16,$inp)); - &pshufb ($inout0,$inout3); - &mov ($key,$key_); + &jmp (&label("ccm64_dec_outer")); + +&set_label("ccm64_dec_outer",16); + &xorps ($in0,$inout0); # inp ^= E(ivec) + &movdqa ($inout0,$ivec); &mov ($rounds,$rounds_); - &movups (&QWP(0,$out),$in0); + &movups (&QWP(0,$out),$in0); # save output &lea ($out,&DWP(16,$out)); + &pshufb ($inout0,$inout3); &sub ($len,1); &jz (&label("ccm64_dec_break")); - &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey0,&QWP(0,$key_)); &shr ($rounds,1); - &$movekey ($rndkey1,&QWP(16,$key)); + &$movekey ($rndkey1,&QWP(16,$key_)); &xorps ($in0,$rndkey0); - &lea ($key,&DWP(32,$key)); + &lea ($key,&DWP(32,$key_)); &xorps ($inout0,$rndkey0); &xorps ($cmac,$in0); # cmac^=out &$movekey ($rndkey0,&QWP(0,$key)); @@ -737,13 +735,17 @@ if ($PREFIX eq "aesni") { &aesenc ($cmac,$rndkey0); &$movekey ($rndkey0,&QWP(0,$key)); &jnz (&label("ccm64_dec2_loop")); + &movups ($in0,&QWP(0,$inp)); # load inp + &paddq ($ivec,&QWP(16,"esp")); &aesenc ($inout0,$rndkey1); &aesenc ($cmac,$rndkey1); + &lea ($inp,&QWP(16,$inp)); &aesenclast ($inout0,$rndkey0); &aesenclast ($cmac,$rndkey0); &jmp (&label("ccm64_dec_outer")); &set_label("ccm64_dec_break",16); + &mov ($key,$key_); if ($inline) { &aesni_inline_generate1("enc",$cmac,$in0); } else diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index ae0ad7f809..499f3b3f42 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -821,8 +821,8 @@ ___ { my $cmac="%r9"; # 6th argument -my $increment="%xmm8"; -my $bswap_mask="%xmm9"; +my $increment="%xmm6"; +my $bswap_mask="%xmm7"; $code.=<<___; .globl aesni_ccm64_encrypt_blocks @@ -839,30 +839,29 @@ $code.=<<___ if ($win64); .Lccm64_enc_body: ___ $code.=<<___; + mov 240($key),$rounds # key->rounds movdqu ($ivp),$iv - movdqu ($cmac),$inout1 movdqa .Lincrement64(%rip),$increment movdqa .Lbswap_mask(%rip),$bswap_mask - pshufb $bswap_mask,$iv # keep iv in reverse order - mov 240($key),$rounds # key->rounds - mov $key,$key_ - mov $rounds,$rnds_ + shr \$1,$rounds + lea 0($key),$key_ + movdqu ($cmac),$inout1 movdqa $iv,$inout0 - + mov $rounds,$rnds_ + pshufb $bswap_mask,$iv + jmp .Lccm64_enc_outer +.align 16 .Lccm64_enc_outer: - movups ($inp),$in0 # load inp - pshufb $bswap_mask,$inout0 - mov $key_,$key + $movkey ($key_),$rndkey0 mov $rnds_,$rounds + movups ($inp),$in0 # load inp - $movkey ($key),$rndkey0 - shr \$1,$rounds - $movkey 16($key),$rndkey1 - xorps $rndkey0,$in0 - lea 32($key),$key - xorps $rndkey0,$inout0 - xorps $inout1,$in0 # cmac^=inp + xorps $rndkey0,$inout0 # counter + $movkey 16($key_),$rndkey1 + xorps $in0,$rndkey0 + lea 32($key_),$key + xorps $rndkey0,$inout1 # cmac^=inp $movkey ($key),$rndkey0 .Lccm64_enc2_loop: @@ -877,16 +876,17 @@ $code.=<<___; jnz .Lccm64_enc2_loop aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 + paddq $increment,$iv aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout1 - paddq $increment,$iv dec $len lea 16($inp),$inp xorps $inout0,$in0 # inp ^= E(iv) movdqa $iv,$inout0 movups $in0,($out) # save output lea 16($out),$out + pshufb $bswap_mask,$inout0 jnz .Lccm64_enc_outer movups $inout1,($cmac) @@ -919,39 +919,40 @@ $code.=<<___ if ($win64); .Lccm64_dec_body: ___ $code.=<<___; - movdqu ($ivp),$iv + mov 240($key),$rounds # key->rounds + movups ($ivp),$iv movdqu ($cmac),$inout1 movdqa .Lincrement64(%rip),$increment movdqa .Lbswap_mask(%rip),$bswap_mask - mov 240($key),$rounds # key->rounds - movdqa $iv,$inout0 - pshufb $bswap_mask,$iv # keep iv in reverse order + movaps $iv,$inout0 mov $rounds,$rnds_ mov $key,$key_ + pshufb $bswap_mask,$iv ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; -.Lccm64_dec_outer: - paddq $increment,$iv movups ($inp),$in0 # load inp - xorps $inout0,$in0 - movdqa $iv,$inout0 + paddq $increment,$iv lea 16($inp),$inp - pshufb $bswap_mask,$inout0 - mov $key_,$key + jmp .Lccm64_dec_outer +.align 16 +.Lccm64_dec_outer: + xorps $inout0,$in0 # inp ^= E(iv) + movdqa $iv,$inout0 mov $rnds_,$rounds - movups $in0,($out) + movups $in0,($out) # save output lea 16($out),$out + pshufb $bswap_mask,$inout0 sub \$1,$len jz .Lccm64_dec_break - $movkey ($key),$rndkey0 + $movkey ($key_),$rndkey0 shr \$1,$rounds - $movkey 16($key),$rndkey1 + $movkey 16($key_),$rndkey1 xorps $rndkey0,$in0 - lea 32($key),$key + lea 32($key_),$key xorps $rndkey0,$inout0 xorps $in0,$inout1 # cmac^=out $movkey ($key),$rndkey0 @@ -966,15 +967,20 @@ $code.=<<___; aesenc $rndkey0,$inout1 $movkey 0($key),$rndkey0 jnz .Lccm64_dec2_loop + movups ($inp),$in0 # load inp + paddq $increment,$iv aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout1 + lea 16($inp),$inp aesenclast $rndkey0,$inout0 + aesenclast $rndkey0,$inout1 jmp .Lccm64_dec_outer .align 16 .Lccm64_dec_break: + #xorps $in0,$inout1 # cmac^=out ___ - &aesni_generate1("enc",$key,$rounds,$inout1); + &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); $code.=<<___; movups $inout1,($cmac) ___ -- 2.25.1