From d608b4d6629b5a19c4e96ff4ae599cef95d74c8e Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Sat, 2 May 2009 09:04:17 +0000
Subject: [PATCH] AES-NI engine jumbo update.

---
 crypto/aes/asm/aesni-x86.pl    | 270 +++++++----
 crypto/aes/asm/aesni-x86_64.pl | 836 +++++++++++++++++----------------
 crypto/engine/eng_aesni.c      |  17 +-
 test/test_aesni                |  12 +-
 4 files changed, 631 insertions(+), 504 deletions(-)

diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index fe0cbe0b5e..86062a9940 100644
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -29,8 +29,8 @@ $rounds="ecx";
 $key="edx";
 $inp="esi";
 $out="edi";
-$rounds_="ebx";
-$key_="ebp";
+$rounds_="ebx";	# backup copy for $rounds
+$key_="ebp";	# backup copy for $key
 
 $inout0="xmm0";
 $inout1="xmm1";
@@ -39,26 +39,23 @@ $rndkey0="xmm3";
 $rndkey1="xmm4";
 $ivec="xmm5";
 $in0="xmm6";
-$in1="xmm7";
+$in1="xmm7";	$inout3="xmm7";
 
-sub _aesni_generate1	# folded loop
+# Inline version of internal aesni_[en|de]crypt1
+sub aesni_inline_generate1
 { my $p=shift;
 
-    &function_begin_B("_aesni_${p}rypt1");
-	&$movekey	($rndkey0,&QWP(0,$key));
-	&$movekey	($rndkey1,&QWP(16,$key));
-	&lea		($key,&DWP(16,$key));
-	&pxor		($inout0,$rndkey0);
-	&dec		($rounds);
-    &set_label("${p}1_loop",16);
+    &$movekey		($rndkey0,&QWP(0,$key));
+    &$movekey		($rndkey1,&QWP(16,$key));
+    &lea		($key,&DWP(32,$key));
+    &pxor		($inout0,$rndkey0);
+    &set_label("${p}1_loop");
 	eval"&aes${p}	($inout0,$rndkey1)";
 	&dec		($rounds);
-	&lea		($key,&DWP(16,$key));
 	&$movekey	($rndkey1,&QWP(0,$key));
-	&jnz		(&label("${p}1_loop"));
+	&lea		($key,&DWP(16,$key));
+    &jnz		(&label("${p}1_loop"));
     eval"&aes${p}last	($inout0,$rndkey1)";
-    &ret();
-    &function_end_B("_aesni_${p}rypt1");
 }
 
 sub aesni_generate1	# fully unrolled loop
@@ -67,7 +64,7 @@ sub aesni_generate1	# fully unrolled loop
     &function_begin_B("_aesni_${p}rypt1");
 	&$movekey	($rndkey0,&QWP(0,$key));
 	&$movekey	($rndkey1,&QWP(0x10,$key));
-	&cmp		($rounds,12);
+	&cmp		($rounds,11);
 	&pxor		($inout0,$rndkey0);
 	&$movekey	($rndkey0,&QWP(0x20,$key));
 	&lea		($key,&DWP(0x30,$key));
@@ -107,52 +104,52 @@ sub aesni_generate1	# fully unrolled loop
     &function_end_B("_aesni_${p}rypt1");
 }
 
-&aesni_generate1("enc");
 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+# &aesni_generate1("dec");
 &function_begin_B("${PREFIX}_encrypt");
 	&mov	("eax",&wparam(0));
 	&mov	($key,&wparam(2));
 	&movups	($inout0,&QWP(0,"eax"));
 	&mov	($rounds,&DWP(240,$key));
 	&mov	("eax",&wparam(1));
-	&call	("_aesni_encrypt1");
+	&aesni_inline_generate1("enc");	# &call	("_aesni_encrypt1");
 	&movups	(&QWP(0,"eax"),$inout0);
 	&ret	();
 &function_end_B("${PREFIX}_encrypt");
 
-&aesni_generate1("dec");
 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
+# &aesni_generate1("dec");
 &function_begin_B("${PREFIX}_decrypt");
 	&mov	("eax",&wparam(0));
 	&mov	($key,&wparam(2));
 	&movups	($inout0,&QWP(0,"eax"));
 	&mov	($rounds,&DWP(240,$key));
 	&mov	("eax",&wparam(1));
-	&call	("_aesni_decrypt1");
+	&aesni_inline_generate1("dec");	# &call	("_aesni_decrypt1");
 	&movups	(&QWP(0,"eax"),$inout0);
 	&ret	();
 &function_end_B("${PREFIX}_decrypt");
-
-# _aesni_[en|de]crypt3 are private interfaces, 3 denotes interleave
-# factor. Why 3x? Even though aes[enc|dec] latency is 6, it turned
-# out that it can be scheduled only every *second* cycle. Thus 3x
-# interleave is the one providing optimal utilization, i.e. when
-# subroutine's throughput is virtually same as of non-interleaved
-# subroutine for number of input blocks up to 3. This is why it
-# handles even double-block inputs. Larger interleave factor would
-# perform suboptimally on shorter inputs... 
-
+
+# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
+# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
+# latency is 6, it turned out that it can be scheduled only every
+# *second* cycle. Thus 3x interleave is the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine. As soon
+# as/if Intel improves throughput by making it possible to schedule
+# the instructions in question *every* cycles I would have to
+# implement 6x interleave and use it in loop...
 sub aesni_generate3
 { my $p=shift;
 
     &function_begin_B("_aesni_${p}rypt3");
 	&$movekey	($rndkey0,&QWP(0,$key));
-	&$movekey	($rndkey1,&QWP(16,$key));
 	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key));
 	&lea		($key,&DWP(32,$key));
 	&pxor		($inout0,$rndkey0);
 	&pxor		($inout1,$rndkey0);
-	&dec		($rounds);
 	&pxor		($inout2,$rndkey0);
 	&jmp		(&label("${p}3_loop"));
     &set_label("${p}3_loop",16);
@@ -177,14 +174,59 @@ sub aesni_generate3
     &ret();
     &function_end_B("_aesni_${p}rypt3");
 }
+
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement  would be <20%,
+# so it's not worth it...
+sub aesni_generate4
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt4");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&shr		($rounds,1);
+	&lea		($key,&DWP(32,$key));
+	&pxor		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&pxor		($inout3,$rndkey0);
+	&jmp		(&label("${p}3_loop"));
+    &set_label("${p}3_loop",16);
+	eval"&aes${p}	($inout0,$rndkey1)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	eval"&aes${p}	($inout3,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout1,$rndkey0)";
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	&jnz		(&label("${p}3_loop"));
+    eval"&aes${p}	($inout0,$rndkey1)";
+    &$movekey		($rndkey0,&QWP(0,$key));
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt4");
+}
 &aesni_generate3("enc") if ($PREFIX eq "aesni");
 &aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
 
 if ($PREFIX eq "aesni") {
 # void aesni_ecb_encrypt (const void *in, void *out,
 #                         size_t length, const AES_KEY *key,
 #                         int enc);
-
 &function_begin("aesni_ecb_encrypt");
 	&mov	($inp,&wparam(0));
 	&mov	($out,&wparam(1));
@@ -200,79 +242,121 @@ if ($PREFIX eq "aesni") {
 	&mov	($rounds_,$rounds);	# backup $rounds
 	&jz	(&label("ecb_decrypt"));
 
-	&sub	($len,0x30);
-	&jc	(&label("ecb_enc_tail"));
-	jmp	(&label("ecb_enc_loop3"));
+	&sub	($len,0x40);
+	&jbe	(&label("ecb_enc_tail"));
+	&jmp	(&label("ecb_enc_loop3"));
 
 &set_label("ecb_enc_loop3",16);
 	&movups	($inout0,&QWP(0,$inp));
 	&movups	($inout1,&QWP(0x10,$inp));
 	&movups	($inout2,&QWP(0x20,$inp));
-	&lea	($inp,&DWP(0x30,$inp));
 	&call	("_aesni_encrypt3");
-	&movups	(&QWP(0,$out),$inout0);
 	&sub	($len,0x30);
-	&movups	(&QWP(0x10,$out),$inout1);
+	&lea	($inp,&DWP(0x30,$inp));
+	&lea	($out,&DWP(0x30,$out));
+	&movups	(&QWP(-0x30,$out),$inout0);
 	&mov	($key,$key_);		# restore $key
-	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(-0x20,$out),$inout1);
 	&mov	($rounds,$rounds_);	# restore $rounds
-	&lea	($out,&DWP(0x30,$out));
-	&jnc	(&label("ecb_enc_loop3"));
+	&movups	(&QWP(-0x10,$out),$inout2);
+	&ja	(&label("ecb_enc_loop3"));
 
 &set_label("ecb_enc_tail");
-	&add	($len,0x30);
+	&add	($len,0x40);
 	&jz	(&label("ecb_ret"));
 
 	&cmp	($len,0x10);
 	&movups	($inout0,&QWP(0,$inp));
-	je	(&label("ecb_enc_one"));
+	&je	(&label("ecb_enc_one"));
+	&cmp	($len,0x20);
 	&movups	($inout1,&QWP(0x10,$inp));
-	&call	("_aesni_encrypt3");
+	&je	(&label("ecb_enc_two"));
+	&cmp	($len,0x30);
+	&movups	($inout2,&QWP(0x20,$inp));
+	&je	(&label("ecb_enc_three"));
+	&movups	($inout3,&QWP(0x30,$inp));
+	&call	("_aesni_encrypt4");
 	&movups	(&QWP(0,$out),$inout0);
 	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
 	jmp	(&label("ecb_ret"));
 
 &set_label("ecb_enc_one",16);
-	&call	("_aesni_encrypt1");
+	&aesni_inline_generate1("enc");	# &call	("_aesni_encrypt1");
 	&movups	(&QWP(0,$out),$inout0);
 	&jmp	(&label("ecb_ret"));
 
+&set_label("ecb_enc_two",16);
+	&call	("_aesni_encrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_three",16);
+	&call	("_aesni_encrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ecb_ret"));
+
 &set_label("ecb_decrypt",16);
-	&sub	($len,0x30);
-	&jc	(&label("ecb_dec_tail"));
-	jmp	(&label("ecb_dec_loop3"));
+	&sub	($len,0x40);
+	&jbe	(&label("ecb_dec_tail"));
+	&jmp	(&label("ecb_dec_loop3"));
 
 &set_label("ecb_dec_loop3",16);
 	&movups	($inout0,&QWP(0,$inp));
 	&movups	($inout1,&QWP(0x10,$inp));
 	&movups	($inout2,&QWP(0x20,$inp));
 	&call	("_aesni_decrypt3");
-	&movups	(&QWP(0,$out),$inout0);
 	&sub	($len,0x30);
 	&lea	($inp,&DWP(0x30,$inp));
-	&movups	(&QWP(0x10,$out),$inout1);
+	&lea	($out,&DWP(0x30,$out));
+	&movups	(&QWP(-0x30,$out),$inout0);
 	&mov	($key,$key_);		# restore $key
-	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(-0x20,$out),$inout1);
 	&mov	($rounds,$rounds_);	# restore $rounds
-	&lea	($out,&DWP(0x30,$out));
-	&jnc	(&label("ecb_dec_loop3"));
+	&movups	(&QWP(-0x10,$out),$inout2);
+	&ja	(&label("ecb_dec_loop3"));
 
 &set_label("ecb_dec_tail");
-	&add	($len,0x30);
+	&add	($len,0x40);
 	&jz	(&label("ecb_ret"));
 
 	&cmp	($len,0x10);
 	&movups	($inout0,&QWP(0,$inp));
-	je	(&label("ecb_dec_one"));
+	&je	(&label("ecb_dec_one"));
+	&cmp	($len,0x20);
 	&movups	($inout1,&QWP(0x10,$inp));
-	&call	("_aesni_decrypt3");
+	&je	(&label("ecb_dec_two"));
+	&cmp	($len,0x30);
+	&movups	($inout2,&QWP(0x20,$inp));
+	&je	(&label("ecb_dec_three"));
+	&movups	($inout3,&QWP(0x30,$inp));
+	&call	("_aesni_decrypt4");
 	&movups	(&QWP(0,$out),$inout0);
 	&movups	(&QWP(0x10,$out),$inout1);
-	jmp	(&label("ecb_ret"));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&jmp	(&label("ecb_ret"));
 
 &set_label("ecb_dec_one",16);
-	&call	("_aesni_decrypt1");
+	&aesni_inline_generate1("dec");	# &call	("_aesni_decrypt3");
 	&movups	(&QWP(0,$out),$inout0);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_two",16);
+	&call	("_aesni_decrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_three",16);
+	&call	("_aesni_decrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
 
 &set_label("ecb_ret");
 &function_end("aesni_ecb_encrypt");
@@ -288,7 +372,7 @@ if ($PREFIX eq "aesni") {
 	&mov	($key,&wparam(3));
 	&test	($len,$len);
 	&mov	($key_,&wparam(4));
-	&je	(&label("cbc_ret"));
+	&jz	(&label("cbc_ret"));
 
 	&cmp	(&wparam(5),0);
 	&movups	($ivec,&QWP(0,$key_));	# load IV
@@ -307,12 +391,12 @@ if ($PREFIX eq "aesni") {
 	&movups	($ivec,&QWP(0,$inp));
 	&lea	($inp,&DWP(16,$inp));
 	&pxor	($inout0,$ivec);
-	&call	("_aesni_encrypt1");
+	&aesni_inline_generate1("enc");	# &call	("_aesni_encrypt3");
 	&sub	($len,16);
+	&lea	($out,&DWP(16,$out));
 	&mov	($rounds,$rounds_);	# restore $rounds
 	&mov	($key,$key_);		# restore $key
-	&movups	(&QWP(0,$out),$inout0);
-	&lea	($out,&DWP(16,$out));
+	&movups	(&QWP(-16,$out),$inout0);
 	&jnc	(&label("cbc_enc_loop"));
 	&add	($len,16);
 	&jnz	(&label("cbc_enc_tail"));
@@ -333,8 +417,8 @@ if ($PREFIX eq "aesni") {
 	&jmp	(&label("cbc_enc_loop"));
 
 &set_label("cbc_decrypt",16);
-	&sub	($len,0x30);
-	&jc	(&label("cbc_dec_tail"));
+	&sub	($len,0x40);
+	&jbe	(&label("cbc_dec_tail"));
 	&jmp	(&label("cbc_dec_loop3"));
 
 &set_label("cbc_dec_loop3",16);
@@ -346,20 +430,20 @@ if ($PREFIX eq "aesni") {
 	&call	("_aesni_decrypt3");
 	&sub	($len,0x30);
 	&lea	($inp,&DWP(0x30,$inp));
+	&lea	($out,&DWP(0x30,$out));
 	&pxor	($inout0,$ivec);
 	&pxor	($inout1,$in0);
 	&movups	($ivec,&QWP(-0x10,$inp));
 	&pxor	($inout2,$in1);
-	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(-0x30,$out),$inout0);
 	&mov	($rounds,$rounds_)	# restore $rounds
-	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(-0x20,$out),$inout1);
 	&mov	($key,$key_);		# restore $key
-	&movups	(&QWP(0x20,$out),$inout2);
-	&lea	($out,&DWP(0x30,$out));
-	&jnc	(&label("cbc_dec_loop3"));
+	&movups	(&QWP(-0x10,$out),$inout2);
+	&ja	(&label("cbc_dec_loop3"));
 
 &set_label("cbc_dec_tail");
-	&add	($len,0x30);
+	&add	($len,0x40);
 	&jz	(&label("cbc_ret"));
 
 	&movups	($inout0,&QWP(0,$inp));
@@ -371,19 +455,26 @@ if ($PREFIX eq "aesni") {
 	&movaps	($in1,$inout1);
 	&jbe	(&label("cbc_dec_two"));
 	&movups	($inout2,&QWP(0x20,$inp));
-	&call	("_aesni_decrypt3");
+	&cmp	($len,0x30);
+	&jbe	(&label("cbc_dec_three"));
+	&movups	($inout3,&QWP(0x30,$inp));
+	&call	("_aesni_decrypt4");
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&movups	($rndkey1,&QWP(0x20,$inp));
 	&pxor	($inout0,$ivec);
-	&movups	($ivec,&QWP(0x20,$inp));
 	&pxor	($inout1,$in0);
-	&pxor	($inout2,$in1);
+	&movups	($ivec,&QWP(0x30,$inp));
 	&movups	(&QWP(0,$out),$inout0);
+	&pxor	($inout2,$rndkey0);
+	&pxor	($inout3,$rndkey1);
 	&movups	(&QWP(0x10,$out),$inout1);
-	&movaps	($inout0,$inout2);
-	&lea	($out,&DWP(0x20,$out));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movaps	($inout0,$inout3);
+	&lea	($out,&DWP(0x30,$out));
 	&jmp	(&label("cbc_dec_tail_collected"));
 
 &set_label("cbc_dec_one");
-	&call	("_aesni_decrypt1");
+	&aesni_inline_generate1("dec");	# &call	("_aesni_decrypt3");
 	&pxor	($inout0,$ivec);
 	&movaps	($ivec,$in0);
 	&jmp	(&label("cbc_dec_tail_collected"));
@@ -396,6 +487,18 @@ if ($PREFIX eq "aesni") {
 	&movaps	($inout0,$inout1);
 	&movaps	($ivec,$in1);
 	&lea	($out,&DWP(0x10,$out));
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_three");
+	&call	("_aesni_decrypt3");
+	&pxor	($inout0,$ivec);
+	&pxor	($inout1,$in0);
+	&pxor	($inout2,$in1);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movaps	($inout0,$inout2);
+	&movups	($ivec,&QWP(0x20,$inp));
+	&lea	($out,&DWP(0x20,$out));
 
 &set_label("cbc_dec_tail_collected");
 	&and	($len,15);
@@ -446,7 +549,7 @@ if ($PREFIX eq "aesni") {
 	&jne	(&label("bad_keybits"));
 
 &set_label("10rounds",16);
-	&mov		($rounds,10);
+	&mov		($rounds,9);
 	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
 	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
 	&call		(&label("key_128_cold"));
@@ -487,7 +590,7 @@ if ($PREFIX eq "aesni") {
 
 &set_label("12rounds",16);
 	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
-	&mov		($rounds,12);
+	&mov		($rounds,11);
 	&$movekey	(&QWP(-16,$key),"xmm0")		# round 0
 	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
 	&call		(&label("key_192a_cold"));
@@ -540,7 +643,7 @@ if ($PREFIX eq "aesni") {
 
 &set_label("14rounds",16);
 	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
-	&mov		($rounds,14);
+	&mov		($rounds,13);
 	&lea		($key,&DWP(16,$key));
 	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
 	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
@@ -625,10 +728,10 @@ if ($PREFIX eq "aesni") {
 	&mov	($key,&wparam(2));
 	&call	("_aesni_set_encrypt_key");
 	&mov	($key,&wparam(2));
-	&shl	($rounds,4)	# actually rounds after _aesni_set_encrypt_key
+	&shl	($rounds,4)	# rounds-1 after _aesni_set_encrypt_key
 	&test	("eax","eax");
 	&jnz	(&label("dec_key_ret"));
-	&lea	("eax",&DWP(0,$key,$rounds));	# end of key schedule
+	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
 
 	&$movekey	("xmm0",&QWP(0,$key));	# just swap
 	&$movekey	("xmm1",&QWP(0,"eax"));
@@ -636,9 +739,8 @@ if ($PREFIX eq "aesni") {
 	&$movekey	(&QWP(0,$key),"xmm1");
 	&lea		($key,&DWP(16,$key));
 	&lea		("eax",&DWP(-16,"eax"));
-	&jmp		(&label("dec_key_inverse"));
 
-&set_label("dec_key_inverse",16);
+&set_label("dec_key_inverse");
 	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
 	&$movekey	("xmm1",&QWP(0,"eax"));
 	&aesimc		("xmm0","xmm0");
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
index 4ed3932b75..c9d7485637 100644
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -11,9 +11,6 @@
 # OpenSSL context it's used with Intel engine, but can also be used as
 # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
 # details].
-#
-# TODO:
-# - Win64 SEH handlers;
 
 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
 			# generates drop-in replacement for
@@ -33,15 +30,15 @@ die "can't locate x86_64-xlate.pl";
 open STDOUT,"| $^X $xlate $flavour $output";
 
 $movkey = $PREFIX eq "aesni" ? "movaps" : "movups";
+@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+		("%rdi","%rsi","%rdx","%rcx");	# Unix order
 
 $code=".text\n";
 
 $rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
-
-# this is natural argument order for public $PREFIX_*crypt...
+# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
 $inp="%rdi";
 $out="%rsi";
-# ... and for $PREFIX_[ebc|cbc]_encrypt in particular.
 $len="%rdx";
 $key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
 $ivp="%r8";	# cbc
@@ -52,162 +49,169 @@ $key_="%r11";	# backup copy for $key
 # %xmm register layout
 $inout0="%xmm0";	$inout1="%xmm1";
 $inout2="%xmm2";	$inout3="%xmm3";
-$inout4="%xmm4";	$inout5="%xmm5";
-$rndkey0="%xmm6";	$rndkey1="%xmm7";
+$rndkey0="%xmm4";	$rndkey1="%xmm5";
 
-$iv="%xmm8";
-$in0="%xmm9";	$in1="%xmm10";
-$in2="%xmm11";	$in3="%xmm12";
-$in4="%xmm13";	$in5="%xmm14";
+$iv="%xmm6";		$in0="%xmm7";	# used in CBC decrypt
+$in1="%xmm8";		$in2="%xmm9";
 
 # Inline version of internal aesni_[en|de]crypt1.
 #
 # Why folded loop? Because aes[enc|dec] is slow enough to accommodate
 # cycles which take care of loop variables...
 { my $sn;
-sub aesni_encrypt1 {
-my ($data,$rndkey0,$rndkey1,$key,$rounds)=@_;
+sub aesni_generate1 {
+my ($p,$key,$rounds)=@_;
 ++$sn;
 $code.=<<___;
 	$movkey	($key),$rndkey0
 	$movkey	16($key),$rndkey1
-	lea	16($key),$key
-	pxor	$rndkey0,$data
-	dec	$rounds
-	jmp	.Loop_enc1_$sn
-.align	16
-.Loop_enc1_$sn:
-	aesenc	$rndkey1,$data
+	lea	32($key),$key
+	pxor	$rndkey0,$inout0
+.Loop_${p}1_$sn:
+	aes${p}	$rndkey1,$inout0
 	dec	$rounds
-	lea	16($key),$key
 	$movkey	($key),$rndkey1
-	jnz	.Loop_enc1_$sn	# loop body is 16 bytes
-
-	aesenclast	$rndkey1,$data
-___
-}}
-{ my $sn;
-sub aesni_decrypt1 {
-my ($data,$rndkey0,$rndkey1,$key,$rounds)=@_;
-++$sn;
-$code.=<<___;
-	$movkey	($key),$rndkey0
-	$movkey	16($key),$rndkey1
 	lea	16($key),$key
-	pxor	$rndkey0,$data
-	dec	$rounds
-	jmp	.Loop_dec1_$sn
-.align	16
-.Loop_dec1_$sn:
-	aesdec	$rndkey1,$data
-	dec	$rounds
-	lea	16($key),$key
-	$movkey	($key),$rndkey1
-	jnz	.Loop_dec1_$sn	# loop body is 16 bytes
-
-	aesdeclast	$rndkey1,$data
+	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
+	aes${p}last	$rndkey1,$inout0
 ___
 }}
-
-# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
 #
+{ my ($inp,$out,$key) = @_4args;
+
 $code.=<<___;
 .globl	${PREFIX}_encrypt
-.type	${PREFIX}_encrypt,\@function,3
+.type	${PREFIX}_encrypt,\@abi-omnipotent
 .align	16
 ${PREFIX}_encrypt:
-	movups	($inp),%xmm0		# load input
-	mov	240(%rdx),$rounds	# pull $rounds
+	movups	($inp),$inout0		# load input
+	mov	240($key),$rounds	# pull $rounds
 ___
-	&aesni_encrypt1("%xmm0","%xmm1","%xmm2","%rdx",$rounds);
+	&aesni_generate1("enc",$key,$rounds);
 $code.=<<___;
-	movups	%xmm0,(%rsi)		# output
+	movups	$inout0,($out)		# output
 	ret
 .size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
-___
 
-# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
-#
-$code.=<<___;
 .globl	${PREFIX}_decrypt
-.type	${PREFIX}_decrypt,\@function,3
+.type	${PREFIX}_decrypt,\@abi-omnipotent
 .align	16
 ${PREFIX}_decrypt:
-	movups	($inp),%xmm0		# load input
-	mov	240(%rdx),$rounds	# pull $rounds
+	movups	($inp),$inout0		# load input
+	mov	240($key),$rounds	# pull $rounds
 ___
-	&aesni_decrypt1("%xmm0","%xmm1","%xmm2","%rdx",$rounds);
+	&aesni_generate1("dec",$key,$rounds);
 $code.=<<___;
-	movups	%xmm0,($out)		# output
+	movups	$inout0,($out)		# output
 	ret
 .size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
 ___
+}
 
-# _aesni_[en|de]crypt6 are private interfaces, 6 denotes interleave
-# factor. Why 6x? Because aes[enc|dec] latency is 6 and 6x interleave
-# provides optimal utilization, so that subroutine's throughput is
-# virtually same for *any* number [naturally up to 6] of input blocks
-# as for non-interleaved subroutine. This is why it handles even
-# double-, tripple-, quad- and penta-block inputs. Larger interleave
-# factor, e.g. 8x, would perform suboptimally on these shorter inputs...
-sub aesni_generate6 {
+# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
+# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
+# latency is 6, it turned out that it can be scheduled only every
+# *second* cycle. Thus 3x interleave is the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine. As soon
+# as/if Intel improves throughput by making it possible to schedule
+# the instructions in question *every* cycles I would have to
+# implement 6x interleave and use it in loop...
+sub aesni_generate3 {
 my $dir=shift;
 # As already mentioned it takes in $key and $rounds, which are *not*
-# preserved. $inout[0-5] is cipher/clear text...
+# preserved. $inout[0-2] is cipher/clear text...
 $code.=<<___;
-.type	_aesni_${dir}rypt6,\@abi-omnipotent
+.type	_aesni_${dir}rypt3,\@abi-omnipotent
 .align	16
-_aesni_${dir}rypt6:
+_aesni_${dir}rypt3:
 	$movkey	($key),$rndkey0
+	shr	\$1,$rounds
 	$movkey	16($key),$rndkey1
+	lea	32($key),$key
+	pxor	$rndkey0,$inout0
+	pxor	$rndkey0,$inout1
+	pxor	$rndkey0,$inout2
+
+.L${dir}_loop3:
+	aes${dir}	$rndkey1,$inout0
+	$movkey		($key),$rndkey0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey0,$inout0
+	$movkey		16($key),$rndkey1
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	jnz		.L${dir}_loop3
+
+	aes${dir}	$rndkey1,$inout0
+	$movkey		($key),$rndkey0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	ret
+.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
+___
+}
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement would be <20%,
+# so it's not worth it...
+sub aesni_generate4 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-3] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt4,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt4:
+	$movkey	($key),$rndkey0
 	shr	\$1,$rounds
+	$movkey	16($key),$rndkey1
 	lea	32($key),$key
-	dec	$rounds
 	pxor	$rndkey0,$inout0
 	pxor	$rndkey0,$inout1
 	pxor	$rndkey0,$inout2
 	pxor	$rndkey0,$inout3
-	pxor	$rndkey0,$inout4
-	pxor	$rndkey0,$inout5
-	jmp	.L${dir}_loop6
-.align	16
-.L${dir}_loop6:
+
+.L${dir}_loop4:
 	aes${dir}	$rndkey1,$inout0
 	$movkey		($key),$rndkey0
 	aes${dir}	$rndkey1,$inout1
 	dec		$rounds
 	aes${dir}	$rndkey1,$inout2
 	aes${dir}	$rndkey1,$inout3
-	aes${dir}	$rndkey1,$inout4
-	aes${dir}	$rndkey1,$inout5
 	aes${dir}	$rndkey0,$inout0
 	$movkey		16($key),$rndkey1
 	aes${dir}	$rndkey0,$inout1
 	lea		32($key),$key
 	aes${dir}	$rndkey0,$inout2
 	aes${dir}	$rndkey0,$inout3
-	aes${dir}	$rndkey0,$inout4
-	aes${dir}	$rndkey0,$inout5
-	jnz		.L${dir}_loop6
+	jnz		.L${dir}_loop4
+
 	aes${dir}	$rndkey1,$inout0
 	$movkey		($key),$rndkey0
 	aes${dir}	$rndkey1,$inout1
 	aes${dir}	$rndkey1,$inout2
 	aes${dir}	$rndkey1,$inout3
-	aes${dir}	$rndkey1,$inout4
-	aes${dir}	$rndkey1,$inout5
 	aes${dir}last	$rndkey0,$inout0
 	aes${dir}last	$rndkey0,$inout1
 	aes${dir}last	$rndkey0,$inout2
 	aes${dir}last	$rndkey0,$inout3
-	aes${dir}last	$rndkey0,$inout4
-	aes${dir}last	$rndkey0,$inout5
 	ret
-.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
+.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
 ___
 }
-&aesni_generate6("enc");
-&aesni_generate6("dec");
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
 
 if ($PREFIX eq "aesni") {
 # void aesni_ecb_encrypt (const void *in, void *out,
@@ -219,48 +223,36 @@ $code.=<<___;
 .align	16
 aesni_ecb_encrypt:
 	cmp	\$16,$len		# check length
-	jb	.Lecb_abort
-___
-$code.=<<___ if ($win64);
-	lea	-0x28(%rsp),%rsp
-	movaps	%xmm6,(%rsp)
-	movaps	%xmm7,16(%rsp)
-___
-$code.=<<___;
+	jb	.Lecb_ret
+
 	mov	240($key),$rounds	# pull $rounds
 	and	\$-16,$len
 	mov	$key,$key_		# backup $key
-	test	%r8d,%r8d
+	test	%r8d,%r8d		# 5th argument
 	mov	$rounds,$rnds_		# backup $rounds
 	jz	.Lecb_decrypt
 #--------------------------- ECB ENCRYPT ------------------------------#
-	sub	\$0x60,$len
-	jc	.Lecb_enc_tail
-	jmp	.Lecb_enc_loop6
+	sub	\$0x40,$len
+	jbe	.Lecb_enc_tail
+	jmp	.Lecb_enc_loop3
 .align 16
-.Lecb_enc_loop6:
+.Lecb_enc_loop3:
 	movups	($inp),$inout0
 	movups	0x10($inp),$inout1
 	movups	0x20($inp),$inout2
-	movups	0x30($inp),$inout3
-	movups	0x40($inp),$inout4
-	movups	0x50($inp),$inout5
-	call	_aesni_encrypt6
-	movups	$inout0,($out)
-	sub	\$0x60,$len
-	movups	$inout1,0x10($out)
-	lea	0x60($inp),$inp
-	movups	$inout2,0x20($out)
+	call	_aesni_encrypt3
+	sub	\$0x30,$len
+	lea	0x30($inp),$inp
+	lea	0x30($out),$out
+	movups	$inout0,-0x30($out)
 	mov	$rnds_,$rounds		# restore $rounds
-	movups	$inout3,0x30($out)
+	movups	$inout1,-0x20($out)
 	mov	$key_,$key		# restore $key
-	movups	$inout4,0x40($out)
-	movups	$inout5,0x50($out)
-	lea	0x60($out),$out
-	jnc	.Lecb_enc_loop6
+	movups	$inout2,-0x10($out)
+	ja	.Lecb_enc_loop3
 
 .Lecb_enc_tail:
-	add	\$0x60,$len
+	add	\$0x40,$len
 	jz	.Lecb_ret
 
 	cmp	\$0x10,$len
@@ -272,75 +264,57 @@ $code.=<<___;
 	cmp	\$0x30,$len
 	movups	0x20($inp),$inout2
 	je	.Lecb_enc_three
-	cmp	\$0x40,$len
 	movups	0x30($inp),$inout3
-	je	.Lecb_enc_four
-	movups	0x40($inp),$inout4
-	call	_aesni_encrypt6
+	call	_aesni_encrypt4
 	movups	$inout0,($out)
 	movups	$inout1,0x10($out)
 	movups	$inout2,0x20($out)
 	movups	$inout3,0x30($out)
-	movups	$inout4,0x40($out)
 	jmp	.Lecb_ret
 .align	16
 .Lecb_enc_one:
 ___
-	&aesni_encrypt1($inout0,$rndkey0,$rndkey1,$key,$rounds);
+	&aesni_generate1("enc",$key,$rounds);
 $code.=<<___;
 	movups	$inout0,($out)
 	jmp	.Lecb_ret
 .align	16
 .Lecb_enc_two:
-	call	_aesni_encrypt6
+	call	_aesni_encrypt3
 	movups	$inout0,($out)
 	movups	$inout1,0x10($out)
 	jmp	.Lecb_ret
 .align	16
 .Lecb_enc_three:
-	call	_aesni_encrypt6
+	call	_aesni_encrypt3
 	movups	$inout0,($out)
 	movups	$inout1,0x10($out)
 	movups	$inout2,0x20($out)
 	jmp	.Lecb_ret
-.align	16
-.Lecb_enc_four:
-	call	_aesni_encrypt6
-	movups	$inout0,($out)
-	movups	$inout1,0x10($out)
-	movups	$inout2,0x20($out)
-	movups	$inout3,0x30($out)
-	jmp	.Lecb_ret
 #--------------------------- ECB DECRYPT ------------------------------#
 .align	16
 .Lecb_decrypt:
-	sub	\$0x60,$len
-	jc	.Lecb_dec_tail
-	jmp	.Lecb_dec_loop6
+	sub	\$0x40,$len
+	jbe	.Lecb_dec_tail
+	jmp	.Lecb_dec_loop3
 .align 16
-.Lecb_dec_loop6:
+.Lecb_dec_loop3:
 	movups	($inp),$inout0
 	movups	0x10($inp),$inout1
 	movups	0x20($inp),$inout2
-	movups	0x30($inp),$inout3
-	movups	0x40($inp),$inout4
-	movups	0x50($inp),$inout5
-	call	_aesni_decrypt6
-	movups	$inout0,($out)
-	sub	\$0x60,$len
-	movups	$inout1,0x10($out)
-	lea	0x60($inp),$inp
-	movups	$inout2,0x20($out)
+	call	_aesni_decrypt3
+	sub	\$0x30,$len
+	lea	0x30($inp),$inp
+	lea	0x30($out),$out
+	movups	$inout0,-0x30($out)
 	mov	$rnds_,$rounds		# restore $rounds
-	movups	$inout3,0x30($out)
+	movups	$inout1,-0x20($out)
 	mov	$key_,$key		# restore $key
-	movups	$inout4,0x40($out)
-	movups	$inout5,0x50($out)
-	lea	0x60($out),$out
-	jnc	.Lecb_dec_loop6
+	movups	$inout2,-0x10($out)
+	ja	.Lecb_dec_loop3
 
 .Lecb_dec_tail:
-	add	\$0x60,$len
+	add	\$0x40,$len
 	jz	.Lecb_ret
 
 	cmp	\$0x10,$len
@@ -352,54 +326,34 @@ $code.=<<___;
 	cmp	\$0x30,$len
 	movups	0x20($inp),$inout2
 	je	.Lecb_dec_three
-	cmp	\$0x40,$len
 	movups	0x30($inp),$inout3
-	je	.Lecb_dec_four
-	movups	0x40($inp),$inout4
-	call	_aesni_decrypt6
+	call	_aesni_decrypt4
 	movups	$inout0,($out)
 	movups	$inout1,0x10($out)
 	movups	$inout2,0x20($out)
 	movups	$inout3,0x30($out)
-	movups	$inout4,0x40($out)
 	jmp	.Lecb_ret
 .align	16
 .Lecb_dec_one:
 ___
-	&aesni_decrypt1($inout0,$rndkey0,$rndkey1,$key,$rounds);
+	&aesni_generate1("dec",$key,$rounds);
 $code.=<<___;
 	movups	$inout0,($out)
 	jmp	.Lecb_ret
 .align	16
 .Lecb_dec_two:
-	call	_aesni_decrypt6
+	call	_aesni_decrypt3
 	movups	$inout0,($out)
 	movups	$inout1,0x10($out)
 	jmp	.Lecb_ret
 .align	16
 .Lecb_dec_three:
-	call	_aesni_decrypt6
-	movups	$inout0,($out)
-	movups	$inout1,0x10($out)
-	movups	$inout2,0x20($out)
-	jmp	.Lecb_ret
-.align	16
-.Lecb_dec_four:
-	call	_aesni_decrypt6
+	call	_aesni_decrypt3
 	movups	$inout0,($out)
 	movups	$inout1,0x10($out)
 	movups	$inout2,0x20($out)
-	movups	$inout3,0x30($out)
 
 .Lecb_ret:
-___
-$code.=<<___ if ($win64);
-	movaps	(%rsp),%xmm6
-	movaps	0x10(%rsp),%xmm7
-	lea	0x28(%rsp),%rsp
-___
-$code.=<<___;
-.Lecb_abort:
 	ret
 .size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
 ___
@@ -408,7 +362,7 @@ ___
 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
 #			    size_t length, const AES_KEY *key,
 #			    unsigned char *ivp,const int enc);
-$reserved = $win64?0x90:-0x18;	# used in decrypt
+$reserved = $win64?0x40:-0x18;	# used in decrypt
 $code.=<<___;
 .globl	${PREFIX}_cbc_encrypt
 .type	${PREFIX}_cbc_encrypt,\@function,6
@@ -416,34 +370,35 @@ $code.=<<___;
 ${PREFIX}_cbc_encrypt:
 	test	$len,$len		# check length
 	jz	.Lcbc_ret
-	mov	240($key),$rounds	# pull $rounds
+
+	mov	240($key),$rnds_	# pull $rounds
 	mov	$key,$key_		# backup $key
-	test	%r9d,%r9d
-	mov	$rounds,$rnds_		# backup $rounds
+	test	%r9d,%r9d		# 6th argument
 	jz	.Lcbc_decrypt
 #--------------------------- CBC ENCRYPT ------------------------------#
-	movups	($ivp),%xmm0	# load iv as initial state
+	movups	($ivp),$inout0		# load iv as initial state
 	cmp	\$16,$len
+	mov	$rnds_,$rounds
 	jb	.Lcbc_enc_tail
 	sub	\$16,$len
 	jmp	.Lcbc_enc_loop
 .align 16
 .Lcbc_enc_loop:
-	movups	($inp),%xmm2	# load input
+	movups	($inp),$inout1		# load input
 	lea	16($inp),$inp
-	pxor	%xmm2,%xmm0
+	pxor	$inout1,$inout0
 ___
-	&aesni_encrypt1("%xmm0","%xmm1","%xmm2",$key,$rounds);
+	&aesni_generate1("enc",$key,$rounds);
 $code.=<<___;
-	movups	%xmm0,($out)	# store output
 	sub	\$16,$len
 	lea	16($out),$out
-	mov	$rnds_,$rounds	# restore $rounds
-	mov	$key_,$key	# restore $key
+	mov	$rnds_,$rounds		# restore $rounds
+	mov	$key_,$key		# restore $key
+	movups	$inout0,-16($out)	# store output
 	jnc	.Lcbc_enc_loop
 	add	\$16,$len
 	jnz	.Lcbc_enc_tail
-	movups	%xmm0,($ivp)
+	movups	$inout0,($ivp)
 	jmp	.Lcbc_ret
 
 .Lcbc_enc_tail:
@@ -465,59 +420,44 @@ $code.=<<___;
 .Lcbc_decrypt:
 ___
 $code.=<<___ if ($win64);
-	lea	-0xa8(%rsp),%rsp
+	lea	-0x58(%rsp),%rsp
 	movaps	%xmm6,(%rsp)
 	movaps	%xmm7,0x10(%rsp)
 	movaps	%xmm8,0x20(%rsp)
 	movaps	%xmm9,0x30(%rsp)
-	movaps	%xmm10,0x40(%rsp)
-	movaps	%xmm11,0x50(%rsp)
-	movaps	%xmm12,0x60(%rsp)
-	movaps	%xmm13,0x70(%rsp)
-	movaps	%xmm14,0x80(%rsp)
+.Lcbc_decrypt_body:
 ___
 $code.=<<___;
 	movups	($ivp),$iv
-	sub	\$0x60,$len
-	jc	.Lcbc_dec_tail
-	jmp	.Lcbc_dec_loop6
+	sub	\$0x40,$len
+	mov	$rnds_,$rounds
+	jbe	.Lcbc_dec_tail
+	jmp	.Lcbc_dec_loop3
 .align 16
-.Lcbc_dec_loop6:
+.Lcbc_dec_loop3:
 	movups	($inp),$inout0
 	movups	0x10($inp),$inout1
 	movups	0x20($inp),$inout2
-	movups	0x30($inp),$inout3
 	movaps	$inout0,$in0
-	movups	0x40($inp),$inout4
 	movaps	$inout1,$in1
-	movups	0x50($inp),$inout5
 	movaps	$inout2,$in2
-	movaps	$inout3,$in3
-	movaps	$inout4,$in4
-	movaps	$inout5,$in5
-	call	_aesni_decrypt6
+	call	_aesni_decrypt3
+	sub	\$0x30,$len
+	lea	0x30($inp),$inp
+	lea	0x30($out),$out
 	pxor	$iv,$inout0
 	pxor	$in0,$inout1
-	movups	$inout0,($out)
-	sub	\$0x60,$len
+	movaps	$in2,$iv
 	pxor	$in1,$inout2
-	movups	$inout1,0x10($out)
-	lea	0x60($inp),$inp
-	pxor	$in2,$inout3
-	movups	$inout2,0x20($out)
+	movups	$inout0,-0x30($out)
 	mov	$rnds_,$rounds	# restore $rounds
-	pxor	$in3,$inout4
-	movups	$inout3,0x30($out)
+	movups	$inout1,-0x20($out)
 	mov	$key_,$key	# restore $key
-	pxor	$in4,$inout5
-	movups	$inout4,0x40($out)
-	movaps	$in5,$iv
-	movups	$inout5,0x50($out)
-	lea	0x60($out),$out
-	jnc	.Lcbc_dec_loop6
+	movups	$inout2,-0x10($out)
+	ja	.Lcbc_dec_loop3
 
 .Lcbc_dec_tail:
-	add	\$0x60,$len
+	add	\$0x40,$len
 	movups	$iv,($ivp)
 	jz	.Lcbc_dec_ret
 
@@ -534,42 +474,29 @@ $code.=<<___;
 	movaps	$inout2,$in2
 	jbe	.Lcbc_dec_three
 	movups	0x30($inp),$inout3
-	cmp	\$0x40,$len
-	movaps	$inout3,$in3
-	jbe	.Lcbc_dec_four
-	movups	0x40($inp),$inout4
-	cmp	\$0x50,$len
-	movaps	$inout4,$in4
-	jbe	.Lcbc_dec_five
-	movups	0x50($inp),$inout5
-	movaps	$inout5,$in5
-	call	_aesni_decrypt6
+	call	_aesni_decrypt4
 	pxor	$iv,$inout0
+	movups	0x30($inp),$iv
 	pxor	$in0,$inout1
 	movups	$inout0,($out)
 	pxor	$in1,$inout2
 	movups	$inout1,0x10($out)
 	pxor	$in2,$inout3
 	movups	$inout2,0x20($out)
-	pxor	$in3,$inout4
-	movups	$inout3,0x30($out)
-	pxor	$in4,$inout5
-	movups	$inout4,0x40($out)
-	movaps	$in5,$iv
-	movaps	$inout5,$inout0
-	lea	0x50($out),$out
+	movaps	$inout3,$inout0
+	lea	0x30($out),$out
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_one:
 ___
-	&aesni_decrypt1($inout0,$rndkey0,$rndkey1,$key,$rounds);
+	&aesni_generate1("dec",$key,$rounds);
 $code.=<<___;
 	pxor	$iv,$inout0
 	movaps	$in0,$iv
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_two:
-	call	_aesni_decrypt6
+	call	_aesni_decrypt3
 	pxor	$iv,$inout0
 	pxor	$in0,$inout1
 	movups	$inout0,($out)
@@ -579,7 +506,7 @@ $code.=<<___;
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_three:
-	call	_aesni_decrypt6
+	call	_aesni_decrypt3
 	pxor	$iv,$inout0
 	pxor	$in0,$inout1
 	movups	$inout0,($out)
@@ -590,36 +517,6 @@ $code.=<<___;
 	lea	0x20($out),$out
 	jmp	.Lcbc_dec_tail_collected
 .align	16
-.Lcbc_dec_four:
-	call	_aesni_decrypt6
-	pxor	$iv,$inout0
-	pxor	$in0,$inout1
-	movups	$inout0,($out)
-	pxor	$in1,$inout2
-	movups	$inout1,0x10($out)
-	pxor	$in2,$inout3
-	movups	$inout2,0x20($out)
-	movaps	$in3,$iv
-	movaps	$inout3,$inout0
-	lea	0x30($out),$out
-	jmp	.Lcbc_dec_tail_collected
-.align	16
-.Lcbc_dec_five:
-	call	_aesni_decrypt6
-	pxor	$iv,$inout0
-	pxor	$in0,$inout1
-	movups	$inout0,($out)
-	pxor	$in1,$inout2
-	movups	$inout1,0x10($out)
-	pxor	$in2,$inout3
-	movups	$inout2,0x20($out)
-	pxor	$in3,$inout4
-	movups	$inout3,0x30($out)
-	movaps	$in4,$iv
-	movaps	$inout4,$inout0
-	lea	0x40($out),$out
-	jmp	.Lcbc_dec_tail_collected
-.align	16
 .Lcbc_dec_tail_collected:
 	and	\$15,$len
 	movups	$iv,($ivp)
@@ -640,12 +537,7 @@ $code.=<<___ if ($win64);
 	movaps	0x10(%rsp),%xmm7
 	movaps	0x20(%rsp),%xmm8
 	movaps	0x30(%rsp),%xmm9
-	movaps	0x40(%rsp),%xmm10
-	movaps	0x50(%rsp),%xmm11
-	movaps	0x60(%rsp),%xmm12
-	movaps	0x70(%rsp),%xmm13
-	movaps	0x80(%rsp),%xmm14
-	lea	0xa8(%rsp),%rsp
+	lea	0x58(%rsp),%rsp
 ___
 $code.=<<___;
 .Lcbc_ret:
@@ -653,61 +545,49 @@ $code.=<<___;
 .size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
 ___
 
-{
-# this is natural argument order for $PREFIX_set_[en|de]crypt_key 
-my $inp="%rdi";
-my $bits="%esi";
-my $key="%rdx";
-
-# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
-#                              AES_KEY *key)
-$code.=<<___;
-.globl	${PREFIX}_set_encrypt_key
-.type	${PREFIX}_set_encrypt_key,\@function,3
-.align	16
-${PREFIX}_set_encrypt_key:
-	call	_aesni_set_encrypt_key
-	ret
-.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
-___
-# int $PREFIX_set_decrypt_key(const unsigned char *userKey, const int bits,
-#                               AES_KEY *key)
+# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
+#				int bits, AES_KEY *key)
+{ my ($inp,$bits,$key) = @_4args;
+  $bits =~ s/%r/%e/;
+
 $code.=<<___;
 .globl	${PREFIX}_set_decrypt_key
-.type	${PREFIX}_set_decrypt_key,\@function,3
+.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
 .align	16
 ${PREFIX}_set_decrypt_key:
+	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
 	call	_aesni_set_encrypt_key
-	shl	\$4,%esi	# actually rounds after _aesni_set_encrypt_key
+	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
 	test	%eax,%eax
 	jnz	.Ldec_key_ret
-	lea	(%rdx,%rsi),%rsi# points at the end of key schedule
-
-	$movkey	(%rdx),%xmm0	# just swap
-	$movkey	(%rsi),%xmm1
-	$movkey	%xmm0,(%rsi)
-	$movkey	%xmm1,(%rdx)
-	lea	16(%rdx),%rdx
-	lea	-16(%rsi),%rsi
-	jmp	.Ldec_key_inverse
-.align 16
+	lea	16($key,$bits),$inp	# points at the end of key schedule
+
+	$movkey	($key),%xmm0		# just swap
+	$movkey	($inp),%xmm1
+	$movkey	%xmm0,($inp)
+	$movkey	%xmm1,($key)
+	lea	16($key),$key
+	lea	-16($inp),$inp
+
 .Ldec_key_inverse:
-	$movkey	(%rdx),%xmm0	# swap and inverse
-	$movkey	(%rsi),%xmm1
+	$movkey	($key),%xmm0		# swap and inverse
+	$movkey	($inp),%xmm1
 	aesimc	%xmm0,%xmm0
 	aesimc	%xmm1,%xmm1
-	lea	16(%rdx),%rdx
-	lea	-16(%rsi),%rsi
-	cmp	%rdx,%rsi
-	$movkey	%xmm0,16(%rsi)
-	$movkey	%xmm1,-16(%rdx)
+	lea	16($key),$key
+	lea	-16($inp),$inp
+	cmp	$key,$inp
+	$movkey	%xmm0,16($inp)
+	$movkey	%xmm1,-16($key)
 	ja	.Ldec_key_inverse
 
-	$movkey	(%rdx),%xmm0	# inverse middle
+	$movkey	($key),%xmm0		# inverse middle
 	aesimc	%xmm0,%xmm0
-	$movkey	%xmm0,(%rsi)
+	$movkey	%xmm0,($inp)
 .Ldec_key_ret:
+	add	\$8,%rsp
 	ret
+.LSEH_end_set_decrypt_key:
 .size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
 ___
 
@@ -721,27 +601,31 @@ ___
 # and is contained in %xmm0-5 to meet Win64 ABI requirement.
 #
 $code.=<<___;
-.type	_aesni_set_encrypt_key,\@abi-omnipotent
+.globl	${PREFIX}_set_encrypt_key
+.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
 .align	16
+${PREFIX}_set_encrypt_key:
 _aesni_set_encrypt_key:
-	test	%rdi,%rdi
-	jz	.Lbad_pointer
-	test	%rdx,%rdx
-	jz	.Lbad_pointer
-
-	movups	(%rdi),%xmm0		# pull first 128 bits of *userKey
+	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
+	test	$inp,$inp
+	mov	\$-1,%rax
+	jz	.Lenc_key_ret
+	test	$key,$key
+	jz	.Lenc_key_ret
+
+	movups	($inp),%xmm0		# pull first 128 bits of *userKey
 	pxor	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
-	lea	16(%rdx),%rcx
-	cmp	\$256,%esi
+	lea	16($key),%rax
+	cmp	\$256,$bits
 	je	.L14rounds
-	cmp	\$192,%esi
+	cmp	\$192,$bits
 	je	.L12rounds
-	cmp	\$128,%esi
+	cmp	\$128,$bits
 	jne	.Lbad_keybits
-
+
 .L10rounds:
-	mov	\$10,%esi			# 10 rounds for 128-bit key
-	$movkey	%xmm0,(%rdx)			# round 0
+	mov	\$9,$bits			# 10 rounds for 128-bit key
+	$movkey	%xmm0,($key)			# round 0
 	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
 	call		.Lkey_expansion_128_cold
 	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
@@ -762,29 +646,16 @@ _aesni_set_encrypt_key:
 	call		.Lkey_expansion_128
 	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
 	call		.Lkey_expansion_128
-	$movkey	%xmm0,(%rcx)
-	mov	%esi,80(%rcx)	# 240(%rdx)
+	$movkey	%xmm0,(%rax)
+	mov	$bits,80(%rax)	# 240(%rdx)
 	xor	%eax,%eax
-	ret
+	jmp	.Lenc_key_ret
 
-.align	16
-.Lkey_expansion_128:
-	$movkey	%xmm0,(%rcx)
-	lea	16(%rcx),%rcx
-.Lkey_expansion_128_cold:
-	shufps	\$0b00010000,%xmm0,%xmm4
-	pxor	%xmm4, %xmm0
-	shufps	\$0b10001100,%xmm0,%xmm4
-	pxor	%xmm4, %xmm0
-	pshufd	\$0b11111111,%xmm1,%xmm1	# critical path
-	pxor	%xmm1,%xmm0
-	ret
-
 .align	16
 .L12rounds:
-	movq	16(%rdi),%xmm2			# remaining 1/3 of *userKey
-	mov	\$12,%esi			# 12 rounds for 192
-	$movkey	%xmm0,(%rdx)			# round 0
+	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
+	mov	\$11,$bits			# 12 rounds for 192
+	$movkey	%xmm0,($key)			# round 0
 	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
 	call		.Lkey_expansion_192a_cold
 	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
@@ -801,48 +672,18 @@ _aesni_set_encrypt_key:
 	call		.Lkey_expansion_192a
 	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
 	call		.Lkey_expansion_192b
-	$movkey	%xmm0,(%rcx)
-	mov	%esi,48(%rcx)	# 240(%rdx)
+	$movkey	%xmm0,(%rax)
+	mov	$bits,48(%rax)	# 240(%rdx)
 	xor	%rax, %rax
-	ret
+	jmp	.Lenc_key_ret
 
-.align 16
-.Lkey_expansion_192a:
-	$movkey	%xmm0,(%rcx)
-	lea	16(%rcx),%rcx
-.Lkey_expansion_192a_cold:
-	movaps	%xmm2, %xmm5
-.Lkey_expansion_192b_warm:
-	shufps	\$0b00010000,%xmm0,%xmm4
-	movaps	%xmm2,%xmm3
-	pxor	%xmm4,%xmm0
-	shufps	\$0b10001100,%xmm0,%xmm4
-	pslldq	\$4,%xmm3
-	pxor	%xmm4,%xmm0
-	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
-	pxor	%xmm3,%xmm2
-	pxor	%xmm1,%xmm0
-	pshufd	\$0b11111111,%xmm0,%xmm3
-	pxor	%xmm3,%xmm2
-	ret
-
-.align 16
-.Lkey_expansion_192b:
-	movaps	%xmm0,%xmm3
-	shufps	\$0b01000100,%xmm0,%xmm5
-	$movkey	%xmm5,(%rcx)
-	shufps	\$0b01001110,%xmm2,%xmm3
-	$movkey	%xmm3,16(%rcx)
-	lea	32(%rcx),%rcx
-	jmp	.Lkey_expansion_192b_warm
-
 .align	16
 .L14rounds:
-	movups	16(%rdi),%xmm2			# remaning half of *userKey
-	mov	\$14,%esi			# 14 rounds for 256
-	lea	16(%rcx),%rcx
-	$movkey	%xmm0,(%rdx)			# round 0
-	$movkey	%xmm2,16(%rdx)			# round 1
+	movups	16($inp),%xmm2			# remaning half of *userKey
+	mov	\$13,$bits			# 14 rounds for 256
+	lea	16(%rax),%rax
+	$movkey	%xmm0,($key)			# round 0
+	$movkey	%xmm2,16($key)			# round 1
 	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
 	call		.Lkey_expansion_256a_cold
 	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
@@ -869,15 +710,66 @@ _aesni_set_encrypt_key:
 	call		.Lkey_expansion_256b
 	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
 	call		.Lkey_expansion_256a
-	$movkey	%xmm0,(%rcx)
-	mov	%esi,16(%rcx)	# 240(%rdx)
+	$movkey	%xmm0,(%rax)
+	mov	$bits,16(%rax)	# 240(%rdx)
 	xor	%rax,%rax
+	jmp	.Lenc_key_ret
+
+.align	16
+.Lbad_keybits:
+	mov	\$-2,%rax
+.Lenc_key_ret:
+	add	\$8,%rsp
+	ret
+.LSEH_end_set_encrypt_key:
+
+.align	16
+.Lkey_expansion_128:
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
+.Lkey_expansion_128_cold:
+	shufps	\$0b00010000,%xmm0,%xmm4
+	pxor	%xmm4, %xmm0
+	shufps	\$0b10001100,%xmm0,%xmm4
+	pxor	%xmm4, %xmm0
+	pshufd	\$0b11111111,%xmm1,%xmm1	# critical path
+	pxor	%xmm1,%xmm0
+	ret
+
+.align 16
+.Lkey_expansion_192a:
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
+.Lkey_expansion_192a_cold:
+	movaps	%xmm2, %xmm5
+.Lkey_expansion_192b_warm:
+	shufps	\$0b00010000,%xmm0,%xmm4
+	movaps	%xmm2,%xmm3
+	pxor	%xmm4,%xmm0
+	shufps	\$0b10001100,%xmm0,%xmm4
+	pslldq	\$4,%xmm3
+	pxor	%xmm4,%xmm0
+	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
+	pxor	%xmm3,%xmm2
+	pxor	%xmm1,%xmm0
+	pshufd	\$0b11111111,%xmm0,%xmm3
+	pxor	%xmm3,%xmm2
 	ret
 
+.align 16
+.Lkey_expansion_192b:
+	movaps	%xmm0,%xmm3
+	shufps	\$0b01000100,%xmm0,%xmm5
+	$movkey	%xmm5,(%rax)
+	shufps	\$0b01001110,%xmm2,%xmm3
+	$movkey	%xmm3,16(%rax)
+	lea	32(%rax),%rax
+	jmp	.Lkey_expansion_192b_warm
+
 .align	16
 .Lkey_expansion_256a:
-	$movkey	%xmm2,(%rcx)
-	lea	16(%rcx),%rcx
+	$movkey	%xmm2,(%rax)
+	lea	16(%rax),%rax
 .Lkey_expansion_256a_cold:
 	shufps	\$0b00010000,%xmm0,%xmm4
 	pxor	%xmm4,%xmm0
@@ -889,8 +781,8 @@ _aesni_set_encrypt_key:
 
 .align 16
 .Lkey_expansion_256b:
-	$movkey	%xmm0,(%rcx)
-	lea	16(%rcx),%rcx
+	$movkey	%xmm0,(%rax)
+	lea	16(%rax),%rax
 
 	shufps	\$0b00010000,%xmm2,%xmm4
 	pxor	%xmm4,%xmm2
@@ -899,15 +791,7 @@ _aesni_set_encrypt_key:
 	pshufd	\$0b10101010,%xmm1,%xmm1	# critical path
 	pxor	%xmm1,%xmm2
 	ret
-
-.align	16
-.Lbad_pointer:
-	mov \$-1, %rax
-	ret
-.Lbad_keybits:
-	mov \$-2, %rax
-	ret
-.size	_aesni_set_encrypt_key,.-_aesni_set_encrypt_key
+.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
 ___
 }
 
@@ -916,6 +800,150 @@ $code.=<<___;
 .align	64
 ___
 
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	cbc_se_handler,\@abi-omnipotent
+.align	16
+cbc_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lcbc_decrypt(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lin_prologue
+
+	lea	.Lcbc_decrypt_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
+	jb	.Lrestore_rax
+
+	lea	.Lcbc_ret(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>="epilogue" label
+	jae	.Lin_prologue
+
+	lea	0(%rax),%rsi		# top of stack
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0x58(%rax),%rax		# adjust stack pointer
+	jmp	.Lin_prologue
+
+.Lrestore_rax:
+	mov	120($context),%rax
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	jmp	.Lcommon_seh_exit
+.size	cbc_se_handler,.-cbc_se_handler
+
+.type	ecb_se_handler,\@abi-omnipotent
+.align	16
+ecb_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+.Lcommon_seh_exit:
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	cbc_se_handler,.-cbc_se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_${PREFIX}_ecb_encrypt
+	.rva	.LSEH_end_${PREFIX}_ecb_encrypt
+	.rva	.LSEH_info_ecb
+
+	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_info_cbc
+
+	.rva	${PREFIX}_set_encrypt_key
+	.rva	.LSEH_end_set_encrypt_key
+	.rva	.LSEH_info_key
+
+	.rva	${PREFIX}_set_decrypt_key
+	.rva	.LSEH_end_set_decrypt_key
+	.rva	.LSEH_info_key
+.section	.xdata
+.align	8
+.LSEH_info_ecb:
+	.byte	9,0,0,0
+	.rva	ecb_se_handler
+.LSEH_info_cbc:
+	.byte	9,0,0,0
+	.rva	cbc_se_handler
+.LSEH_info_key:
+	.byte	0x01,0x04,0x01,0x00
+	.byte	0x04,0x02,0x00,0x00
+___
+}
+
 sub rex {
  local *opcode=shift;
  my ($dst,$src)=@_;
diff --git a/crypto/engine/eng_aesni.c b/crypto/engine/eng_aesni.c
index 6707418614..5491063811 100644
--- a/crypto/engine/eng_aesni.c
+++ b/crypto/engine/eng_aesni.c
@@ -147,8 +147,9 @@ static int aesni_ciphers(ENGINE *e, const EVP_CIPHER **cipher,
 	((void *)(((unsigned long)(x)+AESNI_MIN_ALIGN-1)&~(AESNI_MIN_ALIGN-1)))
 
 /* Engine names */
-static const char *aesni_id = "aesni";
-static const char *aesni_name = "Intel AES-NI engine";
+static const char   aesni_id[] = "aesni",
+		    aesni_name[] = "Intel AES-NI engine",
+		    no_aesni_name[] = "Intel AES-NI engine (no-aesni)";
 
 /* ===== Engine "management" functions ===== */
 
@@ -156,15 +157,15 @@ static const char *aesni_name = "Intel AES-NI engine";
 static int
 aesni_bind_helper(ENGINE *e)
 {
-	if (!(OPENSSL_ia32cap_P[1] & (1UL << (57-32))))
-		return 0;
+	int engage = (OPENSSL_ia32cap_P[1] & (1 << (57-32))) != 0;
 
 	/* Register everything or return with an error */
 	if (!ENGINE_set_id(e, aesni_id) ||
-	    !ENGINE_set_name(e, aesni_name) ||
+	    !ENGINE_set_name(e, engage ? aesni_name : no_aesni_name) ||
 
 	    !ENGINE_set_init_function(e, aesni_init) ||
-	    !ENGINE_set_ciphers (e, aesni_ciphers))
+	    (engage && !ENGINE_set_ciphers (e, aesni_ciphers))
+	    )
 		return 0;
 
 	/* Everything looks good */
@@ -286,14 +287,14 @@ static int aesni_cipher_cfb(EVP_CIPHER_CTX *ctx, unsigned char *out,
 {	AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
 	CRYPTO_cfb128_encrypt(in, out, inl, key, ctx->iv,
 				&ctx->num, ctx->encrypt,
-				aesni_encrypt);
+				(block128_f)aesni_encrypt);
 	return 1;
 }
 static int aesni_cipher_ofb(EVP_CIPHER_CTX *ctx, unsigned char *out,
 		 const unsigned char *in, size_t inl)
 {	AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
 	CRYPTO_ofb128_encrypt(in, out, inl, key, ctx->iv,
-				&ctx->num, aesni_encrypt);
+				&ctx->num, (block128_f)aesni_encrypt);
 	return 1;
 }
 
diff --git a/test/test_aesni b/test/test_aesni
index 87f5da191e..e8fb63ee2b 100755
--- a/test/test_aesni
+++ b/test/test_aesni
@@ -14,23 +14,19 @@ else
     exit 1;
 fi
 
-if $PROG engine aesni | grep aesni; then
+if $PROG engine aesni | grep -v no-aesni; then
 
     HASH=`cat $PROG | $PROG dgst -hex`
 
-    ACE_ALGS="	aes-128-ecb aes-192-ecb aes-256-ecb \
+    AES_ALGS="	aes-128-ecb aes-192-ecb aes-256-ecb \
 		aes-128-cbc aes-192-cbc aes-256-cbc \
 		aes-128-cfb aes-192-cfb aes-256-cfb \
 		aes-128-ofb aes-192-ofb aes-256-ofb"
-    BUFSIZE="16 32 48 64 80 96 128 999"
-    ACE_ALGS="	aes-128-cbc aes-192-cbc aes-256-cbc \
-		aes-128-cfb aes-192-cfb aes-256-cfb \
-		aes-128-ofb aes-192-ofb aes-256-ofb"
-    BUFSIZE="48 64 80 96 128 999"
+    BUFSIZE="16 32 48 64 80 96 128 144 999"
 
     nerr=0
 
-    for alg in $ACE_ALGS; do
+    for alg in $AES_ALGS; do
 	echo $alg
 	for bufsize in $BUFSIZE; do
 	    TEST=`(	cat $PROG | \
-- 
2.25.1