From 558ff0f0c1d87d74e70b0a70ddd67c0ff7f596ad Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Thu, 24 Apr 2014 10:13:30 +0200
Subject: [PATCH] aes/asm/bsaes-x86_64.pl: Atom-specific optimization.

---
 crypto/aes/asm/bsaes-x86_64.pl | 72 +++++++++++++++-------------------
 1 file changed, 32 insertions(+), 40 deletions(-)

diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl
index 41b90f0844..d2c3978b96 100644
--- a/crypto/aes/asm/bsaes-x86_64.pl
+++ b/crypto/aes/asm/bsaes-x86_64.pl
@@ -38,8 +38,8 @@
 #		Emilia's	this(*)		difference
 #
 # Core 2    	9.30		8.69		+7%
-# Nehalem(**) 	7.63		6.98		+9%
-# Atom	    	17.1		17.4		-2%(***)
+# Nehalem(**) 	7.63		6.88		+11%
+# Atom	    	17.1		16.4		+4%
 #
 # (*)	Comparison is not completely fair, because "this" is ECB,
 #	i.e. no extra processing such as counter values calculation
@@ -50,14 +50,6 @@
 # (**)	Results were collected on Westmere, which is considered to
 #	be equivalent to Nehalem for this code.
 #
-# (***)	Slowdown on Atom is rather strange per se, because original
-#	implementation has a number of 9+-bytes instructions, which
-#	are bad for Atom front-end, and which I eliminated completely.
-#	In attempt to address deterioration sbox() was tested in FP
-#	SIMD "domain" (movaps instead of movdqa, xorps instead of
-#	pxor, etc.). While it resulted in nominal 4% improvement on
-#	Atom, it hurted Westmere by more than 2x factor.
-#
 # As for key schedule conversion subroutine. Interface to OpenSSL
 # relies on per-invocation on-the-fly conversion. This naturally
 # has impact on performance, especially for short inputs. Conversion
@@ -67,7 +59,7 @@
 # 		conversion	conversion/8x block
 # Core 2	240		0.22
 # Nehalem	180		0.20
-# Atom		430		0.19
+# Atom		430		0.20
 #
 # The ratio values mean that 128-byte blocks will be processed
 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
@@ -83,9 +75,9 @@
 # Add decryption procedure. Performance in CPU cycles spent to decrypt
 # one byte out of 4096-byte buffer with 128-bit key is:
 #
-# Core 2	9.83
-# Nehalem	7.74
-# Atom		19.0
+# Core 2	9.98
+# Nehalem	7.80
+# Atom		17.9
 #
 # November 2011.
 #
@@ -434,21 +426,21 @@ my $mask=pop;
 $code.=<<___;
 	pxor	0x00($key),@x[0]
 	pxor	0x10($key),@x[1]
-	pshufb	$mask,@x[0]
 	pxor	0x20($key),@x[2]
-	pshufb	$mask,@x[1]
 	pxor	0x30($key),@x[3]
-	pshufb	$mask,@x[2]
+	pshufb	$mask,@x[0]
+	pshufb	$mask,@x[1]
 	pxor	0x40($key),@x[4]
-	pshufb	$mask,@x[3]
 	pxor	0x50($key),@x[5]
-	pshufb	$mask,@x[4]
+	pshufb	$mask,@x[2]
+	pshufb	$mask,@x[3]
 	pxor	0x60($key),@x[6]
-	pshufb	$mask,@x[5]
 	pxor	0x70($key),@x[7]
+	pshufb	$mask,@x[4]
+	pshufb	$mask,@x[5]
 	pshufb	$mask,@x[6]
-	lea	0x80($key),$key
 	pshufb	$mask,@x[7]
+	lea	0x80($key),$key
 ___
 }
 
@@ -820,18 +812,18 @@ _bsaes_encrypt8:
 	movdqa	0x50($const), @XMM[8]	# .LM0SR
 	pxor	@XMM[9], @XMM[0]	# xor with round0 key
 	pxor	@XMM[9], @XMM[1]
-	 pshufb	@XMM[8], @XMM[0]
 	pxor	@XMM[9], @XMM[2]
-	 pshufb	@XMM[8], @XMM[1]
 	pxor	@XMM[9], @XMM[3]
-	 pshufb	@XMM[8], @XMM[2]
+	 pshufb	@XMM[8], @XMM[0]
+	 pshufb	@XMM[8], @XMM[1]
 	pxor	@XMM[9], @XMM[4]
-	 pshufb	@XMM[8], @XMM[3]
 	pxor	@XMM[9], @XMM[5]
-	 pshufb	@XMM[8], @XMM[4]
+	 pshufb	@XMM[8], @XMM[2]
+	 pshufb	@XMM[8], @XMM[3]
 	pxor	@XMM[9], @XMM[6]
-	 pshufb	@XMM[8], @XMM[5]
 	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[4]
+	 pshufb	@XMM[8], @XMM[5]
 	 pshufb	@XMM[8], @XMM[6]
 	 pshufb	@XMM[8], @XMM[7]
 _bsaes_encrypt8_bitslice:
@@ -884,18 +876,18 @@ _bsaes_decrypt8:
 	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
 	pxor	@XMM[9], @XMM[0]	# xor with round0 key
 	pxor	@XMM[9], @XMM[1]
-	 pshufb	@XMM[8], @XMM[0]
 	pxor	@XMM[9], @XMM[2]
-	 pshufb	@XMM[8], @XMM[1]
 	pxor	@XMM[9], @XMM[3]
-	 pshufb	@XMM[8], @XMM[2]
+	 pshufb	@XMM[8], @XMM[0]
+	 pshufb	@XMM[8], @XMM[1]
 	pxor	@XMM[9], @XMM[4]
-	 pshufb	@XMM[8], @XMM[3]
 	pxor	@XMM[9], @XMM[5]
-	 pshufb	@XMM[8], @XMM[4]
+	 pshufb	@XMM[8], @XMM[2]
+	 pshufb	@XMM[8], @XMM[3]
 	pxor	@XMM[9], @XMM[6]
-	 pshufb	@XMM[8], @XMM[5]
 	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[4]
+	 pshufb	@XMM[8], @XMM[5]
 	 pshufb	@XMM[8], @XMM[6]
 	 pshufb	@XMM[8], @XMM[7]
 ___
@@ -1937,21 +1929,21 @@ $code.=<<___;
 	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
 	pxor	@XMM[9], @XMM[0]	# xor with round0 key
 	pxor	@XMM[9], @XMM[1]
-	 pshufb	@XMM[8], @XMM[0]
 	pxor	@XMM[9], @XMM[2]
-	 pshufb	@XMM[8], @XMM[1]
 	pxor	@XMM[9], @XMM[3]
-	 pshufb	@XMM[8], @XMM[2]
+	 pshufb	@XMM[8], @XMM[0]
+	 pshufb	@XMM[8], @XMM[1]
 	pxor	@XMM[9], @XMM[4]
-	 pshufb	@XMM[8], @XMM[3]
 	pxor	@XMM[9], @XMM[5]
-	 pshufb	@XMM[8], @XMM[4]
+	 pshufb	@XMM[8], @XMM[2]
+	 pshufb	@XMM[8], @XMM[3]
 	pxor	@XMM[9], @XMM[6]
-	 pshufb	@XMM[8], @XMM[5]
 	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[4]
+	 pshufb	@XMM[8], @XMM[5]
 	 pshufb	@XMM[8], @XMM[6]
-	lea	.LBS0(%rip), %r11	# constants table
 	 pshufb	@XMM[8], @XMM[7]
+	lea	.LBS0(%rip), %r11	# constants table
 	mov	%ebx,%r10d		# pass rounds
 
 	call	_bsaes_encrypt8_bitslice
-- 
2.25.1