From b59f92e75d334c9281082a02faa6c68afb614fd2 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Sat, 30 Aug 2014 19:13:49 +0200
Subject: [PATCH] x86[_64] assembly pack: add Silvermont performance data.

Reviewed-by: Rich Salz <rsalz@openssl.org>
---
 crypto/aes/asm/bsaes-x86_64.pl   | 2 ++
 crypto/aes/asm/vpaes-x86.pl      | 1 +
 crypto/aes/asm/vpaes-x86_64.pl   | 1 +
 crypto/modes/asm/ghash-x86_64.pl | 1 +
 crypto/sha/asm/sha1-586.pl       | 7 +++++--
 crypto/sha/asm/sha256-586.pl     | 1 +
 crypto/sha/asm/sha512-586.pl     | 1 +
 crypto/sha/asm/sha512-x86_64.pl  | 1 +
 8 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl
index d2c3978b96..3f7d33c45b 100644
--- a/crypto/aes/asm/bsaes-x86_64.pl
+++ b/crypto/aes/asm/bsaes-x86_64.pl
@@ -40,6 +40,7 @@
 # Core 2    	9.30		8.69		+7%
 # Nehalem(**) 	7.63		6.88		+11%
 # Atom	    	17.1		16.4		+4%
+# Silvermont	-		12.9
 #
 # (*)	Comparison is not completely fair, because "this" is ECB,
 #	i.e. no extra processing such as counter values calculation
@@ -78,6 +79,7 @@
 # Core 2	9.98
 # Nehalem	7.80
 # Atom		17.9
+# Silvermont	14.0
 #
 # November 2011.
 #
diff --git a/crypto/aes/asm/vpaes-x86.pl b/crypto/aes/asm/vpaes-x86.pl
index bacf42cf0f..2ba149c3f9 100644
--- a/crypto/aes/asm/vpaes-x86.pl
+++ b/crypto/aes/asm/vpaes-x86.pl
@@ -30,6 +30,7 @@
 # Core 2(**)	28.1/41.4/18.3		21.9/25.2(***)
 # Nehalem	27.9/40.4/18.1		10.2/11.9
 # Atom		70.7/92.1/60.1		61.1/75.4(***)
+# Silvermont	45.4/62.9/24.1		49.2/61.1(***)
 #
 # (*)	"Hyper-threading" in the context refers rather to cache shared
 #	among multiple cores, than to specifically Intel HTT. As vast
diff --git a/crypto/aes/asm/vpaes-x86_64.pl b/crypto/aes/asm/vpaes-x86_64.pl
index 40ef342d97..f2ef318fae 100644
--- a/crypto/aes/asm/vpaes-x86_64.pl
+++ b/crypto/aes/asm/vpaes-x86_64.pl
@@ -30,6 +30,7 @@
 # Core 2(**)	29.6/41.1/14.3		21.9/25.2(***)
 # Nehalem	29.6/40.3/14.6		10.0/11.8
 # Atom		57.3/74.2/32.1		60.9/77.2(***)
+# Silvermont	52.7/64.0/19.5		48.8/60.8(***)
 #
 # (*)	"Hyper-threading" in the context refers rather to cache shared
 #	among multiple cores, than to specifically Intel HTT. As vast
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl
index 1e79227338..ce7d1cb8ba 100644
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -64,6 +64,7 @@
 # Ivy Bridge	1.80(+7%)
 # Haswell	0.55(+93%) (if system doesn't support AVX)
 # Bulldozer	1.49(+27%)
+# Silvermont	2.88(+13%)
 
 # March 2013
 #
diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl
index 59da867848..8377299b1e 100644
--- a/crypto/sha/asm/sha1-586.pl
+++ b/crypto/sha/asm/sha1-586.pl
@@ -93,16 +93,19 @@
 # P4		10.6		-
 # AMD K8	7.1		-
 # Core2		7.3		6.0/+22%	-
-# Atom		12.5		9.3(*)/+35%	-
 # Westmere	7.3		5.5/+33%	-
 # Sandy Bridge	8.8		6.2/+40%	5.1(**)/+73%
 # Ivy Bridge	7.2		4.8/+51%	4.7(**)/+53%
 # Haswell	6.5		4.3/+51%	4.1(**)/+58%
 # Bulldozer	11.6		6.0/+92%
 # VIA Nano	10.6		7.5/+41%
+# Atom		12.5		9.3(*)/+35%
+# Silvermont	14.5		9.9(*)/+46%
 #
 # (*)	Loop is 1056 instructions long and expected result is ~8.25.
-#	It remains mystery [to me] why ILP is limited to 1.7.
+#	The discrepancy is because of front-end limitations, so
+#	called MS-ROM penalties, and on Silvermont even rotate's
+#	limited parallelism.
 #
 # (**)	As per above comment, the result is for AVX *plus* sh[rl]d.
 
diff --git a/crypto/sha/asm/sha256-586.pl b/crypto/sha/asm/sha256-586.pl
index 0c2a778e7c..6462e45ba7 100644
--- a/crypto/sha/asm/sha256-586.pl
+++ b/crypto/sha/asm/sha256-586.pl
@@ -53,6 +53,7 @@
 # Bulldozer	36	-	27/22		17.0	13.6
 # VIA Nano	36	-	25/22		16.8	16.5
 # Atom		50	-	30/25		21.9	18.9
+# Silvermont	40	-	34/31		22.9	20.6
 #
 # (*)	numbers after slash are for unrolled loop, where applicable;
 # (**)	x86_64 assembly performance is presented for reference
diff --git a/crypto/sha/asm/sha512-586.pl b/crypto/sha/asm/sha512-586.pl
index 9fc792964f..e96ec00314 100644
--- a/crypto/sha/asm/sha512-586.pl
+++ b/crypto/sha/asm/sha512-586.pl
@@ -28,6 +28,7 @@
 # Bulldozer	121	-	50	14.0	13.5
 # VIA Nano	91	-	52	33	14.7
 # Atom		126	-	68	48(***)	14.7
+# Silvermont	97	-	58	42(***)	17.5
 #
 # (*)	whichever best applicable.
 # (**)	x86_64 assembler performance is presented for reference
diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl
index 476e99fee8..b7b44b4411 100755
--- a/crypto/sha/asm/sha512-x86_64.pl
+++ b/crypto/sha/asm/sha512-x86_64.pl
@@ -89,6 +89,7 @@
 # Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
 # VIA Nano	23.0	16.5(+39%)  -		    14.7    -
 # Atom		23.0	18.9(+22%)  -		    14.7    -
+# Silvermont	27.4	20.6(+33%)  -               17.5    -
 #
 # (*)	whichever best applicable;
 # (**)	switch from ror to shrd stands for fair share of improvement;
-- 
2.25.1