# AES-128-CBC +SHA1 stitch gain
# Westmere 3.77[+5.6] 9.37 6.65 +41%
# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
+# Ivy Bridge 5.05[+4.7] 9.75 5.59 +74%
+# Bulldozer 5.77[+6.1] 11.87 6.47 +83%
#
# AES-192-CBC
# Westmere 4.51 10.11 6.97 +45%
# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
+# Ivy Bridge 6.05 10.75 6.07 +77%
+# Bulldozer 6.89 12.99 7.02 +85%
#
# AES-256-CBC
# Westmere 5.25 10.85 7.25 +50%
# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
+# Ivy Bridge 7.05 11.75 7.12 +65%
+# Bulldozer 8.00 14.10 8.24 +71%
#
# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
# background information. Above numbers in parentheses are SSSE3
# AES-128-CBC AES-192-CBC AES-256-CBC
# Westmere 1.31 1.55 1.80
# Sandy Bridge 0.93 1.06 1.22
+# Ivy Bridge 0.92 1.06 1.21
+# Bulldozer 0.76 0.90 1.04
$flavour = shift;
$output = shift;
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
# in CTR mode AES instruction interleave factor was chosen to be 6x.
+######################################################################
+# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
+# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
+# in ECB, 0.94 in CTR, 0.95 in XTS... This means that aes[enc|dec]
+# instruction latency is 9 cycles and that they can be issued every
+# cycle.
+
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
# crypto/aes/asm/aes-x86_64.pl:-)
# P4 125/125 17.8 84(***)
# Opteron 66 /70 10.1 30
# Core2 54 /67 8.4 18
+# Atom 105/105 16.8 53
+# VIA Nano 69 /71 13.0 27
#
# (*) gcc 3.4.x was observed to generate few percent slower code,
# which is one of reasons why 2.95.3 results were chosen,
# similar manner resulted in almost 20% degradation on Sandy Bridge,
# where original 64-bit code processes one byte in 1.95 cycles.
+#####################################################################
+# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
+# 32-bit mode and 1.89 in 64-bit.
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
# P4 28.6 14.0 +100%
# Opteron 19.3 7.7 +150%
# Core2 17.8 8.1(**) +120%
+# Atom 31.6 16.8 +88%
+# VIA Nano 21.8 10.1 +115%
#
# (*) comparison is not completely fair, because C results are
# for vanilla "256B" implementation, while assembler results
# Westmere 5.1/+94%(**)
# Sandy Bridge 5.0/+8%
# Atom 12.6/+6%
+# VIA Nano 6.4/+9%
+# Ivy Bridge 4.9/±0%
+# Bulldozer 4.9/+15%
#
# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
# but this specific code performs poorly on Core2. And vice
# Westmere 4.3 5.2 9.5 7.0 +36%
# Sandy Bridge 4.2 5.5 9.7 6.8 +43%
# Atom 9.3 6.5 15.8 11.1 +42%
+# VIA Nano 6.3 5.4 11.7 8.6 +37%
+# Ivy Bridge 4.1 5.2 9.3 6.0 +54%
+# Bulldozer 4.5 5.4 9.9 7.7 +29%
#
# (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
# is +53%...
# Westmere 4.2/+60%
# Sandy Bridge 4.2/+120%
# Atom 9.3/+80%
+# VIA Nano 6.4/+4%
+# Ivy Bridge 4.1/+30%
+# Bulldozer 4.5/+30%(*)
#
# (*) But corresponding loop has less instructions, which should have
# positive effect on upcoming Bulldozer, which has one less ALU.
# Atom 12.5 9.5(*)/+32% -
# Westmere 7.3 5.6/+30% -
# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
+# Ivy Bridge 7.2 4.9/+47% 4.8(**)/+50%
+# Bulldozer 11.6 6.2/+88%
+# VIA Nano 10.6 7.5/+41%
#
# (*) Loop is 1056 instructions long and expected result is ~8.25.
# It remains mystery [to me] why ILP is limited to 1.7.
# Atom 11.0 9.7/+13% -
# Westmere 7.1 5.6/+27% -
# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
+# Ivy Bridge 6.4 4.8/+33% 4.7/+36%
+# Bulldozer 10.9 6.1/+79%
+# VIA Nano 10.2 7.4/+38%
$flavour = shift;
$output = shift;
#
# Performance in clock cycles per processed byte (less is better):
#
-# Pentium PIII P4 AMD K8 Core2
-# gcc 100 75 116 54 66
-# icc 97 77 95 55 57
-# x86 asm 61 56 82 36 40
-# SSE2 asm - - 38 24 20
-# x86_64 asm(*) - - 30 10.0 10.5
+# PIII P4 AMD K8 Core2 SB Atom Bldzr
+# gcc 75 116 54 66 58 126 121
+# icc 77 95 55 57 - - -
+# x86 asm 56 82 36 40 35 68 50
+# SSE2 asm - 38 24 20 16 64(**) 18
+# x86_64 asm(*) - 33 9.6 10.3 11.3 14.7 13.5
#
-# (*) x86_64 assembler performance is presented for reference
-# purposes.
+# (*) x86_64 assembler performance is presented for reference
+# purposes.
+# (**) paddq is increadibly slow on Atom.
#
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
# performance improvement over compiler generated code reaches ~60%,
&bswap ("edx");
&mov (&DWP(8*9+4,"esp"),"ecx");
&mov (&DWP(8*9+0,"esp"),"edx");
+ &jmp (&label("00_14_sse2"));
&set_label("00_14_sse2",16);
&mov ("eax",&DWP(0,"edi"));