X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;ds=sidebyside;f=crypto%2Frc4%2Fasm%2Frc4-x86_64.pl;h=fa227631870354f78eee9f23b1f54a008d81c110;hb=46bf83f07ae1ba7fda435c90af93960e77159f4b;hp=44466ee97a001c31b3086789025b2b98e1c8c9dd;hpb=4bb90087d745c26401e09a3bd10137d7b05e9ea3;p=oweals%2Fopenssl.git

diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl
index 44466ee97a..fa22763187 100755
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl
@@ -78,23 +78,31 @@
 
 # May 2011
 #
-# The only code path that was not modified is P4-specific one. New
-# AMD code path is inspired by and Intel optimization is heavily
-# based on submission from Maxim Locktyukhin of Intel. Current
-# performance in cycles per processed byte (less is better) and
-# improvement coefficients relative to previous version of this
-# module are:
+# The only code path that was not modified is P4-specific one. Non-P4
+# Intel code path optimization is heavily based on submission by Maxim
+# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
+# some of the ideas even in attempt to optmize the original RC4_INT
+# code path... Current performance in cycles per processed byte (less
+# is better) and improvement coefficients relative to previous
+# version of this module are:
 #
-# Opteron	5.3/+0%
+# Opteron	5.3/+0%(*)
 # P4		6.5
-# Core2		6.2/+15%(*)
+# Core2		6.2/+15%(**)
 # Westmere	4.2/+60%
 # Sandy Bridge	4.2/+120%
 # Atom		9.3/+80%
+# VIA Nano	6.4/+4%
+# Ivy Bridge	4.1/+30%
+# Bulldozer	4.5/+30%(*)
 #
-# (*)	Note that this result is ~15% lower than result for 32-bit
-#	code, meaning that it's possible to improve it, but it's
-#	more than likely at the cost of the others...
+# (*)	But corresponding loop has less instructions, which should have
+#	positive effect on upcoming Bulldozer, which has one less ALU.
+#	For reference, Intel code runs at 6.8 cpb rate on Opteron.
+# (**)	Note that Core2 result is ~15% lower than corresponding result
+#	for 32-bit code, meaning that it's possible to improve it,
+#	but more than likely at the cost of the others (see rc4-586.pl
+#	to get the idea)...
 
 $flavour = shift;
 $output  = shift;
@@ -107,7 +115,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
 
 $dat="%rdi";	    # arg1
 $len="%rsi";	    # arg2
@@ -117,6 +126,7 @@ $out="%rcx";	    # arg4
 {
 $code=<<___;
 .text
+.extern	OPENSSL_ia32cap_P
 
 .globl	RC4
 .type	RC4,\@function,4
@@ -423,7 +433,6 @@ $idx="%r8";
 $ido="%r9";
 
 $code.=<<___;
-.extern	OPENSSL_ia32cap_P
 .globl	RC4_set_key
 .type	RC4_set_key,\@function,3
 .align	16
@@ -503,11 +512,13 @@ RC4_options:
 	lea	.Lopts(%rip),%rax
 	mov	OPENSSL_ia32cap_P(%rip),%edx
 	bt	\$20,%edx
-	jnc	.Ldone
-	add	\$12,%rax
+	jc	.L8xchar
 	bt	\$30,%edx
 	jnc	.Ldone
-	add	\$13,%rax
+	add	\$25,%rax
+	ret
+.L8xchar:
+	add	\$12,%rax
 .Ldone:
 	ret
 .align	64