X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;ds=sidebyside;f=crypto%2Frc4%2Fasm%2Frc4-x86_64.pl;h=fa227631870354f78eee9f23b1f54a008d81c110;hb=46bf83f07ae1ba7fda435c90af93960e77159f4b;hp=44466ee97a001c31b3086789025b2b98e1c8c9dd;hpb=4bb90087d745c26401e09a3bd10137d7b05e9ea3;p=oweals%2Fopenssl.git diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl index 44466ee97a..fa22763187 100755 --- a/crypto/rc4/asm/rc4-x86_64.pl +++ b/crypto/rc4/asm/rc4-x86_64.pl @@ -78,23 +78,31 @@ # May 2011 # -# The only code path that was not modified is P4-specific one. New -# AMD code path is inspired by and Intel optimization is heavily -# based on submission from Maxim Locktyukhin of Intel. Current -# performance in cycles per processed byte (less is better) and -# improvement coefficients relative to previous version of this -# module are: +# The only code path that was not modified is P4-specific one. Non-P4 +# Intel code path optimization is heavily based on submission by Maxim +# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used +# some of the ideas even in attempt to optmize the original RC4_INT +# code path... Current performance in cycles per processed byte (less +# is better) and improvement coefficients relative to previous +# version of this module are: # -# Opteron 5.3/+0% +# Opteron 5.3/+0%(*) # P4 6.5 -# Core2 6.2/+15%(*) +# Core2 6.2/+15%(**) # Westmere 4.2/+60% # Sandy Bridge 4.2/+120% # Atom 9.3/+80% +# VIA Nano 6.4/+4% +# Ivy Bridge 4.1/+30% +# Bulldozer 4.5/+30%(*) # -# (*) Note that this result is ~15% lower than result for 32-bit -# code, meaning that it's possible to improve it, but it's -# more than likely at the cost of the others... +# (*) But corresponding loop has less instructions, which should have +# positive effect on upcoming Bulldozer, which has one less ALU. +# For reference, Intel code runs at 6.8 cpb rate on Opteron. +# (**) Note that Core2 result is ~15% lower than corresponding result +# for 32-bit code, meaning that it's possible to improve it, +# but more than likely at the cost of the others (see rc4-586.pl +# to get the idea)... $flavour = shift; $output = shift; @@ -107,7 +115,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open STDOUT,"| $^X $xlate $flavour $output"; +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; $dat="%rdi"; # arg1 $len="%rsi"; # arg2 @@ -117,6 +126,7 @@ $out="%rcx"; # arg4 { $code=<<___; .text +.extern OPENSSL_ia32cap_P .globl RC4 .type RC4,\@function,4 @@ -423,7 +433,6 @@ $idx="%r8"; $ido="%r9"; $code.=<<___; -.extern OPENSSL_ia32cap_P .globl RC4_set_key .type RC4_set_key,\@function,3 .align 16 @@ -503,11 +512,13 @@ RC4_options: lea .Lopts(%rip),%rax mov OPENSSL_ia32cap_P(%rip),%edx bt \$20,%edx - jnc .Ldone - add \$12,%rax + jc .L8xchar bt \$30,%edx jnc .Ldone - add \$13,%rax + add \$25,%rax + ret +.L8xchar: + add \$12,%rax .Ldone: ret .align 64