From 9babf3929bf1f546aa646d9e1e2a934ccfe0b333 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 2 Apr 2007 09:50:14 +0000 Subject: [PATCH] RC4_set_key for x86_64 and Core2 optimization. PR: 1447 --- Configure | 2 +- TABLE | 8 +-- crypto/rc4/asm/rc4-x86_64.pl | 115 +++++++++++++++++++++++++++++++++++ crypto/x86_64cpuid.pl | 58 ++++++++++++++++++ 4 files changed, 178 insertions(+), 5 deletions(-) diff --git a/Configure b/Configure index 5160b10202..c24cf9807f 100755 --- a/Configure +++ b/Configure @@ -120,7 +120,7 @@ my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o mo86-elf.o:dx86-elf.o yx86 my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o mo86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o:rm86-cof.o:r586-cof.o:wp_block.o w86mmx-cof.o"; my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o mo86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o:wp_block.o w86mmx-out.o"; -my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4_skey.o:::wp-x86_64.o"; +my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o"; my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::"; my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::::::::"; my $alpha_asm=":bn_asm.o alpha-mont.o::::::::::"; diff --git a/TABLE b/TABLE index 203cb25e85..d7b1732e2d 100644 --- a/TABLE +++ b/TABLE @@ -268,7 +268,7 @@ $bf_obj = $md5_obj = md5-x86_64.o $sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o $cast_obj = -$rc4_obj = rc4-x86_64.o rc4_skey.o +$rc4_obj = rc4-x86_64.o $rmd160_obj = $rc5_obj = $wp_obj = wp-x86_64.o @@ -3152,7 +3152,7 @@ $bf_obj = $md5_obj = md5-x86_64.o $sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o $cast_obj = -$rc4_obj = rc4-x86_64.o rc4_skey.o +$rc4_obj = rc4-x86_64.o $rmd160_obj = $rc5_obj = $wp_obj = wp-x86_64.o @@ -3964,7 +3964,7 @@ $bf_obj = $md5_obj = md5-x86_64.o $sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o $cast_obj = -$rc4_obj = rc4-x86_64.o rc4_skey.o +$rc4_obj = rc4-x86_64.o $rmd160_obj = $rc5_obj = $wp_obj = wp-x86_64.o @@ -3992,7 +3992,7 @@ $bf_obj = $md5_obj = md5-x86_64.o $sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o $cast_obj = -$rc4_obj = rc4-x86_64.o rc4_skey.o +$rc4_obj = rc4-x86_64.o $rmd160_obj = $rc5_obj = $wp_obj = wp-x86_64.o diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl index 5236afec12..36a9429ef7 100755 --- a/crypto/rc4/asm/rc4-x86_64.pl +++ b/crypto/rc4/asm/rc4-x86_64.pl @@ -49,6 +49,14 @@ # is not implemented, then this final RC4_CHAR code-path should be # preferred, as it provides better *all-round* performance]. +# Intel Core2 was observed to perform poorly on both code paths:-( It +# apparently suffers from some kind of partial register stall, which +# occurs in 64-bit mode only [as virtually identical 32-bit loop was +# observed to outperform 64-bit one by almost 50%]. Adding two movzb to +# cloop1 boosts its performance by 80%! This loop appears to be optimal +# fit for Core2 and therefore the code was modified to skip cloop8 on +# this CPU. + $output=shift; open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output"; @@ -152,6 +160,8 @@ $code.=<<___; movzb ($dat,$XX[0]),$TX[0]#d test \$-8,$len jz .Lcloop1 + cmp \$0,260($dat) + jnz .Lcloop1 push %rbx jmp .Lcloop8 .align 16 @@ -235,6 +245,111 @@ $code.=<<___; .size RC4,.-RC4 ___ +$idx="%r8"; +$ido="%r9"; + +$code.=<<___; +.extern OPENSSL_ia32cap_P +.globl RC4_set_key +.type RC4_set_key,\@function,3 +.align 16 +RC4_set_key: + lea 8($dat),$dat + lea ($inp,$len),$inp + neg $len + mov $len,%rcx + xor %eax,%eax + xor $ido,$ido + xor %r10,%r10 + xor %r11,%r11 + + mov OPENSSL_ia32cap_P(%rip),$idx#d + bt \$20,$idx#d + jnc .Lw1stloop + bt \$30,$idx#d + setc $ido#b + mov $ido#d,260($dat) + jmp .Lc1stloop + +.align 16 +.Lw1stloop: + mov %eax,($dat,%rax,4) + add \$1,%al + jnc .Lw1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lw2ndloop: + mov ($dat,$ido,4),%r10d + add ($inp,$len,1),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx,4),%r11d + cmovz %rcx,$len + mov %r10d,($dat,$idx,4) + mov %r11d,($dat,$ido,4) + add \$1,$ido#b + jnc .Lw2ndloop + jmp .Lexit_key + +.align 16 +.Lc1stloop: + mov %al,($dat,%rax) + add \$1,%al + jnc .Lc1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lc2ndloop: + mov ($dat,$ido),%r10b + add ($inp,$len),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx),%r11b + jnz .Lcnowrap + mov %rcx,$len +.Lcnowrap: + mov %r10b,($dat,$idx) + mov %r11b,($dat,$ido) + add \$1,$ido#b + jnc .Lc2ndloop + movl \$-1,256($dat) + +.align 16 +.Lexit_key: + xor %eax,%eax + mov %eax,-8($dat) + mov %eax,-4($dat) + ret +.size RC4_set_key,.-RC4_set_key + +.globl RC4_options +.type RC4_options,\@function,0 +.align 16 +RC4_options: + .picmeup %rax + lea .Lopts-.(%rax),%rax + mov OPENSSL_ia32cap_P(%rip),%edx + bt \$20,%edx + jnc .Ldone + add \$12,%rax + bt \$30,%edx + jnc .Ldone + add \$13,%rax +.Ldone: + ret +.align 64 +.Lopts: +.asciz "rc4(8x,int)" +.asciz "rc4(8x,char)" +.asciz "rc4(1x,char)" +.asciz "RC4 for x86_64, OpenSSL project" +.align 64 +.size RC4_options,.-RC4_options +___ + $code =~ s/#([bwd])/$1/gm; print $code; diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl index 4d88ad191b..f9f2827636 100644 --- a/crypto/x86_64cpuid.pl +++ b/crypto/x86_64cpuid.pl @@ -48,8 +48,37 @@ OPENSSL_wipe_cpu ENDP OPENSSL_ia32_cpuid PROC mov r8,rbx + + xor eax,eax + cpuid + xor eax,eax + cmp ebx,0756e6547h + setne al + mov r9d,eax + cmp edx,049656e69h + setne al + or r9d,eax + cmp ecx,06c65746eh + setne al + or r9d,eax + mov eax,1 cpuid + bt edx,28 + jnc \$Ldone + cmp r9,0 + jne \$Lnotintel + or edx,000100000h + and ah,15 + cmp ah,15 + je \$Lnotintel + or edx,040000000h +\$Lnotintel: + shr ebx,16 + cmp bl,1 + ja \$Ldone + and edx,0efffffffh +\$Ldone: shl rcx,32 mov eax,edx mov rbx,r8 @@ -124,8 +153,37 @@ OPENSSL_wipe_cpu: .align 16 OPENSSL_ia32_cpuid: movq %rbx,%r8 + + xor %eax,%eax + cpuid + xor %eax,%eax + cmp \$0x756e6547,%ebx # "Genu" + setne %al + mov %eax,%r9d + cmp \$0x49656e69,%edx # "ineI" + setne %al + or %eax,%r9d + cmp \$0x6c65746e,%ecx # "ntel" + setne %al + or %eax,%r9d + movl \$1,%eax cpuid + bt \$28,%edx # test hyper-threading bit + jnc .Ldone + cmp \$0,%r9 + jne .Lnotintel + or \$1<<20,%edx # use reserved bit to engage RC4_CHAR + and \$15,%ah + cmp \$15,%ah # examine Family ID + je .Lnotintel + or \$1<<30,%edx # use reserved bit to skip unrolled loop +.Lnotintel: + shr \$16,%ebx + cmp \$1,%bl # see if cache is shared + ja .Ldone + and \$~(1<<28),%edx +.Ldone: shlq \$32,%rcx movl %edx,%eax movq %r8,%rbx -- 2.25.1