my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o mo86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o:rm86-cof.o:r586-cof.o:wp_block.o w86mmx-cof.o";
my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o mo86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o:wp_block.o w86mmx-out.o";
-my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4_skey.o:::wp-x86_64.o";
+my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o";
my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::";
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::::::::";
my $alpha_asm=":bn_asm.o alpha-mont.o::::::::::";
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o rc4_skey.o
+$rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o rc4_skey.o
+$rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o rc4_skey.o
+$rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
$md5_obj = md5-x86_64.o
$sha1_obj = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
$cast_obj =
-$rc4_obj = rc4-x86_64.o rc4_skey.o
+$rc4_obj = rc4-x86_64.o
$rmd160_obj =
$rc5_obj =
$wp_obj = wp-x86_64.o
# is not implemented, then this final RC4_CHAR code-path should be
# preferred, as it provides better *all-round* performance].
+# Intel Core2 was observed to perform poorly on both code paths:-( It
+# apparently suffers from some kind of partial register stall, which
+# occurs in 64-bit mode only [as virtually identical 32-bit loop was
+# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
+# cloop1 boosts its performance by 80%! This loop appears to be optimal
+# fit for Core2 and therefore the code was modified to skip cloop8 on
+# this CPU.
+
$output=shift;
open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
movzb ($dat,$XX[0]),$TX[0]#d
test \$-8,$len
jz .Lcloop1
+ cmp \$0,260($dat)
+ jnz .Lcloop1
push %rbx
jmp .Lcloop8
.align 16
.size RC4,.-RC4
___
+$idx="%r8";
+$ido="%r9";
+
+$code.=<<___;
+.extern OPENSSL_ia32cap_P
+.globl RC4_set_key
+.type RC4_set_key,\@function,3
+.align 16
+RC4_set_key:
+ lea 8($dat),$dat
+ lea ($inp,$len),$inp
+ neg $len
+ mov $len,%rcx
+ xor %eax,%eax
+ xor $ido,$ido
+ xor %r10,%r10
+ xor %r11,%r11
+
+ mov OPENSSL_ia32cap_P(%rip),$idx#d
+ bt \$20,$idx#d
+ jnc .Lw1stloop
+ bt \$30,$idx#d
+ setc $ido#b
+ mov $ido#d,260($dat)
+ jmp .Lc1stloop
+
+.align 16
+.Lw1stloop:
+ mov %eax,($dat,%rax,4)
+ add \$1,%al
+ jnc .Lw1stloop
+
+ xor $ido,$ido
+ xor $idx,$idx
+.align 16
+.Lw2ndloop:
+ mov ($dat,$ido,4),%r10d
+ add ($inp,$len,1),$idx#b
+ add %r10b,$idx#b
+ add \$1,$len
+ mov ($dat,$idx,4),%r11d
+ cmovz %rcx,$len
+ mov %r10d,($dat,$idx,4)
+ mov %r11d,($dat,$ido,4)
+ add \$1,$ido#b
+ jnc .Lw2ndloop
+ jmp .Lexit_key
+
+.align 16
+.Lc1stloop:
+ mov %al,($dat,%rax)
+ add \$1,%al
+ jnc .Lc1stloop
+
+ xor $ido,$ido
+ xor $idx,$idx
+.align 16
+.Lc2ndloop:
+ mov ($dat,$ido),%r10b
+ add ($inp,$len),$idx#b
+ add %r10b,$idx#b
+ add \$1,$len
+ mov ($dat,$idx),%r11b
+ jnz .Lcnowrap
+ mov %rcx,$len
+.Lcnowrap:
+ mov %r10b,($dat,$idx)
+ mov %r11b,($dat,$ido)
+ add \$1,$ido#b
+ jnc .Lc2ndloop
+ movl \$-1,256($dat)
+
+.align 16
+.Lexit_key:
+ xor %eax,%eax
+ mov %eax,-8($dat)
+ mov %eax,-4($dat)
+ ret
+.size RC4_set_key,.-RC4_set_key
+
+.globl RC4_options
+.type RC4_options,\@function,0
+.align 16
+RC4_options:
+ .picmeup %rax
+ lea .Lopts-.(%rax),%rax
+ mov OPENSSL_ia32cap_P(%rip),%edx
+ bt \$20,%edx
+ jnc .Ldone
+ add \$12,%rax
+ bt \$30,%edx
+ jnc .Ldone
+ add \$13,%rax
+.Ldone:
+ ret
+.align 64
+.Lopts:
+.asciz "rc4(8x,int)"
+.asciz "rc4(8x,char)"
+.asciz "rc4(1x,char)"
+.asciz "RC4 for x86_64, OpenSSL project"
+.align 64
+.size RC4_options,.-RC4_options
+___
+
$code =~ s/#([bwd])/$1/gm;
print $code;
OPENSSL_ia32_cpuid PROC
mov r8,rbx
+
+ xor eax,eax
+ cpuid
+ xor eax,eax
+ cmp ebx,0756e6547h
+ setne al
+ mov r9d,eax
+ cmp edx,049656e69h
+ setne al
+ or r9d,eax
+ cmp ecx,06c65746eh
+ setne al
+ or r9d,eax
+
mov eax,1
cpuid
+ bt edx,28
+ jnc \$Ldone
+ cmp r9,0
+ jne \$Lnotintel
+ or edx,000100000h
+ and ah,15
+ cmp ah,15
+ je \$Lnotintel
+ or edx,040000000h
+\$Lnotintel:
+ shr ebx,16
+ cmp bl,1
+ ja \$Ldone
+ and edx,0efffffffh
+\$Ldone:
shl rcx,32
mov eax,edx
mov rbx,r8
.align 16
OPENSSL_ia32_cpuid:
movq %rbx,%r8
+
+ xor %eax,%eax
+ cpuid
+ xor %eax,%eax
+ cmp \$0x756e6547,%ebx # "Genu"
+ setne %al
+ mov %eax,%r9d
+ cmp \$0x49656e69,%edx # "ineI"
+ setne %al
+ or %eax,%r9d
+ cmp \$0x6c65746e,%ecx # "ntel"
+ setne %al
+ or %eax,%r9d
+
movl \$1,%eax
cpuid
+ bt \$28,%edx # test hyper-threading bit
+ jnc .Ldone
+ cmp \$0,%r9
+ jne .Lnotintel
+ or \$1<<20,%edx # use reserved bit to engage RC4_CHAR
+ and \$15,%ah
+ cmp \$15,%ah # examine Family ID
+ je .Lnotintel
+ or \$1<<30,%edx # use reserved bit to skip unrolled loop
+.Lnotintel:
+ shr \$16,%ebx
+ cmp \$1,%bl # see if cache is shared
+ ja .Ldone
+ and \$~(1<<28),%edx
+.Ldone:
shlq \$32,%rcx
movl %edx,%eax
movq %r8,%rbx