RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's
authorAndy Polyakov <appro@openssl.org>
Sun, 21 Nov 2004 10:36:25 +0000 (10:36 +0000)
committerAndy Polyakov <appro@openssl.org>
Sun, 21 Nov 2004 10:36:25 +0000 (10:36 +0000)
apparently impossible to compose blended code with would perform
satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR
code-path is introduced and P4 core is detected at run-time. This
way we keep original performance on non-P4 implementations and
turbo-charge P4 performance by factor of 2.8x (on 32-bit core).

Configure
TABLE
crypto/amd64cpuid.pl
crypto/perlasm/x86unix.pl
crypto/rc4/asm/rc4-586.pl
crypto/rc4/asm/rc4-amd64.pl
crypto/rc4/rc4_locl.h
crypto/rc4/rc4_skey.c
doc/crypto/OPENSSL_ia32cap.pod

index cc91c3dcb8f84ac73950acdb9c9895fb0eae932c..cce2af2b82208a4d1f738013168ffebe206fecba 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -318,7 +318,7 @@ my %table=(
 "linux-s390x", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-ia64",  "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"linux-x86_64",        "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL::asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"linux-x86_64",        "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL:amd64cpuid.o:asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-elf-arm","gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-parisc",        "gcc:-DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DBN_DIV2W::-D_REENTRANT:::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::",
 #### SPARC Linux setups
diff --git a/TABLE b/TABLE
index 2910ab3f13ce263a4fe724bcecd6a69f60e49a29..e8a258608b00471b25a9d81988621d7f2a55bf15 100644 (file)
--- a/TABLE
+++ b/TABLE
@@ -2086,7 +2086,7 @@ $unistd       =
 $thread_cflag = -D_REENTRANT
 $sys_id       = 
 $lflags       = -rdynamic -ldl
-$bn_ops       = DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
+$bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
 $cpuid_obj    = x86cpuid-elf.o
 $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
 $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
@@ -3572,7 +3572,7 @@ $thread_cflag = -D_REENTRANT
 $sys_id       = 
 $lflags       = -ldl
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL
-$cpuid_obj    = 
+$cpuid_obj    = amd64cpuid.o
 $bn_obj       = asm/x86_64-gcc.o
 $des_obj      = 
 $aes_obj      = 
index baf801d0623c6e88ed9dd057b8a8df05cccd85a7..097f6b8d5c17349876b03e5363768f985c5fa6e6 100644 (file)
@@ -5,16 +5,63 @@ $win64a=1 if ($output =~ /win64a\.[s|asm]/);
 open STDOUT,">$output" || die "can't open $output: $!";
 
 print<<___ if(defined($win64a));
-TEXT   SEGMENT
+_TEXT  SEGMENT
 PUBLIC OPENSSL_rdtsc
 ALIGN  16
-OPENSSL_rdtsc  PROC NEAR
+OPENSSL_rdtsc  PROC
        rdtsc
        shl     rdx,32
        or      rax,rdx
        ret
 OPENSSL_rdtsc  ENDP
-TEXT   ENDS
+
+PUBLIC OPENSSL_atomic_add
+ALIGN  16
+OPENSSL_atomic_add     PROC
+       mov     eax,DWORD PTR[rcx]
+\$Lspin:       lea     r8,DWORD PTR[rdx+rax]
+lock   cmpxchg DWORD PTR[rcx],r8d
+       jne     \$Lspin
+       mov     eax,r8d
+       cdqe    
+       ret
+OPENSSL_atomic_add     ENDP
+
+PUBLIC OPENSSL_wipe_cpu
+ALIGN  16
+OPENSSL_wipe_cpu       PROC
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       xor     rcx,rcx
+       xor     rdx,rdx
+       xor     r8,r8
+       xor     r9,r9
+       xor     r10,r10
+       xor     r11,r11
+       lea     rax,QWORD PTR[rsp+8]
+       ret
+OPENSSL_wipe_cpu       ENDP
+
+OPENSSL_ia32_cpuid     PROC
+       mov     r8,rbx
+       mov     eax,1
+       cpuid
+       shl     rcx,32
+       mov     eax,edx
+       mov     rbx,r8
+       or      rax,rcx
+       ret
+OPENSSL_ia32_cpuid     ENDP
+_TEXT  ENDS
+
+CRT\$XIU       SEGMENT
+EXTRN  OPENSSL_cpuid_setup:PROC
+DQ     OPENSSL_cpuid_setup
+CRT\$XIU       ENDS
 END
 ___
 print<<___ if(!defined($win64a));
@@ -27,4 +74,66 @@ OPENSSL_rdtsc:
        or      %rdx,%rax
        ret
 .size  OPENSSL_rdtsc,.-OPENSSL_rdtsc
+
+.globl OPENSSL_atomic_add
+.type  OPENSSL_atomic_add,\@function
+.align 16
+OPENSSL_atomic_add:
+       movl    (%rdi),%eax
+.Lspin:        lea     (%rsi,%rax),%r8
+lock;  cmpxchg %r8d,(%rdi)
+       jne     .Lspin
+       mov     %r8d,%eax
+       cdqe
+       ret
+.size  OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.globl OPENSSL_wipe_cpu
+.type  OPENSSL_wipe_cpu,\@function
+.align 16
+OPENSSL_wipe_cpu:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
+       pxor    %xmm10,%xmm10
+       pxor    %xmm11,%xmm11
+       pxor    %xmm12,%xmm12
+       pxor    %xmm13,%xmm13
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+       xor     %rcx,%rcx
+       xor     %rdx,%rdx
+       xor     %rsi,%rsi
+       xor     %rdi,%rdi
+       xor     %r8,%r8
+       xor     %r9,%r9
+       xor     %r10,%r10
+       xor     %r11,%r11
+       lea     8(%rsp),%rax
+       ret
+.size  OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+
+.globl OPENSSL_ia32_cpuid
+.align 16
+OPENSSL_ia32_cpuid:
+       mov     %rbx,%r8
+       mov     \$1,%eax
+       cpuid
+       shl     \$32,%rcx
+       mov     %edx,%eax
+       mov     %r8,%rbx
+       or      %rcx,%rax
+       ret
+.size  OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
+
+.section       .init
+       call    OPENSSL_cpuid_setup
+       .align  16
 ___
index 7d87eb1701bf53edf35ab407b3b438b38dad7577..867fa09e48f93d860eef9ff77776db31c5b728c9 100644 (file)
@@ -161,7 +161,7 @@ sub main'shl        { &out2("sall",@_); }
 sub main'shr   { &out2("shrl",@_); }
 sub main'xor   { &out2("xorl",@_); }
 sub main'xorb  { &out2("xorb",@_); }
-sub main'add   { &out2("addl",@_); }
+sub main'add   { &out2($_[0]=~/%[a-d][lh]/?"addb":"addl",@_); }
 sub main'adc   { &out2("adcl",@_); }
 sub main'sub   { &out2("subl",@_); }
 sub main'sbb   { &out2("sbbl",@_); }
@@ -189,7 +189,7 @@ sub main'jc { &out1("jc",@_); }
 sub main'jnc   { &out1("jnc",@_); }
 sub main'jno   { &out1("jno",@_); }
 sub main'dec   { &out1("decl",@_); }
-sub main'inc   { &out1("incl",@_); }
+sub main'inc   { &out1($_[0]=~/%[a-d][hl]/?"incb":"incl",@_); }
 sub main'push  { &out1("pushl",@_); $stack+=4; }
 sub main'pop   { &out1("popl",@_); $stack-=4; }
 sub main'pushf { &out0("pushfl"); $stack+=4; }
@@ -205,9 +205,10 @@ sub main'nop       { &out0("nop"); }
 sub main'test  { &out2("testl",@_); }
 sub main'bt    { &out2("btl",@_); }
 sub main'leave { &out0("leave"); }
-sub main'cpuid { &out0(".byte 0x0f; .byte 0xa2"); }
-sub main'rdtsc { &out0(".byte 0x0f; .byte 0x31"); }
+sub main'cpuid { &out0(".byte\t0x0f,0xa2"); }
+sub main'rdtsc { &out0(".byte\t0x0f,0x31"); }
 sub main'halt  { &out0("hlt"); }
+sub main'movz  { &out2("movzb",@_); }
 
 # SSE2
 sub main'emms  { &out0("emms"); }
@@ -558,7 +559,7 @@ sub main'file_end
                pushl   %ebx
                movl    %edx,%edi
                movl    \$1,%eax
-               .byte 0x0f; .byte 0xa2
+               .byte   0x0f,0xa2
                orl     \$1<<10,%edx
                movl    %edx,0(%edi)
                popl    %ebx
index 7ef889e5a1357cfedceb13855521d061b7d419e7..dbe3803f55b018c90baf5d2c0f5ac811d71896e8 100644 (file)
@@ -7,10 +7,10 @@ require "x86asm.pl";
 
 &asm_init($ARGV[0],"rc4-586.pl");
 
-$tx="eax";
-$ty="ebx";
-$x="ecx";
-$y="edx";
+$x="eax";
+$y="ebx";
+$tx="ecx";
+$ty="edx";
 $in="esi";
 $out="edi";
 $d="ebp";
@@ -31,7 +31,7 @@ sub RC4_loop
                        {
                         &mov($ty,      &swtmp(2));
                        &cmp($ty,       $in);
-                        &jle(&label("finished"));
+                        &jbe(&label("finished"));
                        &inc($in);
                        }
                else
@@ -39,7 +39,7 @@ sub RC4_loop
                        &add($ty,       8);
                         &inc($in);
                        &cmp($ty,       $in);
-                        &jl(&label("finished"));
+                        &jb(&label("finished"));
                        &mov(&swtmp(2), $ty);
                        }
                }
@@ -88,35 +88,44 @@ sub RC4
 
        &function_begin_B($name,"");
 
+       &mov($ty,&wparam(1));           # len
+       &cmp($ty,0);
+       &jne(&label("proceed"));
+       &ret();
+       &set_label("proceed");
+
        &comment("");
 
        &push("ebp");
         &push("ebx");
-       &mov(   $d,     &wparam(0));    # key
-        &mov(  $ty,    &wparam(1));    # num
        &push("esi");
         &push("edi");
+       &mov(   $d,     &wparam(0));    # key
+        &mov(  $in,    &wparam(2));
 
        &mov(   $x,     &DWP(0,$d,"",1));
         &mov(  $y,     &DWP(4,$d,"",1));
 
-       &mov(   $in,    &wparam(2));
+       &mov(   $out,   &wparam(3));
         &inc(  $x);
 
        &stack_push(3); # 3 temp variables
         &add(  $d,     8);
        &and(   $x,             0xff);
 
+       # detect compressed schedule, see commentary section in rc4_skey.c...
+       &cmp(&DWP(256,$d),-1);
+       &je(&label("RC4_CHAR"));
+
         &lea(  $ty,    &DWP(-8,$ty,$in));
 
        # check for 0 length input
 
-       &mov(   $out,   &wparam(3));
         &mov(  &swtmp(2),      $ty);   # this is now address to exit at
        &mov(   $tx,    &DWP(0,$d,$x,4));
 
         &cmp(  $ty,    $in);
-       &jl(    &label("end")); # less than 8 bytes
+       &jb(    &label("end")); # less than 8 bytes
 
        &set_label("start");
 
@@ -148,7 +157,7 @@ sub RC4
        &mov(   &DWP(-4,$out,"",0),     $tx);
         &mov(  $tx,            &DWP(0,$d,$x,4));
        &cmp($in,       $ty);
-        &jle(&label("start"));
+        &jbe(&label("start"));
 
        &set_label("end");
 
@@ -162,6 +171,32 @@ sub RC4
        &RC4_loop(5,0,1);
        &RC4_loop(6,1,1);
 
+       &jmp(&label("finished"));
+
+       &align(16);
+       # this is essentially Intel P4 specific codepath, see rc4_skey.c...
+       &set_label("RC4_CHAR");
+
+       &lea    ($ty,&DWP(0,$in,$ty));
+       &mov    (&swtmp(2),$ty);
+
+       # strangely enough unrolled loop performs over 20% slower...
+       &set_label("RC4_CHAR_loop");
+               &movz   ($tx,&BP(0,$d,$x));
+               &add    (&LB($y),&LB($tx));
+               &movz   ($ty,&BP(0,$d,$y));
+               &movb   (&BP(0,$d,$y),&LB($tx));
+               &movb   (&BP(0,$d,$x),&LB($ty));
+               &add    (&LB($ty),&LB($tx));
+               &movz   ($ty,&BP(0,$d,$ty));
+               &xorb   (&LB($ty),&BP(0,$in));
+               &movb   (&BP(0,$out),&LB($ty));
+               &inc    (&LB($x));
+               &inc    ($in);
+               &inc    ($out);
+               &cmp    ($in,&swtmp(2));
+       &jb     (&label("RC4_CHAR_loop"));
+
        &set_label("finished");
        &dec(   $x);
         &stack_pop(3);
index cc3f0c0231b8c4ad6c37d58dbd2b8cf4cc0463b1..35e426d561a35a0c117cc3161132c07bb5df8a00 100755 (executable)
 # Latter means that if you want to *estimate* what to expect from
 # *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz.
 
+# Intel P4 EM64T core was found to run the AMD64 code really slow...
+# The only way to achieve comparable performance on P4 is to keep
+# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
+# compose blended code, which would perform even within 30% marginal
+# on either AMD and Intel platforms, I implement both cases. See
+# rc4_skey.c for further details...
+
 $output=shift;
 
 $win64a=1 if ($output =~ /win64a.[s|asm]/);
@@ -90,6 +97,8 @@ $code.=<<___;
        add     \$8,$dat
        movl    `&PTR("DWORD:-8[$dat]")`,$XX#d
        movl    `&PTR("DWORD:-4[$dat]")`,$YY#d
+       cmpl    \$-1,`&PTR("DWORD:256[$dat]")`
+       je      .LRC4_CHAR
        test    \$-8,$len
        jz      .Lloop1
 .align 16
@@ -167,6 +176,24 @@ $code.=<<___;
        dec     $len
        jnz     .Lloop1
        jmp     .Lexit
+
+.align 16
+.LRC4_CHAR:
+       inc     $XX#b
+       movzb   `&PTR("BYTE:[$dat+$XX]")`,$TX#d
+       add     $TX#b,$YY#b
+       movzb   `&PTR("BYTE:[$dat+$YY]")`,$TY#d
+       movb    $TX#b,`&PTR("BYTE:[$dat+$YY]")`
+       movb    $TY#b,`&PTR("BYTE:[$dat+$XX]")`
+       add     $TX#b,$TY#b
+       movzb   `&PTR("BYTE:[$dat+$TY]")`,$TY#d
+       xorb    `&PTR("BYTE:[$inp]")`,$TY#b
+       movb    $TY#b,`&PTR("BYTE:[$out]")`
+       inc     $inp
+       inc     $out
+       dec     $len
+       jnz     .LRC4_CHAR
+       jmp     .Lexit
 ___
 $code.=<<___ if (defined($win64a));
 RC4    ENDP
@@ -189,6 +216,8 @@ if (defined($win64a)) {
     $code =~ s/mov[bwlq]/mov/gm;
     $code =~ s/movzb/movzx/gm;
     $code =~ s/repret/DB\t0F3h,0C3h/gm;
+    $code =~ s/cmpl/cmp/gm;
+    $code =~ s/xorb/xor/gm;
 } else {
     $code =~ s/([QD]*WORD|BYTE)://gm;
     $code =~ s/repret/.byte\t0xF3,0xC3/gm;
index 3bb80b6ce9e07b85b6043996918d9049725626a3..c712e1632ea5e121a3a109cf74f807b6840470bc 100644 (file)
@@ -1,4 +1,5 @@
 #ifndef HEADER_RC4_LOCL_H
 #define HEADER_RC4_LOCL_H
 #include <openssl/opensslconf.h>
+#include <cryptlib.h>
 #endif
index bb10c1ebe2892a3b69092f64975e5a33a7fc1b19..781ff2d8b9b86e6b2fda2c6f276201b3da9feef6 100644 (file)
@@ -93,25 +93,58 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
         unsigned int i;
         
         d= &(key->data[0]);
-       for (i=0; i<256; i++)
-               d[i]=i;
         key->x = 0;     
         key->y = 0;     
         id1=id2=0;     
 
-#define SK_LOOP(n) { \
+#define SK_LOOP(d,n) { \
                tmp=d[(n)]; \
                id2 = (data[id1] + tmp + id2) & 0xff; \
                if (++id1 == len) id1=0; \
                d[(n)]=d[id2]; \
                d[id2]=tmp; }
 
+#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM)
+# if   defined(__i386)   || defined(__i386__)   || defined(_M_IX86) || \
+       defined(__INTEL__) || \
+       defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64)
+       if (sizeof(RC4_INT) > 1) {
+               /*
+                * Unlike all other x86 [and x86_64] implementations,
+                * Intel P4 core [including EM64T] was found to perform
+                * poorly with wider RC4_INT. Performance improvement
+                * for IA-32 hand-coded assembler turned out to be 2.8x
+                * if re-coded for RC4_CHAR! It's however inappropriate
+                * to just switch to RC4_CHAR for x86[_64], as non-P4
+                * implementations suffer from significant performance
+                * losses then, e.g. PIII exhibits >2x deterioration,
+                * and so does Opteron. In order to assure optimal
+                * all-round performance, let us [try to] detect P4 at
+                * run-time by checking upon HTT bit in CPU capability
+                * vector and set up compressed key schedule, which is
+                * recognized by correspondingly updated assembler
+                * module...
+                *                              <appro@fy.chalmers.se>
+                */
+               if (OPENSSL_ia32cap_P & (1<<28)) {
+                       unsigned char *cp=(unsigned char *)d;
+
+                       for (i=0;i<256;i++) cp[i]=i;
+                       for (i=0;i<256;i++) SK_LOOP(cp,i);
+                       /* mark schedule as compressed! */
+                       d[256/sizeof(RC4_INT)]=-1;
+                       return;
+               }
+       }
+# endif
+#endif
+       for (i=0; i < 256; i++) d[i]=i;
        for (i=0; i < 256; i+=4)
                {
-               SK_LOOP(i+0);
-               SK_LOOP(i+1);
-               SK_LOOP(i+2);
-               SK_LOOP(i+3);
+               SK_LOOP(d,i+0);
+               SK_LOOP(d,i+1);
+               SK_LOOP(d,i+2);
+               SK_LOOP(d,i+3);
                }
        }
     
index 790e8e9b1ef2d120c6a55a2f5546274547be42f4..ec6b655c17b3459260ab7ff91e7aeb026658b616 100644 (file)
@@ -14,11 +14,12 @@ OPENSSL_ia32cap
 Value returned by OPENSSL_ia32cap_loc() is address of a variable
 containing IA-32 processor capabilities bit vector as it appears in EDX
 register after executing CPUID instruction with EAX=1 input value (see
-Intel Application Note #241618). Naturally it's meaningful on IA-32
+Intel Application Note #241618). Naturally it's meaningful on IA-32[E]
 platforms only. The variable is normally set up automatically upon
 toolkit initialization, but can be manipulated afterwards to modify
-crypto library behaviour. For the moment of this writing only two bits
-are significant, namely bit #26 denoting SSE2 support, and bit #4
+crypto library behaviour. For the moment of this writing three bits are
+significant, namely bit #28 denoting Hyperthreading, which is used to
+distinguish Intel P4 core, bit #26 denoting SSE2 support, and bit #4
 denoting presence of Time-Stamp Counter. Clearing bit #26 at run-time
 for example disables high-performance SSE2 code present in the crypto
 library. You might have to do this if target OpenSSL application is