RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's

author Andy Polyakov <appro@openssl.org>

Sun, 21 Nov 2004 10:36:25 +0000 (10:36 +0000)

committer Andy Polyakov <appro@openssl.org>

Sun, 21 Nov 2004 10:36:25 +0000 (10:36 +0000)
author Andy Polyakov <appro@openssl.org>
Sun, 21 Nov 2004 10:36:25 +0000 (10:36 +0000)
committer Andy Polyakov <appro@openssl.org>
Sun, 21 Nov 2004 10:36:25 +0000 (10:36 +0000)
diff --git a/Configure b/Configure

index cc91c3dcb8f84ac73950acdb9c9895fb0eae932c..cce2af2b82208a4d1f738013168ffebe206fecba 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -318,7 +318,7 @@ my %table=(
  "linux-s390x", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
  "linux-ia64",  "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
  "linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"linux-x86_64",        "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL::asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"linux-x86_64",        "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL:amd64cpuid.o:asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
  "linux-elf-arm","gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
  "linux-parisc",        "gcc:-DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DBN_DIV2W::-D_REENTRANT:::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::",
  #### SPARC Linux setups
diff --git a/TABLE b/TABLE

index 2910ab3f13ce263a4fe724bcecd6a69f60e49a29..e8a258608b00471b25a9d81988621d7f2a55bf15 100644 (file)
--- a/TABLE
+++ b/TABLE
@@ -2086,7 +2086,7 @@ $unistd       =
  $thread_cflag = -D_REENTRANT
  $sys_id       = 
  $lflags       = -rdynamic -ldl
-$bn_ops       = DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
+$bn_ops       = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
  $cpuid_obj    = x86cpuid-elf.o
  $bn_obj       = asm/bn86-elf.o asm/co86-elf.o
  $des_obj      = asm/dx86-elf.o asm/yx86-elf.o
@@ -3572,7 +3572,7 @@ $thread_cflag = -D_REENTRANT
  $sys_id       = 
  $lflags       = -ldl
  $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL
-$cpuid_obj    = 
+$cpuid_obj    = amd64cpuid.o
  $bn_obj       = asm/x86_64-gcc.o
  $des_obj      = 
  $aes_obj      = 
diff --git a/crypto/amd64cpuid.pl b/crypto/amd64cpuid.pl

index baf801d0623c6e88ed9dd057b8a8df05cccd85a7..097f6b8d5c17349876b03e5363768f985c5fa6e6 100644 (file)
--- a/crypto/amd64cpuid.pl
+++ b/crypto/amd64cpuid.pl
@@ -5,16 +5,63 @@ $win64a=1 if ($output =~ /win64a\.[s|asm]/);
  open STDOUT,">$output" || die "can't open $output: $!";
  
  print<<___ if(defined($win64a));
-TEXT   SEGMENT
+_TEXT  SEGMENT
  PUBLIC OPENSSL_rdtsc
  ALIGN  16
-OPENSSL_rdtsc  PROC NEAR
+OPENSSL_rdtsc  PROC
         rdtsc
         shl     rdx,32
         or      rax,rdx
         ret
  OPENSSL_rdtsc  ENDP
-TEXT   ENDS
+
+PUBLIC OPENSSL_atomic_add
+ALIGN  16
+OPENSSL_atomic_add     PROC
+       mov     eax,DWORD PTR[rcx]
+\$Lspin:       lea     r8,DWORD PTR[rdx+rax]
+lock   cmpxchg DWORD PTR[rcx],r8d
+       jne     \$Lspin
+       mov     eax,r8d
+       cdqe    
+       ret
+OPENSSL_atomic_add     ENDP
+
+PUBLIC OPENSSL_wipe_cpu
+ALIGN  16
+OPENSSL_wipe_cpu       PROC
+       pxor    xmm0,xmm0
+       pxor    xmm1,xmm1
+       pxor    xmm2,xmm2
+       pxor    xmm3,xmm3
+       pxor    xmm4,xmm4
+       pxor    xmm5,xmm5
+       xor     rcx,rcx
+       xor     rdx,rdx
+       xor     r8,r8
+       xor     r9,r9
+       xor     r10,r10
+       xor     r11,r11
+       lea     rax,QWORD PTR[rsp+8]
+       ret
+OPENSSL_wipe_cpu       ENDP
+
+OPENSSL_ia32_cpuid     PROC
+       mov     r8,rbx
+       mov     eax,1
+       cpuid
+       shl     rcx,32
+       mov     eax,edx
+       mov     rbx,r8
+       or      rax,rcx
+       ret
+OPENSSL_ia32_cpuid     ENDP
+_TEXT  ENDS
+
+CRT\$XIU       SEGMENT
+EXTRN  OPENSSL_cpuid_setup:PROC
+DQ     OPENSSL_cpuid_setup
+CRT\$XIU       ENDS
  END
  ___
  print<<___ if(!defined($win64a));
@@ -27,4 +74,66 @@ OPENSSL_rdtsc:
         or      %rdx,%rax
         ret
  .size  OPENSSL_rdtsc,.-OPENSSL_rdtsc
+
+.globl OPENSSL_atomic_add
+.type  OPENSSL_atomic_add,\@function
+.align 16
+OPENSSL_atomic_add:
+       movl    (%rdi),%eax
+.Lspin:        lea     (%rsi,%rax),%r8
+lock;  cmpxchg %r8d,(%rdi)
+       jne     .Lspin
+       mov     %r8d,%eax
+       cdqe
+       ret
+.size  OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.globl OPENSSL_wipe_cpu
+.type  OPENSSL_wipe_cpu,\@function
+.align 16
+OPENSSL_wipe_cpu:
+       pxor    %xmm0,%xmm0
+       pxor    %xmm1,%xmm1
+       pxor    %xmm2,%xmm2
+       pxor    %xmm3,%xmm3
+       pxor    %xmm4,%xmm4
+       pxor    %xmm5,%xmm5
+       pxor    %xmm6,%xmm6
+       pxor    %xmm7,%xmm7
+       pxor    %xmm8,%xmm8
+       pxor    %xmm9,%xmm9
+       pxor    %xmm10,%xmm10
+       pxor    %xmm11,%xmm11
+       pxor    %xmm12,%xmm12
+       pxor    %xmm13,%xmm13
+       pxor    %xmm14,%xmm14
+       pxor    %xmm15,%xmm15
+       xor     %rcx,%rcx
+       xor     %rdx,%rdx
+       xor     %rsi,%rsi
+       xor     %rdi,%rdi
+       xor     %r8,%r8
+       xor     %r9,%r9
+       xor     %r10,%r10
+       xor     %r11,%r11
+       lea     8(%rsp),%rax
+       ret
+.size  OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+
+.globl OPENSSL_ia32_cpuid
+.align 16
+OPENSSL_ia32_cpuid:
+       mov     %rbx,%r8
+       mov     \$1,%eax
+       cpuid
+       shl     \$32,%rcx
+       mov     %edx,%eax
+       mov     %r8,%rbx
+       or      %rcx,%rax
+       ret
+.size  OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
+
+.section       .init
+       call    OPENSSL_cpuid_setup
+       .align  16
  ___
diff --git a/crypto/perlasm/x86unix.pl b/crypto/perlasm/x86unix.pl

index 7d87eb1701bf53edf35ab407b3b438b38dad7577..867fa09e48f93d860eef9ff77776db31c5b728c9 100644 (file)
--- a/crypto/perlasm/x86unix.pl
+++ b/crypto/perlasm/x86unix.pl
@@ -161,7 +161,7 @@ sub main'shl        { &out2("sall",@_); }
  sub main'shr   { &out2("shrl",@_); }
  sub main'xor   { &out2("xorl",@_); }
  sub main'xorb  { &out2("xorb",@_); }
-sub main'add   { &out2("addl",@_); }
+sub main'add   { &out2($_[0]=~/%[a-d][lh]/?"addb":"addl",@_); }
  sub main'adc   { &out2("adcl",@_); }
  sub main'sub   { &out2("subl",@_); }
  sub main'sbb   { &out2("sbbl",@_); }
@@ -189,7 +189,7 @@ sub main'jc { &out1("jc",@_); }
  sub main'jnc   { &out1("jnc",@_); }
  sub main'jno   { &out1("jno",@_); }
  sub main'dec   { &out1("decl",@_); }
-sub main'inc   { &out1("incl",@_); }
+sub main'inc   { &out1($_[0]=~/%[a-d][hl]/?"incb":"incl",@_); }
  sub main'push  { &out1("pushl",@_); $stack+=4; }
  sub main'pop   { &out1("popl",@_); $stack-=4; }
  sub main'pushf { &out0("pushfl"); $stack+=4; }
@@ -205,9 +205,10 @@ sub main'nop       { &out0("nop"); }
  sub main'test  { &out2("testl",@_); }
  sub main'bt    { &out2("btl",@_); }
  sub main'leave { &out0("leave"); }
-sub main'cpuid { &out0(".byte 0x0f; .byte 0xa2"); }
-sub main'rdtsc { &out0(".byte 0x0f; .byte 0x31"); }
+sub main'cpuid { &out0(".byte\t0x0f,0xa2"); }
+sub main'rdtsc { &out0(".byte\t0x0f,0x31"); }
  sub main'halt  { &out0("hlt"); }
+sub main'movz  { &out2("movzb",@_); }
  
  # SSE2
  sub main'emms  { &out0("emms"); }
@@ -558,7 +559,7 @@ sub main'file_end
                 pushl   %ebx
                 movl    %edx,%edi
                 movl    \$1,%eax
-               .byte 0x0f; .byte 0xa2
+               .byte   0x0f,0xa2
                 orl     \$1<<10,%edx
                 movl    %edx,0(%edi)
                 popl    %ebx
diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl

index 7ef889e5a1357cfedceb13855521d061b7d419e7..dbe3803f55b018c90baf5d2c0f5ac811d71896e8 100644 (file)
--- a/crypto/rc4/asm/rc4-586.pl
+++ b/crypto/rc4/asm/rc4-586.pl
@@ -7,10 +7,10 @@ require "x86asm.pl";
  
  &asm_init($ARGV[0],"rc4-586.pl");
  
-$tx="eax";
-$ty="ebx";
-$x="ecx";
-$y="edx";
+$x="eax";
+$y="ebx";
+$tx="ecx";
+$ty="edx";
  $in="esi";
  $out="edi";
  $d="ebp";
@@ -31,7 +31,7 @@ sub RC4_loop
                         {
                          &mov($ty,      &swtmp(2));
                         &cmp($ty,       $in);
-                        &jle(&label("finished"));
+                        &jbe(&label("finished"));
                         &inc($in);
                         }
                 else
@@ -39,7 +39,7 @@ sub RC4_loop
                         &add($ty,       8);
                          &inc($in);
                         &cmp($ty,       $in);
-                        &jl(&label("finished"));
+                        &jb(&label("finished"));
                         &mov(&swtmp(2), $ty);
                         }
                 }
@@ -88,35 +88,44 @@ sub RC4
  
         &function_begin_B($name,"");
  
+       &mov($ty,&wparam(1));           # len
+       &cmp($ty,0);
+       &jne(&label("proceed"));
+       &ret();
+       &set_label("proceed");
+
         &comment("");
  
         &push("ebp");
          &push("ebx");
-       &mov(   $d,     &wparam(0));    # key
-        &mov(  $ty,    &wparam(1));    # num
         &push("esi");
          &push("edi");
+       &mov(   $d,     &wparam(0));    # key
+        &mov(  $in,    &wparam(2));
  
         &mov(   $x,     &DWP(0,$d,"",1));
          &mov(  $y,     &DWP(4,$d,"",1));
  
-       &mov(   $in,    &wparam(2));
+       &mov(   $out,   &wparam(3));
          &inc(  $x);
  
         &stack_push(3); # 3 temp variables
          &add(  $d,     8);
         &and(   $x,             0xff);
  
+       # detect compressed schedule, see commentary section in rc4_skey.c...
+       &cmp(&DWP(256,$d),-1);
+       &je(&label("RC4_CHAR"));
+
          &lea(  $ty,    &DWP(-8,$ty,$in));
  
         # check for 0 length input
  
-       &mov(   $out,   &wparam(3));
          &mov(  &swtmp(2),      $ty);   # this is now address to exit at
         &mov(   $tx,    &DWP(0,$d,$x,4));
  
          &cmp(  $ty,    $in);
-       &jl(    &label("end")); # less than 8 bytes
+       &jb(    &label("end")); # less than 8 bytes
  
         &set_label("start");
  
@@ -148,7 +157,7 @@ sub RC4
         &mov(   &DWP(-4,$out,"",0),     $tx);
          &mov(  $tx,            &DWP(0,$d,$x,4));
         &cmp($in,       $ty);
-        &jle(&label("start"));
+        &jbe(&label("start"));
  
         &set_label("end");
  
@@ -162,6 +171,32 @@ sub RC4
         &RC4_loop(5,0,1);
         &RC4_loop(6,1,1);
  
+       &jmp(&label("finished"));
+
+       &align(16);
+       # this is essentially Intel P4 specific codepath, see rc4_skey.c...
+       &set_label("RC4_CHAR");
+
+       &lea    ($ty,&DWP(0,$in,$ty));
+       &mov    (&swtmp(2),$ty);
+
+       # strangely enough unrolled loop performs over 20% slower...
+       &set_label("RC4_CHAR_loop");
+               &movz   ($tx,&BP(0,$d,$x));
+               &add    (&LB($y),&LB($tx));
+               &movz   ($ty,&BP(0,$d,$y));
+               &movb   (&BP(0,$d,$y),&LB($tx));
+               &movb   (&BP(0,$d,$x),&LB($ty));
+               &add    (&LB($ty),&LB($tx));
+               &movz   ($ty,&BP(0,$d,$ty));
+               &xorb   (&LB($ty),&BP(0,$in));
+               &movb   (&BP(0,$out),&LB($ty));
+               &inc    (&LB($x));
+               &inc    ($in);
+               &inc    ($out);
+               &cmp    ($in,&swtmp(2));
+       &jb     (&label("RC4_CHAR_loop"));
+
         &set_label("finished");
         &dec(   $x);
          &stack_pop(3);
diff --git a/crypto/rc4/asm/rc4-amd64.pl b/crypto/rc4/asm/rc4-amd64.pl

index cc3f0c0231b8c4ad6c37d58dbd2b8cf4cc0463b1..35e426d561a35a0c117cc3161132c07bb5df8a00 100755 (executable)
--- a/crypto/rc4/asm/rc4-amd64.pl
+++ b/crypto/rc4/asm/rc4-amd64.pl
@@ -25,6 +25,13 @@
  # Latter means that if you want to *estimate* what to expect from
  # *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz.
  
+# Intel P4 EM64T core was found to run the AMD64 code really slow...
+# The only way to achieve comparable performance on P4 is to keep
+# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
+# compose blended code, which would perform even within 30% marginal
+# on either AMD and Intel platforms, I implement both cases. See
+# rc4_skey.c for further details...
+
  $output=shift;
  
  $win64a=1 if ($output =~ /win64a.[s|asm]/);
@@ -90,6 +97,8 @@ $code.=<<___;
         add     \$8,$dat
         movl    `&PTR("DWORD:-8[$dat]")`,$XX#d
         movl    `&PTR("DWORD:-4[$dat]")`,$YY#d
+       cmpl    \$-1,`&PTR("DWORD:256[$dat]")`
+       je      .LRC4_CHAR
         test    \$-8,$len
         jz      .Lloop1
  .align 16
@@ -167,6 +176,24 @@ $code.=<<___;
         dec     $len
         jnz     .Lloop1
         jmp     .Lexit
+
+.align 16
+.LRC4_CHAR:
+       inc     $XX#b
+       movzb   `&PTR("BYTE:[$dat+$XX]")`,$TX#d
+       add     $TX#b,$YY#b
+       movzb   `&PTR("BYTE:[$dat+$YY]")`,$TY#d
+       movb    $TX#b,`&PTR("BYTE:[$dat+$YY]")`
+       movb    $TY#b,`&PTR("BYTE:[$dat+$XX]")`
+       add     $TX#b,$TY#b
+       movzb   `&PTR("BYTE:[$dat+$TY]")`,$TY#d
+       xorb    `&PTR("BYTE:[$inp]")`,$TY#b
+       movb    $TY#b,`&PTR("BYTE:[$out]")`
+       inc     $inp
+       inc     $out
+       dec     $len
+       jnz     .LRC4_CHAR
+       jmp     .Lexit
  ___
  $code.=<<___ if (defined($win64a));
  RC4    ENDP
@@ -189,6 +216,8 @@ if (defined($win64a)) {
      $code =~ s/mov[bwlq]/mov/gm;
      $code =~ s/movzb/movzx/gm;
      $code =~ s/repret/DB\t0F3h,0C3h/gm;
+    $code =~ s/cmpl/cmp/gm;
+    $code =~ s/xorb/xor/gm;
  } else {
      $code =~ s/([QD]*WORD|BYTE)://gm;
      $code =~ s/repret/.byte\t0xF3,0xC3/gm;
diff --git a/crypto/rc4/rc4_locl.h b/crypto/rc4/rc4_locl.h

index 3bb80b6ce9e07b85b6043996918d9049725626a3..c712e1632ea5e121a3a109cf74f807b6840470bc 100644 (file)
--- a/crypto/rc4/rc4_locl.h
+++ b/crypto/rc4/rc4_locl.h
@@ -1,4 +1,5 @@
  #ifndef HEADER_RC4_LOCL_H
  #define HEADER_RC4_LOCL_H
  #include <openssl/opensslconf.h>
+#include <cryptlib.h>
  #endif
diff --git a/crypto/rc4/rc4_skey.c b/crypto/rc4/rc4_skey.c

index bb10c1ebe2892a3b69092f64975e5a33a7fc1b19..781ff2d8b9b86e6b2fda2c6f276201b3da9feef6 100644 (file)
--- a/crypto/rc4/rc4_skey.c
+++ b/crypto/rc4/rc4_skey.c
@@ -93,25 +93,58 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
          unsigned int i;
          
          d= &(key->data[0]);
-       for (i=0; i<256; i++)
-               d[i]=i;
          key->x = 0;     
          key->y = 0;     
          id1=id2=0;     
  
-#define SK_LOOP(n) { \
+#define SK_LOOP(d,n) { \
                 tmp=d[(n)]; \
                 id2 = (data[id1] + tmp + id2) & 0xff; \
                 if (++id1 == len) id1=0; \
                 d[(n)]=d[id2]; \
                 d[id2]=tmp; }
  
+#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM)
+# if   defined(__i386)   || defined(__i386__)   || defined(_M_IX86) || \
+       defined(__INTEL__) || \
+       defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64)
+       if (sizeof(RC4_INT) > 1) {
+               /*
+                * Unlike all other x86 [and x86_64] implementations,
+                * Intel P4 core [including EM64T] was found to perform
+                * poorly with wider RC4_INT. Performance improvement
+                * for IA-32 hand-coded assembler turned out to be 2.8x
+                * if re-coded for RC4_CHAR! It's however inappropriate
+                * to just switch to RC4_CHAR for x86[_64], as non-P4
+                * implementations suffer from significant performance
+                * losses then, e.g. PIII exhibits >2x deterioration,
+                * and so does Opteron. In order to assure optimal
+                * all-round performance, let us [try to] detect P4 at
+                * run-time by checking upon HTT bit in CPU capability
+                * vector and set up compressed key schedule, which is
+                * recognized by correspondingly updated assembler
+                * module...
+                *                              <appro@fy.chalmers.se>
+                */
+               if (OPENSSL_ia32cap_P & (1<<28)) {
+                       unsigned char *cp=(unsigned char *)d;
+
+                       for (i=0;i<256;i++) cp[i]=i;
+                       for (i=0;i<256;i++) SK_LOOP(cp,i);
+                       /* mark schedule as compressed! */
+                       d[256/sizeof(RC4_INT)]=-1;
+                       return;
+               }
+       }
+# endif
+#endif
+       for (i=0; i < 256; i++) d[i]=i;
         for (i=0; i < 256; i+=4)
                 {
-               SK_LOOP(i+0);
-               SK_LOOP(i+1);
-               SK_LOOP(i+2);
-               SK_LOOP(i+3);
+               SK_LOOP(d,i+0);
+               SK_LOOP(d,i+1);
+               SK_LOOP(d,i+2);
+               SK_LOOP(d,i+3);
                 }
         }
      
diff --git a/doc/crypto/OPENSSL_ia32cap.pod b/doc/crypto/OPENSSL_ia32cap.pod

index 790e8e9b1ef2d120c6a55a2f5546274547be42f4..ec6b655c17b3459260ab7ff91e7aeb026658b616 100644 (file)
--- a/doc/crypto/OPENSSL_ia32cap.pod
+++ b/doc/crypto/OPENSSL_ia32cap.pod
@@ -14,11 +14,12 @@ OPENSSL_ia32cap
  Value returned by OPENSSL_ia32cap_loc() is address of a variable
  containing IA-32 processor capabilities bit vector as it appears in EDX
  register after executing CPUID instruction with EAX=1 input value (see
-Intel Application Note #241618). Naturally it's meaningful on IA-32
+Intel Application Note #241618). Naturally it's meaningful on IA-32[E]
  platforms only. The variable is normally set up automatically upon
  toolkit initialization, but can be manipulated afterwards to modify
-crypto library behaviour. For the moment of this writing only two bits
-are significant, namely bit #26 denoting SSE2 support, and bit #4
+crypto library behaviour. For the moment of this writing three bits are
+significant, namely bit #28 denoting Hyperthreading, which is used to
+distinguish Intel P4 core, bit #26 denoting SSE2 support, and bit #4
  denoting presence of Time-Stamp Counter. Clearing bit #26 at run-time
  for example disables high-performance SSE2 code present in the crypto
  library. You might have to do this if target OpenSSL application is
author	Andy Polyakov <appro@openssl.org>
	Sun, 21 Nov 2004 10:36:25 +0000 (10:36 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Sun, 21 Nov 2004 10:36:25 +0000 (10:36 +0000)
Configure		patch \| blob \| history
TABLE		patch \| blob \| history
crypto/amd64cpuid.pl		patch \| blob \| history
crypto/perlasm/x86unix.pl		patch \| blob \| history
crypto/rc4/asm/rc4-586.pl		patch \| blob \| history
crypto/rc4/asm/rc4-amd64.pl		patch \| blob \| history
crypto/rc4/rc4_locl.h		patch \| blob \| history
crypto/rc4/rc4_skey.c		patch \| blob \| history
doc/crypto/OPENSSL_ia32cap.pod		patch \| blob \| history