From b9064221499d6091dbdac4acacd23247faf318ba Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Mon, 16 May 2011 20:35:11 +0000
Subject: [PATCH] x86[_64]cpuid.pl: handle new extensions.

---
 crypto/x86_64cpuid.pl          | 57 ++++++++++++++++++++--------
 crypto/x86cpuid.pl             | 63 +++++++++++++++++++++++--------
 doc/crypto/OPENSSL_ia32cap.pod | 69 ++++++++++++++++++++++------------
 3 files changed, 133 insertions(+), 56 deletions(-)

diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl
index ecfcfc763c..ba95f0b229 100644
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@@ -47,7 +47,7 @@ OPENSSL_rdtsc:
 .type	OPENSSL_ia32_cpuid,\@abi-omnipotent
 .align	16
 OPENSSL_ia32_cpuid:
-	mov	%rbx,%r8
+	mov	%rbx,%r8		# save %rbx
 
 	xor	%eax,%eax
 	cpuid
@@ -79,7 +79,15 @@ OPENSSL_ia32_cpuid:
 	# AMD specific
 	mov	\$0x80000000,%eax
 	cpuid
-	cmp	\$0x80000008,%eax
+	cmp	\$0x80000001,%eax
+	jb	.Lintel
+	mov	%eax,%r10d
+	mov	\$0x80000001,%eax
+	cpuid
+	or	%ecx,%r9d
+	and	\$0x00000801,%r9d	# isolate AMD XOP bit, 1<<11
+
+	cmp	\$0x80000008,%r10d
 	jb	.Lintel
 
 	mov	\$0x80000008,%eax
@@ -90,12 +98,12 @@ OPENSSL_ia32_cpuid:
 	mov	\$1,%eax
 	cpuid
 	bt	\$28,%edx		# test hyper-threading bit
-	jnc	.Ldone
+	jnc	.Lgeneric
 	shr	\$16,%ebx		# number of logical processors
 	cmp	%r10b,%bl
-	ja	.Ldone
+	ja	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
-	jmp	.Ldone
+	jmp	.Lgeneric
 
 .Lintel:
 	cmp	\$4,%r11d
@@ -121,21 +129,38 @@ OPENSSL_ia32_cpuid:
 	or	\$0x40000000,%edx	# use reserved bit to skip unrolled loop
 .Lnotintel:
 	bt	\$28,%edx		# test hyper-threading bit
-	jnc	.Ldone
+	jnc	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
 	cmp	\$0,%r10d
-	je	.Ldone
+	je	.Lgeneric
 
 	or	\$0x10000000,%edx	# 1<<28
 	shr	\$16,%ebx
 	cmp	\$1,%bl			# see if cache is shared
-	ja	.Ldone
+	ja	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
-.Ldone:
+.Lgeneric:
+	and	\$0x00000800,%r9d	# isolate AMD XOP flag
+	and	\$0xfffff7ff,%ecx
+	or	%r9d,%ecx		# merge AMD XOP flag
+
 	shl	\$32,%rcx
-	mov	%edx,%eax
-	mov	%r8,%rbx
-	or	%rcx,%rax
+	mov	%edx,%ebx
+	or	%rcx,%rbx		# compose capability vector in %rbx
+	bt	\$27+32,%rcx		# check OSXSAVE bit
+	jnc	.Lclear_avx
+	xor	%ecx,%ecx		# XCR0
+	.byte	0x0f,0x01,0xd0		# xgetbv
+	and	\$6,%eax		# isolate XMM and YMM state support
+	cmp	\$6,%eax
+	je	.Ldone
+.Lclear_avx:
+	mov	\$0xefffe7ff,%eax	# ~(1<<28|1<<12|1<<11)
+	shl	\$32,%rax
+	and	%rax,%rbx		# clear AVX, FMA and AMD XOP bits
+.Ldone:
+	mov	%rbx,%rax
+	mov	%r8,%rbx		# restore %rbx
 	ret
 .size	OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
 
@@ -250,7 +275,7 @@ OPENSSL_instrument_bus:
 	mov	%eax,$lasttick	# lasttick = tick
 	mov	\$0,$lastdiff	# lastdiff = 0
 	clflush	($out)
-	lock
+	.byte	0xf0		# lock
 	add	$lastdiff,($out)
 	jmp	.Loop
 .align	16
@@ -260,7 +285,7 @@ OPENSSL_instrument_bus:
 	mov	%edx,$lasttick
 	mov	%eax,$lastdiff
 	clflush	($out)
-	lock
+	.byte	0xf0		# lock
 	add	%eax,($out)
 	lea	4($out),$out
 	sub	\$1,$cnt
@@ -284,7 +309,7 @@ OPENSSL_instrument_bus2:
 	mov	\$0,$lastdiff	# lastdiff = 0
 
 	clflush	($out)
-	lock
+	.byte	0xf0		# lock
 	add	$lastdiff,($out)
 
 	rdtsc			# collect 1st diff
@@ -294,7 +319,7 @@ OPENSSL_instrument_bus2:
 	mov	%eax,$lastdiff	# lastdiff = diff
 .Loop2:
 	clflush	($out)
-	lock
+	.byte	0xf0		# lock
 	add	%eax,($out)	# accumulate diff
 
 	sub	\$1,$max
diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl
index 0513398739..f424c2debe 100644
--- a/crypto/x86cpuid.pl
+++ b/crypto/x86cpuid.pl
@@ -20,7 +20,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&pop	("eax");
 	&xor	("ecx","eax");
 	&bt	("ecx",21);
-	&jnc	(&label("done"));
+	&jnc	(&label("generic"));
 	&xor	("eax","eax");
 	&cpuid	();
 	&mov	("edi","eax");		# max value for standard query level
@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	# AMD specific
 	&mov	("eax",0x80000000);
 	&cpuid	();
-	&cmp	("eax",0x80000008);
+	&cmp	("eax",0x80000001);
+	&jb	(&label("intel"));
+	&mov	("esi","eax");
+	&mov	("eax",0x80000001);
+	&cpuid	();
+	&or	("ebp","ecx");
+	&and	("ebp",1<<11|1);	# isolate XOP bit
+	&cmp	("esi",0x80000008);
 	&jb	(&label("intel"));
 
 	&mov	("eax",0x80000008);
@@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&mov	("eax",1);
 	&cpuid	();
 	&bt	("edx",28);
-	&jnc	(&label("done"));
+	&jnc	(&label("generic"));
 	&shr	("ebx",16);
 	&and	("ebx",0xff);
 	&cmp	("ebx","esi");
-	&ja	(&label("done"));
+	&ja	(&label("generic"));
 	&and	("edx",0xefffffff);	# clear hyper-threading bit
-	&jmp	(&label("done"));
+	&jmp	(&label("generic"));
 	
 &set_label("intel");
 	&cmp	("edi",4);
@@ -93,19 +100,42 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&or	("edx",1<<20);		# use reserved bit to engage RC4_CHAR
 &set_label("notP4");
 	&bt	("edx",28);		# test hyper-threading bit
-	&jnc	(&label("done"));
+	&jnc	(&label("generic"));
 	&and	("edx",0xefffffff);
 	&cmp	("edi",0);
-	&je	(&label("done"));
+	&je	(&label("generic"));
 
 	&or	("edx",0x10000000);
 	&shr	("ebx",16);
 	&cmp	(&LB("ebx"),1);
-	&ja	(&label("done"));
+	&ja	(&label("generic"));
 	&and	("edx",0xefffffff);	# clear hyper-threading bit if not
+
+&set_label("generic");
+	&and	("ebp",1<<11);		# isolate AMD XOP flag
+	&and	("ecx",~(1<<11));
+	&mov	("esi","edx");
+	&or	("ebp","ecx");		# merge AMD XOP flag
+
+	&bt	("ecx",26);		# check XSAVE bit
+	&jnc	(&label("done"));
+	&bt	("ecx",27);		# check OSXSAVE bit
+	&jnc	(&label("clear_xmm"));
+	&xor	("ecx","ecx");
+	&data_byte(0x0f,0x01,0xd0);	# xgetbv
+	&and	("eax",6);
+	&cmp	("eax",6);
+	&je	(&label("done"));
+	&cmp	("eax",2);
+	&je	(&label("clear_avx"));
+&set_label("clear_xmm");
+	&and	("ebp",~(1<<25|1<<1));	# clear AESNI and PCLMULQDQ bits
+	&and	("esi",~(1<<24));	# clear FXSR
+&set_label("clear_avx");
+	&and	("ebp",~(1<<28|1<<12|1<<11));# clear AVX, FMA and AMD XOP bits
 &set_label("done");
-	&mov	("eax","edx");
-	&mov	("edx","ecx");
+	&mov	("eax","esi");
+	&mov	("edx","ebp");
 &function_end("OPENSSL_ia32_cpuid");
 
 &external_label("OPENSSL_ia32cap_P");
@@ -199,8 +229,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&bt	(&DWP(0,"ecx"),1);
 	&jnc	(&label("no_x87"));
 	if ($sse2) {
-		&bt	(&DWP(0,"ecx"),26);
-		&jnc	(&label("no_sse2"));
+		&and	("ecx",1<<26|1<<24);	# check SSE2 and FXSR bits
+		&cmp	("ecx",1<<26|1<<24);
+		&jne	(&label("no_sse2"));
 		&pxor	("xmm0","xmm0");
 		&pxor	("xmm1","xmm1");
 		&pxor	("xmm2","xmm2");
@@ -331,7 +362,7 @@ my $max = "ebp";
 	&mov	($lasttick,"eax");	# lasttick = tick
 	&mov	($lastdiff,0);		# lastdiff = 0
 	&clflush(&DWP(0,$out));
-	&lock	();
+	&data_byte(0xf0);		# lock
 	&add	(&DWP(0,$out),$lastdiff);
 	&jmp	(&label("loop"));
 
@@ -342,7 +373,7 @@ my $max = "ebp";
 	&mov	($lasttick,"edx");	# lasttick = tick
 	&mov	($lastdiff,"eax");	# lastdiff = diff
 	&clflush(&DWP(0,$out));
-	&lock	();
+	&data_byte(0xf0);		# lock
 	&add	(&DWP(0,$out),"eax");	# accumulate diff
 	&lea	($out,&DWP(4,$out));	# ++$out
 	&sub	($cnt,1);		# --$cnt
@@ -371,7 +402,7 @@ my $max = "ebp";
 	&mov	($lastdiff,0);		# lastdiff = 0
 
 	&clflush(&DWP(0,$out));
-	&lock	();
+	&data_byte(0xf0);		# lock
 	&add	(&DWP(0,$out),$lastdiff);
 
 	&rdtsc	();			# collect 1st diff
@@ -383,7 +414,7 @@ my $max = "ebp";
 
 &set_label("loop2",16);
 	&clflush(&DWP(0,$out));
-	&lock	();
+	&data_byte(0xf0);		# lock
 	&add	(&DWP(0,$out),"eax");	# accumulate diff
 
 	&sub	($max,1);
diff --git a/doc/crypto/OPENSSL_ia32cap.pod b/doc/crypto/OPENSSL_ia32cap.pod
index dca2e20ace..af6b4f3a4d 100644
--- a/doc/crypto/OPENSSL_ia32cap.pod
+++ b/doc/crypto/OPENSSL_ia32cap.pod
@@ -2,7 +2,7 @@
 
 =head1 NAME
 
-OPENSSL_ia32cap - finding the IA-32 processor capabilities
+OPENSSL_ia32cap - the IA-32 processor capabilities vector
 
 =head1 SYNOPSIS
 
@@ -18,30 +18,52 @@ input value (see Intel Application Note #241618). Naturally it's
 meaningful on x86 and x86_64 platforms only. The variable is normally
 set up automatically upon toolkit initialization, but can be
 manipulated afterwards to modify crypto library behaviour. For the
-moment of this writing seven bits are significant, namely:
-
-1. bit #4 denoting presence of Time-Stamp Counter.
-2. bit #20, reserved by Intel, is used to choose among RC4 code
-   paths;
-3. bit #23 denoting MMX support;
-4. bit #25 denoting SSE support;
-5. bit #26 denoting SSE2 support;
-6. bit #28 denoting Hyperthreading, which is used to distiguish
-   cores with shared cache;
-7. bit #30, reserved by Intel, is used to choose among RC4 code
-   paths;
-8. bit #57 denoting Intel AES instruction set extension;
+moment of this writing following bits are significant:
+
+=item bit #4 denoting presence of Time-Stamp Counter.
+
+=item bit #19 denoting availability of CLFLUSH instruction;
+
+=item bit #20, reserved by Intel, is used to choose among RC4 code paths;
+
+=item bit #23 denoting MMX support;
+
+=item bit #24, FXSR bit, denoting availability of XMM registers;
+
+=item bit #25 denoting SSE support;
+
+=item bit #26 denoting SSE2 support;
+
+=item bit #28 denoting Hyperthreading, which is used to distiguish
+      cores with shared cache;
+
+=item bit #30, reserved by Intel, is used to choose among RC4 code
+      paths;
+
+=item bit #33 denoting availability of PCLMULQDQ instruction;
+
+=item bit #41 denoting SSSE3, Supplemental SSE3, support;
+
+=item bit #43 denoting AMD XOP support (forced to zero on Intel);
+
+=item bit #57 denoting AES-NI instruction set extension;
+
+=item bit #59, OSXSAVE bit, denoting availability of YMM registers;
+
+=item bit #60 denoting AVX extension;
 
 For example, clearing bit #26 at run-time disables high-performance
-SSE2 code present in the crypto library. You might have to do this if
-target OpenSSL application is executed on SSE2 capable CPU, but under
-control of OS which does not support SSE2 extentions. Even though you
-can manipulate the value programmatically, you most likely will find it
-more appropriate to set up an environment variable with the same name
-prior starting target application, e.g. on Intel P4 processor 'env
-OPENSSL_ia32cap=0x12900010 apps/openssl', to achieve same effect
-without modifying the application source code. Alternatively you can
-reconfigure the toolkit with no-sse2 option and recompile.
+SSE2 code present in the crypto library, while clearing bit #24
+disables SSE2 code operating on 128-bit XMM register bank. You might
+have to do the latter if target OpenSSL application is executed on SSE2
+capable CPU, but under control of OS that does not enable XMM
+registers. Even though you can manipulate the value programmatically,
+you most likely will find it more appropriate to set up an environment
+variable with the same name prior starting target application, e.g. on
+Intel P4 processor 'env OPENSSL_ia32cap=0x16980010 apps/openssl', to
+achieve same effect without modifying the application source code.
+Alternatively you can reconfigure the toolkit with no-sse2 option and
+recompile.
 
 Less intuituve is clearing bit #28. The truth is that it's not copied
 from CPUID output verbatim, but is adjusted to reflect whether or not
@@ -49,4 +71,3 @@ the data cache is actually shared between logical cores. This in turn
 affects the decision on whether or not expensive countermeasures
 against cache-timing attacks are applied, most notably in AES assembler
 module.
-=cut
-- 
2.25.1