From: Andy Polyakov Date: Mon, 24 Jul 2017 19:50:52 +0000 (+0200) Subject: x86_64 assembly pack: "optimize" for Knights Landing. X-Git-Tag: OpenSSL_1_1_0g~126 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=738a9dd53cacce593cd7d67e18e1273549640a79;p=oweals%2Fopenssl.git x86_64 assembly pack: "optimize" for Knights Landing. "Optimize" is in quotes because it's rather a "salvage operation" for now. Idea is to identify processor capability flags that drive Knights Landing to suboptimial code paths and mask them. Two flags were identified, XSAVE and ADCX/ADOX. Former affects choice of AES-NI code path specific for Silvermont (Knights Landing is of Silvermont "ancestry"). And 64-bit ADCX/ADOX instructions are effectively mishandled at decode time. In both cases we are looking at ~2x improvement. Hardware used for benchmarking courtesy of Atos, experiments run by Romain Dolbeau . Kudos! This is minimalistic backpoint of 64d92d74985ebb3d0be58a9718f9e080a14a8e7f Thanks to David Benjamin for spotting typo in Knights Landing detection! Reviewed-by: Rich Salz (Merged from https://github.com/openssl/openssl/pull/4006) --- diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl index afc50af020..7995b5cc25 100644 --- a/crypto/x86_64cpuid.pl +++ b/crypto/x86_64cpuid.pl @@ -143,8 +143,19 @@ OPENSSL_ia32_cpuid: or \$0x40000000,%edx # set reserved bit#30 on Intel CPUs and \$15,%ah cmp \$15,%ah # examine Family ID - jne .Lnotintel + jne .LnotP4 or \$0x00100000,%edx # set reserved bit#20 to engage RC4_CHAR +.LnotP4: + cmp \$6,%ah + jne .Lnotintel + and \$0x0fff0ff0,%eax + cmp \$0x00050670,%eax # Knights Landing + je .Lknights + cmp \$0x00080650,%eax # Knights Mill (according to sde) + jne .Lnotintel +.Lknights: + and \$0xfbffffff,%ecx # clear XSAVE flag to mimic Silvermont + .Lnotintel: bt \$28,%edx # test hyper-threading bit jnc .Lgeneric @@ -169,6 +180,10 @@ OPENSSL_ia32_cpuid: mov \$7,%eax xor %ecx,%ecx cpuid + bt \$26,%r9d # check XSAVE bit, cleared on Knights + jc .Lnotknights + and \$0xfff7ffff,%ebx # clear ADCX/ADOX flag +.Lnotknights: mov %ebx,8(%rdi) # save extended feature flags .Lno_extended_info: