x86_64 assembly pack: "optimize" for Knights Landing.
authorAndy Polyakov <appro@openssl.org>
Mon, 24 Jul 2017 19:50:52 +0000 (21:50 +0200)
committerAndy Polyakov <appro@openssl.org>
Tue, 25 Jul 2017 19:32:31 +0000 (21:32 +0200)
"Optimize" is in quotes because it's rather a "salvage operation"
for now. Idea is to identify processor capability flags that
drive Knights Landing to suboptimial code paths and mask them.
Two flags were identified, XSAVE and ADCX/ADOX. Former affects
choice of AES-NI code path specific for Silvermont (Knights Landing
is of Silvermont "ancestry"). And 64-bit ADCX/ADOX instructions are
effectively mishandled at decode time. In both cases we are looking
at ~2x improvement.

Hardware used for benchmarking courtesy of Atos, experiments run by
Romain Dolbeau <romain.dolbeau@atos.net>. Kudos!

This is minimalistic backpoint of 64d92d74985ebb3d0be58a9718f9e080a14a8e7f

Thanks to David Benjamin for spotting typo in Knights Landing detection!

Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/4006)

crypto/x86_64cpuid.pl

index afc50af020fbc3e2b698cd41afa3453a8a3dfe5c..7995b5cc2560132034400856f59d6df18fcdd9c7 100644 (file)
@@ -143,8 +143,19 @@ OPENSSL_ia32_cpuid:
        or      \$0x40000000,%edx       # set reserved bit#30 on Intel CPUs
        and     \$15,%ah
        cmp     \$15,%ah                # examine Family ID
-       jne     .Lnotintel
+       jne     .LnotP4
        or      \$0x00100000,%edx       # set reserved bit#20 to engage RC4_CHAR
+.LnotP4:
+       cmp     \$6,%ah
+       jne     .Lnotintel
+       and     \$0x0fff0ff0,%eax
+       cmp     \$0x00050670,%eax       # Knights Landing
+       je      .Lknights
+       cmp     \$0x00080650,%eax       # Knights Mill (according to sde)
+       jne     .Lnotintel
+.Lknights:
+       and     \$0xfbffffff,%ecx       # clear XSAVE flag to mimic Silvermont
+
 .Lnotintel:
        bt      \$28,%edx               # test hyper-threading bit
        jnc     .Lgeneric
@@ -169,6 +180,10 @@ OPENSSL_ia32_cpuid:
        mov     \$7,%eax
        xor     %ecx,%ecx
        cpuid
+       bt      \$26,%r9d               # check XSAVE bit, cleared on Knights
+       jc      .Lnotknights
+       and     \$0xfff7ffff,%ebx       # clear ADCX/ADOX flag
+.Lnotknights:
        mov     %ebx,8(%rdi)            # save extended feature flags
 .Lno_extended_info: