aes/asm/bsaes-x86_64.pl: Atom-specific optimization.

author Andy Polyakov <appro@openssl.org>

Thu, 24 Apr 2014 08:13:30 +0000 (10:13 +0200)

committer Andy Polyakov <appro@openssl.org>

Thu, 24 Apr 2014 08:14:46 +0000 (10:14 +0200)
author Andy Polyakov <appro@openssl.org>
Thu, 24 Apr 2014 08:13:30 +0000 (10:13 +0200)
committer Andy Polyakov <appro@openssl.org>
Thu, 24 Apr 2014 08:14:46 +0000 (10:14 +0200)
diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl

index 41b90f08443f512bb0079910720936afff275fc7..d2c3978b962e451f0e901f0e578138813c5ef08c 100644 (file)
--- a/crypto/aes/asm/bsaes-x86_64.pl
+++ b/crypto/aes/asm/bsaes-x86_64.pl
@@ -38,8 +38,8 @@
  #              Emilia's        this(*)         difference
  #
  # Core 2       9.30            8.69            +7%
-# Nehalem(**)  7.63            6.98            +9%
-# Atom         17.1            17.4            -2%(***)
+# Nehalem(**)  7.63            6.88            +11%
+# Atom         17.1            16.4            +4%
  #
  # (*)  Comparison is not completely fair, because "this" is ECB,
  #      i.e. no extra processing such as counter values calculation
@@ -50,14 +50,6 @@
  # (**) Results were collected on Westmere, which is considered to
  #      be equivalent to Nehalem for this code.
  #
-# (***)        Slowdown on Atom is rather strange per se, because original
-#      implementation has a number of 9+-bytes instructions, which
-#      are bad for Atom front-end, and which I eliminated completely.
-#      In attempt to address deterioration sbox() was tested in FP
-#      SIMD "domain" (movaps instead of movdqa, xorps instead of
-#      pxor, etc.). While it resulted in nominal 4% improvement on
-#      Atom, it hurted Westmere by more than 2x factor.
-#
  # As for key schedule conversion subroutine. Interface to OpenSSL
  # relies on per-invocation on-the-fly conversion. This naturally
  # has impact on performance, especially for short inputs. Conversion
@@ -67,7 +59,7 @@
  #              conversion      conversion/8x block
  # Core 2       240             0.22
  # Nehalem      180             0.20
-# Atom         430             0.19
+# Atom         430             0.20
  #
  # The ratio values mean that 128-byte blocks will be processed
  # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
@@ -83,9 +75,9 @@
  # Add decryption procedure. Performance in CPU cycles spent to decrypt
  # one byte out of 4096-byte buffer with 128-bit key is:
  #
-# Core 2       9.83
-# Nehalem      7.74
-# Atom         19.0
+# Core 2       9.98
+# Nehalem      7.80
+# Atom         17.9
  #
  # November 2011.
  #
@@ -434,21 +426,21 @@ my $mask=pop;
  $code.=<<___;
         pxor    0x00($key),@x[0]
         pxor    0x10($key),@x[1]
-       pshufb  $mask,@x[0]
         pxor    0x20($key),@x[2]
-       pshufb  $mask,@x[1]
         pxor    0x30($key),@x[3]
-       pshufb  $mask,@x[2]
+       pshufb  $mask,@x[0]
+       pshufb  $mask,@x[1]
         pxor    0x40($key),@x[4]
-       pshufb  $mask,@x[3]
         pxor    0x50($key),@x[5]
-       pshufb  $mask,@x[4]
+       pshufb  $mask,@x[2]
+       pshufb  $mask,@x[3]
         pxor    0x60($key),@x[6]
-       pshufb  $mask,@x[5]
         pxor    0x70($key),@x[7]
+       pshufb  $mask,@x[4]
+       pshufb  $mask,@x[5]
         pshufb  $mask,@x[6]
-       lea     0x80($key),$key
         pshufb  $mask,@x[7]
+       lea     0x80($key),$key
  ___
  }
  
@@ -820,18 +812,18 @@ _bsaes_encrypt8:
         movdqa  0x50($const), @XMM[8]   # .LM0SR
         pxor    @XMM[9], @XMM[0]        # xor with round0 key
         pxor    @XMM[9], @XMM[1]
-        pshufb @XMM[8], @XMM[0]
         pxor    @XMM[9], @XMM[2]
-        pshufb @XMM[8], @XMM[1]
         pxor    @XMM[9], @XMM[3]
-        pshufb @XMM[8], @XMM[2]
+        pshufb @XMM[8], @XMM[0]
+        pshufb @XMM[8], @XMM[1]
         pxor    @XMM[9], @XMM[4]
-        pshufb @XMM[8], @XMM[3]
         pxor    @XMM[9], @XMM[5]
-        pshufb @XMM[8], @XMM[4]
+        pshufb @XMM[8], @XMM[2]
+        pshufb @XMM[8], @XMM[3]
         pxor    @XMM[9], @XMM[6]
-        pshufb @XMM[8], @XMM[5]
         pxor    @XMM[9], @XMM[7]
+        pshufb @XMM[8], @XMM[4]
+        pshufb @XMM[8], @XMM[5]
          pshufb @XMM[8], @XMM[6]
          pshufb @XMM[8], @XMM[7]
  _bsaes_encrypt8_bitslice:
@@ -884,18 +876,18 @@ _bsaes_decrypt8:
         movdqa  -0x30($const), @XMM[8]  # .LM0ISR
         pxor    @XMM[9], @XMM[0]        # xor with round0 key
         pxor    @XMM[9], @XMM[1]
-        pshufb @XMM[8], @XMM[0]
         pxor    @XMM[9], @XMM[2]
-        pshufb @XMM[8], @XMM[1]
         pxor    @XMM[9], @XMM[3]
-        pshufb @XMM[8], @XMM[2]
+        pshufb @XMM[8], @XMM[0]
+        pshufb @XMM[8], @XMM[1]
         pxor    @XMM[9], @XMM[4]
-        pshufb @XMM[8], @XMM[3]
         pxor    @XMM[9], @XMM[5]
-        pshufb @XMM[8], @XMM[4]
+        pshufb @XMM[8], @XMM[2]
+        pshufb @XMM[8], @XMM[3]
         pxor    @XMM[9], @XMM[6]
-        pshufb @XMM[8], @XMM[5]
         pxor    @XMM[9], @XMM[7]
+        pshufb @XMM[8], @XMM[4]
+        pshufb @XMM[8], @XMM[5]
          pshufb @XMM[8], @XMM[6]
          pshufb @XMM[8], @XMM[7]
  ___
@@ -1937,21 +1929,21 @@ $code.=<<___;
         movdqa  -0x10(%r11), @XMM[8]    # .LSWPUPM0SR
         pxor    @XMM[9], @XMM[0]        # xor with round0 key
         pxor    @XMM[9], @XMM[1]
-        pshufb @XMM[8], @XMM[0]
         pxor    @XMM[9], @XMM[2]
-        pshufb @XMM[8], @XMM[1]
         pxor    @XMM[9], @XMM[3]
-        pshufb @XMM[8], @XMM[2]
+        pshufb @XMM[8], @XMM[0]
+        pshufb @XMM[8], @XMM[1]
         pxor    @XMM[9], @XMM[4]
-        pshufb @XMM[8], @XMM[3]
         pxor    @XMM[9], @XMM[5]
-        pshufb @XMM[8], @XMM[4]
+        pshufb @XMM[8], @XMM[2]
+        pshufb @XMM[8], @XMM[3]
         pxor    @XMM[9], @XMM[6]
-        pshufb @XMM[8], @XMM[5]
         pxor    @XMM[9], @XMM[7]
+        pshufb @XMM[8], @XMM[4]
+        pshufb @XMM[8], @XMM[5]
          pshufb @XMM[8], @XMM[6]
-       lea     .LBS0(%rip), %r11       # constants table
          pshufb @XMM[8], @XMM[7]
+       lea     .LBS0(%rip), %r11       # constants table
         mov     %ebx,%r10d              # pass rounds
  
         call    _bsaes_encrypt8_bitslice
author	Andy Polyakov <appro@openssl.org>
	Thu, 24 Apr 2014 08:13:30 +0000 (10:13 +0200)
committer	Andy Polyakov <appro@openssl.org>
	Thu, 24 Apr 2014 08:14:46 +0000 (10:14 +0200)