aes/asm/bsaes-*.pl: improve decrypt performance.

author Andy Polyakov <appro@openssl.org>

Thu, 3 Oct 2013 21:08:31 +0000 (23:08 +0200)

committer Andy Polyakov <appro@openssl.org>

Thu, 3 Oct 2013 21:08:31 +0000 (23:08 +0200)
author Andy Polyakov <appro@openssl.org>
Thu, 3 Oct 2013 21:08:31 +0000 (23:08 +0200)
committer Andy Polyakov <appro@openssl.org>
Thu, 3 Oct 2013 21:08:31 +0000 (23:08 +0200)
diff --git a/crypto/aes/asm/bsaes-armv7.pl b/crypto/aes/asm/bsaes-armv7.pl

index 31b93a948263c99e359c6686cd3ae44a0f6d3511..f3d96d9325737fba399dc5e6613d223e03fcb7a1 100644 (file)
--- a/crypto/aes/asm/bsaes-armv7.pl
+++ b/crypto/aes/asm/bsaes-armv7.pl
@@ -23,14 +23,14 @@
  # to collect performance results, which for Cortex-A8 core are:
  #
  # encrypt      19.5 cycles per byte processed with 128-bit key
-# decrypt      24.0 cycles per byte processed with 128-bit key
+# decrypt      22.1 cycles per byte processed with 128-bit key
  # key conv.    440  cycles per 128-bit key/0.18 of 8x block
  #
-# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 22.6,
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
  # which is [much] worse than anticipated (for further details see
  # http://www.openssl.org/~appro/Snapdragon-S4.html).
  #
-# Cortex-A15 manages in 14.2/19.6 cycles [when integer-only code
+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
  # manages in 20.0 cycles].
  #
  # When comparing to x86_64 results keep in mind that NEON unit is
@@ -377,6 +377,7 @@ sub MixColumns {
  # modified to emit output in order suitable for feeding back to aesenc[last]
  my @x=@_[0..7];
  my @t=@_[8..15];
+my $inv=@_[16];        # optional
  $code.=<<___;
         vext.8  @t[0], @x[0], @x[0], #12        @ x0 <<< 32
         vext.8  @t[1], @x[1], @x[1], #12
@@ -417,8 +418,9 @@ $code.=<<___;
         veor    @t[3], @t[3], @x[7]
          vext.8 @x[6], @x[2], @x[2], #8
         veor    @x[7], @t[1], @t[5]
+___
+$code.=<<___ if (!$inv);
         veor    @x[2], @t[0], @t[4]
-
         veor    @x[4], @x[4], @t[3]
         veor    @x[5], @x[5], @t[7]
         veor    @x[3], @x[3], @t[6]
@@ -426,9 +428,18 @@ $code.=<<___;
         veor    @x[6], @x[6], @t[2]
          @ vmov @x[7], @t[1]
  ___
+$code.=<<___ if ($inv);
+       veor    @t[3], @t[3], @x[4]
+       veor    @x[5], @x[5], @t[7]
+       veor    @x[2], @x[3], @t[6]
+       veor    @x[3], @t[0], @t[4]
+       veor    @x[4], @x[6], @t[2]
+       vmov    @x[6], @t[3]
+        @ vmov @x[7], @t[1]
+___
  }
  
-sub InvMixColumns {
+sub InvMixColumns_orig {
  my @x=@_[0..7];
  my @t=@_[8..15];
  
@@ -581,6 +592,54 @@ $code.=<<___;
  ___
  }
  
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
+# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
+
+$code.=<<___;
+       @ multiplication by 0x05-0x00-0x04-0x00
+       vext.8  @t[0], @x[0], @x[0], #8
+       vext.8  @t[6], @x[6], @x[6], #8
+       vext.8  @t[7], @x[7], @x[7], #8
+       veor    @t[0], @t[0], @x[0]
+       vext.8  @t[1], @x[1], @x[1], #8
+       veor    @t[6], @t[6], @x[6]
+       vext.8  @t[2], @x[2], @x[2], #8
+       veor    @t[7], @t[7], @x[7]
+       vext.8  @t[3], @x[3], @x[3], #8
+       veor    @t[1], @t[1], @x[1]
+       vext.8  @t[4], @x[4], @x[4], #8
+       veor    @t[2], @t[2], @x[2]
+       vext.8  @t[5], @x[5], @x[5], #8
+       veor    @t[3], @t[3], @x[3]
+       veor    @t[4], @t[4], @x[4]
+       veor    @t[5], @t[5], @x[5]
+
+        veor   @x[0], @x[0], @t[6]
+        veor   @x[1], @x[1], @t[6]
+        veor   @x[2], @x[2], @t[0]
+        veor   @x[4], @x[4], @t[2]
+        veor   @x[3], @x[3], @t[1]
+        veor   @x[1], @x[1], @t[7]
+        veor   @x[2], @x[2], @t[7]
+        veor   @x[4], @x[4], @t[6]
+        veor   @x[5], @x[5], @t[3]
+        veor   @x[3], @x[3], @t[6]
+        veor   @x[6], @x[6], @t[4]
+        veor   @x[4], @x[4], @t[7]
+        veor   @x[5], @x[5], @t[7]
+        veor   @x[7], @x[7], @t[5]
+___
+       &MixColumns     (@x,@t,1);      # flipped 2<->3 and 4<->6
+}
+
  sub swapmove {
  my ($a,$b,$n,$mask,$t)=@_;
  $code.=<<___;
diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl

index ceb02b50ddb60d283345b27ad8a5a3d4a3b92534..8cde67681e48a943df570201c44c987ea31be979 100644 (file)
--- a/crypto/aes/asm/bsaes-x86_64.pl
+++ b/crypto/aes/asm/bsaes-x86_64.pl
@@ -83,9 +83,9 @@
  # Add decryption procedure. Performance in CPU cycles spent to decrypt
  # one byte out of 4096-byte buffer with 128-bit key is:
  #
-# Core 2       11.0
-# Nehalem      9.16
-# Atom         20.9
+# Core 2       9.83
+# Nehalem      7.74
+# Atom         18.9 (estimated, not measured yet)
  #
  # November 2011.
  #
@@ -456,6 +456,7 @@ sub MixColumns {
  # modified to emit output in order suitable for feeding back to aesenc[last]
  my @x=@_[0..7];
  my @t=@_[8..15];
+my $inv=@_[16];        # optional
  $code.=<<___;
         pshufd  \$0x93, @x[0], @t[0]    # x0 <<< 32
         pshufd  \$0x93, @x[1], @t[1]
@@ -497,7 +498,8 @@ $code.=<<___;
         pxor    @t[4], @t[0]
          pshufd \$0x4E, @x[2], @x[6]
         pxor    @t[5], @t[1]
-
+___
+$code.=<<___ if (!$inv);
         pxor    @t[3], @x[4]
         pxor    @t[7], @x[5]
         pxor    @t[6], @x[3]
@@ -505,9 +507,20 @@ $code.=<<___;
         pxor    @t[2], @x[6]
          movdqa @t[1], @x[7]
  ___
+$code.=<<___ if ($inv);
+       pxor    @x[4], @t[3]
+       pxor    @t[7], @x[5]
+       pxor    @x[3], @t[6]
+        movdqa @t[0], @x[3]
+       pxor    @t[2], @x[6]
+        movdqa @t[6], @x[2]
+        movdqa @t[1], @x[7]
+        movdqa @x[6], @x[4]
+        movdqa @t[3], @x[6]
+___
  }
  
-sub InvMixColumns {
+sub InvMixColumns_orig {
  my @x=@_[0..7];
  my @t=@_[8..15];
  
@@ -661,6 +674,54 @@ $code.=<<___;
  ___
  }
  
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+# Thanks to Jussi Kivilinna for providing pointer to
+#
+# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
+# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
+# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
+
+$code.=<<___;
+       # multiplication by 0x05-0x00-0x04-0x00
+       pshufd  \$0x4E, @x[0], @t[0]
+       pshufd  \$0x4E, @x[6], @t[6]
+       pxor    @x[0], @t[0]
+       pshufd  \$0x4E, @x[7], @t[7]
+       pxor    @x[6], @t[6]
+       pshufd  \$0x4E, @x[1], @t[1]
+       pxor    @x[7], @t[7]
+       pshufd  \$0x4E, @x[2], @t[2]
+       pxor    @x[1], @t[1]
+       pshufd  \$0x4E, @x[3], @t[3]
+       pxor    @x[2], @t[2]
+        pxor   @t[6], @x[0]
+        pxor   @t[6], @x[1]
+       pshufd  \$0x4E, @x[4], @t[4]
+       pxor    @x[3], @t[3]
+        pxor   @t[0], @x[2]
+        pxor   @t[1], @x[3]
+       pshufd  \$0x4E, @x[5], @t[5]
+       pxor    @x[4], @t[4]
+        pxor   @t[7], @x[1]
+        pxor   @t[2], @x[4]
+       pxor    @x[5], @t[5]
+
+        pxor   @t[7], @x[2]
+        pxor   @t[6], @x[3]
+        pxor   @t[6], @x[4]
+        pxor   @t[3], @x[5]
+        pxor   @t[4], @x[6]
+        pxor   @t[7], @x[4]
+        pxor   @t[7], @x[5]
+        pxor   @t[5], @x[7]
+___
+       &MixColumns     (@x,@t,1);      # flipped 2<->3 and 4<->6
+}
+
  sub aesenc {                           # not used
  my @b=@_[0..7];
  my @t=@_[8..15];
author	Andy Polyakov <appro@openssl.org>
	Thu, 3 Oct 2013 21:08:31 +0000 (23:08 +0200)
committer	Andy Polyakov <appro@openssl.org>
	Thu, 3 Oct 2013 21:08:31 +0000 (23:08 +0200)
crypto/aes/asm/bsaes-armv7.pl		patch \| blob \| history
crypto/aes/asm/bsaes-x86_64.pl		patch \| blob \| history