bsaes-armv7.pl: closest shave. While 0.3 cpb improvement on S4 appears

author Andy Polyakov <appro@openssl.org>

Fri, 7 Sep 2012 12:29:18 +0000 (12:29 +0000)

committer Andy Polyakov <appro@openssl.org>

Fri, 7 Sep 2012 12:29:18 +0000 (12:29 +0000)
author Andy Polyakov <appro@openssl.org>
Fri, 7 Sep 2012 12:29:18 +0000 (12:29 +0000)
committer Andy Polyakov <appro@openssl.org>
Fri, 7 Sep 2012 12:29:18 +0000 (12:29 +0000)
diff --git a/crypto/aes/asm/bsaes-armv7.pl b/crypto/aes/asm/bsaes-armv7.pl

index 5047f0d56eaadc4310ce7b29a9ecc81103e64e63..d901c58f5a6a4789fa9242383e7dfb0264b6cb8b 100644 (file)
--- a/crypto/aes/asm/bsaes-armv7.pl
+++ b/crypto/aes/asm/bsaes-armv7.pl
@@ -18,11 +18,13 @@
  # only low-level primitives and unsupported entry points, just enough
  # to collect performance results, which for Cortex-A8 core are:
  #
-# encrypt      19.7 cycles per byte processed with 128-bit key
-# decrypt      24.1 cycles per byte processed with 128-bit key
-# key conv.    440  cycles per 128-bit key/0.17 of 8x block
+# encrypt      19.5 cycles per byte processed with 128-bit key
+# decrypt      24.0 cycles per byte processed with 128-bit key
+# key conv.    440  cycles per 128-bit key/0.18 of 8x block
  #
-# Snapdragon S4 encrypts byte in 17.9 cycles and decrypts in 22.9.
+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 22.6,
+# which is [much] worse than anticipated (for further details see
+# http://www.openssl.org/~appro/Snapdragon-S4.html).
  #
  # When comparing to x86_64 results keep in mind that NEON unit is
  # [mostly] single-issue and thus can't [fully] benefit from
@@ -282,35 +284,32 @@ $code.=<<___;
         vand    @s[2], @x[5], @x[1]
         vorr    @s[3], @x[4], @x[0]
         veor    @t[3], @t[3], @s[0]
-       veor    @t[2], @t[2], @s[1]
         veor    @t[1], @t[1], @s[2]
         veor    @t[0], @t[0], @s[3]
+       veor    @t[2], @t[2], @s[1]
  
         @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
  
         @ new smaller inversion
  
-       veor    @s[0], @t[3], @t[2]
-       vand    @t[3], @t[3], @t[1]
+       vand    @s[2], @t[3], @t[1]
+       vmov    @s[0], @t[0]
  
-       veor    @s[2], @t[0], @t[3]
-       veor    @s[1], @t[2], @t[3]
+       veor    @s[1], @t[2], @s[2]
+       veor    @s[3], @t[0], @s[2]
+       veor    @s[2], @t[0], @s[2]     @ @s[2]=@s[3]
  
-       vand    @s[3], @s[0], @s[2]
         vbsl    @s[1], @t[1], @t[0]
+       vbsl    @s[3], @t[3], @t[2]
+       veor    @t[3], @t[3], @t[2]
  
-       veor    @s[3], @s[3], @t[2]
-       veor    @t[2], @s[2], @s[1]
-
-       vand    @t[2], @t[2], @t[0]
+       vbsl    @s[0], @s[1], @s[2]
         vbsl    @t[0], @s[2], @s[1]
  
-       veor    @s[2], @s[2], @t[2]
+       vand    @s[2], @s[0], @s[3]
         veor    @t[1], @t[1], @t[0]
  
-       vand    @s[2], @s[2], @s[3]
-
-       veor    @s[2], @s[2], @s[0]
+       veor    @s[2], @s[2], @t[3]
  ___
  # output in s3, s2, s1, t1
author	Andy Polyakov <appro@openssl.org>
	Fri, 7 Sep 2012 12:29:18 +0000 (12:29 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Fri, 7 Sep 2012 12:29:18 +0000 (12:29 +0000)