crypto/modes/asm/aesni-gcm-x86_64.pl: minor optimization.
authorAndy Polyakov <appro@openssl.org>
Mon, 9 Sep 2013 19:43:21 +0000 (21:43 +0200)
committerAndy Polyakov <appro@openssl.org>
Mon, 9 Sep 2013 19:43:21 +0000 (21:43 +0200)
Avoid occasional up to 8% performance drops.

crypto/modes/asm/aesni-gcm-x86_64.pl

index 31987146b0e1d444d32cfab95a273a430c185881..3781933917227dd127352fa8d7d066804dacba13 100644 (file)
@@ -21,8 +21,8 @@
 # justify. This module is based on combination of Intel submissions,
 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
-# pressure with notable relative improvement on upcoming Haswell
-# processor. [Exact performance numbers to be added at launch.]
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor.
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
@@ -422,17 +422,28 @@ $code.=<<___;
        vzeroupper
 
        vmovdqu         ($ivp),$T1              # input counter value
-       sub             \$128,%rsp
+       add             \$-128,%rsp
        mov             12($ivp),$counter
        lea             .Lbswap_mask(%rip),$const
+       lea             -0x80($key),$in0        # borrow $in0
+       mov             \$0xf80,$end0           # borrow $end0
        vmovdqu         ($Xip),$Xi              # load Xi
-       and             \$-64,%rsp              # ensure stack alignment
+       and             \$-128,%rsp             # ensure stack alignment
        vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
        lea             0x80($key),$key         # size optimization
        lea             0x20+0x20($Xip),$Xip    # size optimization
        mov             0xf0-0x80($key),$rounds
        vpshufb         $Ii,$Xi,$Xi
 
+       and             $end0,$in0
+       and             %rsp,$end0
+       sub             $in0,$end0
+       jc              .Ldec_no_key_aliasing
+       cmp             \$768,$end0
+       jnc             .Ldec_no_key_aliasing
+       sub             $end0,%rsp              # avoid aliasing with key
+.Ldec_no_key_aliasing:
+
        vmovdqu         0x50($inp),$Z3          # I[5]
        lea             ($inp),$in0
        vmovdqu         0x40($inp),$Z0
@@ -621,14 +632,25 @@ $code.=<<___;
        vzeroupper
 
        vmovdqu         ($ivp),$T1              # input counter value
-       sub             \$128,%rsp
+       add             \$-128,%rsp
        mov             12($ivp),$counter
        lea             .Lbswap_mask(%rip),$const
+       lea             -0x80($key),$in0        # borrow $in0
+       mov             \$0xf80,$end0           # borrow $end0
        lea             0x80($key),$key         # size optimization
        vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
-       and             \$-64,%rsp              # ensure stack alignment
+       and             \$-128,%rsp             # ensure stack alignment
        mov             0xf0-0x80($key),$rounds
 
+       and             $end0,$in0
+       and             %rsp,$end0
+       sub             $in0,$end0
+       jc              .Lenc_no_key_aliasing
+       cmp             \$768,$end0
+       jnc             .Lenc_no_key_aliasing
+       sub             $end0,%rsp              # avoid aliasing with key
+.Lenc_no_key_aliasing:
+
        lea             ($out),$in0
        lea             -0xc0($out,$len),$end0
        shr             \$4,$len