From 98e143f118aedc2fa79fa0ae90f1b039da106309 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 13 Feb 2014 14:36:02 +0100 Subject: [PATCH] ghash-x86[_64].pl: ~15% improvement on Atom Silvermont (other processors unaffected). --- crypto/modes/asm/ghash-x86.pl | 16 +++++---- crypto/modes/asm/ghash-x86_64.pl | 62 +++++++++++++++++++------------- 2 files changed, 46 insertions(+), 32 deletions(-) diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl index e6b9663c13..23a5527b30 100644 --- a/crypto/modes/asm/ghash-x86.pl +++ b/crypto/modes/asm/ghash-x86.pl @@ -1021,13 +1021,14 @@ my ($Xhi,$Xi) = @_; &pshufd ($T1,$Xn,0b01001110); # H*Ii+1 &movdqa ($Xhn,$Xn); &pxor ($T1,$Xn); # + &lea ($inp,&DWP(32,$inp)); # i+=2 &pclmulqdq ($Xn,$Hkey,0x00); ####### &pclmulqdq ($Xhn,$Hkey,0x11); ####### - &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 &pclmulqdq ($T1,$T3,0x00); ####### + &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + &nop (); - &lea ($inp,&DWP(32,$inp)); # i+=2 &sub ($len,0x20); &jbe (&label("even_tail")); &jmp (&label("mod_loop")); @@ -1036,22 +1037,23 @@ my ($Xhi,$Xi) = @_; &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) &movdqa ($Xhi,$Xi); &pxor ($T2,$Xi); # + &nop (); &pclmulqdq ($Xi,$Hkey,0x00); ####### &pclmulqdq ($Xhi,$Hkey,0x11); ####### - &movups ($Hkey,&QWP(0,$Htbl)); # load H &pclmulqdq ($T2,$T3,0x10); ####### - &movdqa ($T3,&QWP(0,$const)); + &movups ($Hkey,&QWP(0,$Htbl)); # load H &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &movdqa ($T3,&QWP(0,$const)); &xorps ($Xhi,$Xhn); &movdqu ($Xhn,&QWP(0,$inp)); # Ii &pxor ($T1,$Xi); # aggregated Karatsuba post-processing &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 &pxor ($T1,$Xhi); # - &pxor ($T2,$T1); # &pshufb ($Xhn,$T3); + &pxor ($T2,$T1); # &movdqa ($T1,$T2); # &psrldq ($T2,8); @@ -1068,8 +1070,8 @@ my ($Xhi,$Xi) = @_; &pxor ($T1,$Xi); # &psllq ($Xi,1); &pxor ($Xi,$T1); # - &movups ($T3,&QWP(32,$Htbl)); &pclmulqdq ($Xn,$Hkey,0x00); ####### + &movups ($T3,&QWP(32,$Htbl)); &psllq ($Xi,57); # &movdqa ($T1,$Xi); # &pslldq ($Xi,8); @@ -1080,9 +1082,9 @@ my ($Xhi,$Xi) = @_; &movdqa ($T2,$Xi); # 2nd phase &psrlq ($Xi,1); &pxor ($T1,$Xhn); + &pxor ($Xhi,$T2); # &pclmulqdq ($Xhn,$Hkey,0x11); ####### &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 - &pxor ($Xhi,$T2); # &pxor ($T2,$Xi); &psrlq ($Xi,5); &pxor ($Xi,$T2); # diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl index 7904248070..04001e6aae 100644 --- a/crypto/modes/asm/ghash-x86_64.pl +++ b/crypto/modes/asm/ghash-x86_64.pl @@ -214,6 +214,7 @@ ___ $code=<<___; .text +.extern OPENSSL_ia32cap_P .globl gcm_gmult_4bit .type gcm_gmult_4bit,\@function,2 @@ -597,7 +598,8 @@ ___ } { my ($Xip,$Htbl,$inp,$len)=@_4args; - my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10)); + my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); + my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); $code.=<<___; .globl gcm_ghash_clmul @@ -624,7 +626,6 @@ $code.=<<___ if ($win64); ___ $code.=<<___; movdqa .Lbswap_mask(%rip),$T3 - mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff movdqu ($Xip),$Xi movdqu ($Htbl),$Hkey @@ -640,10 +641,16 @@ if ($do4xaggr) { my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); $code.=<<___; + mov OPENSSL_ia32cap_P+4(%rip),%eax cmp \$0x30,$len jb .Lskip4x + and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE + cmp \$`1<<22`,%eax # check for MOVBE without XSAVE + je .Lskip4x + sub \$0x30,$len + mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff movdqu 0x30($Htbl),$Hkey3 movdqu 0x40($Htbl),$Hkey4 @@ -819,51 +826,54 @@ $code.=<<___; pxor $T1,$Xi # Ii+Xi movdqa $Xln,$Xhn - pshufd \$0b01001110,$Xln,$T1 - pxor $Xln,$T1 + pshufd \$0b01001110,$Xln,$Xmn + pxor $Xln,$Xmn pclmulqdq \$0x00,$Hkey,$Xln pclmulqdq \$0x11,$Hkey,$Xhn - pclmulqdq \$0x00,$HK,$T1 + pclmulqdq \$0x00,$HK,$Xmn lea 32($inp),$inp # i+=2 + nop sub \$0x20,$len jbe .Leven_tail + nop jmp .Lmod_loop .align 32 .Lmod_loop: movdqa $Xi,$Xhi - pshufd \$0b01001110,$Xi,$T2 # - pxor $Xi,$T2 # + movdqa $Xmn,$T1 + pshufd \$0b01001110,$Xi,$Xmn # + pxor $Xi,$Xmn # pclmulqdq \$0x00,$Hkey2,$Xi pclmulqdq \$0x11,$Hkey2,$Xhi - pclmulqdq \$0x10,$HK,$T2 + pclmulqdq \$0x10,$HK,$Xmn pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi movdqu ($inp),$Xhn # Ii + pxor $Xi,$T1 # aggregated Karatsuba post-processing pshufb $T3,$Xhn movdqu 16($inp),$Xln # Ii+1 - pxor $Xi,$T1 # aggregated Karatsuba post-processing pxor $Xhi,$T1 pxor $Xhn,$Xhi # "Ii+Xi", consume early - pxor $T1,$T2 + pxor $T1,$Xmn pshufb $T3,$Xln - movdqa $T2,$T1 # + movdqa $Xmn,$T1 # psrldq \$8,$T1 - pslldq \$8,$T2 # + pslldq \$8,$Xmn # pxor $T1,$Xhi - pxor $T2,$Xi # + pxor $Xmn,$Xi # movdqa $Xln,$Xhn # movdqa $Xi,$T2 # 1st phase movdqa $Xi,$T1 psllq \$5,$Xi - pclmulqdq \$0x00,$Hkey,$Xln ####### pxor $Xi,$T1 # + pclmulqdq \$0x00,$Hkey,$Xln ####### psllq \$1,$Xi pxor $T1,$Xi # psllq \$57,$Xi # @@ -871,9 +881,9 @@ $code.=<<___; pslldq \$8,$Xi psrldq \$8,$T1 # pxor $T2,$Xi + pshufd \$0b01001110,$Xhn,$Xmn pxor $T1,$Xhi # - pshufd \$0b01001110,$Xhn,$T1 - pxor $Xhn,$T1 # + pxor $Xhn,$Xmn # pclmulqdq \$0x11,$Hkey,$Xhn ####### movdqa $Xi,$T2 # 2nd phase @@ -882,33 +892,35 @@ $code.=<<___; pxor $Xi,$T2 psrlq \$5,$Xi pxor $T2,$Xi # + lea 32($inp),$inp psrlq \$1,$Xi # - pclmulqdq \$0x00,$HK,$T1 ####### + pclmulqdq \$0x00,$HK,$Xmn ####### pxor $Xhi,$Xi # + .byte 0x66,0x90 - lea 32($inp),$inp sub \$0x20,$len ja .Lmod_loop .Leven_tail: movdqa $Xi,$Xhi - pshufd \$0b01001110,$Xi,$T2 # - pxor $Xi,$T2 # + movdqa $Xmn,$T1 + pshufd \$0b01001110,$Xi,$Xmn # + pxor $Xi,$Xmn # pclmulqdq \$0x00,$Hkey2,$Xi pclmulqdq \$0x11,$Hkey2,$Xhi - pclmulqdq \$0x10,$HK,$T2 + pclmulqdq \$0x10,$HK,$Xmn pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi pxor $Xi,$T1 pxor $Xhi,$T1 - pxor $T1,$T2 - movdqa $T2,$T1 # + pxor $T1,$Xmn + movdqa $Xmn,$T1 # psrldq \$8,$T1 - pslldq \$8,$T2 # + pslldq \$8,$Xmn # pxor $T1,$Xhi - pxor $T2,$Xi # + pxor $Xmn,$Xi # ___ &reduction_alg9 ($Xhi,$Xi); $code.=<<___; -- 2.25.1