projects
/
oweals
/
openssl.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
x86_64 assembly pack: refine clang detection.
[oweals/openssl.git]
/
crypto
/
modes
/
asm
/
ghash-x86_64.pl
diff --git
a/crypto/modes/asm/ghash-x86_64.pl
b/crypto/modes/asm/ghash-x86_64.pl
index f4af85d21b10fbf1e31a8954ed1371b0341bf9dd..1e7922733821b581c730b887f4bd5ce59c75d974 100644
(file)
--- a/
crypto/modes/asm/ghash-x86_64.pl
+++ b/
crypto/modes/asm/ghash-x86_64.pl
@@
-59,10
+59,11
@@
# longer. A CPU with higher pclmulqdq issue rate would also benefit
# from higher aggregate factor...
#
# longer. A CPU with higher pclmulqdq issue rate would also benefit
# from higher aggregate factor...
#
-# Westmere 1.76(+14%)
-# Sandy Bridge 1.79(+9%)
-# Ivy Bridge 1.79(+8%)
-# Bulldozer 1.52(+25%)
+# Westmere 1.78(+13%)
+# Sandy Bridge 1.80(+8%)
+# Ivy Bridge 1.80(+7%)
+# Haswell 0.55(+93%) (if system doesn't support AVX)
+# Bulldozer 1.49(+27%)
# March 2013
#
# March 2013
#
@@
-70,9
+71,8
@@
# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
# sub-optimally in comparison to above mentioned version. But thanks
# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
# sub-optimally in comparison to above mentioned version. But thanks
-# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we know that
-# it will perform better on upcoming Haswell processor. [Exact
-# performance numbers to be added at launch.]
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor.
#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
@@
-102,6
+102,10
@@
if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
$avx = ($1>=10) + ($1>=11);
}
$avx = ($1>=10) + ($1>=11);
}
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
@@
-214,6
+218,7
@@
___
$code=<<___;
.text
$code=<<___;
.text
+.extern OPENSSL_ia32cap_P
.globl gcm_gmult_4bit
.type gcm_gmult_4bit,\@function,2
.globl gcm_gmult_4bit
.type gcm_gmult_4bit,\@function,2
@@
-597,7
+602,8
@@
___
}
\f
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
}
\f
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
- my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10));
+ my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+ my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
$code.=<<___;
.globl gcm_ghash_clmul
$code.=<<___;
.globl gcm_ghash_clmul
@@
-624,7
+630,6
@@
$code.=<<___ if ($win64);
___
$code.=<<___;
movdqa .Lbswap_mask(%rip),$T3
___
$code.=<<___;
movdqa .Lbswap_mask(%rip),$T3
- mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
movdqu ($Xip),$Xi
movdqu ($Htbl),$Hkey
movdqu ($Xip),$Xi
movdqu ($Htbl),$Hkey
@@
-640,10
+645,16
@@
if ($do4xaggr) {
my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
$code.=<<___;
my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
$code.=<<___;
+ mov OPENSSL_ia32cap_P+4(%rip),%eax
cmp \$0x30,$len
jb .Lskip4x
cmp \$0x30,$len
jb .Lskip4x
+ and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
+ cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
+ je .Lskip4x
+
sub \$0x30,$len
sub \$0x30,$len
+ mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
movdqu 0x30($Htbl),$Hkey3
movdqu 0x40($Htbl),$Hkey4
movdqu 0x30($Htbl),$Hkey3
movdqu 0x40($Htbl),$Hkey4
@@
-666,8
+677,8
@@
$code.=<<___;
pxor $Xl,$Xm
pclmulqdq \$0x00,$Hkey2,$Xl
pclmulqdq \$0x11,$Hkey2,$Xh
pxor $Xl,$Xm
pclmulqdq \$0x00,$Hkey2,$Xl
pclmulqdq \$0x11,$Hkey2,$Xh
- xorps $Xl,$Xln
pclmulqdq \$0x10,$HK,$Xm
pclmulqdq \$0x10,$HK,$Xm
+ xorps $Xl,$Xln
xorps $Xh,$Xhn
movups 0x50($Htbl),$HK
xorps $Xm,$Xmn
xorps $Xh,$Xhn
movups 0x50($Htbl),$HK
xorps $Xm,$Xmn
@@
-685,8
+696,8
@@
$code.=<<___;
pshufd \$0b01001110,$Xi,$T1
pxor $Xi,$T1
pclmulqdq \$0x11,$Hkey3,$Xh
pshufd \$0b01001110,$Xi,$T1
pxor $Xi,$T1
pclmulqdq \$0x11,$Hkey3,$Xh
- xorps $Xl,$Xln
pclmulqdq \$0x00,$HK,$Xm
pclmulqdq \$0x00,$HK,$Xm
+ xorps $Xl,$Xln
xorps $Xh,$Xhn
lea 0x40($inp),$inp
xorps $Xh,$Xhn
lea 0x40($inp),$inp
@@
-704,23
+715,23
@@
$code.=<<___;
xorps $Xln,$Xi
movdqu 0x20($inp),$Xln
movdqa $Xl,$Xh
xorps $Xln,$Xi
movdqu 0x20($inp),$Xln
movdqa $Xl,$Xh
- pshufd \$0b01001110,$Xl,$Xm
pclmulqdq \$0x10,$HK,$T1
pclmulqdq \$0x10,$HK,$T1
+ pshufd \$0b01001110,$Xl,$Xm
xorps $Xhn,$Xhi
pxor $Xl,$Xm
pshufb $T3,$Xln
movups 0x20($Htbl),$HK
xorps $Xhn,$Xhi
pxor $Xl,$Xm
pshufb $T3,$Xln
movups 0x20($Htbl),$HK
- pclmulqdq \$0x00,$Hkey,$Xl
xorps $Xmn,$T1
xorps $Xmn,$T1
- movdqa $Xln,$Xhn
+ pclmulqdq \$0x00,$Hkey,$Xl
pshufd \$0b01001110,$Xln,$Xmn
pxor $Xi,$T1 # aggregated Karatsuba post-processing
pshufd \$0b01001110,$Xln,$Xmn
pxor $Xi,$T1 # aggregated Karatsuba post-processing
-
pxor $Xln,$Xm
n
+
movdqa $Xln,$Xh
n
pxor $Xhi,$T1 #
pxor $Xhi,$T1 #
+ pxor $Xln,$Xmn
movdqa $T1,$T2 #
movdqa $T1,$T2 #
- pslldq \$8,$T1
pclmulqdq \$0x11,$Hkey,$Xh
pclmulqdq \$0x11,$Hkey,$Xh
+ pslldq \$8,$T1
psrldq \$8,$T2 #
pxor $T1,$Xi
movdqa .L7_mask(%rip),$T1
psrldq \$8,$T2 #
pxor $T1,$Xi
movdqa .L7_mask(%rip),$T1
@@
-729,8
+740,8
@@
$code.=<<___;
pand $Xi,$T1 # 1st phase
pshufb $T1,$T2 #
pand $Xi,$T1 # 1st phase
pshufb $T1,$T2 #
- pclmulqdq \$0x00,$HK,$Xm
pxor $Xi,$T2 #
pxor $Xi,$T2 #
+ pclmulqdq \$0x00,$HK,$Xm
psllq \$57,$T2 #
movdqa $T2,$T1 #
pslldq \$8,$T2
psllq \$57,$T2 #
movdqa $T2,$T1 #
pslldq \$8,$T2
@@
-757,32
+768,31
@@
$code.=<<___;
movdqa $Xl,$Xh
pxor $Xm,$Xmn
pshufd \$0b01001110,$Xl,$Xm
movdqa $Xl,$Xh
pxor $Xm,$Xmn
pshufd \$0b01001110,$Xl,$Xm
- pxor $Xl,$Xm
- pclmulqdq \$0x00,$Hkey3,$Xl
pxor $T2,$Xi #
pxor $T1,$Xhi
pxor $T2,$Xi #
pxor $T1,$Xhi
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey3,$Xl
psrlq \$1,$Xi #
psrlq \$1,$Xi #
+ pxor $Xhi,$Xi #
+ movdqa $Xi,$Xhi
pclmulqdq \$0x11,$Hkey3,$Xh
xorps $Xl,$Xln
pclmulqdq \$0x11,$Hkey3,$Xh
xorps $Xl,$Xln
- pxor $Xhi,$Xi #
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1
pclmulqdq \$0x00,$HK,$Xm
xorps $Xh,$Xhn
pclmulqdq \$0x00,$HK,$Xm
xorps $Xh,$Xhn
- movdqa $Xi,$Xhi
- pshufd \$0b01001110,$Xi,$T1
- pxor $Xi,$T1
-
lea 0x40($inp),$inp
sub \$0x40,$len
jnc .Lmod4_loop
.Ltail4x:
pclmulqdq \$0x00,$Hkey4,$Xi
lea 0x40($inp),$inp
sub \$0x40,$len
jnc .Lmod4_loop
.Ltail4x:
pclmulqdq \$0x00,$Hkey4,$Xi
- xorps $Xm,$Xmn
pclmulqdq \$0x11,$Hkey4,$Xhi
pclmulqdq \$0x11,$Hkey4,$Xhi
- xorps $Xln,$Xi
pclmulqdq \$0x10,$HK,$T1
pclmulqdq \$0x10,$HK,$T1
+ xorps $Xm,$Xmn
+ xorps $Xln,$Xi
xorps $Xhn,$Xhi
pxor $Xi,$Xhi # aggregated Karatsuba post-processing
pxor $Xmn,$T1
xorps $Xhn,$Xhi
pxor $Xi,$Xhi # aggregated Karatsuba post-processing
pxor $Xmn,$T1
@@
-819,51
+829,54
@@
$code.=<<___;
pxor $T1,$Xi # Ii+Xi
movdqa $Xln,$Xhn
pxor $T1,$Xi # Ii+Xi
movdqa $Xln,$Xhn
- pshufd \$0b01001110,$Xln,$
T1
- pxor $Xln,$
T1
+ pshufd \$0b01001110,$Xln,$
Xmn
+ pxor $Xln,$
Xmn
pclmulqdq \$0x00,$Hkey,$Xln
pclmulqdq \$0x11,$Hkey,$Xhn
pclmulqdq \$0x00,$Hkey,$Xln
pclmulqdq \$0x11,$Hkey,$Xhn
- pclmulqdq \$0x00,$HK,$
T1
+ pclmulqdq \$0x00,$HK,$
Xmn
lea 32($inp),$inp # i+=2
lea 32($inp),$inp # i+=2
+ nop
sub \$0x20,$len
jbe .Leven_tail
sub \$0x20,$len
jbe .Leven_tail
+ nop
jmp .Lmod_loop
.align 32
.Lmod_loop:
movdqa $Xi,$Xhi
jmp .Lmod_loop
.align 32
.Lmod_loop:
movdqa $Xi,$Xhi
- pshufd \$0b01001110,$Xi,$T2 #
- pxor $Xi,$T2 #
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
pclmulqdq \$0x00,$Hkey2,$Xi
pclmulqdq \$0x11,$Hkey2,$Xhi
pclmulqdq \$0x00,$Hkey2,$Xi
pclmulqdq \$0x11,$Hkey2,$Xhi
- pclmulqdq \$0x10,$HK,$
T2
+ pclmulqdq \$0x10,$HK,$
Xmn
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
- movdqu ($inp),$Xhn # Ii
- pshufb $T3,$Xhn
+ movdqu ($inp),$T2 # Ii
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
+ pshufb $T3,$T2
movdqu 16($inp),$Xln # Ii+1
movdqu 16($inp),$Xln # Ii+1
- pxor $Xi,$T1 # aggregated Karatsuba post-processing
pxor $Xhi,$T1
pxor $Xhi,$T1
- pxor $
Xhn,$Xhi
# "Ii+Xi", consume early
- pxor $T1,$
T2
+ pxor $
T2,$Xhi
# "Ii+Xi", consume early
+ pxor $T1,$
Xmn
pshufb $T3,$Xln
pshufb $T3,$Xln
- movdqa $
T2,$T1
#
+ movdqa $
Xmn,$T1
#
psrldq \$8,$T1
psrldq \$8,$T1
- pslldq \$8,$
T2
#
+ pslldq \$8,$
Xmn
#
pxor $T1,$Xhi
pxor $T1,$Xhi
- pxor $
T2,$Xi
#
+ pxor $
Xmn,$Xi
#
movdqa $Xln,$Xhn #
movdqa $Xi,$T2 # 1st phase
movdqa $Xi,$T1
psllq \$5,$Xi
movdqa $Xln,$Xhn #
movdqa $Xi,$T2 # 1st phase
movdqa $Xi,$T1
psllq \$5,$Xi
- pclmulqdq \$0x00,$Hkey,$Xln #######
pxor $Xi,$T1 #
pxor $Xi,$T1 #
+ pclmulqdq \$0x00,$Hkey,$Xln #######
psllq \$1,$Xi
pxor $T1,$Xi #
psllq \$57,$Xi #
psllq \$1,$Xi
pxor $T1,$Xi #
psllq \$57,$Xi #
@@
-871,44
+884,45
@@
$code.=<<___;
pslldq \$8,$Xi
psrldq \$8,$T1 #
pxor $T2,$Xi
pslldq \$8,$Xi
psrldq \$8,$T1 #
pxor $T2,$Xi
+ pshufd \$0b01001110,$Xhn,$Xmn
pxor $T1,$Xhi #
pxor $T1,$Xhi #
- pshufd \$0b01001110,$Xhn,$T1
- pxor $Xhn,$T1 #
+ pxor $Xhn,$Xmn #
- pclmulqdq \$0x11,$Hkey,$Xhn #######
movdqa $Xi,$T2 # 2nd phase
psrlq \$1,$Xi
movdqa $Xi,$T2 # 2nd phase
psrlq \$1,$Xi
+ pclmulqdq \$0x11,$Hkey,$Xhn #######
pxor $T2,$Xhi #
pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
pxor $T2,$Xhi #
pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
+ lea 32($inp),$inp
psrlq \$1,$Xi #
psrlq \$1,$Xi #
- pclmulqdq \$0x00,$HK,$
T1
#######
+ pclmulqdq \$0x00,$HK,$
Xmn
#######
pxor $Xhi,$Xi #
pxor $Xhi,$Xi #
- lea 32($inp),$inp
sub \$0x20,$len
ja .Lmod_loop
.Leven_tail:
movdqa $Xi,$Xhi
sub \$0x20,$len
ja .Lmod_loop
.Leven_tail:
movdqa $Xi,$Xhi
- pshufd \$0b01001110,$Xi,$T2 #
- pxor $Xi,$T2 #
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
pclmulqdq \$0x00,$Hkey2,$Xi
pclmulqdq \$0x11,$Hkey2,$Xhi
pclmulqdq \$0x00,$Hkey2,$Xi
pclmulqdq \$0x11,$Hkey2,$Xhi
- pclmulqdq \$0x10,$HK,$
T2
+ pclmulqdq \$0x10,$HK,$
Xmn
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
pxor $Xi,$T1
pxor $Xhi,$T1
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
pxor $Xi,$T1
pxor $Xhi,$T1
- pxor $T1,$
T2
- movdqa $
T2,$T1
#
+ pxor $T1,$
Xmn
+ movdqa $
Xmn,$T1
#
psrldq \$8,$T1
psrldq \$8,$T1
- pslldq \$8,$
T2
#
+ pslldq \$8,$
Xmn
#
pxor $T1,$Xhi
pxor $T1,$Xhi
- pxor $
T2,$Xi
#
+ pxor $
Xmn,$Xi
#
___
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
___
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;