&pshufd ($T1,$Xn,0b01001110); # H*Ii+1
&movdqa ($Xhn,$Xn);
&pxor ($T1,$Xn); #
+ &lea ($inp,&DWP(32,$inp)); # i+=2
&pclmulqdq ($Xn,$Hkey,0x00); #######
&pclmulqdq ($Xhn,$Hkey,0x11); #######
- &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
&pclmulqdq ($T1,$T3,0x00); #######
+ &movups ($Hkey,&QWP(16,$Htbl)); # load H^2
+ &nop ();
- &lea ($inp,&DWP(32,$inp)); # i+=2
&sub ($len,0x20);
&jbe (&label("even_tail"));
&jmp (&label("mod_loop"));
&pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
&movdqa ($Xhi,$Xi);
&pxor ($T2,$Xi); #
+ &nop ();
&pclmulqdq ($Xi,$Hkey,0x00); #######
&pclmulqdq ($Xhi,$Hkey,0x11); #######
- &movups ($Hkey,&QWP(0,$Htbl)); # load H
&pclmulqdq ($T2,$T3,0x10); #######
- &movdqa ($T3,&QWP(0,$const));
+ &movups ($Hkey,&QWP(0,$Htbl)); # load H
&xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
+ &movdqa ($T3,&QWP(0,$const));
&xorps ($Xhi,$Xhn);
&movdqu ($Xhn,&QWP(0,$inp)); # Ii
&pxor ($T1,$Xi); # aggregated Karatsuba post-processing
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
&pxor ($T1,$Xhi); #
- &pxor ($T2,$T1); #
&pshufb ($Xhn,$T3);
+ &pxor ($T2,$T1); #
&movdqa ($T1,$T2); #
&psrldq ($T2,8);
&pxor ($T1,$Xi); #
&psllq ($Xi,1);
&pxor ($Xi,$T1); #
- &movups ($T3,&QWP(32,$Htbl));
&pclmulqdq ($Xn,$Hkey,0x00); #######
+ &movups ($T3,&QWP(32,$Htbl));
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
&movdqa ($T2,$Xi); # 2nd phase
&psrlq ($Xi,1);
&pxor ($T1,$Xhn);
+ &pxor ($Xhi,$T2); #
&pclmulqdq ($Xhn,$Hkey,0x11); #######
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
- &pxor ($Xhi,$T2); #
&pxor ($T2,$Xi);
&psrlq ($Xi,5);
&pxor ($Xi,$T2); #
$code=<<___;
.text
+.extern OPENSSL_ia32cap_P
.globl gcm_gmult_4bit
.type gcm_gmult_4bit,\@function,2
}
\f
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
- my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10));
+ my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+ my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
$code.=<<___;
.globl gcm_ghash_clmul
___
$code.=<<___;
movdqa .Lbswap_mask(%rip),$T3
- mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
movdqu ($Xip),$Xi
movdqu ($Htbl),$Hkey
my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
$code.=<<___;
+ mov OPENSSL_ia32cap_P+4(%rip),%eax
cmp \$0x30,$len
jb .Lskip4x
+ and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
+ cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
+ je .Lskip4x
+
sub \$0x30,$len
+ mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
movdqu 0x30($Htbl),$Hkey3
movdqu 0x40($Htbl),$Hkey4
pxor $T1,$Xi # Ii+Xi
movdqa $Xln,$Xhn
- pshufd \$0b01001110,$Xln,$T1
- pxor $Xln,$T1
+ pshufd \$0b01001110,$Xln,$Xmn
+ pxor $Xln,$Xmn
pclmulqdq \$0x00,$Hkey,$Xln
pclmulqdq \$0x11,$Hkey,$Xhn
- pclmulqdq \$0x00,$HK,$T1
+ pclmulqdq \$0x00,$HK,$Xmn
lea 32($inp),$inp # i+=2
+ nop
sub \$0x20,$len
jbe .Leven_tail
+ nop
jmp .Lmod_loop
.align 32
.Lmod_loop:
movdqa $Xi,$Xhi
- pshufd \$0b01001110,$Xi,$T2 #
- pxor $Xi,$T2 #
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
pclmulqdq \$0x00,$Hkey2,$Xi
pclmulqdq \$0x11,$Hkey2,$Xhi
- pclmulqdq \$0x10,$HK,$T2
+ pclmulqdq \$0x10,$HK,$Xmn
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
movdqu ($inp),$Xhn # Ii
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
pshufb $T3,$Xhn
movdqu 16($inp),$Xln # Ii+1
- pxor $Xi,$T1 # aggregated Karatsuba post-processing
pxor $Xhi,$T1
pxor $Xhn,$Xhi # "Ii+Xi", consume early
- pxor $T1,$T2
+ pxor $T1,$Xmn
pshufb $T3,$Xln
- movdqa $T2,$T1 #
+ movdqa $Xmn,$T1 #
psrldq \$8,$T1
- pslldq \$8,$T2 #
+ pslldq \$8,$Xmn #
pxor $T1,$Xhi
- pxor $T2,$Xi #
+ pxor $Xmn,$Xi #
movdqa $Xln,$Xhn #
movdqa $Xi,$T2 # 1st phase
movdqa $Xi,$T1
psllq \$5,$Xi
- pclmulqdq \$0x00,$Hkey,$Xln #######
pxor $Xi,$T1 #
+ pclmulqdq \$0x00,$Hkey,$Xln #######
psllq \$1,$Xi
pxor $T1,$Xi #
psllq \$57,$Xi #
pslldq \$8,$Xi
psrldq \$8,$T1 #
pxor $T2,$Xi
+ pshufd \$0b01001110,$Xhn,$Xmn
pxor $T1,$Xhi #
- pshufd \$0b01001110,$Xhn,$T1
- pxor $Xhn,$T1 #
+ pxor $Xhn,$Xmn #
pclmulqdq \$0x11,$Hkey,$Xhn #######
movdqa $Xi,$T2 # 2nd phase
pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
+ lea 32($inp),$inp
psrlq \$1,$Xi #
- pclmulqdq \$0x00,$HK,$T1 #######
+ pclmulqdq \$0x00,$HK,$Xmn #######
pxor $Xhi,$Xi #
+ .byte 0x66,0x90
- lea 32($inp),$inp
sub \$0x20,$len
ja .Lmod_loop
.Leven_tail:
movdqa $Xi,$Xhi
- pshufd \$0b01001110,$Xi,$T2 #
- pxor $Xi,$T2 #
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
pclmulqdq \$0x00,$Hkey2,$Xi
pclmulqdq \$0x11,$Hkey2,$Xhi
- pclmulqdq \$0x10,$HK,$T2
+ pclmulqdq \$0x10,$HK,$Xmn
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
pxor $Xi,$T1
pxor $Xhi,$T1
- pxor $T1,$T2
- movdqa $T2,$T1 #
+ pxor $T1,$Xmn
+ movdqa $Xmn,$T1 #
psrldq \$8,$T1
- pslldq \$8,$T2 #
+ pslldq \$8,$Xmn #
pxor $T1,$Xhi
- pxor $T2,$Xi #
+ pxor $Xmn,$Xi #
___
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;