From 273a808180e8bff35fb5113f022f8c7c966ab8d1 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 14 Feb 2013 16:28:09 +0100 Subject: [PATCH] ghash-x86[_64].pl: code refresh. --- crypto/modes/asm/ghash-x86.pl | 165 +++++++---- crypto/modes/asm/ghash-x86_64.pl | 472 +++++++++++++++++++++++++------ crypto/modes/gcm128.c | 16 ++ 3 files changed, 499 insertions(+), 154 deletions(-) diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl index d31fbae0d8..e6b9663c13 100644 --- a/crypto/modes/asm/ghash-x86.pl +++ b/crypto/modes/asm/ghash-x86.pl @@ -119,6 +119,12 @@ # For reference, AMD Bulldozer processes one byte in 1.98 cycles in # 32-bit mode and 1.89 in 64-bit. +# February 2013 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9. Resulting performance is 1.96 cycles per byte on +# Westmere, 1.95 - on Sandy/Ivy Bridge, 1.76 - on Bulldozer. + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; @@ -828,17 +834,18 @@ $len="ebx"; &static_label("bswap"); sub clmul64x64_T2 { # minimal "register" pressure -my ($Xhi,$Xi,$Hkey)=@_; +my ($Xhi,$Xi,$Hkey,$HK)=@_; &movdqa ($Xhi,$Xi); # &pshufd ($T1,$Xi,0b01001110); - &pshufd ($T2,$Hkey,0b01001110); + &pshufd ($T2,$Hkey,0b01001110) if (!defined($HK)); &pxor ($T1,$Xi); # - &pxor ($T2,$Hkey); + &pxor ($T2,$Hkey) if (!defined($HK)); + $HK=$T2 if (!defined($HK)); &pclmulqdq ($Xi,$Hkey,0x00); ####### &pclmulqdq ($Xhi,$Hkey,0x11); ####### - &pclmulqdq ($T1,$T2,0x00); ####### + &pclmulqdq ($T1,$HK,0x00); ####### &xorps ($T1,$Xi); # &xorps ($T1,$Xhi); # @@ -885,31 +892,32 @@ if (1) { # Algorithm 9 with <<1 twist. # below. Algorithm 9 was therefore chosen for # further optimization... -sub reduction_alg9 { # 17/13 times faster than Intel version +sub reduction_alg9 { # 17/11 times faster than Intel version my ($Xhi,$Xi) = @_; # 1st phase - &movdqa ($T1,$Xi); # + &movdqa ($T2,$Xi); # + &movdqa ($T1,$Xi); + &psllq ($Xi,5); + &pxor ($T1,$Xi); # &psllq ($Xi,1); &pxor ($Xi,$T1); # - &psllq ($Xi,5); # - &pxor ($Xi,$T1); # &psllq ($Xi,57); # - &movdqa ($T2,$Xi); # + &movdqa ($T1,$Xi); # &pslldq ($Xi,8); - &psrldq ($T2,8); # - &pxor ($Xi,$T1); - &pxor ($Xhi,$T2); # + &psrldq ($T1,8); # + &pxor ($Xi,$T2); + &pxor ($Xhi,$T1); # # 2nd phase &movdqa ($T2,$Xi); + &psrlq ($Xi,1); + &pxor ($Xhi,$T2); # + &pxor ($T2,$Xi); &psrlq ($Xi,5); &pxor ($Xi,$T2); # &psrlq ($Xi,1); # - &pxor ($Xi,$T2); # - &pxor ($T2,$Xhi); - &psrlq ($Xi,1); # - &pxor ($Xi,$T2); # + &pxor ($Xi,$Xhi) # } &function_begin_B("gcm_init_clmul"); @@ -943,8 +951,14 @@ my ($Xhi,$Xi) = @_; &clmul64x64_T2 ($Xhi,$Xi,$Hkey); &reduction_alg9 ($Xhi,$Xi); + &pshufd ($T1,$Hkey,0b01001110); + &pshufd ($T2,$Xi,0b01001110); + &pxor ($T1,$Hkey); # Karatsuba pre-processing &movdqu (&QWP(0,$Htbl),$Hkey); # save H + &pxor ($T2,$Xi); # Karatsuba pre-processing &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 + &palignr ($T2,$T1,8); # low part is H.lo^H.hi + &movdqu (&QWP(32,$Htbl),$T2); # save Karatsuba "salt" &ret (); &function_end_B("gcm_init_clmul"); @@ -962,8 +976,9 @@ my ($Xhi,$Xi) = @_; &movdqa ($T3,&QWP(0,$const)); &movups ($Hkey,&QWP(0,$Htbl)); &pshufb ($Xi,$T3); + &movups ($T2,&QWP(32,$Htbl)); - &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); &reduction_alg9 ($Xhi,$Xi); &pshufb ($Xi,$T3); @@ -1000,79 +1015,107 @@ my ($Xhi,$Xi) = @_; &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 &pshufb ($T1,$T3); &pshufb ($Xn,$T3); + &movdqu ($T3,&QWP(32,$Htbl)); &pxor ($Xi,$T1); # Ii+Xi - &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &pshufd ($T1,$Xn,0b01001110); # H*Ii+1 + &movdqa ($Xhn,$Xn); + &pxor ($T1,$Xn); # + + &pclmulqdq ($Xn,$Hkey,0x00); ####### + &pclmulqdq ($Xhn,$Hkey,0x11); ####### &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + &pclmulqdq ($T1,$T3,0x00); ####### &lea ($inp,&DWP(32,$inp)); # i+=2 &sub ($len,0x20); &jbe (&label("even_tail")); + &jmp (&label("mod_loop")); -&set_label("mod_loop"); - &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) - &movdqu ($T1,&QWP(0,$inp)); # Ii - &movups ($Hkey,&QWP(0,$Htbl)); # load H +&set_label("mod_loop",32); + &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) + &movdqa ($Xhi,$Xi); + &pxor ($T2,$Xi); # - &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) - &pxor ($Xhi,$Xhn); + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &movups ($Hkey,&QWP(0,$Htbl)); # load H + &pclmulqdq ($T2,$T3,0x10); ####### + &movdqa ($T3,&QWP(0,$const)); - &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 - &pshufb ($T1,$T3); - &pshufb ($Xn,$T3); + &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &xorps ($Xhi,$Xhn); + &movdqu ($Xhn,&QWP(0,$inp)); # Ii + &pxor ($T1,$Xi); # aggregated Karatsuba post-processing + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pxor ($T1,$Xhi); # - &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 - &movdqa ($Xhn,$Xn); - &pxor ($Xhi,$T1); # "Ii+Xi", consume early + &pxor ($T2,$T1); # + &pshufb ($Xhn,$T3); - &movdqa ($T1,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase + &movdqa ($T1,$T2); # + &psrldq ($T2,8); + &pslldq ($T1,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T1); # + &pshufb ($Xn,$T3); + &pxor ($Xhi,$Xhn); # "Ii+Xi", consume early + + &movdqa ($Xhn,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 + &movdqa ($T2,$Xi); #&reduction_alg9($Xhi,$Xi); 1st phase + &movdqa ($T1,$Xi); + &psllq ($Xi,5); + &pxor ($T1,$Xi); # &psllq ($Xi,1); &pxor ($Xi,$T1); # - &psllq ($Xi,5); # - &pxor ($Xi,$T1); # + &movups ($T3,&QWP(32,$Htbl)); &pclmulqdq ($Xn,$Hkey,0x00); ####### &psllq ($Xi,57); # - &movdqa ($T2,$Xi); # + &movdqa ($T1,$Xi); # &pslldq ($Xi,8); - &psrldq ($T2,8); # - &pxor ($Xi,$T1); - &pshufd ($T1,$T3,0b01001110); - &pxor ($Xhi,$T2); # - &pxor ($T1,$T3); - &pshufd ($T3,$Hkey,0b01001110); - &pxor ($T3,$Hkey); # - - &pclmulqdq ($Xhn,$Hkey,0x11); ####### + &psrldq ($T1,8); # + &pxor ($Xi,$T2); + &pxor ($Xhi,$T1); # + &pshufd ($T1,$Xhn,0b01001110); &movdqa ($T2,$Xi); # 2nd phase + &psrlq ($Xi,1); + &pxor ($T1,$Xhn); + &pclmulqdq ($Xhn,$Hkey,0x11); ####### + &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + &pxor ($Xhi,$T2); # + &pxor ($T2,$Xi); &psrlq ($Xi,5); &pxor ($Xi,$T2); # &psrlq ($Xi,1); # - &pxor ($Xi,$T2); # - &pxor ($T2,$Xhi); - &psrlq ($Xi,1); # - &pxor ($Xi,$T2); # - + &pxor ($Xi,$Xhi) # &pclmulqdq ($T1,$T3,0x00); ####### - &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 - &xorps ($T1,$Xn); # - &xorps ($T1,$Xhn); # - - &movdqa ($T3,$T1); # - &psrldq ($T1,8); - &pslldq ($T3,8); # - &pxor ($Xhn,$T1); - &pxor ($Xn,$T3); # - &movdqa ($T3,&QWP(0,$const)); &lea ($inp,&DWP(32,$inp)); &sub ($len,0x20); &ja (&label("mod_loop")); &set_label("even_tail"); - &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + &pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi) + &movdqa ($Xhi,$Xi); + &pxor ($T2,$Xi); # - &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) - &pxor ($Xhi,$Xhn); + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T2,$T3,0x10); ####### + &movdqa ($T3,&QWP(0,$const)); + + &xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &xorps ($Xhi,$Xhn); + &pxor ($T1,$Xi); # aggregated Karatsuba post-processing + &pxor ($T1,$Xhi); # + + &pxor ($T2,$T1); # + + &movdqa ($T1,$T2); # + &psrldq ($T2,8); + &pslldq ($T1,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T1); # &reduction_alg9 ($Xhi,$Xi); diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl index cd93c0f95e..4bbd4ab5d6 100644 --- a/crypto/modes/asm/ghash-x86_64.pl +++ b/crypto/modes/asm/ghash-x86_64.pl @@ -41,6 +41,29 @@ # providing access to a Westmere-based system on behalf of Intel # Open Source Technology Centre. +# December 2012 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9, increase reduction aggregate factor to 4x. As for +# the latter. ghash-x86.pl discusses that it makes lesser sense to +# increase aggregate factor. Then why increase here? Critical path +# consists of 3 independent pclmulqdq instructions, Karatsuba post- +# processing and reduction. "On top" of this we lay down aggregated +# multiplication operations, triplets of independent pclmulqdq's. As +# issue rate for pclmulqdq is limited, it makes lesser sense to +# aggregate more multiplications than it takes to perform remaining +# non-multiplication operations. 2x is near-optimal coefficient for +# contemporary Intel CPUs (therefore modest improvement coefficient), +# but not for Bulldozer. Latter is because logical SIMD operations +# are twice as slow in comparison to Intel, so that critical path is +# longer. A CPU with higher pclmulqdq issue rate would also benefit +# from higher aggregate factor... +# +# Westmere 1.76(+14%) +# Sandy Bridge 1.79(+9%) +# Ivy Bridge 1.79(+8%) +# Bulldozer 1.52(+25%) + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -55,6 +78,8 @@ die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; +$do4xaggr=1; + # common register layout $nlo="%rax"; $nhi="%rbx"; @@ -354,19 +379,27 @@ ___ ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); sub clmul64x64_T2 { # minimal register pressure -my ($Xhi,$Xi,$Hkey,$modulo)=@_; +my ($Xhi,$Xi,$Hkey,$HK)=@_; -$code.=<<___ if (!defined($modulo)); +if (!defined($HK)) { $HK = $T2; +$code.=<<___; movdqa $Xi,$Xhi # pshufd \$0b01001110,$Xi,$T1 pshufd \$0b01001110,$Hkey,$T2 pxor $Xi,$T1 # pxor $Hkey,$T2 ___ +} else { +$code.=<<___; + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 # +___ +} $code.=<<___; pclmulqdq \$0x00,$Hkey,$Xi ####### pclmulqdq \$0x11,$Hkey,$Xhi ####### - pclmulqdq \$0x00,$T2,$T1 ####### + pclmulqdq \$0x00,$HK,$T1 ####### pxor $Xi,$T1 # pxor $Xhi,$T1 # @@ -378,32 +411,33 @@ $code.=<<___; ___ } -sub reduction_alg9 { # 17/13 times faster than Intel version +sub reduction_alg9 { # 17/11 times faster than Intel version my ($Xhi,$Xi) = @_; $code.=<<___; # 1st phase - movdqa $Xi,$T1 # + movdqa $Xi,$T2 # + movdqa $Xi,$T1 + psllq \$5,$Xi + pxor $Xi,$T1 # psllq \$1,$Xi pxor $T1,$Xi # - psllq \$5,$Xi # - pxor $T1,$Xi # psllq \$57,$Xi # - movdqa $Xi,$T2 # + movdqa $Xi,$T1 # pslldq \$8,$Xi - psrldq \$8,$T2 # - pxor $T1,$Xi - pxor $T2,$Xhi # + psrldq \$8,$T1 # + pxor $T2,$Xi + pxor $T1,$Xhi # # 2nd phase movdqa $Xi,$T2 + psrlq \$1,$Xi + pxor $T2,$Xhi # + pxor $Xi,$T2 psrlq \$5,$Xi pxor $T2,$Xi # psrlq \$1,$Xi # - pxor $T2,$Xi # - pxor $Xhi,$T2 - psrlq \$1,$Xi # - pxor $T2,$Xi # + pxor $Xhi,$Xi # ___ } @@ -437,8 +471,35 @@ ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey); &reduction_alg9 ($Xhi,$Xi); $code.=<<___; - movdqu $Hkey,($Htbl) # save H - movdqu $Xi,16($Htbl) # save H^2 + pshufd \$0b01001110,$Hkey,$T1 + pshufd \$0b01001110,$Xi,$T2 + pxor $Hkey,$T1 # Karatsuba pre-processing + movdqu $Hkey,0x00($Htbl) # save H + pxor $Xi,$T2 # Karatsuba pre-processing + movdqu $Xi,0x10($Htbl) # save H^2 + palignr \$8,$T1,$T2 # low part is H.lo^H.hi... + movdqu $T2,0x20($Htbl) # save Karatsuba "salt" +___ +if ($do4xaggr) { + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^3 + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + movdqa $Xi,$T3 +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^4 + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + pshufd \$0b01001110,$T3,$T1 + pshufd \$0b01001110,$Xi,$T2 + pxor $T3,$T1 # Karatsuba pre-processing + movdqu $T3,0x30($Htbl) # save H^3 + pxor $Xi,$T2 # Karatsuba pre-processing + movdqu $Xi,0x40($Htbl) # save H^4 + palignr \$8,$T1,$T2 # low part is H.lo^H.hi... + movdqu $T2,0x50($Htbl) # save Karatsuba "salt" +___ +} +$code.=<<___; ret .size gcm_init_clmul,.-gcm_init_clmul ___ @@ -454,10 +515,34 @@ gcm_gmult_clmul: movdqu ($Xip),$Xi movdqa .Lbswap_mask(%rip),$T3 movdqu ($Htbl),$Hkey + movdqu 0x20($Htbl),$T2 pshufb $T3,$Xi ___ - &clmul64x64_T2 ($Xhi,$Xi,$Hkey); - &reduction_alg9 ($Xhi,$Xi); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); +$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); + # experimental alternative. special thing about is that there + # no dependency between the two multiplications... + mov \$`0xE1<<1`,%eax + mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff + mov \$0x07,%r11d + movq %rax,$T1 + movq %r10,$T2 + movq %r11,$T3 # borrow $T3 + pand $Xi,$T3 + pshufb $T3,$T2 # ($Xi&7)·0xE0 + movq %rax,$T3 + pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) + pxor $Xi,$T2 + pslldq \$15,$T2 + paddd $T2,$T2 # <<(64+56+1) + pxor $T2,$Xi + pclmulqdq \$0x01,$T3,$Xi + movdqa .Lbswap_mask(%rip),$T3 # reload $T3 + psrldq \$1,$T1 + pxor $T1,$Xhi + pslldq \$7,$Xi + pxor $Xhi,$Xi +___ $code.=<<___; pshufb $T3,$Xi movdqu $Xi,($Xip) @@ -467,129 +552,316 @@ ___ } { my ($Xip,$Htbl,$inp,$len)=@_4args; - my $Xn="%xmm6"; - my $Xhn="%xmm7"; - my $Hkey2="%xmm8"; - my $T1n="%xmm9"; - my $T2n="%xmm10"; + my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10)); $code.=<<___; .globl gcm_ghash_clmul .type gcm_ghash_clmul,\@abi-omnipotent -.align 16 +.align 32 gcm_ghash_clmul: ___ $code.=<<___ if ($win64); + lea -0x88(%rsp),%rax .LSEH_begin_gcm_ghash_clmul: # I can't trust assembler to use specific encoding:-( - .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp - .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) - .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) - .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp) - .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp) - .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp) + .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp + .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) + .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) + .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) + .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) + .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) + .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) + .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) + .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) + .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) + .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) ___ $code.=<<___; movdqa .Lbswap_mask(%rip),$T3 + mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff movdqu ($Xip),$Xi movdqu ($Htbl),$Hkey + movdqu 0x20($Htbl),$HK pshufb $T3,$Xi sub \$0x10,$len jz .Lodd_tail - movdqu 16($Htbl),$Hkey2 + movdqu 0x10($Htbl),$Hkey2 +___ +if ($do4xaggr) { +my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); + +$code.=<<___; + cmp \$0x30,$len + jb .Lskip4x + + sub \$0x30,$len + movdqu 0x30($Htbl),$Hkey3 + movdqu 0x40($Htbl),$Hkey4 + + ####### + # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P + # + movdqu 0x30($inp),$Xln + movdqu 0x20($inp),$Xl + pshufb $T3,$Xln + pshufb $T3,$Xl + movdqa $Xln,$Xhn + pshufd \$0b01001110,$Xln,$Xmn + pxor $Xln,$Xmn + pclmulqdq \$0x00,$Hkey,$Xln + pclmulqdq \$0x11,$Hkey,$Xhn + pclmulqdq \$0x00,$HK,$Xmn + + movdqa $Xl,$Xh + pshufd \$0b01001110,$Xl,$Xm + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey2,$Xl + pclmulqdq \$0x11,$Hkey2,$Xh + xorps $Xl,$Xln + pclmulqdq \$0x10,$HK,$Xm + xorps $Xh,$Xhn + movups 0x50($Htbl),$HK + xorps $Xm,$Xmn + + movdqu 0x10($inp),$Xl + movdqu 0($inp),$T1 + pshufb $T3,$Xl + pshufb $T3,$T1 + movdqa $Xl,$Xh + pshufd \$0b01001110,$Xl,$Xm + pxor $T1,$Xi + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey3,$Xl + movdqa $Xi,$Xhi + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 + pclmulqdq \$0x11,$Hkey3,$Xh + xorps $Xl,$Xln + pclmulqdq \$0x00,$HK,$Xm + xorps $Xh,$Xhn + + lea 0x40($inp),$inp + sub \$0x40,$len + jc .Ltail4x + + jmp .Lmod4_loop +.align 32 +.Lmod4_loop: + pclmulqdq \$0x00,$Hkey4,$Xi + xorps $Xm,$Xmn + movdqu 0x30($inp),$Xl + pshufb $T3,$Xl + pclmulqdq \$0x11,$Hkey4,$Xhi + xorps $Xln,$Xi + movdqu 0x20($inp),$Xln + movdqa $Xl,$Xh + pshufd \$0b01001110,$Xl,$Xm + pclmulqdq \$0x10,$HK,$T1 + xorps $Xhn,$Xhi + pxor $Xl,$Xm + pshufb $T3,$Xln + movups 0x20($Htbl),$HK + pclmulqdq \$0x00,$Hkey,$Xl + xorps $Xmn,$T1 + movdqa $Xln,$Xhn + pshufd \$0b01001110,$Xln,$Xmn + + pxor $Xi,$T1 # aggregated Karatsuba post-processing + pxor $Xln,$Xmn + pxor $Xhi,$T1 # + movdqa $T1,$T2 # + pslldq \$8,$T1 + pclmulqdq \$0x11,$Hkey,$Xh + psrldq \$8,$T2 # + pxor $T1,$Xi + movdqa .L7_mask(%rip),$T1 + pxor $T2,$Xhi # + movq %rax,$T2 + + pand $Xi,$T1 # 1st phase + pshufb $T1,$T2 # + pclmulqdq \$0x00,$HK,$Xm + pxor $Xi,$T2 # + psllq \$57,$T2 # + movdqa $T2,$T1 # + pslldq \$8,$T2 + pclmulqdq \$0x00,$Hkey2,$Xln + psrldq \$8,$T1 # + pxor $T2,$Xi + pxor $T1,$Xhi # + movdqu 0($inp),$T1 + + movdqa $Xi,$T2 # 2nd phase + psrlq \$1,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhn + xorps $Xl,$Xln + movdqu 0x10($inp),$Xl + pshufb $T3,$Xl + pclmulqdq \$0x10,$HK,$Xmn + xorps $Xh,$Xhn + movups 0x50($Htbl),$HK + pshufb $T3,$T1 + pxor $T2,$Xhi # + pxor $Xi,$T2 + psrlq \$5,$Xi + + movdqa $Xl,$Xh + pxor $Xm,$Xmn + pshufd \$0b01001110,$Xl,$Xm + pxor $Xl,$Xm + pclmulqdq \$0x00,$Hkey3,$Xl + pxor $T2,$Xi # + pxor $T1,$Xhi + psrlq \$1,$Xi # + pclmulqdq \$0x11,$Hkey3,$Xh + xorps $Xl,$Xln + pxor $Xhi,$Xi # + + pclmulqdq \$0x00,$HK,$Xm + xorps $Xh,$Xhn + + movdqa $Xi,$Xhi + pshufd \$0b01001110,$Xi,$T1 + pxor $Xi,$T1 + + lea 0x40($inp),$inp + sub \$0x40,$len + jnc .Lmod4_loop + +.Ltail4x: + pclmulqdq \$0x00,$Hkey4,$Xi + xorps $Xm,$Xmn + pclmulqdq \$0x11,$Hkey4,$Xhi + xorps $Xln,$Xi + pclmulqdq \$0x10,$HK,$T1 + xorps $Xhn,$Xhi + pxor $Xi,$Xhi # aggregated Karatsuba post-processing + pxor $Xmn,$T1 + + pxor $Xhi,$T1 # + pxor $Xi,$Xhi + + movdqa $T1,$T2 # + psrldq \$8,$T1 + pslldq \$8,$T2 # + pxor $T1,$Xhi + pxor $T2,$Xi # +___ + &reduction_alg9($Xhi,$Xi); +$code.=<<___; + add \$0x40,$len + jz .Ldone + sub \$0x10,$len + movdqu 0x20($Htbl),$HK +.Lskip4x: +___ +} +$code.=<<___; ####### # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = # [(H*Ii+1) + (H*Xi+1)] mod P = # [(H*Ii+1) + H^2*(Ii+Xi)] mod P # movdqu ($inp),$T1 # Ii - movdqu 16($inp),$Xn # Ii+1 + movdqu 16($inp),$Xln # Ii+1 pshufb $T3,$T1 - pshufb $T3,$Xn + pshufb $T3,$Xln pxor $T1,$Xi # Ii+Xi -___ - &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 -$code.=<<___; - movdqa $Xi,$Xhi # - pshufd \$0b01001110,$Xi,$T1 - pshufd \$0b01001110,$Hkey2,$T2 + + movdqa $Xln,$Xhn + pshufd \$0b01001110,$Xln,$Xmn + pxor $Xln,$Xmn + pclmulqdq \$0x00,$Hkey,$Xln + pclmulqdq \$0x11,$Hkey,$Xhn + pclmulqdq \$0x00,$HK,$Xmn + + movdqa $Xi,$Xhi + pshufd \$0b01001110,$Xi,$T1 # pxor $Xi,$T1 # - pxor $Hkey2,$T2 lea 32($inp),$inp # i+=2 sub \$0x20,$len jbe .Leven_tail + jmp .Lmod_loop +.align 32 .Lmod_loop: -___ - &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) -$code.=<<___; - movdqu ($inp),$T1 # Ii - pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pclmulqdq \$0x00,$Hkey2,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhi + movdqu ($inp),$T2 # Ii + pclmulqdq \$0x10,$HK,$T1 + pshufb $T3,$T2 + + pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + movdqu 16($inp),$Xln # Ii+1 pxor $Xhn,$Xhi - movdqu 16($inp),$Xn # Ii+1 - pshufb $T3,$T1 - pshufb $T3,$Xn + pxor $Xi,$Xmn # aggregated Karatsuba post-processing + pxor $Xhi,$Xmn + pxor $T2,$Xhi # "Ii+Xi", consume early + pxor $Xmn,$T1 + pshufb $T3,$Xln + movdqa $T1,$T2 # + psrldq \$8,$T1 + pslldq \$8,$T2 # + pxor $T1,$Xhi + pxor $T2,$Xi # - movdqa $Xn,$Xhn # - pshufd \$0b01001110,$Xn,$T1n - pshufd \$0b01001110,$Hkey,$T2n - pxor $Xn,$T1n # - pxor $Hkey,$T2n - pxor $T1,$Xhi # "Ii+Xi", consume early + movdqa $Xln,$Xhn # + pshufd \$0b01001110,$Xln,$Xmn + pxor $Xln,$Xmn # - movdqa $Xi,$T1 # 1st phase + movdqa $Xi,$T2 # 1st phase + movdqa $Xi,$T1 + psllq \$5,$Xi + pclmulqdq \$0x00,$Hkey,$Xln ####### + pxor $Xi,$T1 # psllq \$1,$Xi pxor $T1,$Xi # - psllq \$5,$Xi # - pxor $T1,$Xi # - pclmulqdq \$0x00,$Hkey,$Xn ####### psllq \$57,$Xi # - movdqa $Xi,$T2 # + movdqa $Xi,$T1 # pslldq \$8,$Xi - psrldq \$8,$T2 # - pxor $T1,$Xi - pxor $T2,$Xhi # + psrldq \$8,$T1 # + pxor $T2,$Xi + pxor $T1,$Xhi # pclmulqdq \$0x11,$Hkey,$Xhn ####### movdqa $Xi,$T2 # 2nd phase + psrlq \$1,$Xi + pxor $T2,$Xhi # + pxor $Xi,$T2 psrlq \$5,$Xi pxor $T2,$Xi # psrlq \$1,$Xi # - pxor $T2,$Xi # - pxor $Xhi,$T2 - psrlq \$1,$Xi # - pxor $T2,$Xi # + pclmulqdq \$0x00,$HK,$Xmn ####### + pxor $Xhi,$Xi # - pclmulqdq \$0x00,$T2n,$T1n ####### - movdqa $Xi,$Xhi # - pshufd \$0b01001110,$Xi,$T1 - pshufd \$0b01001110,$Hkey2,$T2 + movdqa $Xi,$Xhi + pshufd \$0b01001110,$Xi,$T1 # pxor $Xi,$T1 # - pxor $Hkey2,$T2 - - pxor $Xn,$T1n # - pxor $Xhn,$T1n # - movdqa $T1n,$T2n # - psrldq \$8,$T1n - pslldq \$8,$T2n # - pxor $T1n,$Xhn - pxor $T2n,$Xn # lea 32($inp),$inp sub \$0x20,$len ja .Lmod_loop .Leven_tail: -___ - &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) -$code.=<<___; - pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pclmulqdq \$0x00,$Hkey2,$Xi + pclmulqdq \$0x11,$Hkey2,$Xhi + pclmulqdq \$0x10,$HK,$T1 + + pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi + pxor $Xi,$Xmn + pxor $Xhi,$Xmn + pxor $Xmn,$T1 + movdqa $T1,$T2 # + psrldq \$8,$T1 + pslldq \$8,$T2 # + pxor $T1,$Xhi + pxor $T2,$Xi # ___ &reduction_alg9 ($Xhi,$Xi); $code.=<<___; @@ -601,7 +873,7 @@ $code.=<<___; pshufb $T3,$T1 pxor $T1,$Xi # Ii+Xi ___ - &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) &reduction_alg9 ($Xhi,$Xi); $code.=<<___; .Ldone: @@ -614,7 +886,12 @@ $code.=<<___ if ($win64); movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 - add \$0x58,%rsp + movaps 0x50(%rsp),%xmm11 + movaps 0x60(%rsp),%xmm12 + movaps 0x70(%rsp),%xmm13 + movaps 0x80(%rsp),%xmm14 + movaps 0x90(%rsp),%xmm15 + lea 0xa8(%rsp),%rsp ___ $code.=<<___; ret @@ -629,6 +906,10 @@ $code.=<<___; .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: + .long 7,0,7,0 +.L7_mask_poly: + .long 7,0,`0xE1<<1`,0 .align 64 .type .Lrem_4bit,\@object .Lrem_4bit: @@ -791,13 +1072,18 @@ se_handler: .rva se_handler .rva .Lghash_prologue,.Lghash_epilogue # HandlerData .LSEH_info_gcm_ghash_clmul: - .byte 0x01,0x1f,0x0b,0x00 - .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 - .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 - .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 - .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 - .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 - .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58 + .byte 0x01,0x33,0x16,0x00 + .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 + .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 + .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 + .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 + .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 + .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 + .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 + .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 + .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 #sub 0xa8,rsp ___ } diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c index e7d1736786..a6e2af1b96 100644 --- a/crypto/modes/gcm128.c +++ b/crypto/modes/gcm128.c @@ -1703,6 +1703,21 @@ static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f}, T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a}; +/* Test Case 19 */ +#define K19 K1 +#define P19 P1 +#define IV19 IV1 +#define C19 C1 +static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, + 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, + 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, + 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55, + 0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d, + 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa, + 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38, + 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad}, + T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92}; + #define TEST_CASE(n) do { \ u8 out[sizeof(P##n)]; \ AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \ @@ -1747,6 +1762,7 @@ int main() TEST_CASE(16); TEST_CASE(17); TEST_CASE(18); + TEST_CASE(19); #ifdef OPENSSL_CPUID_OBJ { -- 2.25.1