From afebe623c52d2067d1c34d4461ee92924371621d Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 16 May 2011 17:46:45 +0000 Subject: [PATCH] x86_64 assembler pack: add x86_64-gf2m module. --- Configure | 2 +- TABLE | 20 +-- crypto/bn/Makefile | 2 + crypto/bn/asm/x86_64-gf2m.pl | 281 +++++++++++++++++++++++++++++++++++ 4 files changed, 294 insertions(+), 11 deletions(-) create mode 100644 crypto/bn/asm/x86_64-gf2m.pl diff --git a/Configure b/Configure index 1b2e85f31f..a8b3b35e80 100755 --- a/Configure +++ b/Configure @@ -127,7 +127,7 @@ my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt5 my $x86_elf_asm="$x86_asm:elf"; -my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o"; +my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o"; my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o:void"; my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o:void"; my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void"; diff --git a/TABLE b/TABLE index a3ffe2c257..ccaee9b14f 100644 --- a/TABLE +++ b/TABLE @@ -297,7 +297,7 @@ $sys_id = $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o -$bn_obj = x86_64-gcc.o x86_64-mont.o +$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o $des_obj = $aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = @@ -738,7 +738,7 @@ $multilib = *** VC-WIN32 $cc = cl -$cflags = -W3 -WX -Gs0 -GF -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE +$cflags = -W3 -Gs0 -GF -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE $unistd = $thread_cflag = $sys_id = WIN32 @@ -1385,7 +1385,7 @@ $sys_id = MACOSX $lflags = -Wl,-search_paths_first% $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o -$bn_obj = x86_64-gcc.o x86_64-mont.o +$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o $des_obj = $aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = @@ -2313,7 +2313,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o -$bn_obj = x86_64-gcc.o x86_64-mont.o +$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o $des_obj = $aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = @@ -2505,7 +2505,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o -$bn_obj = x86_64-gcc.o x86_64-mont.o +$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o $des_obj = $aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = @@ -2569,7 +2569,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o -$bn_obj = x86_64-gcc.o x86_64-mont.o +$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o $des_obj = $aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = @@ -4073,7 +4073,7 @@ $sys_id = $lflags = -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o -$bn_obj = x86_64-gcc.o x86_64-mont.o +$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o $des_obj = $aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = @@ -4233,7 +4233,7 @@ $sys_id = MINGW64 $lflags = -lws2_32 -lgdi32 -lcrypt32 $bn_ops = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN $cpuid_obj = x86_64cpuid.o -$bn_obj = x86_64-gcc.o x86_64-mont.o +$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o $des_obj = $aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = @@ -5193,7 +5193,7 @@ $sys_id = $lflags = -lsocket -lnsl -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o -$bn_obj = x86_64-gcc.o x86_64-mont.o +$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o $des_obj = $aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = @@ -5225,7 +5225,7 @@ $sys_id = $lflags = -lsocket -lnsl -ldl $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL $cpuid_obj = x86_64cpuid.o -$bn_obj = x86_64-gcc.o x86_64-mont.o +$bn_obj = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o $des_obj = $aes_obj = aes-x86_64.o aesni-x86_64.o $bf_obj = diff --git a/crypto/bn/Makefile b/crypto/bn/Makefile index 61b6b702b2..74f1ac4b10 100644 --- a/crypto/bn/Makefile +++ b/crypto/bn/Makefile @@ -96,6 +96,8 @@ x86_64-gcc.o: asm/x86_64-gcc.c $(CC) $(CFLAGS) -c -o $@ asm/x86_64-gcc.c x86_64-mont.s: asm/x86_64-mont.pl $(PERL) asm/x86_64-mont.pl $(PERLASM_SCHEME) > $@ +x86_64-gf2m.s: asm/x86_64-gf2m.pl + $(PERL) asm/x86_64-gf2m.pl $(PERLASM_SCHEME) > $@ bn-ia64.s: asm/ia64.S $(CC) $(CFLAGS) -E asm/ia64.S > $@ diff --git a/crypto/bn/asm/x86_64-gf2m.pl b/crypto/bn/asm/x86_64-gf2m.pl new file mode 100644 index 0000000000..dfbb17772c --- /dev/null +++ b/crypto/bn/asm/x86_64-gf2m.pl @@ -0,0 +1,281 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# May 2011 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication used +# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for +# the time being... Except that it has two code paths: code suitable +# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and +# later. Improvement varies from one benchmark and µ-arch to another. +# Vanilla code path is at most 20% faster than compiler-generated code +# [not very impressive], while PCLMULQDQ - whole 85%-160% better on +# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that +# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not +# all CPU time is burnt in it... + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +($lo,$hi)=("%rax","%rdx"); $a=$lo; +($i0,$i1)=("%rsi","%rdi"); +($t0,$t1)=("%rbx","%rcx"); +($b,$mask)=("%rbp","%r8"); +($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15)); +($R,$Tx)=("%xmm0","%xmm1"); + +$code.=<<___; +.text + +.type _mul_1x1,\@abi-omnipotent +.align 16 +_mul_1x1: + sub \$128+8,%rsp + mov \$-1,$a1 + lea ($a,$a),$i0 + shr \$3,$a1 + lea (,$a,4),$i1 + and $a,$a1 # a1=a&0x1fffffffffffffff + lea (,$a,8),$a8 + sar \$63,$a # broadcast 63rd bit + lea ($a1,$a1),$a2 + sar \$63,$i0 # broadcast 62nd bit + lea (,$a1,4),$a4 + and $b,$a + sar \$63,$i1 # boardcast 61st bit + mov $a,$hi # $a is $lo + shl \$63,$lo + and $b,$i0 + shr \$1,$hi + mov $i0,$t1 + shl \$62,$i0 + and $b,$i1 + shr \$2,$t1 + xor $i0,$lo + mov $i1,$t0 + shl \$61,$i1 + xor $t1,$hi + shr \$3,$t0 + xor $i1,$lo + xor $t0,$hi + + mov $a1,$a12 + movq \$0,0(%rsp) # tab[0]=0 + xor $a2,$a12 # a1^a2 + mov $a1,8(%rsp) # tab[1]=a1 + mov $a4,$a48 + mov $a2,16(%rsp) # tab[2]=a2 + xor $a8,$a48 # a4^a8 + mov $a12,24(%rsp) # tab[3]=a1^a2 + + xor $a4,$a1 + mov $a4,32(%rsp) # tab[4]=a4 + xor $a4,$a2 + mov $a1,40(%rsp) # tab[5]=a1^a4 + xor $a4,$a12 + mov $a2,48(%rsp) # tab[6]=a2^a4 + xor $a48,$a1 # a1^a4^a4^a8=a1^a8 + mov $a12,56(%rsp) # tab[7]=a1^a2^a4 + xor $a48,$a2 # a2^a4^a4^a8=a1^a8 + + mov $a8,64(%rsp) # tab[8]=a8 + xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8 + mov $a1,72(%rsp) # tab[9]=a1^a8 + xor $a4,$a1 # a1^a8^a4 + mov $a2,80(%rsp) # tab[10]=a2^a8 + xor $a4,$a2 # a2^a8^a4 + mov $a12,88(%rsp) # tab[11]=a1^a2^a8 + + xor $a4,$a12 # a1^a2^a8^a4 + mov $a48,96(%rsp) # tab[12]=a4^a8 + mov $mask,$i0 + mov $a1,104(%rsp) # tab[13]=a1^a4^a8 + and $b,$i0 + mov $a2,112(%rsp) # tab[14]=a2^a4^a8 + shr \$4,$b + mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8 + mov $mask,$i1 + and $b,$i1 + shr \$4,$b + + movq (%rsp,$i0,8),$R # half of calculations is done in SSE2 + mov $mask,$i0 + and $b,$i0 + shr \$4,$b +___ + for ($n=1;$n<8;$n++) { + $code.=<<___; + mov (%rsp,$i1,8),$t1 + mov $mask,$i1 + mov $t1,$t0 + shl \$`8*$n-4`,$t1 + and $b,$i1 + movq (%rsp,$i0,8),$Tx + shr \$`64-(8*$n-4)`,$t0 + xor $t1,$lo + pslldq \$$n,$Tx + mov $mask,$i0 + shr \$4,$b + xor $t0,$hi + and $b,$i0 + shr \$4,$b + pxor $Tx,$R +___ + } +$code.=<<___; + mov (%rsp,$i1,8),$t1 + mov $t1,$t0 + shl \$`8*$n-4`,$t1 + movq $R,$i0 + shr \$`64-(8*$n-4)`,$t0 + xor $t1,$lo + psrldq \$8,$R + xor $t0,$hi + movq $R,$i1 + xor $i0,$lo + xor $i1,$hi + + add \$128+8,%rsp + ret +.size _mul_1x1,.-_mul_1x1 +___ + +($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order + +$code.=<<___; +.extern OPENSSL_ia32cap_P +.globl bn_GF2m_mul_2x2 +.type bn_GF2m_mul_2x2,\@abi-omnipotent +.align 16 +bn_GF2m_mul_2x2: + mov OPENSSL_ia32cap_P(%rip),%rax + bt \$33,%rax + jnc .Lvanilla + + movq $a1,%xmm0 + movq $b1,%xmm1 + movq $a0,%xmm2 +___ +$code.=<<___ if ($win64); + movq 40(%rsp),%xmm3 +___ +$code.=<<___ if (!$win64); + movq $b0,%xmm3 +___ +$code.=<<___; + movdqa %xmm0,%xmm4 + movdqa %xmm1,%xmm5 + pclmulqdq \$0,%xmm1,%xmm0 # a1·b1 + pxor %xmm2,%xmm4 + pxor %xmm3,%xmm5 + pclmulqdq \$0,%xmm3,%xmm2 # a0·b0 + pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1) + xorps %xmm0,%xmm4 + xorps %xmm2,%xmm4 + movdqa %xmm4,%xmm5 + pslldq \$8,%xmm4 + psrldq \$8,%xmm5 + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm0 + movdqu %xmm2,0($rp) + movdqu %xmm0,16($rp) + ret + +.align 16 +.Lvanilla: + lea -8*17(%rsp),%rsp +___ +$code.=<<___ if ($win64); + mov `8*17+40`(%rsp),$b0 + mov %rdi,8*15(%rsp) + mov %rsi,8*16(%rsp) +___ +$code.=<<___; + mov %r14,8*10(%rsp) + mov %r13,8*11(%rsp) + mov %r12,8*12(%rsp) + mov %rbp,8*13(%rsp) + mov %rbx,8*14(%rsp) +.Lbody: + mov $rp,32(%rsp) # save the arguments + mov $a1,40(%rsp) + mov $a0,48(%rsp) + mov $b1,56(%rsp) + mov $b0,64(%rsp) + + mov \$0xf,$mask + mov $a1,$a + mov $b1,$b + call _mul_1x1 # a1·b1 + mov $lo,16(%rsp) + mov $hi,24(%rsp) + + mov 48(%rsp),$a + mov 64(%rsp),$b + call _mul_1x1 # a0·b0 + mov $lo,0(%rsp) + mov $hi,8(%rsp) + + mov 40(%rsp),$a + mov 56(%rsp),$b + xor 48(%rsp),$a + xor 64(%rsp),$b + call _mul_1x1 # (a0+a1)·(b0+b1) +___ + @r=("%rbx","%rcx","%rdi","%rsi"); +$code.=<<___; + mov 0(%rsp),@r[0] + mov 8(%rsp),@r[1] + mov 16(%rsp),@r[2] + mov 24(%rsp),@r[3] + mov 32(%rsp),%rbp + + xor $hi,$lo + xor @r[1],$hi + xor @r[0],$lo + mov @r[0],0(%rbp) + xor @r[2],$hi + mov @r[3],24(%rbp) + xor @r[3],$lo + xor @r[3],$hi + xor $hi,$lo + mov $hi,16(%rbp) + mov $lo,8(%rbp) + + mov 8*10(%rsp),%r14 + mov 8*11(%rsp),%r13 + mov 8*12(%rsp),%r12 + mov 8*13(%rsp),%rbp + mov 8*14(%rsp),%rbx +___ +$code.=<<___ if ($win64); + mov 8*15(%rsp),%rdi + mov 8*16(%rsp),%rsi +___ +$code.=<<___; + lea 8*17(%rsp),%rsp + ret +.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 +.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by " +___ + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +print $code; +close STDOUT; -- 2.25.1