From b7838586545c88c44132de1d9c153ee811c5433b Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 3 Oct 2013 00:18:58 +0200 Subject: [PATCH] x86_64 assembly pack: add multi-block AES-NI, SHA1 and SHA256. --- crypto/aes/asm/aesni-mb-x86_64.pl | 1195 ++++++++++++++++++++++++++++ crypto/sha/asm/sha1-mb-x86_64.pl | 1026 ++++++++++++++++++++++++ crypto/sha/asm/sha256-mb-x86_64.pl | 934 ++++++++++++++++++++++ 3 files changed, 3155 insertions(+) create mode 100644 crypto/aes/asm/aesni-mb-x86_64.pl create mode 100644 crypto/sha/asm/sha1-mb-x86_64.pl create mode 100644 crypto/sha/asm/sha256-mb-x86_64.pl diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl new file mode 100644 index 0000000000..17ed3b2ca9 --- /dev/null +++ b/crypto/aes/asm/aesni-mb-x86_64.pl @@ -0,0 +1,1195 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# Multi-buffer AES-NI procedures process several independent buffers +# in parallel by interleaving independent instructions. +# +# Cycles per byte for interleave factor 4: +# +# asymptotic measured +# --------------------------- +# Westmere 5.00/4=1.25 5.13/4=1.28 +# Atom 15.0/4=3.75 15.7/4=3.93 +# Sandy Bridge 5.06/4=1.27 5.15/4=1.29 +# Ivy Bridge 5.06/4=1.27 5.14/4=1.29 +# Haswell 4.44/4=1.11 4.44/4=1.11 +# Bulldozer 5.75/4=1.44 5.76/4=1.44 +# +# Cycles per byte for interleave factor 8 (not implemented for +# pre-AVX processors, where higher interleave factor incidentally +# doesn't result in improvement): +# +# asymptotic measured +# --------------------------- +# Sandy Bridge 5.06/8=0.64 7.05/8=0.88(*) +# Ivy Bridge 5.06/8=0.64 7.02/8=0.88(*) +# Haswell 5.00/8=0.63 5.00/8=0.63 +# Bulldozer 5.75/8=0.72 5.77/8=0.72 +# +# (*) Sandy/Ivy Bridge are known to handle high interleave factors +# suboptimally; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +$avx=0; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +# void aesni_multi_cbc_encrypt ( +# struct { void *inp,*out; int blocks; double iv[2]; } inp[8]; +# const AES_KEY *key, +# int num); /* 1 or 2 */ +# +$inp="%rdi"; # 1st arg +$key="%rsi"; # 2nd arg +$num="%edx"; + +@inptr=map("%r$_",(8..11)); +@outptr=map("%r$_",(12..15)); + +($rndkey0,$rndkey1)=("%xmm0","%xmm1"); +@out=map("%xmm$_",(2..5)); +@inp=map("%xmm$_",(6..9)); +($counters,$mask,$zero)=map("%xmm$_",(10..12)); + +($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx"); + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.globl aesni_multi_cbc_encrypt +.type aesni_multi_cbc_encrypt,\@function,3 +.align 32 +aesni_multi_cbc_encrypt: +___ +$code.=<<___ if ($avx); + cmp \$2,$num + jb .Lenc_non_avx + mov OPENSSL_ia32cap_P+4(%rip),%ecx + test \$`1<<28`,%ecx # AVX bit + jnz _avx_cbc_enc_shortcut + jmp .Lenc_non_avx +.align 16 +.Lenc_non_avx: +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0x78(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + + sub \$48,%rsp + and \$-64,%rsp + mov %rax,16(%rsp) # original %rsp + +.Lenc4x_body: + movdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*2($inp),$inp + +.Lenc4x_loop_grande: + mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*2`($inp),@inptr[$i] + cmp $num,$one + mov `40*$i+8-40*2`($inp),@outptr[$i] + cmovg $one,$num # find maximum + test $one,$one + movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@inptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Lenc4x_done + + movups 0x10-0x78($key),$rndkey1 + pxor $zero,@out[0] + movups 0x20-0x78($key),$rndkey0 + pxor $zero,@out[1] + mov 0xf0-0x78($key),$rounds + pxor $zero,@out[2] + movdqu (@inptr[0]),@inp[0] # load inputs + pxor $zero,@out[3] + movdqu (@inptr[1]),@inp[1] + pxor @inp[0],@out[0] + movdqu (@inptr[2]),@inp[2] + pxor @inp[1],@out[1] + movdqu (@inptr[3]),@inp[3] + pxor @inp[2],@out[2] + pxor @inp[3],@out[3] + movdqa 32(%rsp),$counters # load counters + xor $offset,$offset + jmp .Loop_enc4x + +.align 32 +.Loop_enc4x: + add \$16,$offset + lea 16(%rsp),$sink # sink pointer + mov \$1,$one # constant of 1 + sub $offset,$sink + + aesenc $rndkey1,@out[0] + aesenc $rndkey1,@out[1] + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0x30-0x78($key),$rndkey1 +___ +for($i=0;$i<4;$i++) { +my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; +$code.=<<___; + cmp `32+4*$i`(%rsp),$one + aesenc $rndkey,@out[0] + aesenc $rndkey,@out[1] + cmovge $sink,@inptr[$i] # cancel input + aesenc $rndkey,@out[2] + cmovg $sink,@outptr[$i] # sink output + aesenc $rndkey,@out[3] + movups `0x40+16*$i-0x78`($key),$rndkey +___ +} +$code.=<<___; + movdqa $counters,$mask + aesenc $rndkey0,@out[0] + aesenc $rndkey0,@out[1] + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0x80-0x78($key),$rndkey0 + pxor $zero,$zero + + aesenc $rndkey1,@out[0] + pcmpgtd $zero,$mask + movdqu -0x78($key),$zero # reload 0-round key + aesenc $rndkey1,@out[1] + paddd $mask,$counters # decrement counters + movdqa $counters,32(%rsp) # update counters + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0x90-0x78($key),$rndkey1 + + cmp \$11,$rounds + + aesenc $rndkey0,@out[0] + aesenc $rndkey0,@out[1] + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0xa0-0x78($key),$rndkey0 + + jb .Lenc4x_tail + + aesenc $rndkey1,@out[0] + aesenc $rndkey1,@out[1] + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0xb0-0x78($key),$rndkey1 + + aesenc $rndkey0,@out[0] + aesenc $rndkey0,@out[1] + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0xc0-0x78($key),$rndkey0 + + je .Lenc4x_tail + + aesenc $rndkey1,@out[0] + aesenc $rndkey1,@out[1] + aesenc $rndkey1,@out[2] + aesenc $rndkey1,@out[3] + movups 0xd0-0x78($key),$rndkey1 + + aesenc $rndkey0,@out[0] + aesenc $rndkey0,@out[1] + aesenc $rndkey0,@out[2] + aesenc $rndkey0,@out[3] + movups 0xe0-0x78($key),$rndkey0 + +.Lenc4x_tail: + aesenc $rndkey1,@out[0] + aesenc $rndkey1,@out[1] + aesenc $rndkey1,@out[2] + movdqu (@inptr[0],$offset),@inp[0] + aesenc $rndkey1,@out[3] + movdqu 0x10-0x78($key),$rndkey1 + + aesenclast $rndkey0,@out[0] + movdqu (@inptr[1],$offset),@inp[1] + pxor $zero,@inp[0] + aesenclast $rndkey0,@out[1] + movdqu (@inptr[2],$offset),@inp[2] + pxor $zero,@inp[1] + aesenclast $rndkey0,@out[2] + movdqu (@inptr[3],$offset),@inp[3] + pxor $zero,@inp[2] + aesenclast $rndkey0,@out[3] + movdqu 0x20-0x78($key),$rndkey0 + pxor $zero,@inp[3] + + movups @out[0],-16(@outptr[0],$offset) + pxor @inp[0],@out[0] + movups @out[1],-16(@outptr[1],$offset) + pxor @inp[1],@out[1] + movups @out[2],-16(@outptr[2],$offset) + pxor @inp[2],@out[2] + movups @out[3],-16(@outptr[3],$offset) + pxor @inp[3],@out[3] + + dec $num + jnz .Loop_enc4x + + mov 16(%rsp),%rax # original %rsp + mov 24(%rsp),$num + + #pxor @inp[0],@out[0] + #pxor @inp[1],@out[1] + #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME! + #pxor @inp[2],@out[2] + #movdqu @out[1],`40*1+24-40*2`($inp) + #pxor @inp[3],@out[3] + #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller + #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out... + + lea `40*4`($inp),$inp + dec $num + jnz .Lenc4x_loop_grande + +.Lenc4x_done: +___ +$code.=<<___ if ($win64); + movaps -0xa8(%rax),%xmm6 + movaps -0x98(%rax),%xmm7 + movaps -0x88(%rax),%xmm8 + movaps -0x78(%rax),%xmm9 + movaps -0x68(%rax),%xmm10 + movaps -0x58(%rax),%xmm11 + movaps -0x48(%rax),%xmm12 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp + ret +.size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt + +.globl aesni_multi_cbc_decrypt +.type aesni_multi_cbc_decrypt,\@function,3 +.align 32 +aesni_multi_cbc_decrypt: +___ +$code.=<<___ if ($avx); + cmp \$2,$num + jb .Ldec_non_avx + mov OPENSSL_ia32cap_P+4(%rip),%ecx + test \$`1<<28`,%ecx # AVX bit + jnz _avx_cbc_dec_shortcut + jmp .Ldec_non_avx +.align 16 +.Ldec_non_avx: +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0x78(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,0x60(%rsp) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + + sub \$48,%rsp + and \$-64,%rsp + mov %rax,16(%rsp) # original %rsp + +.Ldec4x_body: + movdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*2($inp),$inp + +.Ldec4x_loop_grande: + mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*2`($inp),@inptr[$i] + cmp $num,$one + mov `40*$i+8-40*2`($inp),@outptr[$i] + cmovg $one,$num # find maximum + test $one,$one + movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@inptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldec4x_done + + movups 0x10-0x78($key),$rndkey1 + movups 0x20-0x78($key),$rndkey0 + mov 0xf0-0x78($key),$rounds + movdqu (@inptr[0]),@out[0] # load inputs + movdqu (@inptr[1]),@out[1] + pxor $zero,@out[0] + movdqu (@inptr[2]),@out[2] + pxor $zero,@out[1] + movdqu (@inptr[3]),@out[3] + pxor $zero,@out[2] + pxor $zero,@out[3] + movdqa 32(%rsp),$counters # load counters + xor $offset,$offset + jmp .Loop_dec4x + +.align 32 +.Loop_dec4x: + add \$16,$offset + lea 16(%rsp),$sink # sink pointer + mov \$1,$one # constant of 1 + sub $offset,$sink + + aesdec $rndkey1,@out[0] + aesdec $rndkey1,@out[1] + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0x30-0x78($key),$rndkey1 +___ +for($i=0;$i<4;$i++) { +my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; +$code.=<<___; + cmp `32+4*$i`(%rsp),$one + aesdec $rndkey,@out[0] + aesdec $rndkey,@out[1] + cmovge $sink,@inptr[$i] # cancel input + aesdec $rndkey,@out[2] + cmovg $sink,@outptr[$i] # sink output + aesdec $rndkey,@out[3] + movups `0x40+16*$i-0x78`($key),$rndkey +___ +} +$code.=<<___; + movdqa $counters,$mask + aesdec $rndkey0,@out[0] + aesdec $rndkey0,@out[1] + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0x80-0x78($key),$rndkey0 + pxor $zero,$zero + + aesdec $rndkey1,@out[0] + pcmpgtd $zero,$mask + movdqu -0x78($key),$zero # reload 0-round key + aesdec $rndkey1,@out[1] + paddd $mask,$counters # decrement counters + movdqa $counters,32(%rsp) # update counters + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0x90-0x78($key),$rndkey1 + + cmp \$11,$rounds + + aesdec $rndkey0,@out[0] + aesdec $rndkey0,@out[1] + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0xa0-0x78($key),$rndkey0 + + jb .Ldec4x_tail + + aesdec $rndkey1,@out[0] + aesdec $rndkey1,@out[1] + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0xb0-0x78($key),$rndkey1 + + aesdec $rndkey0,@out[0] + aesdec $rndkey0,@out[1] + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0xc0-0x78($key),$rndkey0 + + je .Ldec4x_tail + + aesdec $rndkey1,@out[0] + aesdec $rndkey1,@out[1] + aesdec $rndkey1,@out[2] + aesdec $rndkey1,@out[3] + movups 0xd0-0x78($key),$rndkey1 + + aesdec $rndkey0,@out[0] + aesdec $rndkey0,@out[1] + aesdec $rndkey0,@out[2] + aesdec $rndkey0,@out[3] + movups 0xe0-0x78($key),$rndkey0 + +.Ldec4x_tail: + aesdec $rndkey1,@out[0] + aesdec $rndkey1,@out[1] + aesdec $rndkey1,@out[2] + pxor $rndkey0,@inp[0] + pxor $rndkey0,@inp[1] + aesdec $rndkey1,@out[3] + movdqu 0x10-0x78($key),$rndkey1 + pxor $rndkey0,@inp[2] + pxor $rndkey0,@inp[3] + movdqu 0x20-0x78($key),$rndkey0 + + aesdeclast @inp[0],@out[0] + movdqu -16(@inptr[0],$offset),@inp[0] # load next IV + aesdeclast @inp[1],@out[1] + movdqu -16(@inptr[1],$offset),@inp[1] + aesdeclast @inp[2],@out[2] + movdqu -16(@inptr[2],$offset),@inp[2] + aesdeclast @inp[3],@out[3] + movdqu -16(@inptr[3],$offset),@inp[3] + + movups @out[0],-16(@outptr[0],$offset) + movdqu (@inptr[0],$offset),@out[0] + movups @out[1],-16(@outptr[1],$offset) + movdqu (@inptr[1],$offset),@out[1] + pxor $zero,@out[0] + movups @out[2],-16(@outptr[2],$offset) + movdqu (@inptr[2],$offset),@out[2] + pxor $zero,@out[1] + movups @out[3],-16(@outptr[3],$offset) + movdqu (@inptr[3],$offset),@out[3] + pxor $zero,@out[2] + pxor $zero,@out[3] + + dec $num + jnz .Loop_dec4x + + mov 16(%rsp),%rax # original %rsp + mov 24(%rsp),$num + + lea `40*4`($inp),$inp + dec $num + jnz .Ldec4x_loop_grande + +.Ldec4x_done: +___ +$code.=<<___ if ($win64); + movaps -0xa8(%rax),%xmm6 + movaps -0x98(%rax),%xmm7 + movaps -0x88(%rax),%xmm8 + movaps -0x78(%rax),%xmm9 + movaps -0x68(%rax),%xmm10 + movaps -0x58(%rax),%xmm11 + movaps -0x48(%rax),%xmm12 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp + ret +.size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt +___ + + if ($avx) {{{ +my @ptr=map("%r$_",(8..15)); +my $offload=$sink; + +my @out=map("%xmm$_",(2..9)); +my @inp=map("%xmm$_",(10..13)); +my ($counters,$zero)=("%xmm14","%xmm15"); + +$code.=<<___; +.type aesni_multi_cbc_encrypt_avx,\@function,3 +.align 32 +aesni_multi_cbc_encrypt_avx: +_avx_cbc_enc_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + # +64 distances between inputs and outputs + # +128 off-load area for @inp[0..3] + + sub \$192,%rsp + and \$-128,%rsp + mov %rax,16(%rsp) # original %rsp + +.Lenc8x_body: + vzeroupper + vmovdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*4($inp),$inp + shr \$1,$num + +.Lenc8x_loop_grande: + #mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<8;$i++) { + my $temp = $i ? $offload : $offset; + $code.=<<___; + mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer + cmp $num,$one + mov `40*$i+8-40*4`($inp),$temp # output pointer + cmovg $one,$num # find maximum + test $one,$one + vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@ptr[$i] # cancel input + sub @ptr[$i],$temp # distance between input and output + mov $temp,`64+8*$i`(%rsp) # initialize distances +___ +} +$code.=<<___; + test $num,$num + jz .Lenc8x_done + + vmovups 0x10-0x78($key),$rndkey1 + vmovups 0x20-0x78($key),$rndkey0 + mov 0xf0-0x78($key),$rounds + + vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round + lea 128(%rsp),$offload # offload area + vpxor (@ptr[1]),$zero,@inp[1] + vpxor (@ptr[2]),$zero,@inp[2] + vpxor (@ptr[3]),$zero,@inp[3] + vpxor @inp[0],@out[0],@out[0] + vpxor (@ptr[4]),$zero,@inp[0] + vpxor @inp[1],@out[1],@out[1] + vpxor (@ptr[5]),$zero,@inp[1] + vpxor @inp[2],@out[2],@out[2] + vpxor (@ptr[6]),$zero,@inp[2] + vpxor @inp[3],@out[3],@out[3] + vpxor (@ptr[7]),$zero,@inp[3] + vpxor @inp[0],@out[4],@out[4] + mov \$1,$one # constant of 1 + vpxor @inp[1],@out[5],@out[5] + vpxor @inp[2],@out[6],@out[6] + vpxor @inp[3],@out[7],@out[7] + jmp .Loop_enc8x + +.align 32 +.Loop_enc8x: +___ +for($i=0;$i<8;$i++) { +my $rndkey=($i&1)?$rndkey0:$rndkey1; +$code.=<<___; + vaesenc $rndkey,@out[0],@out[0] + cmp 32+4*$i(%rsp),$one +___ +$code.=<<___ if ($i); + mov 64+8*$i(%rsp),$offset +___ +$code.=<<___; + vaesenc $rndkey,@out[1],@out[1] + vaesenc $rndkey,@out[2],@out[2] + vaesenc $rndkey,@out[3],@out[3] + lea (@ptr[$i],$offset),$offset + cmovge %rsp,@ptr[$i] # cancel input + vaesenc $rndkey,@out[4],@out[4] + cmovg %rsp,$offset # sink output + vaesenc $rndkey,@out[5],@out[5] + sub @ptr[$i],$offset + vaesenc $rndkey,@out[6],@out[6] + vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round + mov $offset,64+8*$i(%rsp) + vaesenc $rndkey,@out[7],@out[7] + vmovups `16*(3+$i)-0x78`($key),$rndkey + lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output +___ +$code.=<<___ if ($i<4) + vmovdqu @inp[$i%4],`16*$i`($offload) # off-load +___ +} +$code.=<<___; + vmovdqu 32(%rsp),$counters + cmp \$11,$rounds + jb .Lenc8x_tail + + vaesenc $rndkey1,@out[0],@out[0] + vaesenc $rndkey1,@out[1],@out[1] + vaesenc $rndkey1,@out[2],@out[2] + vaesenc $rndkey1,@out[3],@out[3] + vaesenc $rndkey1,@out[4],@out[4] + vaesenc $rndkey1,@out[5],@out[5] + vaesenc $rndkey1,@out[6],@out[6] + vaesenc $rndkey1,@out[7],@out[7] + vmovups 0xb0-0x78($key),$rndkey1 + + vaesenc $rndkey0,@out[0],@out[0] + vaesenc $rndkey0,@out[1],@out[1] + vaesenc $rndkey0,@out[2],@out[2] + vaesenc $rndkey0,@out[3],@out[3] + vaesenc $rndkey0,@out[4],@out[4] + vaesenc $rndkey0,@out[5],@out[5] + vaesenc $rndkey0,@out[6],@out[6] + vaesenc $rndkey0,@out[7],@out[7] + vmovups 0xc0-0x78($key),$rndkey0 + je .Lenc8x_tail + + vaesenc $rndkey1,@out[0],@out[0] + vaesenc $rndkey1,@out[1],@out[1] + vaesenc $rndkey1,@out[2],@out[2] + vaesenc $rndkey1,@out[3],@out[3] + vaesenc $rndkey1,@out[4],@out[4] + vaesenc $rndkey1,@out[5],@out[5] + vaesenc $rndkey1,@out[6],@out[6] + vaesenc $rndkey1,@out[7],@out[7] + vmovups 0xd0-0x78($key),$rndkey1 + + vaesenc $rndkey0,@out[0],@out[0] + vaesenc $rndkey0,@out[1],@out[1] + vaesenc $rndkey0,@out[2],@out[2] + vaesenc $rndkey0,@out[3],@out[3] + vaesenc $rndkey0,@out[4],@out[4] + vaesenc $rndkey0,@out[5],@out[5] + vaesenc $rndkey0,@out[6],@out[6] + vaesenc $rndkey0,@out[7],@out[7] + vmovups 0xe0-0x78($key),$rndkey0 + +.Lenc8x_tail: + vaesenc $rndkey1,@out[0],@out[0] + vpxor $zero,$zero,$zero + vaesenc $rndkey1,@out[1],@out[1] + vaesenc $rndkey1,@out[2],@out[2] + vpcmpgtd $zero,$counters,$zero + vaesenc $rndkey1,@out[3],@out[3] + vaesenc $rndkey1,@out[4],@out[4] + vpaddd $counters,$zero,$zero # decrement counters + vmovdqu 48(%rsp),$counters + vaesenc $rndkey1,@out[5],@out[5] + mov 64(%rsp),$offset # pre-load 1st offset + vaesenc $rndkey1,@out[6],@out[6] + vaesenc $rndkey1,@out[7],@out[7] + vmovups 0x10-0x78($key),$rndkey1 + + vaesenclast $rndkey0,@out[0],@out[0] + vmovdqa $zero,32(%rsp) # update counters + vpxor $zero,$zero,$zero + vaesenclast $rndkey0,@out[1],@out[1] + vaesenclast $rndkey0,@out[2],@out[2] + vpcmpgtd $zero,$counters,$zero + vaesenclast $rndkey0,@out[3],@out[3] + vaesenclast $rndkey0,@out[4],@out[4] + vpaddd $zero,$counters,$counters # decrement counters + vmovdqu -0x78($key),$zero # 0-round + vaesenclast $rndkey0,@out[5],@out[5] + vaesenclast $rndkey0,@out[6],@out[6] + vmovdqa $counters,48(%rsp) # update counters + vaesenclast $rndkey0,@out[7],@out[7] + vmovups 0x20-0x78($key),$rndkey0 + + vmovups @out[0],-16(@ptr[0]) # write output + sub $offset,@ptr[0] # switch to input + vpxor 0x00($offload),@out[0],@out[0] + vmovups @out[1],-16(@ptr[1]) + sub `64+1*8`(%rsp),@ptr[1] + vpxor 0x10($offload),@out[1],@out[1] + vmovups @out[2],-16(@ptr[2]) + sub `64+2*8`(%rsp),@ptr[2] + vpxor 0x20($offload),@out[2],@out[2] + vmovups @out[3],-16(@ptr[3]) + sub `64+3*8`(%rsp),@ptr[3] + vpxor 0x30($offload),@out[3],@out[3] + vmovups @out[4],-16(@ptr[4]) + sub `64+4*8`(%rsp),@ptr[4] + vpxor @inp[0],@out[4],@out[4] + vmovups @out[5],-16(@ptr[5]) + sub `64+5*8`(%rsp),@ptr[5] + vpxor @inp[1],@out[5],@out[5] + vmovups @out[6],-16(@ptr[6]) + sub `64+6*8`(%rsp),@ptr[6] + vpxor @inp[2],@out[6],@out[6] + vmovups @out[7],-16(@ptr[7]) + sub `64+7*8`(%rsp),@ptr[7] + vpxor @inp[3],@out[7],@out[7] + + dec $num + jnz .Loop_enc8x + + mov 16(%rsp),%rax # original %rsp + #mov 24(%rsp),$num + #lea `40*8`($inp),$inp + #dec $num + #jnz .Lenc8x_loop_grande + +.Lenc8x_done: + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp + ret +.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx + +.type aesni_multi_cbc_decrypt_avx,\@function,3 +.align 32 +aesni_multi_cbc_decrypt_avx: +_avx_cbc_dec_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + # stack layout + # + # +0 output sink + # +16 input sink [original %rsp and $num] + # +32 counters + # +64 distances between inputs and outputs + # +128 off-load area for @inp[0..3] + # +192 IV/input offload + + sub \$256,%rsp + and \$-256,%rsp + sub \$192,%rsp + mov %rax,16(%rsp) # original %rsp + +.Ldec8x_body: + vzeroupper + vmovdqu ($key),$zero # 0-round key + lea 0x78($key),$key # size optimization + lea 40*4($inp),$inp + shr \$1,$num + +.Ldec8x_loop_grande: + #mov $num,24(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<8;$i++) { + my $temp = $i ? $offload : $offset; + $code.=<<___; + mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks + mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer + cmp $num,$one + mov `40*$i+8-40*4`($inp),$temp # output pointer + cmovg $one,$num # find maximum + test $one,$one + vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV + mov $one,`32+4*$i`(%rsp) # initialize counters + cmovle %rsp,@ptr[$i] # cancel input + sub @ptr[$i],$temp # distance between input and output + mov $temp,`64+8*$i`(%rsp) # initialize distances + vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV +___ +} +$code.=<<___; + test $num,$num + jz .Ldec8x_done + + vmovups 0x10-0x78($key),$rndkey1 + vmovups 0x20-0x78($key),$rndkey0 + mov 0xf0-0x78($key),$rounds + lea 192+128(%rsp),$offload # offload area + + vmovdqu (@ptr[0]),@out[0] # load inputs + vmovdqu (@ptr[1]),@out[1] + vmovdqu (@ptr[2]),@out[2] + vmovdqu (@ptr[3]),@out[3] + vmovdqu (@ptr[4]),@out[4] + vmovdqu (@ptr[5]),@out[5] + vmovdqu (@ptr[6]),@out[6] + vmovdqu (@ptr[7]),@out[7] + vmovdqu @out[0],0x00($offload) # offload inputs + vpxor $zero,@out[0],@out[0] # xor inputs with 0-round + vmovdqu @out[1],0x10($offload) + vpxor $zero,@out[1],@out[1] + vmovdqu @out[2],0x20($offload) + vpxor $zero,@out[2],@out[2] + vmovdqu @out[3],0x30($offload) + vpxor $zero,@out[3],@out[3] + vmovdqu @out[4],0x40($offload) + vpxor $zero,@out[4],@out[4] + vmovdqu @out[5],0x50($offload) + vpxor $zero,@out[5],@out[5] + vmovdqu @out[6],0x60($offload) + vpxor $zero,@out[6],@out[6] + vmovdqu @out[7],0x70($offload) + vpxor $zero,@out[7],@out[7] + xor \$0x80,$offload + mov \$1,$one # constant of 1 + jmp .Loop_dec8x + +.align 32 +.Loop_dec8x: +___ +for($i=0;$i<8;$i++) { +my $rndkey=($i&1)?$rndkey0:$rndkey1; +$code.=<<___; + vaesdec $rndkey,@out[0],@out[0] + cmp 32+4*$i(%rsp),$one +___ +$code.=<<___ if ($i); + mov 64+8*$i(%rsp),$offset +___ +$code.=<<___; + vaesdec $rndkey,@out[1],@out[1] + vaesdec $rndkey,@out[2],@out[2] + vaesdec $rndkey,@out[3],@out[3] + lea (@ptr[$i],$offset),$offset + cmovge %rsp,@ptr[$i] # cancel input + vaesdec $rndkey,@out[4],@out[4] + cmovg %rsp,$offset # sink output + vaesdec $rndkey,@out[5],@out[5] + sub @ptr[$i],$offset + vaesdec $rndkey,@out[6],@out[6] + vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input + mov $offset,64+8*$i(%rsp) + vaesdec $rndkey,@out[7],@out[7] + vmovups `16*(3+$i)-0x78`($key),$rndkey + lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output +___ +$code.=<<___ if ($i<4); + vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load +___ +} +$code.=<<___; + vmovdqu 32(%rsp),$counters + cmp \$11,$rounds + jb .Ldec8x_tail + + vaesdec $rndkey1,@out[0],@out[0] + vaesdec $rndkey1,@out[1],@out[1] + vaesdec $rndkey1,@out[2],@out[2] + vaesdec $rndkey1,@out[3],@out[3] + vaesdec $rndkey1,@out[4],@out[4] + vaesdec $rndkey1,@out[5],@out[5] + vaesdec $rndkey1,@out[6],@out[6] + vaesdec $rndkey1,@out[7],@out[7] + vmovups 0xb0-0x78($key),$rndkey1 + + vaesdec $rndkey0,@out[0],@out[0] + vaesdec $rndkey0,@out[1],@out[1] + vaesdec $rndkey0,@out[2],@out[2] + vaesdec $rndkey0,@out[3],@out[3] + vaesdec $rndkey0,@out[4],@out[4] + vaesdec $rndkey0,@out[5],@out[5] + vaesdec $rndkey0,@out[6],@out[6] + vaesdec $rndkey0,@out[7],@out[7] + vmovups 0xc0-0x78($key),$rndkey0 + je .Ldec8x_tail + + vaesdec $rndkey1,@out[0],@out[0] + vaesdec $rndkey1,@out[1],@out[1] + vaesdec $rndkey1,@out[2],@out[2] + vaesdec $rndkey1,@out[3],@out[3] + vaesdec $rndkey1,@out[4],@out[4] + vaesdec $rndkey1,@out[5],@out[5] + vaesdec $rndkey1,@out[6],@out[6] + vaesdec $rndkey1,@out[7],@out[7] + vmovups 0xd0-0x78($key),$rndkey1 + + vaesdec $rndkey0,@out[0],@out[0] + vaesdec $rndkey0,@out[1],@out[1] + vaesdec $rndkey0,@out[2],@out[2] + vaesdec $rndkey0,@out[3],@out[3] + vaesdec $rndkey0,@out[4],@out[4] + vaesdec $rndkey0,@out[5],@out[5] + vaesdec $rndkey0,@out[6],@out[6] + vaesdec $rndkey0,@out[7],@out[7] + vmovups 0xe0-0x78($key),$rndkey0 + +.Ldec8x_tail: + vaesdec $rndkey1,@out[0],@out[0] + vpxor $zero,$zero,$zero + vaesdec $rndkey1,@out[1],@out[1] + vaesdec $rndkey1,@out[2],@out[2] + vpcmpgtd $zero,$counters,$zero + vaesdec $rndkey1,@out[3],@out[3] + vaesdec $rndkey1,@out[4],@out[4] + vpaddd $counters,$zero,$zero # decrement counters + vmovdqu 48(%rsp),$counters + vaesdec $rndkey1,@out[5],@out[5] + mov 64(%rsp),$offset # pre-load 1st offset + vaesdec $rndkey1,@out[6],@out[6] + vaesdec $rndkey1,@out[7],@out[7] + vmovups 0x10-0x78($key),$rndkey1 + + vaesdeclast $rndkey0,@out[0],@out[0] + vmovdqa $zero,32(%rsp) # update counters + vpxor $zero,$zero,$zero + vaesdeclast $rndkey0,@out[1],@out[1] + vpxor 0x00($offload),@out[0],@out[0] # xor with IV + vaesdeclast $rndkey0,@out[2],@out[2] + vpxor 0x10($offload),@out[1],@out[1] + vpcmpgtd $zero,$counters,$zero + vaesdeclast $rndkey0,@out[3],@out[3] + vpxor 0x20($offload),@out[2],@out[2] + vaesdeclast $rndkey0,@out[4],@out[4] + vpxor 0x30($offload),@out[3],@out[3] + vpaddd $zero,$counters,$counters # decrement counters + vmovdqu -0x78($key),$zero # 0-round + vaesdeclast $rndkey0,@out[5],@out[5] + vpxor 0x40($offload),@out[4],@out[4] + vaesdeclast $rndkey0,@out[6],@out[6] + vpxor 0x50($offload),@out[5],@out[5] + vmovdqa $counters,48(%rsp) # update counters + vaesdeclast $rndkey0,@out[7],@out[7] + vpxor 0x60($offload),@out[6],@out[6] + vmovups 0x20-0x78($key),$rndkey0 + + vmovups @out[0],-16(@ptr[0]) # write output + sub $offset,@ptr[0] # switch to input + vmovdqu 128+0(%rsp),@out[0] + vpxor 0x70($offload),@out[7],@out[7] + vmovups @out[1],-16(@ptr[1]) + sub `64+1*8`(%rsp),@ptr[1] + vmovdqu @out[0],0x00($offload) + vpxor $zero,@out[0],@out[0] + vmovdqu 128+16(%rsp),@out[1] + vmovups @out[2],-16(@ptr[2]) + sub `64+2*8`(%rsp),@ptr[2] + vmovdqu @out[1],0x10($offload) + vpxor $zero,@out[1],@out[1] + vmovdqu 128+32(%rsp),@out[2] + vmovups @out[3],-16(@ptr[3]) + sub `64+3*8`(%rsp),@ptr[3] + vmovdqu @out[2],0x20($offload) + vpxor $zero,@out[2],@out[2] + vmovdqu 128+48(%rsp),@out[3] + vmovups @out[4],-16(@ptr[4]) + sub `64+4*8`(%rsp),@ptr[4] + vmovdqu @out[3],0x30($offload) + vpxor $zero,@out[3],@out[3] + vmovdqu @inp[0],0x40($offload) + vpxor @inp[0],$zero,@out[4] + vmovups @out[5],-16(@ptr[5]) + sub `64+5*8`(%rsp),@ptr[5] + vmovdqu @inp[1],0x50($offload) + vpxor @inp[1],$zero,@out[5] + vmovups @out[6],-16(@ptr[6]) + sub `64+6*8`(%rsp),@ptr[6] + vmovdqu @inp[2],0x60($offload) + vpxor @inp[2],$zero,@out[6] + vmovups @out[7],-16(@ptr[7]) + sub `64+7*8`(%rsp),@ptr[7] + vmovdqu @inp[3],0x70($offload) + vpxor @inp[3],$zero,@out[7] + + xor \$128,$offload + dec $num + jnz .Loop_dec8x + + mov 16(%rsp),%rax # original %rsp + #mov 24(%rsp),$num + #lea `40*8`($inp),$inp + #dec $num + #jnz .Ldec8x_loop_grande + +.Ldec8x_done: + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp + ret +.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx +___ + }}} + +sub rex { + local *opcode=shift; + my ($dst,$src)=@_; + my $rex=0; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @opcode,$rex|0x40 if($rex); +} + +sub aesni { + my $line=shift; + my @opcode=(0x66); + + if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + rex(\@opcode,$4,$3); + push @opcode,0x0f,0x3a,0xdf; + push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M + my $c=$2; + push @opcode,$c=~/^0/?oct($c):$c; + return ".byte\t".join(',',@opcode); + } + elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesimc" => 0xdb, + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf + ); + return undef if (!defined($opcodelet{$1})); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } + elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesenc" => 0xdc, "aesenclast" => 0xdd, + "aesdec" => 0xde, "aesdeclast" => 0xdf + ); + return undef if (!defined($opcodelet{$1})); + my $off = $2; + push @opcode,0x44 if ($3>=8); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M + push @opcode,($off=~/^0/?oct($off):$off)&0xff; + return ".byte\t".join(',',@opcode); + } + return $line; +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; + +print $code; +close STDOUT; diff --git a/crypto/sha/asm/sha1-mb-x86_64.pl b/crypto/sha/asm/sha1-mb-x86_64.pl new file mode 100644 index 0000000000..fff96547da --- /dev/null +++ b/crypto/sha/asm/sha1-mb-x86_64.pl @@ -0,0 +1,1026 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# Multi-buffer SHA1 procedure processes n buffers in parallel by +# placing buffer data to designated lane of SIMD register. n is +# naturally limited to 4 on pre-AVX2 processors and to 8 on +# AVX2-capable processors such as Haswell. +# +# this +aesni(*) sha1 aesni-sha1 gain +# ------------------------------------------------------------------- +# Westmere(**) 10.4/n +1.28=3.88(n=4) 5.44 6.58 +70% +# Atom(**) 18.9/n +3.93=8.66(n=4) 10.0 14.0 +62% +# Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% +# Ivy Bridge (8.03 +5.14=13.2)/n 4.60 5.54 +68% +# Haswell(***) (8.96 +5.00=14.0)/n 3.57 4.55 +160% +# Bulldozer (9.75 +5.76=15.5)/n 5.95 6.37 +64% +# +# (*) multi-block CBC encrypt with 128-bit key; +# (**) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, +# because of lower AES-NI instruction throughput; +# (***) "this" is for n=8, when we gather twice as much data, result +# for n=4 is 7.98+4.44=12.4; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +$avx=0; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +# void sha1_multi_block ( +# struct { unsigned int A[8]; +# unsigned int B[8]; +# unsigned int C[8]; +# unsigned int D[8]; +# unsigned int E[8]; } *ctx, +# struct { void *ptr; int blocks; } inp[8], +# int num); /* 1 or 2 */ +# +$ctx="%rdi"; # 1st arg +$inp="%rsi"; # 2nd arg +$num="%edx"; +@ptr=map("%r$_",(8..11)); +$Tbl="%rbp"; + +@V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4)); +($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9)); +@Xi=map("%xmm$_",(10..14)); +$K="%xmm15"; + +$REG_SZ=16; + +sub Xi_off { +my $off = shift; + + $off %= 16; $off *= $REG_SZ; + $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; +} + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $k=$i+2; + +$code.=<<___ if ($i==0); + movd (@ptr[0]),@Xi[0] + lea `16*4`(@ptr[0]),@ptr[0] + movd (@ptr[1]),@Xi[2] # borrow @Xi[2] + lea `16*4`(@ptr[1]),@ptr[1] + movd (@ptr[2]),@Xi[3] # borrow @Xi[3] + lea `16*4`(@ptr[2]),@ptr[2] + movd (@ptr[3]),@Xi[4] # borrow @Xi[4] + lea `16*4`(@ptr[3]),@ptr[3] + punpckldq @Xi[3],@Xi[0] + movd `4*$j-16*4`(@ptr[0]),@Xi[1] + punpckldq @Xi[4],@Xi[2] + movd `4*$j-16*4`(@ptr[1]),$t3 + punpckldq @Xi[2],@Xi[0] + movd `4*$j-16*4`(@ptr[2]),$t2 + pshufb $tx,@Xi[0] +___ +$code.=<<___ if ($i<14); # just load input + movd `4*$j-16*4`(@ptr[3]),$t1 + punpckldq $t2,@Xi[1] + movdqa $a,$t2 + paddd $K,$e # e+=K_00_19 + punpckldq $t1,$t3 + movdqa $b,$t1 + movdqa $b,$t0 + pslld \$5,$t2 + pandn $d,$t1 + pand $c,$t0 + punpckldq $t3,@Xi[1] + movdqa $a,$t3 + + movdqa @Xi[0],`&Xi_off($i)` + paddd @Xi[0],$e # e+=X[i] + movd `4*$k-16*4`(@ptr[0]),@Xi[2] + psrld \$27,$t3 + pxor $t1,$t0 # Ch(b,c,d) + movdqa $b,$t1 + + por $t3,$t2 # rol(a,5) + movd `4*$k-16*4`(@ptr[1]),$t3 + pslld \$30,$t1 + paddd $t0,$e # e+=Ch(b,c,d) + + psrld \$2,$b + paddd $t2,$e # e+=rol(a,5) + movd `4*$j-16*4`(@ptr[2]),$t2 + pshufb $tx,@Xi[1] + por $t1,$b # b=rol(b,30) +___ +$code.=<<___ if ($i==14); # just load input + movd `4*$j-16*4`(@ptr[3]),$t1 + punpckldq $t2,@Xi[1] + movdqa $a,$t2 + paddd $K,$e # e+=K_00_19 + punpckldq $t1,$t3 + movdqa $b,$t1 + movdqa $b,$t0 + pslld \$5,$t2 + pandn $d,$t1 + pand $c,$t0 + punpckldq $t3,@Xi[1] + movdqa $a,$t3 + + movdqa @Xi[0],`&Xi_off($i)` + paddd @Xi[0],$e # e+=X[i] + psrld \$27,$t3 + pxor $t1,$t0 # Ch(b,c,d) + movdqa $b,$t1 + + por $t3,$t2 # rol(a,5) + pslld \$30,$t1 + paddd $t0,$e # e+=Ch(b,c,d) + + psrld \$2,$b + paddd $t2,$e # e+=rol(a,5) + pshufb $tx,@Xi[1] + por $t1,$b # b=rol(b,30) +___ +$code.=<<___ if ($i>=13 && $i<15); + movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" +___ +$code.=<<___ if ($i>=15); # apply Xupdate + pxor @Xi[-2],@Xi[1] # "X[13]" + movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" + + movdqa $a,$t2 + pxor `&Xi_off($j+8)`,@Xi[1] + paddd $K,$e # e+=K_00_19 + movdqa $b,$t1 + pslld \$5,$t2 + pxor @Xi[3],@Xi[1] + movdqa $b,$t0 + pandn $d,$t1 + movdqa @Xi[1],$tx + pand $c,$t0 + movdqa $a,$t3 + psrld \$31,$tx + paddd @Xi[1],@Xi[1] + + movdqa @Xi[0],`&Xi_off($i)` + paddd @Xi[0],$e # e+=X[i] + psrld \$27,$t3 + pxor $t1,$t0 # Ch(b,c,d) + + movdqa $b,$t1 + por $t3,$t2 # rol(a,5) + pslld \$30,$t1 + paddd $t0,$e # e+=Ch(b,c,d) + + psrld \$2,$b + paddd $t2,$e # e+=rol(a,5) + por $tx,@Xi[1] # rol \$1,@Xi[1] + por $t1,$b # b=rol(b,30) +___ +push(@Xi,shift(@Xi)); +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; + +$code.=<<___ if ($i<79); + pxor @Xi[-2],@Xi[1] # "X[13]" + movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" + + movdqa $a,$t2 + movdqa $d,$t0 + pxor `&Xi_off($j+8)`,@Xi[1] + paddd $K,$e # e+=K_20_39 + pslld \$5,$t2 + pxor $b,$t0 + + movdqa $a,$t3 +___ +$code.=<<___ if ($i<72); + movdqa @Xi[0],`&Xi_off($i)` +___ +$code.=<<___ if ($i<79); + paddd @Xi[0],$e # e+=X[i] + pxor @Xi[3],@Xi[1] + psrld \$27,$t3 + pxor $c,$t0 # Parity(b,c,d) + movdqa $b,$t1 + + pslld \$30,$t1 + movdqa @Xi[1],$tx + por $t3,$t2 # rol(a,5) + psrld \$31,$tx + paddd $t0,$e # e+=Parity(b,c,d) + paddd @Xi[1],@Xi[1] + + psrld \$2,$b + paddd $t2,$e # e+=rol(a,5) + por $tx,@Xi[1] # rol(@Xi[1],1) + por $t1,$b # b=rol(b,30) +___ +$code.=<<___ if ($i==79); + movdqa $a,$t2 + paddd $K,$e # e+=K_20_39 + movdqa $d,$t0 + pslld \$5,$t2 + pxor $b,$t0 + + movdqa $a,$t3 + paddd @Xi[0],$e # e+=X[i] + psrld \$27,$t3 + movdqa $b,$t1 + pxor $c,$t0 # Parity(b,c,d) + + pslld \$30,$t1 + por $t3,$t2 # rol(a,5) + paddd $t0,$e # e+=Parity(b,c,d) + + psrld \$2,$b + paddd $t2,$e # e+=rol(a,5) + por $t1,$b # b=rol(b,30) +___ +push(@Xi,shift(@Xi)); +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; + +$code.=<<___; + pxor @Xi[-2],@Xi[1] # "X[13]" + movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" + + movdqa $a,$t2 + movdqa $d,$t1 + pxor `&Xi_off($j+8)`,@Xi[1] + pxor @Xi[3],@Xi[1] + paddd $K,$e # e+=K_40_59 + pslld \$5,$t2 + movdqa $a,$t3 + pand $c,$t1 + + movdqa $d,$t0 + movdqa @Xi[1],$tx + psrld \$27,$t3 + paddd $t1,$e + pxor $c,$t0 + + movdqa @Xi[0],`&Xi_off($i)` + paddd @Xi[0],$e # e+=X[i] + por $t3,$t2 # rol(a,5) + psrld \$31,$tx + pand $b,$t0 + movdqa $b,$t1 + + pslld \$30,$t1 + paddd @Xi[1],@Xi[1] + paddd $t0,$e # e+=Maj(b,d,c) + + psrld \$2,$b + paddd $t2,$e # e+=rol(a,5) + por $tx,@Xi[1] # rol(@X[1],1) + por $t1,$b # b=rol(b,30) +___ +push(@Xi,shift(@Xi)); +} + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.globl sha1_multi_block +.type sha1_multi_block,\@function,3 +.align 32 +sha1_multi_block: +___ +$code.=<<___ if ($avx); + mov OPENSSL_ia32cap_P+4(%rip),%rcx + test \$`1<<28`,%ecx + jnz _avx_shortcut +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,-0x78(%rax) + movaps %xmm11,-0x68(%rax) + movaps %xmm12,-0x58(%rax) + movaps %xmm13,-0x48(%rax) + movaps %xmm14,-0x38(%rax) + movaps %xmm15,-0x28(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`,%rsp + and \$-256,%rsp + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp + lea K_XX_XX(%rip),$Tbl + lea `$REG_SZ*16`(%rsp),%rbx + +.Loop_grande: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldone + + movdqu 0x00($ctx),$A # load context + lea 128(%rsp),%rax + movdqu 0x20($ctx),$B + movdqu 0x40($ctx),$C + movdqu 0x60($ctx),$D + movdqu 0x80($ctx),$E + movdqa 0x60($Tbl),$tx # pbswap_mask + jmp .Loop + +.align 32 +.Loop: +___ +$code.=" movdqa -0x20($Tbl),$K\n"; # K_00_19 +for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +$code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39 +for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59 +for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +$code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79 +for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + movdqa (%rbx),@Xi[0] # pull counters + mov \$1,%ecx + cmp 4*0(%rbx),%ecx # examinte counters + pxor $t2,$t2 + cmovge $Tbl,@ptr[0] # cancel input + cmp 4*1(%rbx),%ecx + movdqa @Xi[0],@Xi[1] + cmovge $Tbl,@ptr[1] + cmp 4*2(%rbx),%ecx + pcmpgtd $t2,@Xi[1] # mask value + cmovge $Tbl,@ptr[2] + cmp 4*3(%rbx),%ecx + paddd @Xi[1],@Xi[0] # counters-- + cmovge $Tbl,@ptr[3] + + movdqu 0x00($ctx),$t0 + pand @Xi[1],$A + movdqu 0x20($ctx),$t1 + pand @Xi[1],$B + paddd $t0,$A + movdqu 0x40($ctx),$t2 + pand @Xi[1],$C + paddd $t1,$B + movdqu 0x60($ctx),$t3 + pand @Xi[1],$D + paddd $t2,$C + movdqu 0x80($ctx),$tx + pand @Xi[1],$E + movdqu $A,0x00($ctx) + paddd $t3,$D + movdqu $B,0x20($ctx) + paddd $tx,$E + movdqu $C,0x40($ctx) + movdqu $D,0x60($ctx) + movdqu $E,0x80($ctx) + + movdqa @Xi[0],(%rbx) # save counters + movdqa 0x60($Tbl),$tx # pbswap_mask + dec $num + jnz .Loop + + mov `$REG_SZ*17+8`(%rsp),$num + lea $REG_SZ($ctx),$ctx + lea `16*$REG_SZ/4`($inp),$inp + dec $num + jnz .Loop_grande + +.Ldone: + mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp +___ +$code.=<<___ if ($win64); + movaps -0xb8(%rax),%xmm6 + movaps -0xa8(%rax),%xmm7 + movaps -0x98(%rax),%xmm8 + movaps -0x88(%rax),%xmm9 + movaps -0x78(%rax),%xmm10 + movaps -0x68(%rax),%xmm11 + movaps -0x58(%rax),%xmm12 + movaps -0x48(%rax),%xmm13 + movaps -0x38(%rax),%xmm14 + movaps -0x28(%rax),%xmm15 +___ +$code.=<<___; + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp + ret +.size sha1_multi_block,.-sha1_multi_block +___ + + if ($avx) {{{ +sub BODY_00_19_avx { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $k=$i+2; +my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128"; +my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4]; + +$code.=<<___ if ($i==0 && $REG_SZ==16); + vmovd (@ptr[0]),@Xi[0] + lea `16*4`(@ptr[0]),@ptr[0] + vmovd (@ptr[1]),@Xi[2] # borrow Xi[2] + lea `16*4`(@ptr[1]),@ptr[1] + vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] + lea `16*4`(@ptr[2]),@ptr[2] + vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2] + lea `16*4`(@ptr[3]),@ptr[3] + vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] + vpunpckldq @Xi[2],@Xi[0],@Xi[0] + vmovd `4*$j-16*4`($ptr_n),$t3 + vpshufb $tx,@Xi[0],@Xi[0] +___ +$code.=<<___ if ($i<15 && $REG_SZ==16); # just load input + vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] + vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3 +___ +$code.=<<___ if ($i==0 && $REG_SZ==32); + vmovd (@ptr[0]),@Xi[0] + lea `16*4`(@ptr[0]),@ptr[0] + vmovd (@ptr[4]),@Xi[2] # borrow Xi[2] + lea `16*4`(@ptr[4]),@ptr[4] + vmovd (@ptr[1]),$t2 + lea `16*4`(@ptr[1]),@ptr[1] + vmovd (@ptr[5]),$t1 + lea `16*4`(@ptr[5]),@ptr[5] + vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] + lea `16*4`(@ptr[2]),@ptr[2] + vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2] + lea `16*4`(@ptr[6]),@ptr[6] + vpinsrd \$1,(@ptr[3]),$t2,$t2 + lea `16*4`(@ptr[3]),@ptr[3] + vpunpckldq $t2,@Xi[0],@Xi[0] + vpinsrd \$1,(@ptr[7]),$t1,$t1 + lea `16*4`(@ptr[7]),@ptr[7] + vpunpckldq $t1,@Xi[2],@Xi[2] + vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] + vinserti128 @Xi[2],@Xi[0],@Xi[0] + vmovd `4*$j-16*4`($ptr_n),$t3 + vpshufb $tx,@Xi[0],@Xi[0] +___ +$code.=<<___ if ($i<15 && $REG_SZ==32); # just load input + vmovd `4*$j-16*4`(@ptr[1]),$t2 + vmovd `4*$j-16*4`(@ptr[5]),$t1 + vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] + vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3 + vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2 + vpunpckldq $t2,@Xi[1],@Xi[1] + vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1 + vpunpckldq $t1,$t3,$t3 +___ +$code.=<<___ if ($i<14); + vpaddd $K,$e,$e # e+=K_00_19 + vpslld \$5,$a,$t2 + vpandn $d,$b,$t1 + vpand $c,$b,$t0 + + vmovdqa @Xi[0],`&Xi_off($i)` + vpaddd @Xi[0],$e,$e # e+=X[i] + $vpack $t3,@Xi[1],@Xi[1] + vpsrld \$27,$a,$t3 + vpxor $t1,$t0,$t0 # Ch(b,c,d) + vmovd `4*$k-16*4`(@ptr[0]),@Xi[2] + + vpslld \$30,$b,$t1 + vpor $t3,$t2,$t2 # rol(a,5) + vmovd `4*$k-16*4`($ptr_n),$t3 + vpaddd $t0,$e,$e # e+=Ch(b,c,d) + + vpsrld \$2,$b,$b + vpaddd $t2,$e,$e # e+=rol(a,5) + vpshufb $tx,@Xi[1],@Xi[1] + vpor $t1,$b,$b # b=rol(b,30) +___ +$code.=<<___ if ($i==14); + vpaddd $K,$e,$e # e+=K_00_19 + vpslld \$5,$a,$t2 + vpandn $d,$b,$t1 + vpand $c,$b,$t0 + + vmovdqa @Xi[0],`&Xi_off($i)` + vpaddd @Xi[0],$e,$e # e+=X[i] + $vpack $t3,@Xi[1],@Xi[1] + vpsrld \$27,$a,$t3 + vpxor $t1,$t0,$t0 # Ch(b,c,d) + + vpslld \$30,$b,$t1 + vpor $t3,$t2,$t2 # rol(a,5) + vpaddd $t0,$e,$e # e+=Ch(b,c,d) + + vpsrld \$2,$b,$b + vpaddd $t2,$e,$e # e+=rol(a,5) + vpshufb $tx,@Xi[1],@Xi[1] + vpor $t1,$b,$b # b=rol(b,30) +___ +$code.=<<___ if ($i>=13 && $i<15); + vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" +___ +$code.=<<___ if ($i>=15); # apply Xupdate + vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" + vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" + + vpaddd $K,$e,$e # e+=K_00_19 + vpslld \$5,$a,$t2 + vpandn $d,$b,$t1 + vpand $c,$b,$t0 + + vmovdqa @Xi[0],`&Xi_off($i)` + vpaddd @Xi[0],$e,$e # e+=X[i] + vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] + vpsrld \$27,$a,$t3 + vpxor $t1,$t0,$t0 # Ch(b,c,d) + vpxor @Xi[3],@Xi[1],@Xi[1] + + vpslld \$30,$b,$t1 + vpor $t3,$t2,$t2 # rol(a,5) + vpaddd $t0,$e,$e # e+=Ch(b,c,d) + vpsrld \$31,@Xi[1],$tx + vpaddd @Xi[1],@Xi[1],@Xi[1] + + vpsrld \$2,$b,$b + vpaddd $t2,$e,$e # e+=rol(a,5) + vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1] + vpor $t1,$b,$b # b=rol(b,30) +___ +push(@Xi,shift(@Xi)); +} + +sub BODY_20_39_avx { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; + +$code.=<<___ if ($i<79); + vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" + vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" + + vpslld \$5,$a,$t2 + vpaddd $K,$e,$e # e+=K_20_39 + vpxor $b,$d,$t0 +___ +$code.=<<___ if ($i<72); + vmovdqa @Xi[0],`&Xi_off($i)` +___ +$code.=<<___ if ($i<79); + vpaddd @Xi[0],$e,$e # e+=X[i] + vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] + vpsrld \$27,$a,$t3 + vpxor $c,$t0,$t0 # Parity(b,c,d) + vpxor @Xi[3],@Xi[1],@Xi[1] + + vpslld \$30,$b,$t1 + vpor $t3,$t2,$t2 # rol(a,5) + vpaddd $t0,$e,$e # e+=Parity(b,c,d) + vpsrld \$31,@Xi[1],$tx + vpaddd @Xi[1],@Xi[1],@Xi[1] + + vpsrld \$2,$b,$b + vpaddd $t2,$e,$e # e+=rol(a,5) + vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1) + vpor $t1,$b,$b # b=rol(b,30) +___ +$code.=<<___ if ($i==79); + vpslld \$5,$a,$t2 + vpaddd $K,$e,$e # e+=K_20_39 + vpxor $b,$d,$t0 + + vpsrld \$27,$a,$t3 + vpaddd @Xi[0],$e,$e # e+=X[i] + vpxor $c,$t0,$t0 # Parity(b,c,d) + + vpslld \$30,$b,$t1 + vpor $t3,$t2,$t2 # rol(a,5) + vpaddd $t0,$e,$e # e+=Parity(b,c,d) + + vpsrld \$2,$b,$b + vpaddd $t2,$e,$e # e+=rol(a,5) + vpor $t1,$b,$b # b=rol(b,30) +___ +push(@Xi,shift(@Xi)); +} + +sub BODY_40_59_avx { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; + +$code.=<<___; + vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" + vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" + + vpaddd $K,$e,$e # e+=K_40_59 + vpslld \$5,$a,$t2 + vpand $c,$d,$t1 + vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] + + vpaddd $t1,$e,$e + vpsrld \$27,$a,$t3 + vpxor $c,$d,$t0 + vpxor @Xi[3],@Xi[1],@Xi[1] + + vmovdqu @Xi[0],`&Xi_off($i)` + vpaddd @Xi[0],$e,$e # e+=X[i] + vpor $t3,$t2,$t2 # rol(a,5) + vpsrld \$31,@Xi[1],$tx + vpand $b,$t0,$t0 + vpaddd @Xi[1],@Xi[1],@Xi[1] + + vpslld \$30,$b,$t1 + vpaddd $t0,$e,$e # e+=Maj(b,d,c) + + vpsrld \$2,$b,$b + vpaddd $t2,$e,$e # e+=rol(a,5) + vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1) + vpor $t1,$b,$b # b=rol(b,30) +___ +push(@Xi,shift(@Xi)); +} + +$code.=<<___; +.type sha1_multi_block_avx,\@function,3 +.align 32 +sha1_multi_block_avx: +_avx_shortcut: +___ +$code.=<<___ if ($avx>1); + shr \$32,%rcx + cmp \$2,$num + jb .Lavx + test \$`1<<5`,%ecx + jnz _avx2_shortcut + jmp .Lavx +.align 32 +.Lavx: +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,-0x78(%rax) + movaps %xmm11,-0x68(%rax) + movaps %xmm12,-0x58(%rax) + movaps %xmm13,-0x48(%rax) + movaps %xmm14,-0x38(%rax) + movaps %xmm15,-0x28(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`, %rsp + and \$-256,%rsp + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp + lea K_XX_XX(%rip),$Tbl + lea `$REG_SZ*16`(%rsp),%rbx + + vzeroupper +.Loop_grande_avx: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldone_avx + + vmovdqu 0x00($ctx),$A # load context + lea 128(%rsp),%rax + vmovdqu 0x20($ctx),$B + vmovdqu 0x40($ctx),$C + vmovdqu 0x60($ctx),$D + vmovdqu 0x80($ctx),$E + vmovdqu 0x60($Tbl),$tx # pbswap_mask + jmp .Loop_avx + +.align 32 +.Loop_avx: +___ +$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 +for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } +$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 +for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } +$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 +for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } +$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 +for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + mov \$1,%ecx +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + cmp `4*$i`(%rbx),%ecx # examine counters + cmovge $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + vmovdqu (%rbx),$t0 # pull counters + vpxor $t2,$t2,$t2 + vmovdqa $t0,$t1 + vpcmpgtd $t2,$t1,$t1 # mask value + vpaddd $t1,$t0,$t0 # counters-- + + vpand $t1,$A,$A + vpand $t1,$B,$B + vpaddd 0x00($ctx),$A,$A + vpand $t1,$C,$C + vpaddd 0x20($ctx),$B,$B + vpand $t1,$D,$D + vpaddd 0x40($ctx),$C,$C + vpand $t1,$E,$E + vpaddd 0x60($ctx),$D,$D + vpaddd 0x80($ctx),$E,$E + vmovdqu $A,0x00($ctx) + vmovdqu $B,0x20($ctx) + vmovdqu $C,0x40($ctx) + vmovdqu $D,0x60($ctx) + vmovdqu $E,0x80($ctx) + + vmovdqu $t0,(%rbx) # save counters + vmovdqu 0x60($Tbl),$tx # pbswap_mask + dec $num + jnz .Loop_avx + + mov `$REG_SZ*17+8`(%rsp),$num + lea $REG_SZ($ctx),$ctx + lea `16*$REG_SZ/4`($inp),$inp + dec $num + jnz .Loop_grande_avx + +.Ldone_avx: + mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xb8(%rax),%xmm6 + movaps -0xa8(%rax),%xmm7 + movaps -0x98(%rax),%xmm8 + movaps -0x88(%rax),%xmm9 + movaps -0x78(%rax),%xmm10 + movaps -0x68(%rax),%xmm11 + movaps -0x58(%rax),%xmm12 + movaps -0x48(%rax),%xmm13 + movaps -0x38(%rax),%xmm14 + movaps -0x28(%rax),%xmm15 +___ +$code.=<<___; + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp + ret +.size sha1_multi_block_avx,.-sha1_multi_block_avx +___ + + if ($avx>1) { +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +$REG_SZ=32; + +@ptr=map("%r$_",(12..15,8..11)); + +@V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4)); +($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9)); +@Xi=map("%ymm$_",(10..14)); +$K="%ymm15"; + +$code.=<<___; +.type sha1_multi_block_avx2,\@function,3 +.align 32 +sha1_multi_block_avx2: +_avx2_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`, %rsp + and \$-256,%rsp + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp + lea K_XX_XX(%rip),$Tbl + shr \$1,$num + + vzeroupper +.Loop_grande_avx2: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num + lea `$REG_SZ*16`(%rsp),%rbx +___ +for($i=0;$i<8;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + vmovdqu 0x00($ctx),$A # load context + lea 128(%rsp),%rax + vmovdqu 0x20($ctx),$B + lea 256+128(%rsp),%rbx + vmovdqu 0x40($ctx),$C + vmovdqu 0x60($ctx),$D + vmovdqu 0x80($ctx),$E + vmovdqu 0x60($Tbl),$tx # pbswap_mask + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: +___ +$code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 +for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } +$code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 +for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } +$code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 +for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } +$code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 +for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + mov \$1,%ecx + lea `$REG_SZ*16`(%rsp),%rbx +___ +for($i=0;$i<8;$i++) { + $code.=<<___; + cmp `4*$i`(%rbx),%ecx # examine counters + cmovge $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + vmovdqu (%rbx),$t0 # pull counters + vpxor $t2,$t2,$t2 + vmovdqa $t0,$t1 + vpcmpgtd $t2,$t1,$t1 # mask value + vpaddd $t1,$t0,$t0 # counters-- + + vpand $t1,$A,$A + vpand $t1,$B,$B + vpaddd 0x00($ctx),$A,$A + vpand $t1,$C,$C + vpaddd 0x20($ctx),$B,$B + vpand $t1,$D,$D + vpaddd 0x40($ctx),$C,$C + vpand $t1,$E,$E + vpaddd 0x60($ctx),$D,$D + vpaddd 0x80($ctx),$E,$E + vmovdqu $A,0x00($ctx) + vmovdqu $B,0x20($ctx) + vmovdqu $C,0x40($ctx) + vmovdqu $D,0x60($ctx) + vmovdqu $E,0x80($ctx) + + vmovdqu $t0,(%rbx) # save counters + lea 256+128(%rsp),%rbx + vmovdqu 0x60($Tbl),$tx # pbswap_mask + dec $num + jnz .Loop_avx2 + + #mov `$REG_SZ*17+8`(%rsp),$num + #lea $REG_SZ($ctx),$ctx + #lea `16*$REG_SZ/4`($inp),$inp + #dec $num + #jnz .Loop_grande_avx2 + +.Ldone_avx2: + mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp + ret +.size sha1_multi_block_avx2,.-sha1_multi_block_avx2 +___ + } }}} +$code.=<<___; + +.align 256 + .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 + .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 +K_XX_XX: + .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 + .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 + .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 + .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 + .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 + .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or + s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or + s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; + print $_,"\n"; +} + +close STDOUT; diff --git a/crypto/sha/asm/sha256-mb-x86_64.pl b/crypto/sha/asm/sha256-mb-x86_64.pl new file mode 100644 index 0000000000..5bed43b60c --- /dev/null +++ b/crypto/sha/asm/sha256-mb-x86_64.pl @@ -0,0 +1,934 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# Multi-buffer SHA256 procedure processes n buffers in parallel by +# placing buffer data to designated lane of SIMD register. n is +# naturally limited to 4 on pre-AVX2 processors and to 8 on +# AVX2-capable processors such as Haswell. +# +# this +aesni(*) sha256 aesni-sha256 gain +# ------------------------------------------------------------------- +# Westmere(**) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126% +# Atom(**) 39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93% +# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103% +# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82% +# Haswell(***) (21.0 +5.00=26.0)/n 7.80 8.79 +170% +# Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100% +# +# (*) multi-block CBC encrypt with 128-bit key; +# (**) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, +# because of lower AES-NI instruction throughput, nor is there +# AES-NI-SHA256 stitch for these processors; +# (***) "this" is for n=8, when we gather twice as much data, result +# for n=4 is 20.3+4.44=24.7; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +$avx=0; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; + +# void sha256_multi_block ( +# struct { unsigned int A[8]; +# unsigned int B[8]; +# unsigned int C[8]; +# unsigned int D[8]; +# unsigned int E[8]; +# unsigned int F[8]; +# unsigned int G[8]; +# unsigned int H[8]; } *ctx, +# struct { void *ptr; int blocks; } inp[8], +# int num); /* 1 or 2 */ +# +$ctx="%rdi"; # 1st arg +$inp="%rsi"; # 2nd arg +$num="%edx"; # 3rd arg +@ptr=map("%r$_",(8..11)); +$Tbl="%rbp"; + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15)); +($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7)); + +$REG_SZ=16; + +sub Xi_off { +my $off = shift; + + $off %= 16; $off *= $REG_SZ; + $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; +} + +sub ROUND_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; + +$code.=<<___ if ($i<15); + movd `4*$i`(@ptr[0]),$Xi + movd `4*$i`(@ptr[1]),$t1 + movd `4*$i`(@ptr[2]),$t2 + movd `4*$i`(@ptr[3]),$t3 + punpckldq $t2,$Xi + punpckldq $t3,$t1 + punpckldq $t1,$Xi + pshufb $Xn,$Xi +___ +$code.=<<___ if ($i==15); + movd `4*$i`(@ptr[0]),$Xi + lea `16*4`(@ptr[0]),@ptr[0] + movd `4*$i`(@ptr[1]),$t1 + lea `16*4`(@ptr[1]),@ptr[1] + movd `4*$i`(@ptr[2]),$t2 + lea `16*4`(@ptr[2]),@ptr[2] + movd `4*$i`(@ptr[3]),$t3 + lea `16*4`(@ptr[3]),@ptr[3] + punpckldq $t2,$Xi + punpckldq $t3,$t1 + punpckldq $t1,$Xi + pshufb $Xn,$Xi +___ +$code.=<<___; + movdqa $e,$sigma + movdqa $e,$t3 + psrld \$6,$sigma + movdqa $e,$t2 + pslld \$7,$t3 + movdqa $Xi,`&Xi_off($i)` + paddd $h,$Xi # Xi+=h + + psrld \$11,$t2 + pxor $t3,$sigma + pslld \$21-7,$t3 + paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round] + pxor $t2,$sigma + + psrld \$25-11,$t2 + movdqa $e,$t1 + pxor $t3,$sigma + movdqa $e,$axb # borrow $axb + pslld \$26-21,$t3 + pandn $g,$t1 + pand $f,$axb + pxor $t2,$sigma + + movdqa $a,$t2 + pxor $t3,$sigma # Sigma1(e) + movdqa $a,$t3 + psrld \$2,$t2 + paddd $sigma,$Xi # Xi+=Sigma1(e) + pxor $axb,$t1 # Ch(e,f,g) + movdqa $b,$axb + movdqa $a,$sigma + pslld \$10,$t3 + pxor $a,$axb # a^b, b^c in next round + + psrld \$13,$sigma + pxor $t3,$t2 + paddd $t1,$Xi # Xi+=Ch(e,f,g) + pslld \$19-10,$t3 + pand $axb,$bxc + pxor $sigma,$t2 + + psrld \$22-13,$sigma + pxor $t3,$t2 + movdqa $b,$h + pslld \$30-19,$t3 + pxor $t2,$sigma + pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b) + paddd $Xi,$d # d+=Xi + pxor $t3,$sigma # Sigma0(a) + + paddd $Xi,$h # h+=Xi + paddd $sigma,$h # h+=Sigma0(a) +___ +$code.=<<___ if (($i%8)==7); + lea `32*8`($Tbl),$Tbl +___ + ($axb,$bxc)=($bxc,$axb); +} + +sub ROUND_16_XX { +my $i=shift; + +$code.=<<___; + movdqa `&Xi_off($i+1)`,$Xn + paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9] + + movdqa $Xn,$sigma + movdqa $Xn,$t2 + psrld \$3,$sigma + movdqa $Xn,$t3 + + psrld \$7,$t2 + movdqa `&Xi_off($i+14)`,$t1 + pslld \$14,$t3 + pxor $t2,$sigma + psrld \$18-7,$t2 + movdqa $t1,$axb # borrow $axb + pxor $t3,$sigma + pslld \$25-14,$t3 + pxor $t2,$sigma + psrld \$10,$t1 + movdqa $axb,$t2 + + psrld \$17,$axb + pxor $t3,$sigma # sigma0(X[i+1]) + pslld \$13,$t2 + paddd $sigma,$Xi # Xi+=sigma0(e) + pxor $axb,$t1 + psrld \$19-17,$axb + pxor $t2,$t1 + pslld \$15-13,$t2 + pxor $axb,$t1 + pxor $t2,$t1 # sigma0(X[i+14]) + paddd $t1,$Xi # Xi+=sigma1(X[i+14]) +___ + &ROUND_00_15($i,@_); + ($Xi,$Xn)=($Xn,$Xi); +} + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.globl sha256_multi_block +.type sha256_multi_block,\@function,3 +.align 32 +sha256_multi_block: +___ +$code.=<<___ if ($avx); + mov OPENSSL_ia32cap_P+4(%rip),%rcx + test \$`1<<28`,%ecx + jnz _avx_shortcut +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,-0x78(%rax) + movaps %xmm11,-0x68(%rax) + movaps %xmm12,-0x58(%rax) + movaps %xmm13,-0x48(%rax) + movaps %xmm14,-0x38(%rax) + movaps %xmm15,-0x28(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`, %rsp + and \$-256,%rsp + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp + lea K256+128(%rip),$Tbl + lea `$REG_SZ*16`(%rsp),%rbx + lea 0x80($ctx),$ctx # size optimization + +.Loop_grande: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldone + + movdqu 0x00-0x80($ctx),$A # load context + lea 128(%rsp),%rax + movdqu 0x20-0x80($ctx),$B + movdqu 0x40-0x80($ctx),$C + movdqu 0x60-0x80($ctx),$D + movdqu 0x80-0x80($ctx),$E + movdqu 0xa0-0x80($ctx),$F + movdqu 0xc0-0x80($ctx),$G + movdqu 0xe0-0x80($ctx),$H + movdqu .Lpbswap(%rip),$Xn + jmp .Loop + +.align 32 +.Loop: + movdqa $C,$bxc + pxor $B,$bxc # magic seed +___ +for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + movdqu `&Xi_off($i)`,$Xi + mov \$3,%ecx + jmp .Loop_16_xx +.align 32 +.Loop_16_xx: +___ +for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + dec %ecx + jnz .Loop_16_xx + + mov \$1,%ecx + lea K256+128(%rip),$Tbl + + movdqa (%rbx),$sigma # pull counters + cmp 4*0(%rbx),%ecx # examine counters + pxor $t1,$t1 + cmovge $Tbl,@ptr[0] # cancel input + cmp 4*1(%rbx),%ecx + movdqa $sigma,$Xn + cmovge $Tbl,@ptr[1] + cmp 4*2(%rbx),%ecx + pcmpgtd $t1,$Xn # mask value + cmovge $Tbl,@ptr[2] + cmp 4*3(%rbx),%ecx + paddd $Xn,$sigma # counters-- + cmovge $Tbl,@ptr[3] + + movdqu 0x00-0x80($ctx),$t1 + pand $Xn,$A + movdqu 0x20-0x80($ctx),$t2 + pand $Xn,$B + movdqu 0x40-0x80($ctx),$t3 + pand $Xn,$C + movdqu 0x60-0x80($ctx),$Xi + pand $Xn,$D + paddd $t1,$A + movdqu 0x80-0x80($ctx),$t1 + pand $Xn,$E + paddd $t2,$B + movdqu 0xa0-0x80($ctx),$t2 + pand $Xn,$F + paddd $t3,$C + movdqu 0xc0-0x80($ctx),$t3 + pand $Xn,$G + paddd $Xi,$D + movdqu 0xe0-0x80($ctx),$Xi + pand $Xn,$H + paddd $t1,$E + paddd $t2,$F + movdqu $A,0x00-0x80($ctx) + paddd $t3,$G + movdqu $B,0x20-0x80($ctx) + paddd $Xi,$H + movdqu $C,0x40-0x80($ctx) + movdqu $D,0x60-0x80($ctx) + movdqu $E,0x80-0x80($ctx) + movdqu $F,0xa0-0x80($ctx) + movdqu $G,0xc0-0x80($ctx) + movdqu $H,0xe0-0x80($ctx) + + movdqa $sigma,(%rbx) # save counters + movdqa .Lpbswap(%rip),$Xn + dec $num + jnz .Loop + + mov `$REG_SZ*17+8`(%rsp),$num + lea $REG_SZ($ctx),$ctx + lea `16*$REG_SZ/4`($inp),$inp + dec $num + jnz .Loop_grande + +.Ldone: + mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp +___ +$code.=<<___ if ($win64); + movaps -0xb8(%rax),%xmm6 + movaps -0xa8(%rax),%xmm7 + movaps -0x98(%rax),%xmm8 + movaps -0x88(%rax),%xmm9 + movaps -0x78(%rax),%xmm10 + movaps -0x68(%rax),%xmm11 + movaps -0x58(%rax),%xmm12 + movaps -0x48(%rax),%xmm13 + movaps -0x38(%rax),%xmm14 + movaps -0x28(%rax),%xmm15 +___ +$code.=<<___; + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp + ret +.size sha256_multi_block,.-sha256_multi_block +___ + if ($avx) {{{ +sub ROUND_00_15_avx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; + +$code.=<<___ if ($i<15 && $REG_SZ==16); + vmovd `4*$i`(@ptr[0]),$Xi + vmovd `4*$i`(@ptr[1]),$t1 + vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi + vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 + vpunpckldq $t1,$Xi,$Xi + vpshufb $Xn,$Xi,$Xi +___ +$code.=<<___ if ($i==15 && $REG_SZ==16); + vmovd `4*$i`(@ptr[0]),$Xi + lea `16*4`(@ptr[0]),@ptr[0] + vmovd `4*$i`(@ptr[1]),$t1 + lea `16*4`(@ptr[1]),@ptr[1] + vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi + lea `16*4`(@ptr[2]),@ptr[2] + vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 + lea `16*4`(@ptr[3]),@ptr[3] + vpunpckldq $t1,$Xi,$Xi + vpshufb $Xn,$Xi,$Xi +___ +$code.=<<___ if ($i<15 && $REG_SZ==32); + vmovd `4*$i`(@ptr[0]),$Xi + vmovd `4*$i`(@ptr[4]),$t1 + vmovd `4*$i`(@ptr[1]),$t2 + vmovd `4*$i`(@ptr[5]),$t3 + vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi + vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 + vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 + vpunpckldq $t2,$Xi,$Xi + vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 + vpunpckldq $t3,$t1,$t1 + vinserti128 $t1,$Xi,$Xi + vpshufb $Xn,$Xi,$Xi +___ +$code.=<<___ if ($i==15 && $REG_SZ==32); + vmovd `4*$i`(@ptr[0]),$Xi + lea `16*4`(@ptr[0]),@ptr[0] + vmovd `4*$i`(@ptr[4]),$t1 + lea `16*4`(@ptr[4]),@ptr[4] + vmovd `4*$i`(@ptr[1]),$t2 + lea `16*4`(@ptr[1]),@ptr[1] + vmovd `4*$i`(@ptr[5]),$t3 + lea `16*4`(@ptr[5]),@ptr[5] + vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi + lea `16*4`(@ptr[2]),@ptr[2] + vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 + lea `16*4`(@ptr[6]),@ptr[6] + vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 + lea `16*4`(@ptr[3]),@ptr[3] + vpunpckldq $t2,$Xi,$Xi + vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 + lea `16*4`(@ptr[7]),@ptr[7] + vpunpckldq $t3,$t1,$t1 + vinserti128 $t1,$Xi,$Xi + vpshufb $Xn,$Xi,$Xi +___ +$code.=<<___; + vpsrld \$6,$e,$sigma + vpslld \$26,$e,$t3 + vmovdqu $Xi,`&Xi_off($i)` + vpaddd $h,$Xi,$Xi # Xi+=h + + vpsrld \$11,$e,$t2 + vpxor $t3,$sigma,$sigma + vpslld \$21,$e,$t3 + vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round] + vpxor $t2,$sigma,$sigma + + vpsrld \$25,$e,$t2 + vpxor $t3,$sigma,$sigma + vpslld \$7,$e,$t3 + vpandn $g,$e,$t1 + vpand $f,$e,$axb # borrow $axb + vpxor $t2,$sigma,$sigma + + vpsrld \$2,$a,$h # borrow $h + vpxor $t3,$sigma,$sigma # Sigma1(e) + vpslld \$30,$a,$t2 + vpxor $axb,$t1,$t1 # Ch(e,f,g) + vpxor $a,$b,$axb # a^b, b^c in next round + vpxor $t2,$h,$h + vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e) + + vpsrld \$13,$a,$t2 + vpslld \$19,$a,$t3 + vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g) + vpand $axb,$bxc,$bxc + vpxor $t2,$h,$sigma + + vpsrld \$22,$a,$t2 + vpxor $t3,$sigma,$sigma + vpslld \$10,$a,$t3 + vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b) + vpaddd $Xi,$d,$d # d+=Xi + vpxor $t2,$sigma,$sigma + vpxor $t3,$sigma,$sigma # Sigma0(a) + + vpaddd $Xi,$h,$h # h+=Xi + vpaddd $sigma,$h,$h # h+=Sigma0(a) +___ +$code.=<<___ if (($i%8)==7); + add \$`32*8`,$Tbl +___ + ($axb,$bxc)=($bxc,$axb); +} + +sub ROUND_16_XX_avx { +my $i=shift; + +$code.=<<___; + vmovdqu `&Xi_off($i+1)`,$Xn + vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9] + + vpsrld \$3,$Xn,$sigma + vpsrld \$7,$Xn,$t2 + vpslld \$25,$Xn,$t3 + vpxor $t2,$sigma,$sigma + vpsrld \$18,$Xn,$t2 + vpxor $t3,$sigma,$sigma + vpslld \$14,$Xn,$t3 + vmovdqu `&Xi_off($i+14)`,$t1 + vpsrld \$10,$t1,$axb # borrow $axb + + vpxor $t2,$sigma,$sigma + vpsrld \$17,$t1,$t2 + vpxor $t3,$sigma,$sigma # sigma0(X[i+1]) + vpslld \$15,$t1,$t3 + vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e) + vpxor $t2,$axb,$sigma + vpsrld \$19,$t1,$t2 + vpxor $t3,$sigma,$sigma + vpslld \$13,$t1,$t3 + vpxor $t2,$sigma,$sigma + vpxor $t3,$sigma,$sigma # sigma0(X[i+14]) + vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14]) +___ + &ROUND_00_15_avx($i,@_); + ($Xi,$Xn)=($Xn,$Xi); +} + +$code.=<<___; +.type sha256_multi_block_avx,\@function,3 +.align 32 +sha256_multi_block_avx: +_avx_shortcut: +___ +$code.=<<___ if ($avx>1); + shr \$32,%rcx + cmp \$2,$num + jb .Lavx + test \$`1<<5`,%ecx + jnz _avx2_shortcut + jmp .Lavx +.align 32 +.Lavx: +___ +$code.=<<___; + mov %rsp,%rax + push %rbx + push %rbp +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,-0x78(%rax) + movaps %xmm11,-0x68(%rax) + movaps %xmm12,-0x58(%rax) + movaps %xmm13,-0x48(%rax) + movaps %xmm14,-0x38(%rax) + movaps %xmm15,-0x28(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`, %rsp + and \$-256,%rsp + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp + lea K256+128(%rip),$Tbl + lea `$REG_SZ*16`(%rsp),%rbx + lea 0x80($ctx),$ctx # size optimization + +.Loop_grande_avx: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + test $num,$num + jz .Ldone_avx + + vmovdqu 0x00-0x80($ctx),$A # load context + lea 128(%rsp),%rax + vmovdqu 0x20-0x80($ctx),$B + vmovdqu 0x40-0x80($ctx),$C + vmovdqu 0x60-0x80($ctx),$D + vmovdqu 0x80-0x80($ctx),$E + vmovdqu 0xa0-0x80($ctx),$F + vmovdqu 0xc0-0x80($ctx),$G + vmovdqu 0xe0-0x80($ctx),$H + vmovdqu .Lpbswap(%rip),$Xn + jmp .Loop_avx + +.align 32 +.Loop_avx: + vpxor $B,$C,$bxc # magic seed +___ +for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + vmovdqu `&Xi_off($i)`,$Xi + mov \$3,%ecx + jmp .Loop_16_xx_avx +.align 32 +.Loop_16_xx_avx: +___ +for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + dec %ecx + jnz .Loop_16_xx_avx + + mov \$1,%ecx + lea K256+128(%rip),$Tbl +___ +for($i=0;$i<4;$i++) { + $code.=<<___; + cmp `4*$i`(%rbx),%ecx # examine counters + cmovge $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + vmovdqa (%rbx),$sigma # pull counters + vpxor $t1,$t1,$t1 + vmovdqa $sigma,$Xn + vpcmpgtd $t1,$Xn,$Xn # mask value + vpaddd $Xn,$sigma,$sigma # counters-- + + vmovdqu 0x00-0x80($ctx),$t1 + vpand $Xn,$A,$A + vmovdqu 0x20-0x80($ctx),$t2 + vpand $Xn,$B,$B + vmovdqu 0x40-0x80($ctx),$t3 + vpand $Xn,$C,$C + vmovdqu 0x60-0x80($ctx),$Xi + vpand $Xn,$D,$D + vpaddd $t1,$A,$A + vmovdqu 0x80-0x80($ctx),$t1 + vpand $Xn,$E,$E + vpaddd $t2,$B,$B + vmovdqu 0xa0-0x80($ctx),$t2 + vpand $Xn,$F,$F + vpaddd $t3,$C,$C + vmovdqu 0xc0-0x80($ctx),$t3 + vpand $Xn,$G,$G + vpaddd $Xi,$D,$D + vmovdqu 0xe0-0x80($ctx),$Xi + vpand $Xn,$H,$H + vpaddd $t1,$E,$E + vpaddd $t2,$F,$F + vmovdqu $A,0x00-0x80($ctx) + vpaddd $t3,$G,$G + vmovdqu $B,0x20-0x80($ctx) + vpaddd $Xi,$H,$H + vmovdqu $C,0x40-0x80($ctx) + vmovdqu $D,0x60-0x80($ctx) + vmovdqu $E,0x80-0x80($ctx) + vmovdqu $F,0xa0-0x80($ctx) + vmovdqu $G,0xc0-0x80($ctx) + vmovdqu $H,0xe0-0x80($ctx) + + vmovdqu $sigma,(%rbx) # save counters + vmovdqu .Lpbswap(%rip),$Xn + dec $num + jnz .Loop_avx + + mov `$REG_SZ*17+8`(%rsp),$num + lea $REG_SZ($ctx),$ctx + lea `16*$REG_SZ/4`($inp),$inp + dec $num + jnz .Loop_grande_avx + +.Ldone_avx: + mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xb8(%rax),%xmm6 + movaps -0xa8(%rax),%xmm7 + movaps -0x98(%rax),%xmm8 + movaps -0x88(%rax),%xmm9 + movaps -0x78(%rax),%xmm10 + movaps -0x68(%rax),%xmm11 + movaps -0x58(%rax),%xmm12 + movaps -0x48(%rax),%xmm13 + movaps -0x38(%rax),%xmm14 + movaps -0x28(%rax),%xmm15 +___ +$code.=<<___; + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp + ret +.size sha256_multi_block_avx,.-sha256_multi_block_avx +___ + if ($avx>1) { +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +$REG_SZ=32; +@ptr=map("%r$_",(12..15,8..11)); + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15)); +($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7)); + +$code.=<<___; +.type sha256_multi_block_avx2,\@function,3 +.align 32 +sha256_multi_block_avx2: +_avx2_shortcut: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0xa8(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) + movaps %xmm10,0x40(%rsp) + movaps %xmm11,0x50(%rsp) + movaps %xmm12,-0x78(%rax) + movaps %xmm13,-0x68(%rax) + movaps %xmm14,-0x58(%rax) + movaps %xmm15,-0x48(%rax) +___ +$code.=<<___; + sub \$`$REG_SZ*18`, %rsp + and \$-256,%rsp + mov %rax,`$REG_SZ*17`(%rsp) # original %rsp + lea K256+128(%rip),$Tbl + lea 0x80($ctx),$ctx # size optimization + +.Loop_grande_avx2: + mov $num,`$REG_SZ*17+8`(%rsp) # original $num + xor $num,$num + lea `$REG_SZ*16`(%rsp),%rbx +___ +for($i=0;$i<8;$i++) { + $code.=<<___; + mov `16*$i+0`($inp),@ptr[$i] # input pointer + mov `16*$i+8`($inp),%ecx # number of blocks + cmp $num,%ecx + cmovg %ecx,$num # find maximum + test %ecx,%ecx + mov %ecx,`4*$i`(%rbx) # initialize counters + cmovle $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + vmovdqu 0x00-0x80($ctx),$A # load context + lea 128(%rsp),%rax + vmovdqu 0x20-0x80($ctx),$B + lea 256+128(%rsp),%rbx + vmovdqu 0x40-0x80($ctx),$C + vmovdqu 0x60-0x80($ctx),$D + vmovdqu 0x80-0x80($ctx),$E + vmovdqu 0xa0-0x80($ctx),$F + vmovdqu 0xc0-0x80($ctx),$G + vmovdqu 0xe0-0x80($ctx),$H + vmovdqu .Lpbswap(%rip),$Xn + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + vpxor $B,$C,$bxc # magic seed +___ +for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + vmovdqu `&Xi_off($i)`,$Xi + mov \$3,%ecx + jmp .Loop_16_xx_avx2 +.align 32 +.Loop_16_xx_avx2: +___ +for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + dec %ecx + jnz .Loop_16_xx_avx2 + + mov \$1,%ecx + lea `$REG_SZ*16`(%rsp),%rbx + lea K256+128(%rip),$Tbl +___ +for($i=0;$i<8;$i++) { + $code.=<<___; + cmp `4*$i`(%rbx),%ecx # examine counters + cmovge $Tbl,@ptr[$i] # cancel input +___ +} +$code.=<<___; + vmovdqa (%rbx),$sigma # pull counters + vpxor $t1,$t1,$t1 + vmovdqa $sigma,$Xn + vpcmpgtd $t1,$Xn,$Xn # mask value + vpaddd $Xn,$sigma,$sigma # counters-- + + vmovdqu 0x00-0x80($ctx),$t1 + vpand $Xn,$A,$A + vmovdqu 0x20-0x80($ctx),$t2 + vpand $Xn,$B,$B + vmovdqu 0x40-0x80($ctx),$t3 + vpand $Xn,$C,$C + vmovdqu 0x60-0x80($ctx),$Xi + vpand $Xn,$D,$D + vpaddd $t1,$A,$A + vmovdqu 0x80-0x80($ctx),$t1 + vpand $Xn,$E,$E + vpaddd $t2,$B,$B + vmovdqu 0xa0-0x80($ctx),$t2 + vpand $Xn,$F,$F + vpaddd $t3,$C,$C + vmovdqu 0xc0-0x80($ctx),$t3 + vpand $Xn,$G,$G + vpaddd $Xi,$D,$D + vmovdqu 0xe0-0x80($ctx),$Xi + vpand $Xn,$H,$H + vpaddd $t1,$E,$E + vpaddd $t2,$F,$F + vmovdqu $A,0x00-0x80($ctx) + vpaddd $t3,$G,$G + vmovdqu $B,0x20-0x80($ctx) + vpaddd $Xi,$H,$H + vmovdqu $C,0x40-0x80($ctx) + vmovdqu $D,0x60-0x80($ctx) + vmovdqu $E,0x80-0x80($ctx) + vmovdqu $F,0xa0-0x80($ctx) + vmovdqu $G,0xc0-0x80($ctx) + vmovdqu $H,0xe0-0x80($ctx) + + vmovdqu $sigma,(%rbx) # save counters + lea 256+128(%rsp),%rbx + vmovdqu .Lpbswap(%rip),$Xn + dec $num + jnz .Loop_avx2 + + #mov `$REG_SZ*17+8`(%rsp),$num + #lea $REG_SZ($ctx),$ctx + #lea `16*$REG_SZ/4`($inp),$inp + #dec $num + #jnz .Loop_grande_avx2 + +.Ldone_avx2: + mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xd8(%rax),%xmm6 + movaps -0xc8(%rax),%xmm7 + movaps -0xb8(%rax),%xmm8 + movaps -0xa8(%rax),%xmm9 + movaps -0x98(%rax),%xmm10 + movaps -0x88(%rax),%xmm11 + movaps -0x78(%rax),%xmm12 + movaps -0x68(%rax),%xmm13 + movaps -0x58(%rax),%xmm14 + movaps -0x48(%rax),%xmm15 +___ +$code.=<<___; + mov -48(%rax),%r15 + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp + ret +.size sha256_multi_block_avx2,.-sha256_multi_block_avx2 +___ + } }}} +$code.=<<___; +.align 256 +K256: +___ +sub TABLE { + foreach (@_) { + $code.=<<___; + .long $_,$_,$_,$_ + .long $_,$_,$_,$_ +___ + } +} +&TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, + 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, + 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, + 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, + 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, + 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, + 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, + 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, + 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, + 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, + 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, + 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, + 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, + 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, + 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, + 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); +$code.=<<___; +.Lpbswap: + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or + s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or + s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or + s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; + print $_,"\n"; +} + +close STDOUT; -- 2.25.1