X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;f=crypto%2Fbn%2Fasm%2Fx86_64-mont5.pl;h=5e70547704f87f1077cf5f1eaf540f1dda7f5cce;hb=9bb3e5fd87905e3e9f5f7edcc2e22d98360510ab;hp=265ee7d13a5e259b99656b193b5858758c9ccdac;hpb=ec9cc70f72454b8d4a84247c86159613cee83b81;p=oweals%2Fopenssl.git diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 265ee7d13a..5e70547704 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # Written by Andy Polyakov for the OpenSSL @@ -24,9 +31,10 @@ # the np argument is not just modulus value, but one interleaved # with 0. This is to optimize post-condition... -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); @@ -35,7 +43,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -50,7 +59,12 @@ if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $addx = ($1>=11); + $addx = ($1>=12); +} + +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { + my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 + $addx = ($ver>=3.03); } # int bn_mul_mont_gather5( @@ -81,6 +95,10 @@ $code=<<___; .type bn_mul_mont_gather5,\@function,6 .align 64 bn_mul_mont_gather5: +.cfi_startproc + mov ${num}d,${num}d + mov %rsp,%rax +.cfi_def_cfa_register %rax test \$7,${num}d jnz .Lmul_enter ___ @@ -92,60 +110,143 @@ $code.=<<___; .align 16 .Lmul_enter: - mov ${num}d,${num}d - mov %rsp,%rax - mov `($win64?56:8)`(%rsp),%r10d # load 7th argument + movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - lea 2($num),%r11 - neg %r11 - lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) - and \$-1024,%rsp # minimize TLB usage +.cfi_push %r15 + neg $num + mov %rsp,%r11 + lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) + neg $num # restore $num + and \$-1024,%r10 # minimize TLB usage + + # An OS-agnostic version of __chkstk. + # + # Some OSes (Windows) insist on stack being "wired" to + # physical memory in strictly sequential manner, i.e. if stack + # allocation spans two pages, then reference to farmost one can + # be punishable by SEGV. But page walking can do good even on + # other OSes, because it guarantees that villain thread hits + # the guard page before it can make damage to innocent one... + sub %r10,%r11 + and \$-4096,%r11 + lea (%r10,%r11),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.Lmul_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r11 + cmp %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + lea .Linc(%rip),%r10 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul_body: - mov $bp,%r12 # reassign $bp + + lea 128($bp),%r12 # reassign $bp (+size optimization) ___ $bp="%r12"; $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96($bp,%r11,8),$bp # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 + lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) + and \$-16,%r10 + + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to index and save result to stack +# +$code.=<<___; + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 + .byte 0x67 + movdqa %xmm4,%xmm3 +___ +for($k=0;$k<$STRIDE/16-4;$k+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($k+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($k+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($k+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($k+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($k+0)+112`(%r10) + + paddd %xmm2,%xmm3 + .byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($k+1)+112`(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($k+2)+112`(%r10) + pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register + + pand `16*($k+1)-128`($bp),%xmm1 + pand `16*($k+2)-128`($bp),%xmm2 + movdqa %xmm3,`16*($k+3)+112`(%r10) + pand `16*($k+3)-128`($bp),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($k=0;$k<$STRIDE/16-4;$k+=4) { +$code.=<<___; + movdqa `16*($k+0)-128`($bp),%xmm4 + movdqa `16*($k+1)-128`($bp),%xmm5 + movdqa `16*($k+2)-128`($bp),%xmm2 + pand `16*($k+0)+112`(%r10),%xmm4 + movdqa `16*($k+3)-128`($bp),%xmm3 + pand `16*($k+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($k+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($k+3)+112`(%r10),%xmm3 por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + por %xmm1,%xmm0 + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - movq %xmm0,$m0 # m0=bp[0] mov ($n0),$n0 # pull n0[0] value @@ -154,29 +255,14 @@ $code.=<<___; xor $i,$i # i=0 xor $j,$j # j=0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$lo0 mov ($np),%rax - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq $lo0,$m1 # "tp[0]"*n0 mov %rdx,$hi0 - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax @@ -207,16 +293,14 @@ $code.=<<___; mulq $m1 # np[j]*m1 cmp $num,$j - jne .L1st - - movq %xmm0,$m0 # bp[1] + jne .L1st # note that upon exit $j==$num, so + # they can be used interchangeably add %rax,$hi1 - mov ($ap),%rax # ap[0] adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 mov $lo0,$hi0 @@ -230,33 +314,48 @@ $code.=<<___; jmp .Louter .align 16 .Louter: + lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) + and \$-16,%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($k=0;$k<$STRIDE/16;$k+=4) { +$code.=<<___; + movdqa `16*($k+0)-128`($bp),%xmm0 + movdqa `16*($k+1)-128`($bp),%xmm1 + movdqa `16*($k+2)-128`($bp),%xmm2 + movdqa `16*($k+3)-128`($bp),%xmm3 + pand `16*($k+0)-128`(%rdx),%xmm0 + pand `16*($k+1)-128`(%rdx),%xmm1 + por %xmm0,%xmm4 + pand `16*($k+2)-128`(%rdx),%xmm2 + por %xmm1,%xmm5 + pand `16*($k+3)-128`(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + lea $STRIDE($bp),$bp + + mov ($ap),%rax # ap[0] + movq %xmm0,$m0 # m0=bp[i] + xor $j,$j # j=0 mov $n0,$m1 mov (%rsp),$lo0 - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - mulq $m0 # ap[0]*bp[i] add %rax,$lo0 # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq $lo0,$m1 # tp[0]*n0 mov %rdx,$hi0 - por %xmm2,%xmm0 - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax @@ -290,17 +389,14 @@ $code.=<<___; mulq $m1 # np[j]*m1 cmp $num,$j - jne .Linner - - movq %xmm0,$m0 # bp[i+1] - + jne .Linner # note that upon exit $j==$num, so + # they can be used interchangeably add %rax,$hi1 - mov ($ap),%rax # ap[0] adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] - mov (%rsp,$j,8),$lo0 + mov (%rsp,$num,8),$lo0 adc \$0,%rdx - mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 xor %rdx,%rdx @@ -325,43 +421,48 @@ $code.=<<___; mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] mov 8($ap,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ - dec $j # doesnn't affect CF! + dec $j # doesn't affect CF! jnz .Lsub sbb \$0,%rax # handle upmost overflow bit + mov \$-1,%rbx + xor %rax,%rbx xor $i,$i - and %rax,$ap - not %rax - mov $rp,$np - and %rax,$np mov $num,$j # j=num - or $np,$ap # ap=borrow?tp:rp -.align 16 -.Lcopy: # copy or in-place refresh - mov ($ap,$i,8),%rax + +.Lcopy: # conditional copy + mov ($rp,$i,8),%rcx + mov (%rsp,$i,8),%rdx + and %rbx,%rcx + and %rax,%rdx mov $i,(%rsp,$i,8) # zap temporary vector - mov %rax,($rp,$i,8) # rp[i]=tp[i] + or %rcx,%rdx + mov %rdx,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy mov 8(%rsp,$num,8),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: ret +.cfi_endproc .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 ___ {{{ @@ -371,93 +472,121 @@ $code.=<<___; .type bn_mul4x_mont_gather5,\@function,6 .align 32 bn_mul4x_mont_gather5: +.cfi_startproc + .byte 0x67 + mov %rsp,%rax +.cfi_def_cfa_register %rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); - and \$0x80100,%r11d - cmp \$0x80100,%r11d + and \$0x80108,%r11d + cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 je .Lmulx4x_enter ___ $code.=<<___; - .byte 0x67 - mov %rsp,%rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; +.cfi_push %r15 +.Lmul4x_prologue: + .byte 0x67 - mov ${num}d,%r10d - shl \$3,${num}d - shl \$3+2,%r10d # 4*$num + shl \$3,${num}d # convert $num to bytes + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. [excessive frame is allocated in order - # to allow bn_from_mont8x to clear it.] + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra [num] is allocated in order + # to align with bn_power5's frame, which is cleansed after + # completing exponentiation. Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $ap,%r11 + lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp + sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmul4xsp_alt - sub %r11,%rsp # align with $ap - lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) + sub %r11,%rbp # align with $rp + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: - lea 4096-64(,$num,2),%r10 - lea -64(%rsp,$num,2),%rsp # alloca(128+num*8) + lea 4096-320(,$num,2),%r10 + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lmul4xsp_done: - and \$-64,%rsp + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + +.Lmul4x_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + neg $num mov %rax,40(%rsp) +.cfi_cfa_expression %rsp+40,deref,+8 .Lmul4x_body: call mul4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret +.cfi_endproc .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 .type mul4x_internal,\@abi-omnipotent .align 32 mul4x_internal: - shl \$5,$num - mov `($win64?56:8)`(%rax),%r10d # load 7th argument - lea 256(%rdx,$num),%r13 +.cfi_startproc + shl \$5,$num # $num was in bytes + movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index + lea .Linc(%rip),%rax + lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) shr \$5,$num # restore $num ___ $bp="%r12"; @@ -465,44 +594,92 @@ ___ $N=$STRIDE/4; # should match cache line size $tp=$i; $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - add \$7,%r11 - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - and \$7,%r11 - - movq `0*$STRIDE/4-96`($bp),%xmm0 - lea $STRIDE($bp),$tp # borrow $tp - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 - pand %xmm6,%xmm2 - .byte 0x67 - por %xmm1,%xmm0 - movq `0*$STRIDE/4-96`($tp),%xmm1 - .byte 0x67 - pand %xmm7,%xmm3 - .byte 0x67 - por %xmm2,%xmm0 - movq `1*$STRIDE/4-96`($tp),%xmm2 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) + lea 128(%rdx),$bp # size optimization + + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + .byte 0x67,0x67 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to index and save result to stack +# +$code.=<<___; + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 .byte 0x67 - pand %xmm4,%xmm1 + movdqa %xmm4,%xmm3 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($i+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($i+0)+112`(%r10) + + paddd %xmm2,%xmm3 .byte 0x67 - por %xmm3,%xmm0 - movq `2*$STRIDE/4-96`($tp),%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($i+1)+112`(%r10) + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($i+2)+112`(%r10) + pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register + + pand `16*($i+1)-128`($bp),%xmm1 + pand `16*($i+2)-128`($bp),%xmm2 + movdqa %xmm3,`16*($i+3)+112`(%r10) + pand `16*($i+3)-128`($bp),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bp),%xmm4 + movdqa `16*($i+1)-128`($bp),%xmm5 + movdqa `16*($i+2)-128`($bp),%xmm2 + pand `16*($i+0)+112`(%r10),%xmm4 + movdqa `16*($i+3)-128`($bp),%xmm3 + pand `16*($i+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($i+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($i+3)+112`(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + por %xmm1,%xmm0 + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[0] - movq `3*$STRIDE/4-96`($tp),%xmm0 + mov %r13,16+8(%rsp) # save end of b[num] mov $rp, 56+8(%rsp) # save $rp @@ -516,26 +693,10 @@ $code.=<<___; mov %rax,$A[0] mov ($np),%rax - pand %xmm5,%xmm2 - pand %xmm6,%xmm3 - por %xmm2,%xmm1 - imulq $A[0],$m1 # "tp[0]"*n0 - ############################################################## - # $tp is chosen so that writing to top-most element of the - # vector occurs just "above" references to powers table, - # "above" modulo cache-line size, which effectively precludes - # possibility of memory disambiguation logic failure when - # accessing the table. - # - lea 64+8(%rsp,%r11,8),$tp + lea 64+8(%rsp),$tp mov %rdx,$A[1] - pand %xmm7,%xmm0 - por %xmm3,%xmm1 - lea 2*$STRIDE($bp),$bp - por %xmm1,%xmm0 - mulq $m1 # np[0]*m1 add %rax,$A[0] # discarded mov 8($ap,$num),%rax @@ -544,7 +705,7 @@ $code.=<<___; mulq $m0 add %rax,$A[1] - mov 16*1($np),%rax # interleaved with 0, therefore 16*n + mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -554,7 +715,7 @@ $code.=<<___; adc \$0,%rdx add $A[1],$N[1] lea 4*8($num),$j # j=4 - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) mov %rdx,$N[0] @@ -564,7 +725,7 @@ $code.=<<___; .L1st4x: mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] @@ -580,7 +741,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov -16*1($np),%rax + mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -595,7 +756,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov 16*0($np),%rax + mov 8*0($np),%rax adc \$0,%rdx mov %rdx,$A[1] @@ -610,7 +771,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov 16*1($np),%rax + mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -619,7 +780,7 @@ $code.=<<___; mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) # tp[j-1] mov %rdx,$N[0] @@ -629,7 +790,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] @@ -645,7 +806,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[0] add %rax,$A[1] - mov -16*1($np),%rax + mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] @@ -658,8 +819,7 @@ $code.=<<___; mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] - movq %xmm0,$m0 # bp[1] - lea ($np,$num,2),$np # rewind $np + lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] @@ -670,6 +830,33 @@ $code.=<<___; .align 32 .Louter4x: + lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bp),%xmm0 + movdqa `16*($i+1)-128`($bp),%xmm1 + movdqa `16*($i+2)-128`($bp),%xmm2 + movdqa `16*($i+3)-128`($bp),%xmm3 + pand `16*($i+0)-128`(%rdx),%xmm0 + pand `16*($i+1)-128`(%rdx),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)-128`(%rdx),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)-128`(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + lea $STRIDE($bp),$bp + movq %xmm0,$m0 # m0=bp[i] + mov ($tp,$num),$A[0] mov $n0,$m1 mulq $m0 # ap[0]*bp[i] @@ -677,25 +864,11 @@ $code.=<<___; mov ($np),%rax adc \$0,%rdx - movq `0*$STRIDE/4-96`($bp),%xmm0 - movq `1*$STRIDE/4-96`($bp),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bp),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bp),%xmm3 - imulq $A[0],$m1 # tp[0]*n0 - .byte 0x67 mov %rdx,$A[1] mov $N[1],($tp) # store upmost overflow bit - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 lea ($tp,$num),$tp # rewind $tp - lea $STRIDE($bp),$bp - por %xmm3,%xmm0 mulq $m1 # np[0]*m1 add %rax,$A[0] # "$N[0]", discarded @@ -705,7 +878,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov 16*1($np),%rax # interleaved with 0, therefore 16*n + mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] # +tp[1] adc \$0,%rdx @@ -717,7 +890,7 @@ $code.=<<___; adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] lea 4*8($num),$j # j=4 - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov %rdx,$N[0] jmp .Linner4x @@ -726,7 +899,7 @@ $code.=<<___; .Linner4x: mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp @@ -744,7 +917,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov -16*1($np),%rax + mov -8*1($np),%rax adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx @@ -761,7 +934,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov 16*0($np),%rax + mov 8*0($np),%rax adc \$0,%rdx add ($tp),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx @@ -778,7 +951,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[1] - mov 16*1($np),%rax + mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] adc \$0,%rdx @@ -789,7 +962,7 @@ $code.=<<___; mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] - lea 16*4($np),$np + lea 8*4($np),$np adc \$0,%rdx mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[0] @@ -799,7 +972,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[0] - mov -16*2($np),%rax + mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp @@ -818,7 +991,7 @@ $code.=<<___; mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov $m1,%rax - mov -16*1($np),$m1 + mov -8*1($np),$m1 adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx @@ -833,9 +1006,8 @@ $code.=<<___; mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] - movq %xmm0,$m0 # bp[i+1] mov $N[1],-16($tp) # tp[j-1] - lea ($np,$num,2),$np # rewind $np + lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] @@ -849,16 +1021,23 @@ $code.=<<___; ___ if (1) { $code.=<<___; + xor %rax,%rax sub $N[0],$m1 # compare top-most words adc $j,$j # $j is zero or $j,$N[1] - xor \$1,$N[1] + sub $N[1],%rax # %rax=-$N[1] lea ($tp,$num),%rbx # tptr in .sqr4x_sub - lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub + mov ($np),%r12 + lea ($np),%rbp # nptr in .sqr4x_sub mov %r9,%rcx - sar \$3+2,%rcx # cf=0 + sar \$3+2,%rcx mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub - jmp .Lsqr4x_sub + dec %r12 # so that after 'not' we get -n[0] + xor %r10,%r10 + mov 8*1(%rbp),%r13 + mov 8*2(%rbp),%r14 + mov 8*3(%rbp),%r15 + jmp .Lsqr4x_sub_entry ___ } else { my @ri=("%rax",$bp,$m0,$m1); @@ -898,6 +1077,7 @@ $code.=<<___ ___ } $code.=<<___; +.cfi_endproc .size mul4x_internal,.-mul4x_internal ___ }}} @@ -910,7 +1090,7 @@ my $bptr="%rdx"; # const void *table, my $nptr="%rcx"; # const BN_ULONG *nptr, my $n0 ="%r8"; # const BN_ULONG *n0); my $num ="%r9"; # int num, has to be divisible by 8 - # int pwr + # int pwr my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); my @A0=("%r10","%r11"); @@ -922,60 +1102,80 @@ $code.=<<___; .type bn_power5,\@function,6 .align 32 bn_power5: +.cfi_startproc + mov %rsp,%rax +.cfi_def_cfa_register %rax ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d - and \$0x80100,%r11d - cmp \$0x80100,%r11d + and \$0x80108,%r11d + cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 je .Lpowerx5_enter ___ $code.=<<___; - mov %rsp,%rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - mov ${num}d,%r10d +.cfi_push %r15 +.Lpower5_prologue: + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10d # 3*$num neg $num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $aptr,%r11 + lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp + sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwr_sp_alt - sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lpwr_sp_done: - and \$-64,%rsp - mov $num,%r10 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwr_page_walk + jmp .Lpwr_page_walk_done + +.Lpwr_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwr_page_walk +.Lpwr_page_walk_done: + + mov $num,%r10 neg $num ############################################################## @@ -989,19 +1189,25 @@ $code.=<<___; # mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lpower5_body: - movq $rptr,%xmm1 # save $rptr + movq $rptr,%xmm1 # save $rptr, used in sqr8x movq $nptr,%xmm2 # save $nptr - movq %r10, %xmm3 # -$num + movq %r10, %xmm3 # -$num, used in sqr8x movq $bptr,%xmm4 call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal - mov %xmm2,$nptr + movq %xmm2,$nptr movq %xmm4,$bptr mov $aptr,$rptr mov 40(%rsp),%rax @@ -1010,16 +1216,25 @@ $code.=<<___; call mul4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lpower5_epilogue: ret +.cfi_endproc .size bn_power5,.-bn_power5 .globl bn_sqr8x_internal @@ -1028,6 +1243,7 @@ $code.=<<___; .align 32 bn_sqr8x_internal: __bn_sqr8x_internal: +.cfi_startproc ############################################################## # Squaring part: # @@ -1560,9 +1776,9 @@ my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); $code.=<<___; movq %xmm2,$nptr -sqr8x_reduction: +__bn_sqr8x_reduction: xor %rax,%rax - lea ($nptr,$num,2),%rcx # end of n[] + lea ($nptr,$num),%rcx # end of n[] lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer mov %rcx,0+8(%rsp) lea 48+8(%rsp,$num),$tptr # end of initial t[] window @@ -1588,21 +1804,21 @@ sqr8x_reduction: .byte 0x67 mov $m0,%r8 imulq 32+8(%rsp),$m0 # n0*a[0] - mov 16*0($nptr),%rax # n[0] + mov 8*0($nptr),%rax # n[0] mov \$8,%ecx jmp .L8x_reduce .align 32 .L8x_reduce: mulq $m0 - mov 16*1($nptr),%rax # n[1] + mov 8*1($nptr),%rax # n[1] neg %r8 mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 - mov 16*2($nptr),%rax + mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] @@ -1611,7 +1827,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r10 - mov 16*3($nptr),%rax + mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov 32+8(%rsp),$carry # pull n0, borrow $carry @@ -1620,7 +1836,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r11 - mov 16*4($nptr),%rax + mov 8*4($nptr),%rax adc \$0,%rdx imulq %r8,$carry # modulo-scheduled add %r11,%r10 @@ -1629,7 +1845,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r12 - mov 16*5($nptr),%rax + mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 @@ -1637,7 +1853,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r13 - mov 16*6($nptr),%rax + mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 @@ -1645,7 +1861,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r14 - mov 16*7($nptr),%rax + mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 @@ -1654,7 +1870,7 @@ sqr8x_reduction: mulq $m0 mov $carry,$m0 # n0*a[i] add %rax,%r15 - mov 16*0($nptr),%rax # n[0] + mov 8*0($nptr),%rax # n[0] adc \$0,%rdx add %r15,%r14 mov %rdx,%r15 @@ -1663,7 +1879,7 @@ sqr8x_reduction: dec %ecx jnz .L8x_reduce - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr xor %rax,%rax mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? @@ -1682,21 +1898,21 @@ sqr8x_reduction: mov 48+56+8(%rsp),$m0 # pull n0*a[0] mov \$8,%ecx - mov 16*0($nptr),%rax + mov 8*0($nptr),%rax jmp .L8x_tail .align 32 .L8x_tail: mulq $m0 add %rax,%r8 - mov 16*1($nptr),%rax + mov 8*1($nptr),%rax mov %r8,($tptr) # save result mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 - mov 16*2($nptr),%rax + mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 lea 8($tptr),$tptr # $tptr++ @@ -1705,7 +1921,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r10 - mov 16*3($nptr),%rax + mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov %rdx,%r10 @@ -1713,7 +1929,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r11 - mov 16*4($nptr),%rax + mov 8*4($nptr),%rax adc \$0,%rdx add %r11,%r10 mov %rdx,%r11 @@ -1721,7 +1937,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r12 - mov 16*5($nptr),%rax + mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 @@ -1729,7 +1945,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r13 - mov 16*6($nptr),%rax + mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 @@ -1737,7 +1953,7 @@ sqr8x_reduction: mulq $m0 add %rax,%r14 - mov 16*7($nptr),%rax + mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 @@ -1748,14 +1964,14 @@ sqr8x_reduction: add %rax,%r15 adc \$0,%rdx add %r15,%r14 - mov 16*0($nptr),%rax # pull n[0] + mov 8*0($nptr),%rax # pull n[0] mov %rdx,%r15 adc \$0,%r15 dec %ecx jnz .L8x_tail - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? jae .L8x_tail_done # break out of loop @@ -1778,8 +1994,16 @@ sqr8x_reduction: .align 32 .L8x_tail_done: - add (%rdx),%r8 # can this overflow? xor %rax,%rax + add (%rdx),%r8 # can this overflow? + adc \$0,%r9 + adc \$0,%r10 + adc \$0,%r11 + adc \$0,%r12 + adc \$0,%r13 + adc \$0,%r14 + adc \$0,%r15 + adc \$0,%rax neg $carry .L8x_no_tail: @@ -1792,7 +2016,7 @@ sqr8x_reduction: adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 adc \$0,%rax # top-most carry - mov -16($nptr),%rcx # np[num-1] + mov -8($nptr),%rcx # np[num-1] xor $carry,$carry movq %xmm2,$nptr # restore $nptr @@ -1810,6 +2034,9 @@ sqr8x_reduction: cmp %rdx,$tptr # end of t[]? jb .L8x_reduction_loop + ret +.cfi_endproc +.size bn_sqr8x_internal,.-bn_sqr8x_internal ___ } ############################################################## @@ -1818,111 +2045,148 @@ ___ { my ($tptr,$nptr)=("%rbx","%rbp"); $code.=<<___; - #xor %rsi,%rsi # %rsi was $carry above - sub %r15,%rcx # compare top-most words +.type __bn_post4x_internal,\@abi-omnipotent +.align 32 +__bn_post4x_internal: +.cfi_startproc + mov 8*0($nptr),%r12 lea (%rdi,$num),$tptr # %rdi was $tptr above - adc %rsi,%rsi mov $num,%rcx - or %rsi,%rax movq %xmm1,$rptr # restore $rptr - xor \$1,%rax + neg %rax movq %xmm1,$aptr # prepare for back-to-back call - lea ($nptr,%rax,8),$nptr - sar \$3+2,%rcx # cf=0 - jmp .Lsqr4x_sub + sar \$3+2,%rcx + dec %r12 # so that after 'not' we get -n[0] + xor %r10,%r10 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 + jmp .Lsqr4x_sub_entry -.align 32 +.align 16 .Lsqr4x_sub: - .byte 0x66 - mov 8*0($tptr),%r12 - mov 8*1($tptr),%r13 - sbb 16*0($nptr),%r12 - mov 8*2($tptr),%r14 - sbb 16*1($nptr),%r13 - mov 8*3($tptr),%r15 - lea 8*4($tptr),$tptr - sbb 16*2($nptr),%r14 + mov 8*0($nptr),%r12 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 +.Lsqr4x_sub_entry: + lea 8*4($nptr),$nptr + not %r12 + not %r13 + not %r14 + not %r15 + and %rax,%r12 + and %rax,%r13 + and %rax,%r14 + and %rax,%r15 + + neg %r10 # mov %r10,%cf + adc 8*0($tptr),%r12 + adc 8*1($tptr),%r13 + adc 8*2($tptr),%r14 + adc 8*3($tptr),%r15 mov %r12,8*0($rptr) - sbb 16*3($nptr),%r15 - lea 16*4($nptr),$nptr + lea 8*4($tptr),$tptr mov %r13,8*1($rptr) + sbb %r10,%r10 # mov %cf,%r10 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx # pass %cf jnz .Lsqr4x_sub -___ -} -$code.=<<___; + mov $num,%r10 # prepare for back-to-back call - neg $num # restore $num + neg $num # restore $num ret -.size bn_sqr8x_internal,.-bn_sqr8x_internal +.cfi_endproc +.size __bn_post4x_internal,.-__bn_post4x_internal ___ +} { $code.=<<___; .globl bn_from_montgomery .type bn_from_montgomery,\@abi-omnipotent .align 32 bn_from_montgomery: +.cfi_startproc testl \$7,`($win64?"48(%rsp)":"%r9d")` jz bn_from_mont8x xor %eax,%eax ret +.cfi_endproc .size bn_from_montgomery,.-bn_from_montgomery .type bn_from_mont8x,\@function,6 .align 32 bn_from_mont8x: +.cfi_startproc .byte 0x67 mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - .byte 0x67 - mov ${num}d,%r10d +.cfi_push %r15 +.Lfrom_prologue: + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). The stack is allocated to aligned with + # bn_power5's frame, and as bn_from_montgomery happens to be + # last operation, we use the opportunity to cleanse it. # - lea -64(%rsp,$num,2),%r11 - sub $aptr,%r11 + lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp + sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lfrom_sp_alt - sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lfrom_sp_done: - and \$-64,%rsp - mov $num,%r10 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lfrom_page_walk + jmp .Lfrom_page_walk_done + +.Lfrom_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lfrom_page_walk +.Lfrom_page_walk_done: + + mov $num,%r10 neg $num ############################################################## @@ -1936,6 +2200,7 @@ $code.=<<___; # mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lfrom_body: mov $num,%r11 lea 48(%rsp),%rax @@ -1969,31 +2234,33 @@ $code.=<<___; ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d - and \$0x80100,%r11d - cmp \$0x80100,%r11d + and \$0x80108,%r11d + cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 jne .Lfrom_mont_nox lea (%rax,$num),$rptr - call sqrx8x_reduction + call __bn_sqrx8x_reduction + call __bn_postx4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax - mov 40(%rsp),%rsi # restore %rsp jmp .Lfrom_mont_zero .align 32 .Lfrom_mont_nox: ___ $code.=<<___; - call sqr8x_reduction + call __bn_sqr8x_reduction + call __bn_post4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax - mov 40(%rsp),%rsi # restore %rsp jmp .Lfrom_mont_zero .align 32 .Lfrom_mont_zero: + mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 movdqa %xmm0,16*0(%rax) movdqa %xmm0,16*1(%rax) movdqa %xmm0,16*2(%rax) @@ -2004,14 +2271,22 @@ $code.=<<___; mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lfrom_epilogue: ret +.cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x ___ } @@ -2024,55 +2299,73 @@ $code.=<<___; .type bn_mulx4x_mont_gather5,\@function,6 .align 32 bn_mulx4x_mont_gather5: -.Lmulx4x_enter: - .byte 0x67 +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax +.Lmulx4x_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - .byte 0x67 - mov ${num}d,%r10d +.cfi_push %r15 +.Lmulx4x_prologue: + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers a[num], ret[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. [excessive frame is allocated in order - # to allow bn_from_mont8x to clear it.] + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra [num] is allocated in order + # to align with bn_power5's frame, which is cleansed after + # completing exponentiation. Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $ap,%r11 + lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp + sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmulx4xsp_alt - sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lmulx4xsp_done -.align 32 .Lmulx4xsp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp -.Lmulx4xsp_done: - and \$-64,%rsp # ensure alignment + sub %r11,%rbp +.Lmulx4xsp_done: + and \$-64,%rbp # ensure alignment + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.Lmulx4x_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + ############################################################## # Stack layout # +0 -num @@ -2087,39 +2380,47 @@ $code.=<<___; # mov $n0, 32(%rsp) # save *n0 mov %rax,40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lmulx4x_body: call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret +.cfi_endproc .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 .type mulx4x_internal,\@abi-omnipotent .align 32 mulx4x_internal: - .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num - .byte 0x67 +.cfi_startproc + mov $num,8(%rsp) # save -$num (it was in bytes) + mov $num,%r10 neg $num # restore $num shl \$5,$num - lea 256($bp,$num),%r13 + neg %r10 # restore $num + lea 128($bp,$num),%r13 # end of powers table (+size optimization) shr \$5+5,$num - mov `($win64?56:8)`(%rax),%r10d # load 7th argument + movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument sub \$1,$num + lea .Linc(%rip),%rax mov %r13,16+8(%rsp) # end of b[num] mov $num,24+8(%rsp) # inner counter mov $rp, 56+8(%rsp) # save $rp @@ -2130,52 +2431,92 @@ my $rptr=$bptr; my $STRIDE=2**5*8; # 5 is "window size" my $N=$STRIDE/4; # should match cache line size $code.=<<___; - mov %r10,%r11 - shr \$`log($N/8)/log(2)`,%r10 - and \$`$N/8-1`,%r11 - not %r10 - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" - lea 96($bp,%r11,8),$bptr # pointer within 1st cache line - movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which - movq 8(%rax,%r10,8),%xmm5 # cache line contains element - add \$7,%r11 - movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument - movq 24(%rax,%r10,8),%xmm7 - and \$7,%r11 - - movq `0*$STRIDE/4-96`($bptr),%xmm0 - lea $STRIDE($bptr),$tptr # borrow $tptr - movq `1*$STRIDE/4-96`($bptr),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bptr),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bptr),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - movq `0*$STRIDE/4-96`($tptr),%xmm1 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 - movq `1*$STRIDE/4-96`($tptr),%xmm2 - por %xmm3,%xmm0 - .byte 0x67,0x67 - pand %xmm4,%xmm1 - movq `2*$STRIDE/4-96`($tptr),%xmm3 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) + lea 128($bp),$bptr # size optimization + pshufd \$0,%xmm5,%xmm5 # broadcast index + movdqa %xmm1,%xmm4 + .byte 0x67 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to index and save result to stack +# +$code.=<<___; + .byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 + movdqa %xmm4,%xmm3 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)+112`(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)+112`(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)+112`(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,`16*($i+3)+112`(%r10) + movdqa %xmm4,%xmm3 +___ +} +$code.=<<___; # last iteration can be optimized + .byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,`16*($i+0)+112`(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,`16*($i+1)+112`(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,`16*($i+2)+112`(%r10) + + pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register + pand `16*($i+1)-128`($bptr),%xmm1 + pand `16*($i+2)-128`($bptr),%xmm2 + movdqa %xmm3,`16*($i+3)+112`(%r10) + pand `16*($i+3)-128`($bptr),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +for($i=0;$i<$STRIDE/16-4;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bptr),%xmm4 + movdqa `16*($i+1)-128`($bptr),%xmm5 + movdqa `16*($i+2)-128`($bptr),%xmm2 + pand `16*($i+0)+112`(%r10),%xmm4 + movdqa `16*($i+3)-128`($bptr),%xmm3 + pand `16*($i+1)+112`(%r10),%xmm5 + por %xmm4,%xmm0 + pand `16*($i+2)+112`(%r10),%xmm2 + por %xmm5,%xmm1 + pand `16*($i+3)+112`(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 +___ +} +$code.=<<___; + pxor %xmm1,%xmm0 + pshufd \$0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + lea $STRIDE($bptr),$bptr movq %xmm0,%rdx # bp[0] - movq `3*$STRIDE/4-96`($tptr),%xmm0 - lea 2*$STRIDE($bptr),$bptr # next &b[i] - pand %xmm5,%xmm2 - .byte 0x67,0x67 - pand %xmm6,%xmm3 - ############################################################## - # $tptr is chosen so that writing to top-most element of the - # vector occurs just "above" references to powers table, - # "above" modulo cache-line size, which effectively precludes - # possibility of memory disambiguation logic failure when - # accessing the table. - # - lea 64+8*4+8(%rsp,%r11,8),$tptr + lea 64+8*4+8(%rsp),$tptr mov %rdx,$bi mulx 0*8($aptr),$mi,%rax # a[0]*b[0] @@ -2191,37 +2532,31 @@ $code.=<<___; xor $zero,$zero # cf=0, of=0 mov $mi,%rdx - por %xmm2,%xmm1 - pand %xmm7,%xmm0 - por %xmm3,%xmm1 mov $bptr,8+8(%rsp) # off-load &b[i] - por %xmm1,%xmm0 - .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr + lea 4*8($aptr),$aptr adcx %rax,%r13 adcx $zero,%r14 # cf=0 - mulx 0*16($nptr),%rax,%r10 + mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 - mulx 1*16($nptr),%rax,%r11 + mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 - mulx 2*16($nptr),%rax,%r12 + mulx 2*8($nptr),%rax,%r12 mov 24+8(%rsp),$bptr # counter value - .byte 0x66 mov %r10,-8*4($tptr) adcx %rax,%r11 adox %r13,%r12 - mulx 3*16($nptr),%rax,%r15 - .byte 0x67,0x67 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-8*3($tptr) adcx %rax,%r12 adox $zero,%r15 # of=0 - .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr mov %r12,-8*2($tptr) - #jmp .Lmulx4x_1st + jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: @@ -2241,30 +2576,29 @@ $code.=<<___; lea 4*8($tptr),$tptr adox %r15,%r10 - mulx 0*16($nptr),%rax,%r15 + mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 - mulx 1*16($nptr),%rax,%r15 + mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 - mulx 2*16($nptr),%rax,%r15 + mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 mov %r11,-4*8($tptr) adox %r15,%r13 - mulx 3*16($nptr),%rax,%r15 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 - lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_1st mov 8(%rsp),$num # load -num - movq %xmm0,%rdx # bp[1] adc $zero,%r15 # modulo-scheduled lea ($aptr,$num),$aptr # rewind $aptr add %r15,%r14 @@ -2275,6 +2609,34 @@ $code.=<<___; .align 32 .Lmulx4x_outer: + lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) + pxor %xmm4,%xmm4 + .byte 0x67,0x67 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`($bptr),%xmm0 + movdqa `16*($i+1)-128`($bptr),%xmm1 + movdqa `16*($i+2)-128`($bptr),%xmm2 + pand `16*($i+0)+256`(%r10),%xmm0 + movdqa `16*($i+3)-128`($bptr),%xmm3 + pand `16*($i+1)+256`(%r10),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)+256`(%r10),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)+256`(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + lea $STRIDE($bptr),$bptr + movq %xmm0,%rdx # m0=bp[i] + mov $zero,($tptr) # save top-most carry lea 4*8($tptr,$num),$tptr # rewind $tptr mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] @@ -2289,54 +2651,37 @@ $code.=<<___; mulx 3*8($aptr),%rdx,%r14 adox -2*8($tptr),%r12 adcx %rdx,%r13 - lea ($nptr,$num,2),$nptr # rewind $nptr + lea ($nptr,$num),$nptr # rewind $nptr lea 4*8($aptr),$aptr adox -1*8($tptr),%r13 adcx $zero,%r14 adox $zero,%r14 - .byte 0x67 mov $mi,%r15 imulq 32+8(%rsp),$mi # "t[0]"*n0 - movq `0*$STRIDE/4-96`($bptr),%xmm0 - .byte 0x67,0x67 mov $mi,%rdx - movq `1*$STRIDE/4-96`($bptr),%xmm1 - .byte 0x67 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-96`($bptr),%xmm2 - .byte 0x67 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-96`($bptr),%xmm3 - add \$$STRIDE,$bptr # next &b[i] - .byte 0x67 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 xor $zero,$zero # cf=0, of=0 mov $bptr,8+8(%rsp) # off-load &b[i] - mulx 0*16($nptr),%rax,%r10 + mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 - mulx 1*16($nptr),%rax,%r11 + mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 - mulx 2*16($nptr),%rax,%r12 + mulx 2*8($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 - mulx 3*16($nptr),%rax,%r15 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx - por %xmm2,%xmm0 mov 24+8(%rsp),$bptr # counter value mov %r10,-8*4($tptr) - por %xmm3,%xmm0 adcx %rax,%r12 mov %r11,-8*3($tptr) adox $zero,%r15 # of=0 mov %r12,-8*2($tptr) - lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr jmp .Lmulx4x_inner .align 32 @@ -2361,20 +2706,20 @@ $code.=<<___; adcx $zero,%r14 # cf=0 adox %r15,%r10 - mulx 0*16($nptr),%rax,%r15 + mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 - mulx 1*16($nptr),%rax,%r15 + mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 - mulx 2*16($nptr),%rax,%r15 + mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 adox %r15,%r13 mov %r11,-4*8($tptr) - mulx 3*16($nptr),%rax,%r15 + mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx - lea 4*16($nptr),$nptr + lea 4*8($nptr),$nptr mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 @@ -2384,7 +2729,6 @@ $code.=<<___; jnz .Lmulx4x_inner mov 0+8(%rsp),$num # load -num - movq %xmm0,%rdx # bp[i+1] adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$bptr # pull top-most carry to %cf mov 8+8(%rsp),$bptr # re-load &b[i] @@ -2397,20 +2741,27 @@ $code.=<<___; cmp %r10,$bptr jb .Lmulx4x_outer - mov -16($nptr),%r10 + mov -8($nptr),%r10 + mov $zero,%r8 + mov ($nptr,$num),%r12 + lea ($nptr,$num),%rbp # rewind $nptr + mov $num,%rcx + lea ($tptr,$num),%rdi # rewind $tptr + xor %eax,%eax xor %r15,%r15 sub %r14,%r10 # compare top-most words adc %r15,%r15 - or %r15,$zero - xor \$1,$zero - lea ($tptr,$num),%rdi # rewind $tptr - lea ($nptr,$num,2),$nptr # rewind $nptr - .byte 0x67,0x67 - sar \$3+2,$num # cf=0 - lea ($nptr,$zero,8),%rbp + or %r15,%r8 + sar \$3+2,%rcx + sub %r8,%rax # %rax=-%r8 mov 56+8(%rsp),%rdx # restore rp - mov $num,%rcx - jmp .Lsqrx4x_sub # common post-condition + dec %r12 # so that after 'not' we get -n[0] + mov 8*1(%rbp),%r13 + xor %r8,%r8 + mov 8*2(%rbp),%r14 + mov 8*3(%rbp),%r15 + jmp .Lsqrx4x_sub_entry # common post-condition +.cfi_endproc .size mulx4x_internal,.-mulx4x_internal ___ } { @@ -2433,55 +2784,73 @@ $code.=<<___; .type bn_powerx5,\@function,6 .align 32 bn_powerx5: -.Lpowerx5_enter: - .byte 0x67 +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax +.Lpowerx5_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 -___ -$code.=<<___ if ($win64); - lea -0x28(%rsp),%rsp - movaps %xmm6,(%rsp) - movaps %xmm7,0x10(%rsp) -___ -$code.=<<___; - .byte 0x67 - mov ${num}d,%r10d +.cfi_push %r15 +.Lpowerx5_prologue: + shl \$3,${num}d # convert $num to bytes - shl \$3+2,%r10d # 4*$num + lea ($num,$num,2),%r10 # 3*$num in bytes neg $num mov ($n0),$n0 # *n0 ############################################################## - # ensure that stack frame doesn't alias with $aptr+4*$num - # modulo 4096, which covers ret[num], am[num] and n[2*num] - # (see bn_exp.c). this is done to allow memory disambiguation - # logic do its magic. + # Ensure that stack frame doesn't alias with $rptr+3*$num + # modulo 4096, which covers ret[num], am[num] and n[num] + # (see bn_exp.c). This is done to allow memory disambiguation + # logic do its magic. [Extra 256 bytes is for power mask + # calculated from 7th argument, the index.] # - lea -64(%rsp,$num,2),%r11 - sub $aptr,%r11 + lea -320(%rsp,$num,2),%r11 + mov %rsp,%rbp + sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwrx_sp_alt - sub %r11,%rsp # align with $aptr - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + sub %r11,%rbp # align with $aptr + lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lpwrx_sp_done .align 32 .Lpwrx_sp_alt: - lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num - lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num) + lea 4096-320(,$num,2),%r10 + lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 - sub %r11,%rsp + sub %r11,%rbp .Lpwrx_sp_done: - and \$-64,%rsp - mov $num,%r10 + and \$-64,%rbp + mov %rsp,%r11 + sub %rbp,%r11 + and \$-4096,%r11 + lea (%rbp,%r11),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + +.Lpwrx_page_walk: + lea -4096(%rsp),%rsp + mov (%rsp),%r10 + cmp %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: + + mov $num,%r10 neg $num ############################################################## @@ -2502,13 +2871,19 @@ $code.=<<___; movq $bptr,%xmm4 mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lpowerx5_body: call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal call __bn_sqrx8x_internal + call __bn_postx4x_internal mov %r10,$num # -num mov $aptr,$rptr @@ -2519,22 +2894,26 @@ $code.=<<___; call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax -___ -$code.=<<___ if ($win64); - movaps -88(%rsi),%xmm6 - movaps -72(%rsi),%xmm7 -___ -$code.=<<___; + mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lpowerx5_epilogue: ret +.cfi_endproc .size bn_powerx5,.-bn_powerx5 .globl bn_sqrx8x_internal @@ -2543,6 +2922,7 @@ $code.=<<___; .align 32 bn_sqrx8x_internal: __bn_sqrx8x_internal: +.cfi_startproc ################################################################## # Squaring part: # @@ -2840,11 +3220,19 @@ $code.=<<___; .align 32 .Lsqrx8x_break: - sub 16+8(%rsp),%r8 # consume last carry + xor $zero,$zero + sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf + adcx $zero,%r8 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry + adcx $zero,%r9 mov 0*8($aptr),%rdx # a[8], modulo-scheduled - xor %ebp,%ebp # xor $zero,$zero + adc \$0,%r10 mov %r8,0*8($tptr) + adc \$0,%r11 + adc \$0,%r12 + adc \$0,%r13 + adc \$0,%r14 + adc \$0,%r15 cmp $carry,$tptr # cf=0, of=0 je .Lsqrx8x_outer_loop @@ -2959,11 +3347,11 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); $code.=<<___; movq %xmm2,$nptr -sqrx8x_reduction: +__bn_sqrx8x_reduction: xor %eax,%eax # initial top-most carry bit mov 32+8(%rsp),%rbx # n0 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) - lea -128($nptr,$num,2),%rcx # end of n[] + lea -8*8($nptr,$num),%rcx # end of n[] #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer mov %rcx, 0+8(%rsp) # save end of n[] mov $tptr,8+8(%rsp) # save end of t[] @@ -2992,23 +3380,23 @@ sqrx8x_reduction: .align 32 .Lsqrx8x_reduce: mov %r8, %rbx - mulx 16*0($nptr),%rax,%r8 # n[0] + mulx 8*0($nptr),%rax,%r8 # n[0] adcx %rbx,%rax # discarded adox %r9,%r8 - mulx 16*1($nptr),%rbx,%r9 # n[1] + mulx 8*1($nptr),%rbx,%r9 # n[1] adcx %rbx,%r8 adox %r10,%r9 - mulx 16*2($nptr),%rbx,%r10 + mulx 8*2($nptr),%rbx,%r10 adcx %rbx,%r9 adox %r11,%r10 - mulx 16*3($nptr),%rbx,%r11 + mulx 8*3($nptr),%rbx,%r11 adcx %rbx,%r10 adox %r12,%r11 - .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12 + .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 mov %rdx,%rax mov %r8,%rdx adcx %rbx,%r11 @@ -3018,15 +3406,15 @@ sqrx8x_reduction: mov %rax,%rdx mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] - mulx 16*5($nptr),%rax,%r13 + mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 - mulx 16*6($nptr),%rax,%r14 + mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 - mulx 16*7($nptr),%rax,%r15 + mulx 8*7($nptr),%rax,%r15 mov %rbx,%rdx adcx %rax,%r14 adox $carry,%r15 # $carry is 0 @@ -3042,7 +3430,7 @@ sqrx8x_reduction: mov 48+8(%rsp),%rdx # pull n0*a[0] add 8*0($tptr),%r8 - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr mov \$-8,%rcx adcx 8*1($tptr),%r9 adcx 8*2($tptr),%r10 @@ -3061,35 +3449,35 @@ sqrx8x_reduction: .align 32 .Lsqrx8x_tail: mov %r8,%rbx - mulx 16*0($nptr),%rax,%r8 + mulx 8*0($nptr),%rax,%r8 adcx %rax,%rbx adox %r9,%r8 - mulx 16*1($nptr),%rax,%r9 + mulx 8*1($nptr),%rax,%r9 adcx %rax,%r8 adox %r10,%r9 - mulx 16*2($nptr),%rax,%r10 + mulx 8*2($nptr),%rax,%r10 adcx %rax,%r9 adox %r11,%r10 - mulx 16*3($nptr),%rax,%r11 + mulx 8*3($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 - .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12 + .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 - mulx 16*5($nptr),%rax,%r13 + mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 - mulx 16*6($nptr),%rax,%r14 + mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 - mulx 16*7($nptr),%rax,%r15 + mulx 8*7($nptr),%rax,%r15 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] adcx %rax,%r14 adox $carry,%r15 @@ -3105,7 +3493,7 @@ sqrx8x_reduction: sub 16+8(%rsp),$carry # mov 16(%rsp),%cf mov 48+8(%rsp),%rdx # pull n0*a[0] - lea 16*8($nptr),$nptr + lea 8*8($nptr),$nptr adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 @@ -3124,15 +3512,23 @@ sqrx8x_reduction: .align 32 .Lsqrx8x_tail_done: + xor %rax,%rax add 24+8(%rsp),%r8 # can this overflow? - mov $carry,%rax # xor %rax,%rax + adc \$0,%r9 + adc \$0,%r10 + adc \$0,%r11 + adc \$0,%r12 + adc \$0,%r13 + adc \$0,%r14 + adc \$0,%r15 + adc \$0,%rax sub 16+8(%rsp),$carry # mov 16(%rsp),%cf .Lsqrx8x_no_tail: # %cf is 0 if jumped here adc 8*0($tptr),%r8 movq %xmm3,%rcx adc 8*1($tptr),%r9 - mov 16*7($nptr),$carry + mov 8*7($nptr),$carry movq %xmm2,$nptr # restore $nptr adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 @@ -3140,7 +3536,7 @@ sqrx8x_reduction: adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 - adc %rax,%rax # top-most carry + adc \$0,%rax # top-most carry mov 32+8(%rsp),%rbx # n0 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" @@ -3158,6 +3554,9 @@ sqrx8x_reduction: lea 8*8($tptr,%rcx),$tptr # start of current t[] window cmp 8+8(%rsp),%r8 # end of t[]? jb .Lsqrx8x_reduction_loop + ret +.cfi_endproc +.size bn_sqrx8x_internal,.-bn_sqrx8x_internal ___ } ############################################################## @@ -3165,54 +3564,61 @@ ___ # { my ($rptr,$nptr)=("%rdx","%rbp"); -my @ri=map("%r$_",(10..13)); -my @ni=map("%r$_",(14..15)); $code.=<<___; - xor %rbx,%rbx - sub %r15,%rsi # compare top-most words - adc %rbx,%rbx +.align 32 +__bn_postx4x_internal: +.cfi_startproc + mov 8*0($nptr),%r12 mov %rcx,%r10 # -$num - .byte 0x67 - or %rbx,%rax - .byte 0x67 mov %rcx,%r9 # -$num - xor \$1,%rax - sar \$3+2,%rcx # cf=0 + neg %rax + sar \$3+2,%rcx #lea 48+8(%rsp,%r9),$tptr - lea ($nptr,%rax,8),$nptr movq %xmm1,$rptr # restore $rptr movq %xmm1,$aptr # prepare for back-to-back call - jmp .Lsqrx4x_sub + dec %r12 # so that after 'not' we get -n[0] + mov 8*1($nptr),%r13 + xor %r8,%r8 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 + jmp .Lsqrx4x_sub_entry -.align 32 +.align 16 .Lsqrx4x_sub: - .byte 0x66 - mov 8*0($tptr),%r12 - mov 8*1($tptr),%r13 - sbb 16*0($nptr),%r12 - mov 8*2($tptr),%r14 - sbb 16*1($nptr),%r13 - mov 8*3($tptr),%r15 - lea 8*4($tptr),$tptr - sbb 16*2($nptr),%r14 + mov 8*0($nptr),%r12 + mov 8*1($nptr),%r13 + mov 8*2($nptr),%r14 + mov 8*3($nptr),%r15 +.Lsqrx4x_sub_entry: + andn %rax,%r12,%r12 + lea 8*4($nptr),$nptr + andn %rax,%r13,%r13 + andn %rax,%r14,%r14 + andn %rax,%r15,%r15 + + neg %r8 # mov %r8,%cf + adc 8*0($tptr),%r12 + adc 8*1($tptr),%r13 + adc 8*2($tptr),%r14 + adc 8*3($tptr),%r15 mov %r12,8*0($rptr) - sbb 16*3($nptr),%r15 - lea 16*4($nptr),$nptr + lea 8*4($tptr),$tptr mov %r13,8*1($rptr) + sbb %r8,%r8 # mov %cf,%r8 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx jnz .Lsqrx4x_sub -___ -} -$code.=<<___; + neg %r9 # restore $num ret -.size bn_sqrx8x_internal,.-bn_sqrx8x_internal +.cfi_endproc +.size __bn_postx4x_internal,.-__bn_postx4x_internal ___ +} }}} { my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order @@ -3226,20 +3632,28 @@ $code.=<<___; .type bn_get_bits5,\@abi-omnipotent .align 16 bn_get_bits5: - mov $inp,%r10 +.cfi_startproc + lea 0($inp),%r10 + lea 1($inp),%r11 mov $num,%ecx - shr \$3,$num - movzw (%r10,$num),%eax - and \$7,%ecx + shr \$4,$num + and \$15,%ecx + lea -8(%ecx),%eax + cmp \$11,%ecx + cmova %r11,%r10 + cmova %eax,%ecx + movzw (%r10,$num,2),%eax shrl %cl,%eax and \$31,%eax ret +.cfi_endproc .size bn_get_bits5,.-bn_get_bits5 .globl bn_scatter5 .type bn_scatter5,\@abi-omnipotent .align 16 bn_scatter5: +.cfi_startproc cmp \$0, $num jz .Lscatter_epilogue lea ($tbl,$idx,8),$tbl @@ -3252,70 +3666,108 @@ bn_scatter5: jnz .Lscatter .Lscatter_epilogue: ret +.cfi_endproc .size bn_scatter5,.-bn_scatter5 .globl bn_gather5 .type bn_gather5,\@abi-omnipotent -.align 16 +.align 32 bn_gather5: -___ -$code.=<<___ if ($win64); -.LSEH_begin_bn_gather5: +.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases +.cfi_startproc # I can't trust assembler to use specific encoding:-( - .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp - .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) - .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) + .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 + .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp + lea .Linc(%rip),%rax + and \$-16,%rsp # shouldn't be formally required + + movd $idx,%xmm5 + movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 + movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 + lea 128($tbl),%r11 # size optimization + lea 128(%rsp),%rax # size optimization + + pshufd \$0,%xmm5,%xmm5 # broadcast $idx + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 +___ +######################################################################## +# calculate mask by comparing 0..31 to $idx and save result to stack +# +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 # compare to 1,0 +___ +$code.=<<___ if ($i); + movdqa %xmm3,`16*($i-1)-128`(%rax) ___ $code.=<<___; - mov $idx,%r11d - shr \$`log($N/8)/log(2)`,$idx - and \$`$N/8-1`,%r11 - not $idx - lea .Lmagic_masks(%rip),%rax - and \$`2**5/($N/8)-1`,$idx # 5 is "window size" - lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line - movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which - movq 8(%rax,$idx,8),%xmm5 # cache line contains element - movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument - movq 24(%rax,$idx,8),%xmm7 + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 # compare to 3,2 + movdqa %xmm0,`16*($i+0)-128`(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 # compare to 5,4 + movdqa %xmm1,`16*($i+1)-128`(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 # compare to 7,6 + movdqa %xmm2,`16*($i+2)-128`(%rax) + movdqa %xmm4,%xmm2 +___ +} +$code.=<<___; + movdqa %xmm3,`16*($i-1)-128`(%rax) jmp .Lgather -.align 16 -.Lgather: - movq `0*$STRIDE/4-128`($tbl),%xmm0 - movq `1*$STRIDE/4-128`($tbl),%xmm1 - pand %xmm4,%xmm0 - movq `2*$STRIDE/4-128`($tbl),%xmm2 - pand %xmm5,%xmm1 - movq `3*$STRIDE/4-128`($tbl),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - .byte 0x67,0x67 - por %xmm2,%xmm0 - lea $STRIDE($tbl),$tbl - por %xmm3,%xmm0 +.align 32 +.Lgather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 +___ +for($i=0;$i<$STRIDE/16;$i+=4) { +$code.=<<___; + movdqa `16*($i+0)-128`(%r11),%xmm0 + movdqa `16*($i+1)-128`(%r11),%xmm1 + movdqa `16*($i+2)-128`(%r11),%xmm2 + pand `16*($i+0)-128`(%rax),%xmm0 + movdqa `16*($i+3)-128`(%r11),%xmm3 + pand `16*($i+1)-128`(%rax),%xmm1 + por %xmm0,%xmm4 + pand `16*($i+2)-128`(%rax),%xmm2 + por %xmm1,%xmm5 + pand `16*($i+3)-128`(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 +___ +} +$code.=<<___; + por %xmm5,%xmm4 + lea $STRIDE(%r11),%r11 + pshufd \$0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 movq %xmm0,($out) # m0=bp[0] lea 8($out),$out sub \$1,$num jnz .Lgather -___ -$code.=<<___ if ($win64); - movaps (%rsp),%xmm6 - movaps 0x10(%rsp),%xmm7 - lea 0x28(%rsp),%rsp -___ -$code.=<<___; + + lea (%r10),%rsp ret .LSEH_end_bn_gather5: +.cfi_endproc .size bn_gather5,.-bn_gather5 ___ } $code.=<<___; .align 64 -.Lmagic_masks: - .long 0,0, 0,0, 0,0, -1,-1 - .long 0,0, 0,0, 0,0, 0,0 +.Linc: + .long 0,0, 1,1 + .long 2,2, 2,2 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by " ___ @@ -3354,28 +3806,30 @@ mul_handler: cmp %r10,%rbx # context->RipRipRsp - mov 4(%r11),%r10d # HandlerData[1] + mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea .Lmul_epilogue(%rip),%r10 cmp %r10,%rbx - jb .Lbody_40 + ja .Lbody_40 mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer - jmp .Lbody_proceed + + jmp .Lcommon_pop_regs .Lbody_40: mov 40(%rax),%rax # pull saved stack pointer -.Lbody_proceed: - - movaps -88(%rax),%xmm0 - movaps -72(%rax),%xmm1 - +.Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 @@ -3388,8 +3842,6 @@ mul_handler: mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 - movups %xmm0,512($context) # restore context->Xmm6 - movups %xmm1,528($context) # restore context->Xmm7 .Lcommon_seh_tail: mov 8(%rax),%rdi @@ -3468,42 +3920,41 @@ $code.=<<___; .LSEH_info_bn_mul_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmul_body,.Lmul_epilogue # HandlerData[] + .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] .align 8 .LSEH_info_bn_mul4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] + .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_power5: .byte 9,0,0,0 .rva mul_handler - .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[] + .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] .align 8 .LSEH_info_bn_from_mont8x: .byte 9,0,0,0 .rva mul_handler - .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] + .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] ___ $code.=<<___ if ($addx); .align 8 .LSEH_info_bn_mulx4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler - .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] + .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_powerx5: .byte 9,0,0,0 .rva mul_handler - .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] + .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] ___ $code.=<<___; .align 8 .LSEH_info_bn_gather5: - .byte 0x01,0x0d,0x05,0x00 - .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 - .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 - .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 + .byte 0x01,0x0b,0x03,0x0a + .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 + .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) .align 8 ___ }