X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;f=crypto%2Fbn%2Fasm%2Fx86_64-mont5.pl;h=5e70547704f87f1077cf5f1eaf540f1dda7f5cce;hb=9bb3e5fd87905e3e9f5f7edcc2e22d98360510ab;hp=8f49391727f5f3a894fbd9756a82dde30a23563e;hpb=609b0852e4d50251857dbbac3141ba042e35a9ae;p=oweals%2Fopenssl.git diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 8f49391727..5e70547704 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl -# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -31,9 +31,10 @@ # the np argument is not just modulus value, but one interleaved # with 0. This is to optimize post-condition... -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); @@ -42,7 +43,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -60,7 +62,7 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $addx = ($ver>=3.03); } @@ -93,8 +95,10 @@ $code=<<___; .type bn_mul_mont_gather5,\@function,6 .align 64 bn_mul_mont_gather5: +.cfi_startproc mov ${num}d,${num}d mov %rsp,%rax +.cfi_def_cfa_register %rax test \$7,${num}d jnz .Lmul_enter ___ @@ -108,11 +112,17 @@ $code.=<<___; .Lmul_enter: movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 neg $num mov %rsp,%r11 @@ -145,6 +155,7 @@ $code.=<<___; lea .Linc(%rip),%r10 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul_body: lea 128($bp),%r12 # reassign $bp (+size optimization) @@ -410,38 +421,48 @@ $code.=<<___; mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] mov 8($ap,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ - dec $j # doesnn't affect CF! + dec $j # doesn't affect CF! jnz .Lsub sbb \$0,%rax # handle upmost overflow bit + mov \$-1,%rbx + xor %rax,%rbx xor $i,$i - and %rax,$ap - not %rax - mov $rp,$np - and %rax,$np mov $num,$j # j=num - or $np,$ap # ap=borrow?tp:rp -.align 16 -.Lcopy: # copy or in-place refresh - mov ($ap,$i,8),%rax + +.Lcopy: # conditional copy + mov ($rp,$i,8),%rcx + mov (%rsp,$i,8),%rdx + and %rbx,%rcx + and %rax,%rdx mov $i,(%rsp,$i,8) # zap temporary vector - mov %rax,($rp,$i,8) # rp[i]=tp[i] + or %rcx,%rdx + mov %rdx,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy mov 8(%rsp,$num,8),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: ret +.cfi_endproc .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 ___ {{{ @@ -451,8 +472,10 @@ $code.=<<___; .type bn_mul4x_mont_gather5,\@function,6 .align 32 bn_mul4x_mont_gather5: +.cfi_startproc .byte 0x67 mov %rsp,%rax +.cfi_def_cfa_register %rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); @@ -462,11 +485,17 @@ $code.=<<___ if ($addx); ___ $code.=<<___; push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lmul4x_prologue: .byte 0x67 @@ -522,27 +551,38 @@ $code.=<<___; neg $num mov %rax,40(%rsp) +.cfi_cfa_expression %rsp+40,deref,+8 .Lmul4x_body: call mul4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret +.cfi_endproc .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 .type mul4x_internal,\@abi-omnipotent .align 32 mul4x_internal: +.cfi_startproc shl \$5,$num # $num was in bytes movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index lea .Linc(%rip),%rax @@ -1037,6 +1077,7 @@ $code.=<<___ ___ } $code.=<<___; +.cfi_endproc .size mul4x_internal,.-mul4x_internal ___ }}} @@ -1061,7 +1102,9 @@ $code.=<<___; .type bn_power5,\@function,6 .align 32 bn_power5: +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d @@ -1071,11 +1114,17 @@ $code.=<<___ if ($addx); ___ $code.=<<___; push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lpower5_prologue: shl \$3,${num}d # convert $num to bytes @@ -1140,6 +1189,7 @@ $code.=<<___; # mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lpower5_body: movq $rptr,%xmm1 # save $rptr, used in sqr8x movq $nptr,%xmm2 # save $nptr @@ -1166,16 +1216,25 @@ $code.=<<___; call mul4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lpower5_epilogue: ret +.cfi_endproc .size bn_power5,.-bn_power5 .globl bn_sqr8x_internal @@ -1184,6 +1243,7 @@ $code.=<<___; .align 32 bn_sqr8x_internal: __bn_sqr8x_internal: +.cfi_startproc ############################################################## # Squaring part: # @@ -1934,6 +1994,7 @@ __bn_sqr8x_reduction: .align 32 .L8x_tail_done: + xor %rax,%rax add (%rdx),%r8 # can this overflow? adc \$0,%r9 adc \$0,%r10 @@ -1941,10 +2002,8 @@ __bn_sqr8x_reduction: adc \$0,%r12 adc \$0,%r13 adc \$0,%r14 - adc \$0,%r15 # can't overflow, because we - # started with "overhung" part - # of multiplication - xor %rax,%rax + adc \$0,%r15 + adc \$0,%rax neg $carry .L8x_no_tail: @@ -1976,6 +2035,7 @@ __bn_sqr8x_reduction: cmp %rdx,$tptr # end of t[]? jb .L8x_reduction_loop ret +.cfi_endproc .size bn_sqr8x_internal,.-bn_sqr8x_internal ___ } @@ -1988,6 +2048,7 @@ $code.=<<___; .type __bn_post4x_internal,\@abi-omnipotent .align 32 __bn_post4x_internal: +.cfi_startproc mov 8*0($nptr),%r12 lea (%rdi,$num),$tptr # %rdi was $tptr above mov $num,%rcx @@ -2038,6 +2099,7 @@ __bn_post4x_internal: mov $num,%r10 # prepare for back-to-back call neg $num # restore $num ret +.cfi_endproc .size __bn_post4x_internal,.-__bn_post4x_internal ___ } @@ -2047,23 +2109,33 @@ $code.=<<___; .type bn_from_montgomery,\@abi-omnipotent .align 32 bn_from_montgomery: +.cfi_startproc testl \$7,`($win64?"48(%rsp)":"%r9d")` jz bn_from_mont8x xor %eax,%eax ret +.cfi_endproc .size bn_from_montgomery,.-bn_from_montgomery .type bn_from_mont8x,\@function,6 .align 32 bn_from_mont8x: +.cfi_startproc .byte 0x67 mov %rsp,%rax +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lfrom_prologue: shl \$3,${num}d # convert $num to bytes @@ -2128,6 +2200,7 @@ bn_from_mont8x: # mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lfrom_body: mov $num,%r11 lea 48(%rsp),%rax @@ -2171,7 +2244,6 @@ $code.=<<___ if ($addx); pxor %xmm0,%xmm0 lea 48(%rsp),%rax - mov 40(%rsp),%rsi # restore %rsp jmp .Lfrom_mont_zero .align 32 @@ -2183,11 +2255,12 @@ $code.=<<___; pxor %xmm0,%xmm0 lea 48(%rsp),%rax - mov 40(%rsp),%rsi # restore %rsp jmp .Lfrom_mont_zero .align 32 .Lfrom_mont_zero: + mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 movdqa %xmm0,16*0(%rax) movdqa %xmm0,16*1(%rax) movdqa %xmm0,16*2(%rax) @@ -2198,14 +2271,22 @@ $code.=<<___; mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lfrom_epilogue: ret +.cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x ___ } @@ -2218,14 +2299,22 @@ $code.=<<___; .type bn_mulx4x_mont_gather5,\@function,6 .align 32 bn_mulx4x_mont_gather5: +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax .Lmulx4x_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes @@ -2291,26 +2380,37 @@ bn_mulx4x_mont_gather5: # mov $n0, 32(%rsp) # save *n0 mov %rax,40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lmulx4x_body: call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret +.cfi_endproc .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 .type mulx4x_internal,\@abi-omnipotent .align 32 mulx4x_internal: +.cfi_startproc mov $num,8(%rsp) # save -$num (it was in bytes) mov $num,%r10 neg $num # restore $num @@ -2333,7 +2433,7 @@ my $N=$STRIDE/4; # should match cache line size $code.=<<___; movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 - lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton) + lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) lea 128($bp),$bptr # size optimization pshufd \$0,%xmm5,%xmm5 # broadcast index @@ -2661,6 +2761,7 @@ $code.=<<___; mov 8*2(%rbp),%r14 mov 8*3(%rbp),%r15 jmp .Lsqrx4x_sub_entry # common post-condition +.cfi_endproc .size mulx4x_internal,.-mulx4x_internal ___ } { @@ -2683,14 +2784,22 @@ $code.=<<___; .type bn_powerx5,\@function,6 .align 32 bn_powerx5: +.cfi_startproc mov %rsp,%rax +.cfi_def_cfa_register %rax .Lpowerx5_enter: push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 +.cfi_push %r15 .Lpowerx5_prologue: shl \$3,${num}d # convert $num to bytes @@ -2762,6 +2871,7 @@ bn_powerx5: movq $bptr,%xmm4 mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp +.cfi_cfa_expression %rsp+40,deref,+8 .Lpowerx5_body: call __bn_sqrx8x_internal @@ -2784,17 +2894,26 @@ bn_powerx5: call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp +.cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 +.cfi_restore %r15 mov -40(%rsi),%r14 +.cfi_restore %r14 mov -32(%rsi),%r13 +.cfi_restore %r13 mov -24(%rsi),%r12 +.cfi_restore %r12 mov -16(%rsi),%rbp +.cfi_restore %rbp mov -8(%rsi),%rbx +.cfi_restore %rbx lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lpowerx5_epilogue: ret +.cfi_endproc .size bn_powerx5,.-bn_powerx5 .globl bn_sqrx8x_internal @@ -2803,6 +2922,7 @@ bn_powerx5: .align 32 bn_sqrx8x_internal: __bn_sqrx8x_internal: +.cfi_startproc ################################################################## # Squaring part: # @@ -3100,11 +3220,19 @@ $code.=<<___; .align 32 .Lsqrx8x_break: - sub 16+8(%rsp),%r8 # consume last carry + xor $zero,$zero + sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf + adcx $zero,%r8 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry + adcx $zero,%r9 mov 0*8($aptr),%rdx # a[8], modulo-scheduled - xor %ebp,%ebp # xor $zero,$zero + adc \$0,%r10 mov %r8,0*8($tptr) + adc \$0,%r11 + adc \$0,%r12 + adc \$0,%r13 + adc \$0,%r14 + adc \$0,%r15 cmp $carry,$tptr # cf=0, of=0 je .Lsqrx8x_outer_loop @@ -3384,6 +3512,7 @@ __bn_sqrx8x_reduction: .align 32 .Lsqrx8x_tail_done: + xor %rax,%rax add 24+8(%rsp),%r8 # can this overflow? adc \$0,%r9 adc \$0,%r10 @@ -3391,10 +3520,8 @@ __bn_sqrx8x_reduction: adc \$0,%r12 adc \$0,%r13 adc \$0,%r14 - adc \$0,%r15 # can't overflow, because we - # started with "overhung" part - # of multiplication - mov $carry,%rax # xor %rax,%rax + adc \$0,%r15 + adc \$0,%rax sub 16+8(%rsp),$carry # mov 16(%rsp),%cf .Lsqrx8x_no_tail: # %cf is 0 if jumped here @@ -3409,7 +3536,7 @@ __bn_sqrx8x_reduction: adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 - adc %rax,%rax # top-most carry + adc \$0,%rax # top-most carry mov 32+8(%rsp),%rbx # n0 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" @@ -3428,6 +3555,7 @@ __bn_sqrx8x_reduction: cmp 8+8(%rsp),%r8 # end of t[]? jb .Lsqrx8x_reduction_loop ret +.cfi_endproc .size bn_sqrx8x_internal,.-bn_sqrx8x_internal ___ } @@ -3439,6 +3567,7 @@ my ($rptr,$nptr)=("%rdx","%rbp"); $code.=<<___; .align 32 __bn_postx4x_internal: +.cfi_startproc mov 8*0($nptr),%r12 mov %rcx,%r10 # -$num mov %rcx,%r9 # -$num @@ -3486,6 +3615,7 @@ __bn_postx4x_internal: neg %r9 # restore $num ret +.cfi_endproc .size __bn_postx4x_internal,.-__bn_postx4x_internal ___ } @@ -3502,6 +3632,7 @@ $code.=<<___; .type bn_get_bits5,\@abi-omnipotent .align 16 bn_get_bits5: +.cfi_startproc lea 0($inp),%r10 lea 1($inp),%r11 mov $num,%ecx @@ -3515,12 +3646,14 @@ bn_get_bits5: shrl %cl,%eax and \$31,%eax ret +.cfi_endproc .size bn_get_bits5,.-bn_get_bits5 .globl bn_scatter5 .type bn_scatter5,\@abi-omnipotent .align 16 bn_scatter5: +.cfi_startproc cmp \$0, $num jz .Lscatter_epilogue lea ($tbl,$idx,8),$tbl @@ -3533,6 +3666,7 @@ bn_scatter5: jnz .Lscatter .Lscatter_epilogue: ret +.cfi_endproc .size bn_scatter5,.-bn_scatter5 .globl bn_gather5 @@ -3540,6 +3674,7 @@ bn_scatter5: .align 32 bn_gather5: .LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases +.cfi_startproc # I can't trust assembler to use specific encoding:-( .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp @@ -3624,6 +3759,7 @@ $code.=<<___; lea (%r10),%rsp ret .LSEH_end_bn_gather5: +.cfi_endproc .size bn_gather5,.-bn_gather5 ___ } @@ -3671,8 +3807,8 @@ mul_handler: jb .Lcommon_seh_tail mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label + lea (%rsi,%r10),%r10 # beginning of body label + cmp %r10,%rbx # context->RipRsp