X-Git-Url: https://git.librecmc.org/?a=blobdiff_plain;f=crypto%2Fbn%2Fasm%2Fx86_64-mont5.pl;h=5e70547704f87f1077cf5f1eaf540f1dda7f5cce;hb=9bb3e5fd87905e3e9f5f7edcc2e22d98360510ab;hp=1666fbd7a2d4a92b30754d30286a9e005e091f5a;hpb=668a709a8d7ea374ee72ad2d43ac72ec60a80eee;p=oweals%2Fopenssl.git diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 1666fbd7a2..5e70547704 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -1,7 +1,7 @@ #! /usr/bin/env perl -# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved. # -# Licensed under the OpenSSL license (the "License"). You may not use +# Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html @@ -31,9 +31,10 @@ # the np argument is not just modulus value, but one interleaved # with 0. This is to optimize post-condition... -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); @@ -42,7 +43,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` @@ -60,7 +62,7 @@ if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $addx = ($ver>=3.03); } @@ -419,22 +421,23 @@ $code.=<<___; mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] mov 8($ap,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ - dec $j # doesnn't affect CF! + dec $j # doesn't affect CF! jnz .Lsub sbb \$0,%rax # handle upmost overflow bit + mov \$-1,%rbx + xor %rax,%rbx xor $i,$i - and %rax,$ap - not %rax - mov $rp,$np - and %rax,$np mov $num,$j # j=num - or $np,$ap # ap=borrow?tp:rp -.align 16 -.Lcopy: # copy or in-place refresh - mov ($ap,$i,8),%rax + +.Lcopy: # conditional copy + mov ($rp,$i,8),%rcx + mov (%rsp,$i,8),%rdx + and %rbx,%rcx + and %rax,%rdx mov $i,(%rsp,$i,8) # zap temporary vector - mov %rax,($rp,$i,8) # rp[i]=tp[i] + or %rcx,%rdx + mov %rdx,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy @@ -579,6 +582,7 @@ $code.=<<___; .type mul4x_internal,\@abi-omnipotent .align 32 mul4x_internal: +.cfi_startproc shl \$5,$num # $num was in bytes movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index lea .Linc(%rip),%rax @@ -1073,6 +1077,7 @@ $code.=<<___ ___ } $code.=<<___; +.cfi_endproc .size mul4x_internal,.-mul4x_internal ___ }}} @@ -1238,6 +1243,7 @@ $code.=<<___; .align 32 bn_sqr8x_internal: __bn_sqr8x_internal: +.cfi_startproc ############################################################## # Squaring part: # @@ -2029,6 +2035,7 @@ __bn_sqr8x_reduction: cmp %rdx,$tptr # end of t[]? jb .L8x_reduction_loop ret +.cfi_endproc .size bn_sqr8x_internal,.-bn_sqr8x_internal ___ } @@ -2041,6 +2048,7 @@ $code.=<<___; .type __bn_post4x_internal,\@abi-omnipotent .align 32 __bn_post4x_internal: +.cfi_startproc mov 8*0($nptr),%r12 lea (%rdi,$num),$tptr # %rdi was $tptr above mov $num,%rcx @@ -2091,6 +2099,7 @@ __bn_post4x_internal: mov $num,%r10 # prepare for back-to-back call neg $num # restore $num ret +.cfi_endproc .size __bn_post4x_internal,.-__bn_post4x_internal ___ } @@ -2100,10 +2109,12 @@ $code.=<<___; .type bn_from_montgomery,\@abi-omnipotent .align 32 bn_from_montgomery: +.cfi_startproc testl \$7,`($win64?"48(%rsp)":"%r9d")` jz bn_from_mont8x xor %eax,%eax ret +.cfi_endproc .size bn_from_montgomery,.-bn_from_montgomery .type bn_from_mont8x,\@function,6 @@ -2399,6 +2410,7 @@ bn_mulx4x_mont_gather5: .type mulx4x_internal,\@abi-omnipotent .align 32 mulx4x_internal: +.cfi_startproc mov $num,8(%rsp) # save -$num (it was in bytes) mov $num,%r10 neg $num # restore $num @@ -2421,7 +2433,7 @@ my $N=$STRIDE/4; # should match cache line size $code.=<<___; movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 - lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton) + lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) lea 128($bp),$bptr # size optimization pshufd \$0,%xmm5,%xmm5 # broadcast index @@ -2749,6 +2761,7 @@ $code.=<<___; mov 8*2(%rbp),%r14 mov 8*3(%rbp),%r15 jmp .Lsqrx4x_sub_entry # common post-condition +.cfi_endproc .size mulx4x_internal,.-mulx4x_internal ___ } { @@ -2909,6 +2922,7 @@ bn_powerx5: .align 32 bn_sqrx8x_internal: __bn_sqrx8x_internal: +.cfi_startproc ################################################################## # Squaring part: # @@ -3541,6 +3555,7 @@ __bn_sqrx8x_reduction: cmp 8+8(%rsp),%r8 # end of t[]? jb .Lsqrx8x_reduction_loop ret +.cfi_endproc .size bn_sqrx8x_internal,.-bn_sqrx8x_internal ___ } @@ -3552,6 +3567,7 @@ my ($rptr,$nptr)=("%rdx","%rbp"); $code.=<<___; .align 32 __bn_postx4x_internal: +.cfi_startproc mov 8*0($nptr),%r12 mov %rcx,%r10 # -$num mov %rcx,%r9 # -$num @@ -3599,6 +3615,7 @@ __bn_postx4x_internal: neg %r9 # restore $num ret +.cfi_endproc .size __bn_postx4x_internal,.-__bn_postx4x_internal ___ } @@ -3615,6 +3632,7 @@ $code.=<<___; .type bn_get_bits5,\@abi-omnipotent .align 16 bn_get_bits5: +.cfi_startproc lea 0($inp),%r10 lea 1($inp),%r11 mov $num,%ecx @@ -3628,12 +3646,14 @@ bn_get_bits5: shrl %cl,%eax and \$31,%eax ret +.cfi_endproc .size bn_get_bits5,.-bn_get_bits5 .globl bn_scatter5 .type bn_scatter5,\@abi-omnipotent .align 16 bn_scatter5: +.cfi_startproc cmp \$0, $num jz .Lscatter_epilogue lea ($tbl,$idx,8),$tbl @@ -3646,6 +3666,7 @@ bn_scatter5: jnz .Lscatter .Lscatter_epilogue: ret +.cfi_endproc .size bn_scatter5,.-bn_scatter5 .globl bn_gather5 @@ -3653,6 +3674,7 @@ bn_scatter5: .align 32 bn_gather5: .LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases +.cfi_startproc # I can't trust assembler to use specific encoding:-( .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp @@ -3737,6 +3759,7 @@ $code.=<<___; lea (%r10),%rsp ret .LSEH_end_bn_gather5: +.cfi_endproc .size bn_gather5,.-bn_gather5 ___ }