+#! /usr/bin/env perl
+# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# the np argument is not just modulus value, but one interleaved
# with 0. This is to optimize post-condition...
# the np argument is not just modulus value, but one interleaved
# with 0. This is to optimize post-condition...
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
- lea 2($num),%r11
- neg %r11
- lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
- and \$-1024,%rsp # minimize TLB usage
+ neg $num
+ mov %rsp,%r11
+ lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8)
+ neg $num # restore $num
+ and \$-1024,%r10 # minimize TLB usage
- mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
-.Lmul_body:
- # Some OSes, *cough*-dows, insist on stack being "wired" to
+ # An OS-agnostic version of __chkstk.
+ #
+ # Some OSes (Windows) insist on stack being "wired" to
# physical memory in strictly sequential manner, i.e. if stack
# allocation spans two pages, then reference to farmost one can
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
# physical memory in strictly sequential manner, i.e. if stack
# allocation spans two pages, then reference to farmost one can
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
- mov (%rsp,%rax),%r11
- sub \$4096,%rax
- .byte 0x2e # predict non-taken
- jnc .Lmul_page_walk
+ lea -4096(%rsp),%rsp
+ mov (%rsp),%r11
+ cmp %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+
+ lea .Linc(%rip),%r10
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
+.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
+.Lmul_body:
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
mov 8($ap,$i,8),%rax # tp[i+1]
lea 1($i),$i # i++
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
mov 8($ap,$i,8),%rax # tp[i+1]
lea 1($i),$i # i++
mov $i,(%rsp,$i,8) # zap temporary vector
mov $i,(%rsp,$i,8) # zap temporary vector
- mov %rax,($rp,$i,8) # rp[i]=tp[i]
+ or %rcx,%rdx
+ mov %rdx,($rp,$i,8) # rp[i]=tp[i]
lea 1($i),$i
sub \$1,$j
jnz .Lcopy
mov 8(%rsp,$num,8),%rsi # restore %rsp
lea 1($i),$i
sub \$1,$j
jnz .Lcopy
mov 8(%rsp,$num,8),%rsi # restore %rsp
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
- sub %r11,%rsp # align with $rp
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ sub %r11,%rbp # align with $rp
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
lea 4096-320(,$num,2),%r10
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
.type mul4x_internal,\@abi-omnipotent
.align 32
mul4x_internal:
.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
.type mul4x_internal,\@abi-omnipotent
.align 32
mul4x_internal:
shl \$5,$num # $num was in bytes
movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index
lea .Linc(%rip),%rax
shl \$5,$num # $num was in bytes
movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index
lea .Linc(%rip),%rax
my $nptr="%rcx"; # const BN_ULONG *nptr,
my $n0 ="%r8"; # const BN_ULONG *n0);
my $num ="%r9"; # int num, has to be divisible by 8
my $nptr="%rcx"; # const BN_ULONG *nptr,
my $n0 ="%r8"; # const BN_ULONG *n0);
my $num ="%r9"; # int num, has to be divisible by 8
my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
my @A0=("%r10","%r11");
my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
my @A0=("%r10","%r11");
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10d # 3*$num
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10d # 3*$num
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
lea 4096-320(,$num,2),%r10
jmp .Lpwr_sp_done
.align 32
.Lpwr_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
.Lpower5_body:
movq $rptr,%xmm1 # save $rptr, used in sqr8x
movq $nptr,%xmm2 # save $nptr
.Lpower5_body:
movq $rptr,%xmm1 # save $rptr, used in sqr8x
movq $nptr,%xmm2 # save $nptr
.size bn_from_montgomery,.-bn_from_montgomery
.type bn_from_mont8x,\@function,6
.align 32
bn_from_mont8x:
.size bn_from_montgomery,.-bn_from_montgomery
.type bn_from_mont8x,\@function,6
.align 32
bn_from_mont8x:
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
# last operation, we use the opportunity to cleanse it.
#
lea -320(%rsp,$num,2),%r11
# last operation, we use the opportunity to cleanse it.
#
lea -320(%rsp,$num,2),%r11
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
lea 4096-320(,$num,2),%r10
jmp .Lfrom_sp_done
.align 32
.Lfrom_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lmulx4xsp_done
.Lmulx4xsp_alt:
lea 4096-320(,$num,2),%r10
jmp .Lmulx4xsp_done
.Lmulx4xsp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
#
mov $n0, 32(%rsp) # save *n0
mov %rax,40(%rsp) # save original %rsp
#
mov $n0, 32(%rsp) # save *n0
mov %rax,40(%rsp) # save original %rsp
.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
.type mulx4x_internal,\@abi-omnipotent
.align 32
mulx4x_internal:
.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
.type mulx4x_internal,\@abi-omnipotent
.align 32
mulx4x_internal:
$code.=<<___;
movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
$code.=<<___;
movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
lea 128($bp),$bptr # size optimization
pshufd \$0,%xmm5,%xmm5 # broadcast index
lea 128($bp),$bptr # size optimization
pshufd \$0,%xmm5,%xmm5 # broadcast index
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
- sub %r11,%rsp # align with $aptr
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ sub %r11,%rbp # align with $aptr
+ lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
jmp .Lpwrx_sp_done
.align 32
.Lpwrx_sp_alt:
lea 4096-320(,$num,2),%r10
jmp .Lpwrx_sp_done
.align 32
.Lpwrx_sp_alt:
lea 4096-320(,$num,2),%r10
- lea -320(%rsp,$num,2),%rsp # alloca(frame+2*$num*8+256)
+ lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256)
.size bn_powerx5,.-bn_powerx5
.globl bn_sqrx8x_internal
.size bn_powerx5,.-bn_powerx5
.globl bn_sqrx8x_internal
- adc \$0,%r15 # can't overflow, because we
- # started with "overhung" part
- # of multiplication
- mov $carry,%rax # xor %rax,%rax
+ adc \$0,%r15
+ adc \$0,%rax
sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
.Lsqrx8x_no_tail: # %cf is 0 if jumped here
sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
.Lsqrx8x_no_tail: # %cf is 0 if jumped here
cmp 8+8(%rsp),%r8 # end of t[]?
jb .Lsqrx8x_reduction_loop
ret
cmp 8+8(%rsp),%r8 # end of t[]?
jb .Lsqrx8x_reduction_loop
ret
.size bn_get_bits5,.-bn_get_bits5
.globl bn_scatter5
.type bn_scatter5,\@abi-omnipotent
.align 16
bn_scatter5:
.size bn_get_bits5,.-bn_get_bits5
.globl bn_scatter5
.type bn_scatter5,\@abi-omnipotent
.align 16
bn_scatter5:
# I can't trust assembler to use specific encoding:-(
.byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10
.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp
# I can't trust assembler to use specific encoding:-(
.byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10
.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # beginning of body label
+ cmp %r10,%rbx # context->Rip<body label
+ jb .Lcommon_pop_regs
+
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer