From: Andy Polyakov Date: Fri, 3 Feb 2017 11:07:16 +0000 (+0100) Subject: x86_64 assembly pack: Win64 SEH face-lift. X-Git-Tag: OpenSSL_1_1_1-pre1~2474 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=384e6de4c7e35e37fb3d6fbeb32ddcb5eb0d3d3f;p=oweals%2Fopenssl.git x86_64 assembly pack: Win64 SEH face-lift. - harmonize handlers with guidelines and themselves; - fix some bugs in handlers; - add missing handlers in chacha and ecp_nistz256 modules; Reviewed-by: Rich Salz --- diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl index ae7fde20fe..5eecfdf8cf 100755 --- a/crypto/aes/asm/aes-x86_64.pl +++ b/crypto/aes/asm/aes-x86_64.pl @@ -599,6 +599,7 @@ $code.=<<___; .hidden asm_AES_encrypt asm_AES_encrypt: AES_encrypt: + mov %rsp,%rax push %rbx push %rbp push %r12 @@ -607,7 +608,6 @@ AES_encrypt: push %r15 # allocate frame "above" key schedule - mov %rsp,%r10 lea -63(%rdx),%rcx # %rdx is key argument and \$-64,%rsp sub %rsp,%rcx @@ -617,7 +617,7 @@ AES_encrypt: sub \$32,%rsp mov %rsi,16(%rsp) # save out - mov %r10,24(%rsp) # save real stack pointer + mov %rax,24(%rsp) # save original stack pointer .Lenc_prologue: mov %rdx,$key @@ -649,13 +649,13 @@ AES_encrypt: mov $s2,8($out) mov $s3,12($out) - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lenc_epilogue: ret .size AES_encrypt,.-AES_encrypt @@ -1197,6 +1197,7 @@ $code.=<<___; .hidden asm_AES_decrypt asm_AES_decrypt: AES_decrypt: + mov %rsp,%rax push %rbx push %rbp push %r12 @@ -1205,7 +1206,6 @@ AES_decrypt: push %r15 # allocate frame "above" key schedule - mov %rsp,%r10 lea -63(%rdx),%rcx # %rdx is key argument and \$-64,%rsp sub %rsp,%rcx @@ -1215,7 +1215,7 @@ AES_decrypt: sub \$32,%rsp mov %rsi,16(%rsp) # save out - mov %r10,24(%rsp) # save real stack pointer + mov %rax,24(%rsp) # save original stack pointer .Ldec_prologue: mov %rdx,$key @@ -1249,13 +1249,13 @@ AES_decrypt: mov $s2,8($out) mov $s3,12($out) - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Ldec_epilogue: ret .size AES_decrypt,.-AES_decrypt @@ -1675,10 +1675,9 @@ AES_cbc_encrypt: mov %r9d,%r9d # clear upper half of enc lea .LAES_Te(%rip),$sbox + lea .LAES_Td(%rip),%r10 cmp \$0,%r9 - jne .Lcbc_picked_te - lea .LAES_Td(%rip),$sbox -.Lcbc_picked_te: + cmoveq %r10,$sbox mov OPENSSL_ia32cap_P(%rip),%r10d cmp \$$speed_limit,%rdx @@ -2580,7 +2579,6 @@ block_se_handler: jae .Lin_block_prologue mov 24(%rax),%rax # pull saved real stack pointer - lea 48(%rax),%rax # adjust... mov -8(%rax),%rbx mov -16(%rax),%rbp diff --git a/crypto/aes/asm/aesni-sha256-x86_64.pl b/crypto/aes/asm/aesni-sha256-x86_64.pl index ba4964a850..2d6424fecd 100644 --- a/crypto/aes/asm/aesni-sha256-x86_64.pl +++ b/crypto/aes/asm/aesni-sha256-x86_64.pl @@ -341,13 +341,13 @@ $code.=<<___; ${func}_xop: .Lxop_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame @@ -363,7 +363,7 @@ ${func}_xop: mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 - mov %r11,$_rsp + mov %rax,$_rsp ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) @@ -617,13 +617,13 @@ $code.=<<___ if ($win64); movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_xop: ret .size ${func}_xop,.-${func}_xop @@ -639,13 +639,13 @@ $code.=<<___; ${func}_avx: .Lavx_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame @@ -661,7 +661,7 @@ ${func}_avx: mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 - mov %r11,$_rsp + mov %rax,$_rsp ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) @@ -868,13 +868,13 @@ $code.=<<___ if ($win64); movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_avx: ret .size ${func}_avx,.-${func}_avx @@ -935,13 +935,13 @@ $code.=<<___; ${func}_avx2: .Lavx2_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp and \$-256*$SZ,%rsp # align stack frame add \$`2*$SZ*($rounds-8)`,%rsp @@ -958,7 +958,7 @@ ${func}_avx2: mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 - mov %r11,$_rsp + mov %rax,$_rsp ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) @@ -1205,13 +1205,13 @@ $code.=<<___ if ($win64); movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_avx2: ret .size ${func}_avx2,.-${func}_avx2 @@ -1569,7 +1569,6 @@ ___ $code.=<<___; mov %rax,%rsi # put aside Rsp mov 16*$SZ+7*8(%rax),%rax # pull $_rsp - lea 48(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index 443f2f7542..8ae6dbfa4f 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -1172,7 +1172,7 @@ ___ # with zero-round key xor. { my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); -my ($key0,$ctr)=("${key_}d","${ivp}d"); +my ($key0,$ctr)=("%ebp","${ivp}d"); my $frame_size = 0x80 + ($win64?160:0); $code.=<<___; @@ -1201,26 +1201,25 @@ $code.=<<___; .align 16 .Lctr32_bulk: - lea (%rsp),%rax + lea (%rsp),$key_ # use $key_ as frame pointer push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) # offload everything - movaps %xmm7,-0x98(%rax) - movaps %xmm8,-0x88(%rax) - movaps %xmm9,-0x78(%rax) - movaps %xmm10,-0x68(%rax) - movaps %xmm11,-0x58(%rax) - movaps %xmm12,-0x48(%rax) - movaps %xmm13,-0x38(%rax) - movaps %xmm14,-0x28(%rax) - movaps %xmm15,-0x18(%rax) + movaps %xmm6,-0xa8($key_) # offload everything + movaps %xmm7,-0x98($key_) + movaps %xmm8,-0x88($key_) + movaps %xmm9,-0x78($key_) + movaps %xmm10,-0x68($key_) + movaps %xmm11,-0x58($key_) + movaps %xmm12,-0x48($key_) + movaps %xmm13,-0x38($key_) + movaps %xmm14,-0x28($key_) + movaps %xmm15,-0x18($key_) .Lctr32_body: ___ $code.=<<___; - lea -8(%rax),%rbp # 8 16-byte words on top of stack are counter values # xor-ed with zero-round key @@ -1692,26 +1691,26 @@ $code.=<<___ if (!$win64); pxor %xmm15,%xmm15 ___ $code.=<<___ if ($win64); - movaps -0xa0(%rbp),%xmm6 - movaps %xmm0,-0xa0(%rbp) # clear stack - movaps -0x90(%rbp),%xmm7 - movaps %xmm0,-0x90(%rbp) - movaps -0x80(%rbp),%xmm8 - movaps %xmm0,-0x80(%rbp) - movaps -0x70(%rbp),%xmm9 - movaps %xmm0,-0x70(%rbp) - movaps -0x60(%rbp),%xmm10 - movaps %xmm0,-0x60(%rbp) - movaps -0x50(%rbp),%xmm11 - movaps %xmm0,-0x50(%rbp) - movaps -0x40(%rbp),%xmm12 - movaps %xmm0,-0x40(%rbp) - movaps -0x30(%rbp),%xmm13 - movaps %xmm0,-0x30(%rbp) - movaps -0x20(%rbp),%xmm14 - movaps %xmm0,-0x20(%rbp) - movaps -0x10(%rbp),%xmm15 - movaps %xmm0,-0x10(%rbp) + movaps -0xa8($key_),%xmm6 + movaps %xmm0,-0xa8($key_) # clear stack + movaps -0x98($key_),%xmm7 + movaps %xmm0,-0x98($key_) + movaps -0x88($key_),%xmm8 + movaps %xmm0,-0x88($key_) + movaps -0x78($key_),%xmm9 + movaps %xmm0,-0x78($key_) + movaps -0x68($key_),%xmm10 + movaps %xmm0,-0x68($key_) + movaps -0x58($key_),%xmm11 + movaps %xmm0,-0x58($key_) + movaps -0x48($key_),%xmm12 + movaps %xmm0,-0x48($key_) + movaps -0x38($key_),%xmm13 + movaps %xmm0,-0x38($key_) + movaps -0x28($key_),%xmm14 + movaps %xmm0,-0x28($key_) + movaps -0x18($key_),%xmm15 + movaps %xmm0,-0x18($key_) movaps %xmm0,0x00(%rsp) movaps %xmm0,0x10(%rsp) movaps %xmm0,0x20(%rsp) @@ -1722,8 +1721,8 @@ $code.=<<___ if ($win64); movaps %xmm0,0x70(%rsp) ___ $code.=<<___; - lea (%rbp),%rsp - pop %rbp + mov -8($key_),%rbp + lea ($key_),%rsp .Lctr32_epilogue: ret .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks @@ -1740,32 +1739,32 @@ my @tweak=map("%xmm$_",(10..15)); my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); my $frame_size = 0x70 + ($win64?160:0); +my $key_ = "%rbp"; # override so that we can use %r11 as FP $code.=<<___; .globl aesni_xts_encrypt .type aesni_xts_encrypt,\@function,6 .align 16 aesni_xts_encrypt: - lea (%rsp),%rax + lea (%rsp),%r11 # frame pointer push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) # offload everything - movaps %xmm7,-0x98(%rax) - movaps %xmm8,-0x88(%rax) - movaps %xmm9,-0x78(%rax) - movaps %xmm10,-0x68(%rax) - movaps %xmm11,-0x58(%rax) - movaps %xmm12,-0x48(%rax) - movaps %xmm13,-0x38(%rax) - movaps %xmm14,-0x28(%rax) - movaps %xmm15,-0x18(%rax) + movaps %xmm6,-0xa8(%r11) # offload everything + movaps %xmm7,-0x98(%r11) + movaps %xmm8,-0x88(%r11) + movaps %xmm9,-0x78(%r11) + movaps %xmm10,-0x68(%r11) + movaps %xmm11,-0x58(%r11) + movaps %xmm12,-0x48(%r11) + movaps %xmm13,-0x38(%r11) + movaps %xmm14,-0x28(%r11) + movaps %xmm15,-0x18(%r11) .Lxts_enc_body: ___ $code.=<<___; - lea -8(%rax),%rbp movups ($ivp),$inout0 # load clear-text tweak mov 240(%r8),$rounds # key2->rounds mov 240($key),$rnds_ # key1->rounds @@ -2183,26 +2182,26 @@ $code.=<<___ if (!$win64); pxor %xmm15,%xmm15 ___ $code.=<<___ if ($win64); - movaps -0xa0(%rbp),%xmm6 - movaps %xmm0,-0xa0(%rbp) # clear stack - movaps -0x90(%rbp),%xmm7 - movaps %xmm0,-0x90(%rbp) - movaps -0x80(%rbp),%xmm8 - movaps %xmm0,-0x80(%rbp) - movaps -0x70(%rbp),%xmm9 - movaps %xmm0,-0x70(%rbp) - movaps -0x60(%rbp),%xmm10 - movaps %xmm0,-0x60(%rbp) - movaps -0x50(%rbp),%xmm11 - movaps %xmm0,-0x50(%rbp) - movaps -0x40(%rbp),%xmm12 - movaps %xmm0,-0x40(%rbp) - movaps -0x30(%rbp),%xmm13 - movaps %xmm0,-0x30(%rbp) - movaps -0x20(%rbp),%xmm14 - movaps %xmm0,-0x20(%rbp) - movaps -0x10(%rbp),%xmm15 - movaps %xmm0,-0x10(%rbp) + movaps -0xa8(%r11),%xmm6 + movaps %xmm0,-0xa8(%r11) # clear stack + movaps -0x98(%r11),%xmm7 + movaps %xmm0,-0x98(%r11) + movaps -0x88(%r11),%xmm8 + movaps %xmm0,-0x88(%r11) + movaps -0x78(%r11),%xmm9 + movaps %xmm0,-0x78(%r11) + movaps -0x68(%r11),%xmm10 + movaps %xmm0,-0x68(%r11) + movaps -0x58(%r11),%xmm11 + movaps %xmm0,-0x58(%r11) + movaps -0x48(%r11),%xmm12 + movaps %xmm0,-0x48(%r11) + movaps -0x38(%r11),%xmm13 + movaps %xmm0,-0x38(%r11) + movaps -0x28(%r11),%xmm14 + movaps %xmm0,-0x28(%r11) + movaps -0x18(%r11),%xmm15 + movaps %xmm0,-0x18(%r11) movaps %xmm0,0x00(%rsp) movaps %xmm0,0x10(%rsp) movaps %xmm0,0x20(%rsp) @@ -2212,8 +2211,8 @@ $code.=<<___ if ($win64); movaps %xmm0,0x60(%rsp) ___ $code.=<<___; - lea (%rbp),%rsp - pop %rbp + mov -8(%r11),%rbp + lea (%r11),%rsp .Lxts_enc_epilogue: ret .size aesni_xts_encrypt,.-aesni_xts_encrypt @@ -2224,26 +2223,25 @@ $code.=<<___; .type aesni_xts_decrypt,\@function,6 .align 16 aesni_xts_decrypt: - lea (%rsp),%rax + lea (%rsp),%r11 # frame pointer push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%rax) # offload everything - movaps %xmm7,-0x98(%rax) - movaps %xmm8,-0x88(%rax) - movaps %xmm9,-0x78(%rax) - movaps %xmm10,-0x68(%rax) - movaps %xmm11,-0x58(%rax) - movaps %xmm12,-0x48(%rax) - movaps %xmm13,-0x38(%rax) - movaps %xmm14,-0x28(%rax) - movaps %xmm15,-0x18(%rax) + movaps %xmm6,-0xa8(%r11) # offload everything + movaps %xmm7,-0x98(%r11) + movaps %xmm8,-0x88(%r11) + movaps %xmm9,-0x78(%r11) + movaps %xmm10,-0x68(%r11) + movaps %xmm11,-0x58(%r11) + movaps %xmm12,-0x48(%r11) + movaps %xmm13,-0x38(%r11) + movaps %xmm14,-0x28(%r11) + movaps %xmm15,-0x18(%r11) .Lxts_dec_body: ___ $code.=<<___; - lea -8(%rax),%rbp movups ($ivp),$inout0 # load clear-text tweak mov 240($key2),$rounds # key2->rounds mov 240($key),$rnds_ # key1->rounds @@ -2687,26 +2685,26 @@ $code.=<<___ if (!$win64); pxor %xmm15,%xmm15 ___ $code.=<<___ if ($win64); - movaps -0xa0(%rbp),%xmm6 - movaps %xmm0,-0xa0(%rbp) # clear stack - movaps -0x90(%rbp),%xmm7 - movaps %xmm0,-0x90(%rbp) - movaps -0x80(%rbp),%xmm8 - movaps %xmm0,-0x80(%rbp) - movaps -0x70(%rbp),%xmm9 - movaps %xmm0,-0x70(%rbp) - movaps -0x60(%rbp),%xmm10 - movaps %xmm0,-0x60(%rbp) - movaps -0x50(%rbp),%xmm11 - movaps %xmm0,-0x50(%rbp) - movaps -0x40(%rbp),%xmm12 - movaps %xmm0,-0x40(%rbp) - movaps -0x30(%rbp),%xmm13 - movaps %xmm0,-0x30(%rbp) - movaps -0x20(%rbp),%xmm14 - movaps %xmm0,-0x20(%rbp) - movaps -0x10(%rbp),%xmm15 - movaps %xmm0,-0x10(%rbp) + movaps -0xa8(%r11),%xmm6 + movaps %xmm0,-0xa8(%r11) # clear stack + movaps -0x98(%r11),%xmm7 + movaps %xmm0,-0x98(%r11) + movaps -0x88(%r11),%xmm8 + movaps %xmm0,-0x88(%r11) + movaps -0x78(%r11),%xmm9 + movaps %xmm0,-0x78(%r11) + movaps -0x68(%r11),%xmm10 + movaps %xmm0,-0x68(%r11) + movaps -0x58(%r11),%xmm11 + movaps %xmm0,-0x58(%r11) + movaps -0x48(%r11),%xmm12 + movaps %xmm0,-0x48(%r11) + movaps -0x38(%r11),%xmm13 + movaps %xmm0,-0x38(%r11) + movaps -0x28(%r11),%xmm14 + movaps %xmm0,-0x28(%r11) + movaps -0x18(%r11),%xmm15 + movaps %xmm0,-0x18(%r11) movaps %xmm0,0x00(%rsp) movaps %xmm0,0x10(%rsp) movaps %xmm0,0x20(%rsp) @@ -2716,8 +2714,8 @@ $code.=<<___ if ($win64); movaps %xmm0,0x60(%rsp) ___ $code.=<<___; - lea (%rbp),%rsp - pop %rbp + mov -8(%r11),%rbp + lea (%r11),%rsp .Lxts_dec_epilogue: ret .size aesni_xts_decrypt,.-aesni_xts_decrypt @@ -2943,6 +2941,7 @@ $code.=<<___ if (!$win64); pxor %xmm13,%xmm13 pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 + lea 0x28(%rsp),%rax ___ $code.=<<___ if ($win64); movaps 0x00(%rsp),%xmm6 @@ -2967,14 +2966,14 @@ $code.=<<___ if ($win64); movaps %xmm0,0x90(%rsp) lea 0xa0+0x28(%rsp),%rax .Locb_enc_pop: - lea 0xa0(%rsp),%rsp ___ $code.=<<___; - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp .Locb_enc_epilogue: ret .size aesni_ocb_encrypt,.-aesni_ocb_encrypt @@ -3410,6 +3409,7 @@ $code.=<<___ if (!$win64); pxor %xmm13,%xmm13 pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 + lea 0x28(%rsp),%rax ___ $code.=<<___ if ($win64); movaps 0x00(%rsp),%xmm6 @@ -3434,14 +3434,14 @@ $code.=<<___ if ($win64); movaps %xmm0,0x90(%rsp) lea 0xa0+0x28(%rsp),%rax .Locb_dec_pop: - lea 0xa0(%rsp),%rsp ___ $code.=<<___; - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx + mov -40(%rax),%r14 + mov -32(%rax),%r13 + mov -24(%rax),%r12 + mov -16(%rax),%rbp + mov -8(%rax),%rbx + lea (%rax),%rsp .Locb_dec_epilogue: ret .size aesni_ocb_decrypt,.-aesni_ocb_decrypt @@ -3650,7 +3650,6 @@ ___ { my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); -my $inp_=$key_; $code.=<<___; .globl ${PREFIX}_cbc_encrypt @@ -3732,7 +3731,7 @@ $code.=<<___; jmp .Lcbc_ret .align 16 .Lcbc_decrypt_bulk: - lea (%rsp),%rax + lea (%rsp),%r11 # frame pointer push %rbp sub \$$frame_size,%rsp and \$-16,%rsp # Linux kernel stack can be incorrectly seeded @@ -3750,8 +3749,11 @@ $code.=<<___ if ($win64); movaps %xmm15,0xa0(%rsp) .Lcbc_decrypt_body: ___ + +my $inp_=$key_="%rbp"; # reassign $key_ + $code.=<<___; - lea -8(%rax),%rbp + mov $key,$key_ # [re-]backup $key [after reassignment] movups ($ivp),$iv mov $rnds_,$rounds cmp \$0x50,$len @@ -3791,7 +3793,7 @@ $code.=<<___; pxor $rndkey0,$inout1 $movkey 0x10-0x70($key),$rndkey1 pxor $rndkey0,$inout2 - xor $inp_,$inp_ + mov \$-1,$inp_ cmp \$0x70,$len # is there at least 0x60 bytes ahead? pxor $rndkey0,$inout3 pxor $rndkey0,$inout4 @@ -3807,8 +3809,8 @@ $code.=<<___; aesdec $rndkey1,$inout4 aesdec $rndkey1,$inout5 aesdec $rndkey1,$inout6 - setnc ${inp_}b - shl \$7,$inp_ + adc \$0,$inp_ + and \$128,$inp_ aesdec $rndkey1,$inout7 add $inp,$inp_ $movkey 0x30-0x70($key),$rndkey1 @@ -4172,8 +4174,8 @@ $code.=<<___ if ($win64); movaps %xmm0,0xa0(%rsp) ___ $code.=<<___; - lea (%rbp),%rsp - pop %rbp + mov -8(%r11),%rbp + lea (%r11),%rsp .Lcbc_ret: ret .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt @@ -4744,13 +4746,16 @@ ctr_xts_se_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail - mov 160($context),%rax # pull context->Rbp - lea -0xa0(%rax),%rsi # %xmm save area + mov 208($context),%rax # pull context->R11 + + lea -0xa8(%rax),%rsi # %xmm save area lea 512($context),%rdi # & context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq - jmp .Lcommon_rbp_tail + mov -8(%rax),%rbp # restore saved %rbp + mov %rbp,160($context) # restore context->Rbp + jmp .Lcommon_seh_tail .size ctr_xts_se_handler,.-ctr_xts_se_handler .type ocb_se_handler,\@abi-omnipotent @@ -4834,9 +4839,13 @@ cbc_se_handler: cmp %r10,%rbx # context->Rip<"prologue" label jb .Lcommon_seh_tail + mov 120($context),%rax # pull context->Rax + lea .Lcbc_decrypt_body(%rip),%r10 cmp %r10,%rbx # context->RipRsp lea .Lcbc_ret(%rip),%r10 cmp %r10,%rbx # context->Rip>="epilogue" label @@ -4847,15 +4856,10 @@ cbc_se_handler: mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq -.Lcommon_rbp_tail: - mov 160($context),%rax # pull context->Rbp - mov (%rax),%rbp # restore saved %rbp - lea 8(%rax),%rax # adjust stack pointer - mov %rbp,160($context) # restore context->Rbp - jmp .Lcommon_seh_tail + mov 208($context),%rax # pull context->R11 -.Lrestore_cbc_rax: - mov 120($context),%rax + mov -8(%rax),%rbp # restore saved %rbp + mov %rbp,160($context) # restore context->Rbp .Lcommon_seh_tail: mov 8(%rax),%rdi diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl index 0d49947d81..cce5795302 100644 --- a/crypto/aes/asm/bsaes-x86_64.pl +++ b/crypto/aes/asm/bsaes-x86_64.pl @@ -1334,7 +1334,7 @@ $code.=<<___; cmp %rax, %rbp jb .Lecb_enc_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -1347,17 +1347,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lecb_enc_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lecb_enc_epilogue: ret .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks @@ -1536,7 +1536,7 @@ $code.=<<___; cmp %rax, %rbp jb .Lecb_dec_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -1549,17 +1549,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lecb_dec_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lecb_dec_epilogue: ret .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks @@ -1826,7 +1826,7 @@ $code.=<<___; cmp %rax, %rbp ja .Lcbc_dec_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -1839,17 +1839,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lcbc_dec_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lcbc_dec_epilogue: ret .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt @@ -2058,7 +2058,7 @@ $code.=<<___; cmp %rax, %rbp ja .Lctr_enc_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -2071,17 +2071,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lctr_enc_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lctr_enc_epilogue: ret .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks @@ -2448,7 +2448,7 @@ $code.=<<___; cmp %rax, %rbp ja .Lxts_enc_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -2461,17 +2461,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lxts_enc_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lxts_enc_epilogue: ret .size bsaes_xts_encrypt,.-bsaes_xts_encrypt @@ -2855,7 +2855,7 @@ $code.=<<___; cmp %rax, %rbp ja .Lxts_dec_bzero - lea (%rbp),%rsp # restore %rsp + lea 0x78(%rbp),%rax ___ $code.=<<___ if ($win64); movaps 0x40(%rbp), %xmm6 @@ -2868,17 +2868,17 @@ $code.=<<___ if ($win64); movaps 0xb0(%rbp), %xmm13 movaps 0xc0(%rbp), %xmm14 movaps 0xd0(%rbp), %xmm15 - lea 0xa0(%rbp), %rsp + lea 0xa0(%rax), %rax +.Lxts_dec_tail: ___ $code.=<<___; - mov 0x48(%rsp), %r15 - mov 0x50(%rsp), %r14 - mov 0x58(%rsp), %r13 - mov 0x60(%rsp), %r12 - mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rax - lea 0x78(%rsp), %rsp - mov %rax, %rbp + mov -48(%rax), %r15 + mov -40(%rax), %r14 + mov -32(%rax), %r13 + mov -24(%rax), %r12 + mov -16(%rax), %rbx + mov -8(%rax), %rbp + lea (%rax), %rsp # restore %rsp .Lxts_dec_epilogue: ret .size bsaes_xts_decrypt,.-bsaes_xts_decrypt @@ -2974,31 +2974,34 @@ se_handler: mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->RipRsp + cmp %r10,%rbx # context->Rip<=prologue label + jbe .Lin_prologue mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue + mov 8(%r11),%r10d # HandlerData[2] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=tail label + jae .Lin_tail + mov 160($context),%rax # pull context->Rbp lea 0x40(%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq - lea 0xa0(%rax),%rax # adjust stack pointer - - mov 0x70(%rax),%rbp - mov 0x68(%rax),%rbx - mov 0x60(%rax),%r12 - mov 0x58(%rax),%r13 - mov 0x50(%rax),%r14 - mov 0x48(%rax),%r15 - lea 0x78(%rax),%rax # adjust stack pointer + lea 0xa0+0x78(%rax),%rax # adjust stack pointer + +.Lin_tail: + mov -48(%rax),%rbp + mov -40(%rax),%rbx + mov -32(%rax),%r12 + mov -24(%rax),%r13 + mov -16(%rax),%r14 + mov -8(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 @@ -3079,28 +3082,40 @@ $code.=<<___ if ($ecb); .byte 9,0,0,0 .rva se_handler .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] + .rva .Lecb_enc_tail + .long 0 .Lecb_dec_info: .byte 9,0,0,0 .rva se_handler .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] + .rva .Lecb_dec_tail + .long 0 ___ $code.=<<___; .Lcbc_dec_info: .byte 9,0,0,0 .rva se_handler .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] + .rva .Lcbc_dec_tail + .long 0 .Lctr_enc_info: .byte 9,0,0,0 .rva se_handler .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] + .rva .Lctr_enc_tail + .long 0 .Lxts_enc_info: .byte 9,0,0,0 .rva se_handler .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] + .rva .Lxts_enc_tail + .long 0 .Lxts_dec_info: .byte 9,0,0,0 .rva se_handler .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] + .rva .Lxts_dec_tail + .long 0 ___ } diff --git a/crypto/bn/asm/rsaz-avx2.pl b/crypto/bn/asm/rsaz-avx2.pl index f34c84f452..e620285e61 100755 --- a/crypto/bn/asm/rsaz-avx2.pl +++ b/crypto/bn/asm/rsaz-avx2.pl @@ -1738,11 +1738,11 @@ $code.=<<___ if ($win64); movaps -0x38(%r11),%xmm13 movaps -0x28(%r11),%xmm14 movaps -0x18(%r11),%xmm15 -.LSEH_end_rsaz_1024_gather5: ___ $code.=<<___; lea (%r11),%rsp ret +.LSEH_end_rsaz_1024_gather5: .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 ___ } diff --git a/crypto/bn/asm/x86_64-gf2m.pl b/crypto/bn/asm/x86_64-gf2m.pl index d962f62033..d237c1e3d2 100644 --- a/crypto/bn/asm/x86_64-gf2m.pl +++ b/crypto/bn/asm/x86_64-gf2m.pl @@ -174,8 +174,9 @@ $code.=<<___; .type bn_GF2m_mul_2x2,\@abi-omnipotent .align 16 bn_GF2m_mul_2x2: - mov OPENSSL_ia32cap_P(%rip),%rax - bt \$33,%rax + mov %rsp,%rax + mov OPENSSL_ia32cap_P(%rip),%r10 + bt \$33,%r10 jnc .Lvanilla_mul_2x2 movq $a1,%xmm0 @@ -280,6 +281,7 @@ $code.=<<___ if ($win64); ___ $code.=<<___; lea 8*17(%rsp),%rsp +.Lepilogue_mul_2x2: ret .Lend_mul_2x2: .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 @@ -312,13 +314,19 @@ se_handler: pushfq sub \$64,%rsp - mov 152($context),%rax # pull context->Rsp + mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip lea .Lbody_mul_2x2(%rip),%r10 cmp %r10,%rbx # context->Rip<"prologue" label jb .Lin_prologue + mov 152($context),%rax # pull context->Rsp + + lea .Lepilogue_mul_2x2(%rip),%r10 + cmp %r10,%rbx # context->Rip>="epilogue" label + jae .Lin_prologue + mov 8*10(%rax),%r14 # mimic epilogue mov 8*11(%rax),%r13 mov 8*12(%rax),%r12 @@ -335,8 +343,9 @@ se_handler: mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 -.Lin_prologue: lea 8*17(%rax),%rax + +.Lin_prologue: mov %rax,152($context) # restore context->Rsp mov 40($disp),%rdi # disp->ContextRecord diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index df4cca5bfe..7b5e88547b 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -695,10 +695,11 @@ ___ my @ri=("%rax","%rdx",$m0,$m1); $code.=<<___; mov 16(%rsp,$num,8),$rp # restore $rp + lea -4($num),$j mov 0(%rsp),@ri[0] # tp[0] pxor %xmm0,%xmm0 mov 8(%rsp),@ri[1] # tp[1] - shr \$2,$num # num/=4 + shr \$2,$j # j=num/4-1 lea (%rsp),$ap # borrow ap for tp xor $i,$i # i=0 and clear CF! @@ -706,7 +707,6 @@ $code.=<<___; mov 16($ap),@ri[2] # tp[2] mov 24($ap),@ri[3] # tp[3] sbb 8($np),@ri[1] - lea -1($num),$j # j=num/4-1 jmp .Lsub4x .align 16 .Lsub4x: @@ -740,8 +740,9 @@ $code.=<<___; not @ri[0] mov $rp,$np and @ri[0],$np - lea -1($num),$j + lea -4($num),$j or $np,$ap # ap=borrow?tp:rp + shr \$2,$j # j=num/4-1 movdqu ($ap),%xmm1 movdqa %xmm0,(%rsp) @@ -759,7 +760,6 @@ $code.=<<___; dec $j jnz .Lcopy4x - shl \$2,$num movdqu 16($ap,$i),%xmm2 movdqa %xmm0,16(%rsp,$i) movdqu %xmm2,16($rp,$i) @@ -1401,12 +1401,12 @@ sqr_handler: mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label - cmp %r10,%rbx # context->Rip<.Lsqr_body + cmp %r10,%rbx # context->Rip<.Lsqr_prologue jb .Lcommon_seh_tail mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # body label - cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue + cmp %r10,%rbx # context->Rip<.Lsqr_body jb .Lcommon_pop_regs mov 152($context),%rax # pull context->Rsp diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index d041d738cf..226f4360d6 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -3669,8 +3669,8 @@ mul_handler: jb .Lcommon_seh_tail mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label + lea (%rsi,%r10),%r10 # beginning of body label + cmp %r10,%rbx # context->RipRsp diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl index ac169ee33c..7fc1749f53 100755 --- a/crypto/chacha/asm/chacha-x86_64.pl +++ b/crypto/chacha/asm/chacha-x86_64.pl @@ -261,6 +261,7 @@ $code.=<<___; push %r14 push %r15 sub \$64+24,%rsp +.Lctr32_body: #movdqa .Lsigma(%rip),%xmm0 movdqu ($key),%xmm1 @@ -399,13 +400,14 @@ $code.=<<___; jnz .Loop_tail .Ldone: - add \$64+24,%rsp - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx + lea 64+24+48(%rsp),%rsi + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lno_data: ret .size ChaCha20_ctr32,.-ChaCha20_ctr32 @@ -440,13 +442,14 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round &por ($b,$t); } -my $xframe = $win64 ? 32+32+8 : 24; +my $xframe = $win64 ? 32+8 : 8; $code.=<<___; .type ChaCha20_ssse3,\@function,5 .align 32 ChaCha20_ssse3: .LChaCha20_ssse3: + mov %rsp,%r9 # frame pointer ___ $code.=<<___ if ($avx); test \$`1<<(43-32)`,%r10d @@ -457,18 +460,12 @@ $code.=<<___; ja .LChaCha20_4x # but overall it won't be slower .Ldo_sse3_after_all: - push %rbx # just to share SEH handler, no pops - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,64+32(%rsp) - movaps %xmm7,64+48(%rsp) + movaps %xmm6,-0x28(%r9) + movaps %xmm7,-0x18(%r9) +.Lssse3_body: ___ $code.=<<___; movdqa .Lsigma(%rip),$a @@ -563,11 +560,12 @@ $code.=<<___; .Ldone_ssse3: ___ $code.=<<___ if ($win64); - movaps 64+32(%rsp),%xmm6 - movaps 64+48(%rsp),%xmm7 + movaps -0x28(%r9),%xmm6 + movaps -0x18(%r9),%xmm7 ___ $code.=<<___; - add \$64+$xframe+48,%rsp + lea (%r9),%rsp +.Lssse3_epilogue: ret .size ChaCha20_ssse3,.-ChaCha20_ssse3 ___ @@ -704,13 +702,14 @@ my @x=map("\"$_\"",@xx); ); } -my $xframe = $win64 ? 0xa0 : 0; +my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_4x,\@function,5 .align 32 ChaCha20_4x: .LChaCha20_4x: + mov %rsp,%r9 # frame pointer mov %r10,%r11 ___ $code.=<<___ if ($avx>1); @@ -727,8 +726,7 @@ $code.=<<___; je .Ldo_sse3_after_all # to detect Atom .Lproceed4x: - lea -0x78(%rsp),%r11 - sub \$0x148+$xframe,%rsp + sub \$0x140+$xframe,%rsp ___ ################ stack layout # +0x00 SIMD equivalent of @x[8-12] @@ -739,16 +737,17 @@ ___ # ... # +0x140 $code.=<<___ if ($win64); - movaps %xmm6,-0x30(%r11) - movaps %xmm7,-0x20(%r11) - movaps %xmm8,-0x10(%r11) - movaps %xmm9,0x00(%r11) - movaps %xmm10,0x10(%r11) - movaps %xmm11,0x20(%r11) - movaps %xmm12,0x30(%r11) - movaps %xmm13,0x40(%r11) - movaps %xmm14,0x50(%r11) - movaps %xmm15,0x60(%r11) + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L4x_body: ___ $code.=<<___; movdqa .Lsigma(%rip),$xa3 # key[0] @@ -1137,20 +1136,20 @@ $code.=<<___; .Ldone4x: ___ $code.=<<___ if ($win64); - lea 0x140+0x30(%rsp),%r11 - movaps -0x30(%r11),%xmm6 - movaps -0x20(%r11),%xmm7 - movaps -0x10(%r11),%xmm8 - movaps 0x00(%r11),%xmm9 - movaps 0x10(%r11),%xmm10 - movaps 0x20(%r11),%xmm11 - movaps 0x30(%r11),%xmm12 - movaps 0x40(%r11),%xmm13 - movaps 0x50(%r11),%xmm14 - movaps 0x60(%r11),%xmm15 + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 ___ $code.=<<___; - add \$0x148+$xframe,%rsp + lea (%r9),%rsp +.L4x_epilogue: ret .size ChaCha20_4x,.-ChaCha20_4x ___ @@ -1232,15 +1231,15 @@ my @x=map("\"$_\"",@xx); ); } -my $xframe = $win64 ? 0xa0 : 0; +my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_4xop,\@function,5 .align 32 ChaCha20_4xop: .LChaCha20_4xop: - lea -0x78(%rsp),%r11 - sub \$0x148+$xframe,%rsp + mov %rsp,%r9 # frame pointer + sub \$0x140+$xframe,%rsp ___ ################ stack layout # +0x00 SIMD equivalent of @x[8-12] @@ -1251,16 +1250,17 @@ ___ # ... # +0x140 $code.=<<___ if ($win64); - movaps %xmm6,-0x30(%r11) - movaps %xmm7,-0x20(%r11) - movaps %xmm8,-0x10(%r11) - movaps %xmm9,0x00(%r11) - movaps %xmm10,0x10(%r11) - movaps %xmm11,0x20(%r11) - movaps %xmm12,0x30(%r11) - movaps %xmm13,0x40(%r11) - movaps %xmm14,0x50(%r11) - movaps %xmm15,0x60(%r11) + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L4xop_body: ___ $code.=<<___; vzeroupper @@ -1588,20 +1588,20 @@ $code.=<<___; vzeroupper ___ $code.=<<___ if ($win64); - lea 0x140+0x30(%rsp),%r11 - movaps -0x30(%r11),%xmm6 - movaps -0x20(%r11),%xmm7 - movaps -0x10(%r11),%xmm8 - movaps 0x00(%r11),%xmm9 - movaps 0x10(%r11),%xmm10 - movaps 0x20(%r11),%xmm11 - movaps 0x30(%r11),%xmm12 - movaps 0x40(%r11),%xmm13 - movaps 0x50(%r11),%xmm14 - movaps 0x60(%r11),%xmm15 + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 ___ $code.=<<___; - add \$0x148+$xframe,%rsp + lea (%r9),%rsp +.L4xop_epilogue: ret .size ChaCha20_4xop,.-ChaCha20_4xop ___ @@ -1729,33 +1729,32 @@ my @x=map("\"$_\"",@xx); ); } -my $xframe = $win64 ? 0xb0 : 8; +my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_8x,\@function,5 .align 32 ChaCha20_8x: .LChaCha20_8x: - mov %rsp,%r10 + mov %rsp,%r9 # frame register sub \$0x280+$xframe,%rsp and \$-32,%rsp ___ $code.=<<___ if ($win64); - lea 0x290+0x30(%rsp),%r11 - movaps %xmm6,-0x30(%r11) - movaps %xmm7,-0x20(%r11) - movaps %xmm8,-0x10(%r11) - movaps %xmm9,0x00(%r11) - movaps %xmm10,0x10(%r11) - movaps %xmm11,0x20(%r11) - movaps %xmm12,0x30(%r11) - movaps %xmm13,0x40(%r11) - movaps %xmm14,0x50(%r11) - movaps %xmm15,0x60(%r11) + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L8x_body: ___ $code.=<<___; vzeroupper - mov %r10,0x280(%rsp) ################ stack layout # +0x00 SIMD equivalent of @x[8-12] @@ -1764,7 +1763,7 @@ $code.=<<___; # ... # +0x200 SIMD counters (with nonce smashed by lanes) # ... - # +0x280 saved %rsp + # +0x280 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] vbroadcasti128 ($key),$xb3 # key[1] @@ -2230,20 +2229,20 @@ $code.=<<___; vzeroall ___ $code.=<<___ if ($win64); - lea 0x290+0x30(%rsp),%r11 - movaps -0x30(%r11),%xmm6 - movaps -0x20(%r11),%xmm7 - movaps -0x10(%r11),%xmm8 - movaps 0x00(%r11),%xmm9 - movaps 0x10(%r11),%xmm10 - movaps 0x20(%r11),%xmm11 - movaps 0x30(%r11),%xmm12 - movaps 0x40(%r11),%xmm13 - movaps 0x50(%r11),%xmm14 - movaps 0x60(%r11),%xmm15 + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 ___ $code.=<<___; - mov 0x280(%rsp),%rsp + lea (%r9),%rsp +.L8x_epilogue: ret .size ChaCha20_8x,.-ChaCha20_8x ___ @@ -2275,28 +2274,23 @@ sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round &vprold ($b,$b,7); } -my $xframe = $win64 ? 32+32+8 : 24; +my $xframe = $win64 ? 32+8 : 8; $code.=<<___; .type ChaCha20_avx512,\@function,5 .align 32 ChaCha20_avx512: .LChaCha20_avx512: + mov %rsp,%r9 # frame pointer cmp \$512,$len ja .LChaCha20_16x - push %rbx # just to share SEH handler, no pops - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,64+32(%rsp) - movaps %xmm7,64+48(%rsp) + movaps %xmm6,-0x28(%r9) + movaps %xmm7,-0x18(%r9) +.Lavx512_body: ___ $code.=<<___; vbroadcasti32x4 .Lsigma(%rip),$a @@ -2462,11 +2456,12 @@ $code.=<<___; vzeroall ___ $code.=<<___ if ($win64); - movaps 64+32(%rsp),%xmm6 - movaps 64+48(%rsp),%xmm7 + movaps -0x28(%r9),%xmm6 + movaps -0x18(%r9),%xmm7 ___ $code.=<<___; - add \$64+$xframe+48,%rsp + lea (%r9),%rsp +.Lavx512_epilogue: ret .size ChaCha20_avx512,.-ChaCha20_avx512 ___ @@ -2543,29 +2538,29 @@ my @x=map("\"$_\"",@xx); ); } -my $xframe = $win64 ? 0xb0 : 8; +my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_16x,\@function,5 .align 32 ChaCha20_16x: .LChaCha20_16x: - mov %rsp,%r11 + mov %rsp,%r9 # frame register sub \$64+$xframe,%rsp and \$-64,%rsp ___ $code.=<<___ if ($win64); - lea 0x290+0x30(%rsp),%r11 - movaps %xmm6,-0x30(%r11) - movaps %xmm7,-0x20(%r11) - movaps %xmm8,-0x10(%r11) - movaps %xmm9,0x00(%r11) - movaps %xmm10,0x10(%r11) - movaps %xmm11,0x20(%r11) - movaps %xmm12,0x30(%r11) - movaps %xmm13,0x40(%r11) - movaps %xmm14,0x50(%r11) - movaps %xmm15,0x60(%r11) + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L16x_body: ___ $code.=<<___; vzeroupper @@ -2955,25 +2950,275 @@ $code.=<<___; vzeroall ___ $code.=<<___ if ($win64); - lea 0x290+0x30(%rsp),%r11 - movaps -0x30(%r11),%xmm6 - movaps -0x20(%r11),%xmm7 - movaps -0x10(%r11),%xmm8 - movaps 0x00(%r11),%xmm9 - movaps 0x10(%r11),%xmm10 - movaps 0x20(%r11),%xmm11 - movaps 0x30(%r11),%xmm12 - movaps 0x40(%r11),%xmm13 - movaps 0x50(%r11),%xmm14 - movaps 0x60(%r11),%xmm15 + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 ___ $code.=<<___; - mov %r11,%rsp + lea (%r9),%rsp +.L16x_epilogue: ret .size ChaCha20_16x,.-ChaCha20_16x ___ } +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + lea .Lctr32_body(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + lea .Lno_data(%rip),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lcommon_seh_tail + + lea 64+24+48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R14 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.type ssse3_handler,\@abi-omnipotent +.align 16 +ssse3_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipR9 + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea -0x28(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$4,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lcommon_seh_tail +.size ssse3_handler,.-ssse3_handler + +.type full_handler,\@abi-omnipotent +.align 16 +full_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipR9 + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea -0xa8(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lcommon_seh_tail +.size full_handler,.-full_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_ChaCha20_ctr32 + .rva .LSEH_end_ChaCha20_ctr32 + .rva .LSEH_info_ChaCha20_ctr32 + + .rva .LSEH_begin_ChaCha20_ssse3 + .rva .LSEH_end_ChaCha20_ssse3 + .rva .LSEH_info_ChaCha20_ssse3 + + .rva .LSEH_begin_ChaCha20_4x + .rva .LSEH_end_ChaCha20_4x + .rva .LSEH_info_ChaCha20_4x +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_ChaCha20_4xop + .rva .LSEH_end_ChaCha20_4xop + .rva .LSEH_info_ChaCha20_4xop +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_ChaCha20_8x + .rva .LSEH_end_ChaCha20_8x + .rva .LSEH_info_ChaCha20_8x +___ +$code.=<<___ if ($avx>2); + .rva .LSEH_begin_ChaCha20_avx512 + .rva .LSEH_end_ChaCha20_avx512 + .rva .LSEH_info_ChaCha20_avx512 + + .rva .LSEH_begin_ChaCha20_16x + .rva .LSEH_end_ChaCha20_16x + .rva .LSEH_info_ChaCha20_16x +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_ChaCha20_ctr32: + .byte 9,0,0,0 + .rva se_handler + +.LSEH_info_ChaCha20_ssse3: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lssse3_body,.Lssse3_epilogue + +.LSEH_info_ChaCha20_4x: + .byte 9,0,0,0 + .rva full_handler + .rva .L4x_body,.L4x_epilogue +___ +$code.=<<___ if ($avx); +.LSEH_info_ChaCha20_4xop: + .byte 9,0,0,0 + .rva full_handler + .rva .L4xop_body,.L4xop_epilogue # HandlerData[] +___ +$code.=<<___ if ($avx>1); +.LSEH_info_ChaCha20_8x: + .byte 9,0,0,0 + .rva full_handler + .rva .L8x_body,.L8x_epilogue # HandlerData[] +___ +$code.=<<___ if ($avx>2); +.LSEH_info_ChaCha20_avx512: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] + +.LSEH_info_ChaCha20_16x: + .byte 9,0,0,0 + .rva full_handler + .rva .L16x_body,.L16x_epilogue # HandlerData[] +___ +} + foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl index adb49f37dd..1028c09c06 100755 --- a/crypto/ec/asm/ecp_nistz256-x86_64.pl +++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl @@ -133,6 +133,7 @@ $code.=<<___; ecp_nistz256_mul_by_2: push %r12 push %r13 +.Lmul_by_2_body: mov 8*0($a_ptr), $a0 xor $t4,$t4 @@ -165,8 +166,10 @@ ecp_nistz256_mul_by_2: mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 + mov 8(%rsp),%r12 + lea 16(%rsp),%rsp +.Lmul_by_2_epilogue: ret .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 @@ -178,6 +181,7 @@ ecp_nistz256_mul_by_2: ecp_nistz256_div_by_2: push %r12 push %r13 +.Ldiv_by_2_body: mov 8*0($a_ptr), $a0 mov 8*1($a_ptr), $a1 @@ -225,8 +229,10 @@ ecp_nistz256_div_by_2: mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 + mov 8(%rsp),%r12 + lea 16(%rsp),%rsp +.Ldiv_by_2_epilogue: ret .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 @@ -238,6 +244,7 @@ ecp_nistz256_div_by_2: ecp_nistz256_mul_by_3: push %r12 push %r13 +.Lmul_by_3_body: mov 8*0($a_ptr), $a0 xor $t4, $t4 @@ -291,8 +298,10 @@ ecp_nistz256_mul_by_3: mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 + mov 8(%rsp),%r12 + lea 16(%rsp),%rsp +.Lmul_by_3_epilogue: ret .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 @@ -304,6 +313,7 @@ ecp_nistz256_mul_by_3: ecp_nistz256_add: push %r12 push %r13 +.Ladd_body: mov 8*0($a_ptr), $a0 xor $t4, $t4 @@ -337,8 +347,10 @@ ecp_nistz256_add: mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 + mov 8(%rsp),%r12 + lea 16(%rsp),%rsp +.Ladd_epilogue: ret .size ecp_nistz256_add,.-ecp_nistz256_add @@ -350,6 +362,7 @@ ecp_nistz256_add: ecp_nistz256_sub: push %r12 push %r13 +.Lsub_body: mov 8*0($a_ptr), $a0 xor $t4, $t4 @@ -383,8 +396,10 @@ ecp_nistz256_sub: mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 + mov 8(%rsp),%r12 + lea 16(%rsp),%rsp +.Lsub_epilogue: ret .size ecp_nistz256_sub,.-ecp_nistz256_sub @@ -396,6 +411,7 @@ ecp_nistz256_sub: ecp_nistz256_neg: push %r12 push %r13 +.Lneg_body: xor $a0, $a0 xor $a1, $a1 @@ -429,8 +445,10 @@ ecp_nistz256_neg: mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 + mov 8(%rsp),%r12 + lea 16(%rsp),%rsp +.Lneg_epilogue: ret .size ecp_nistz256_neg,.-ecp_nistz256_neg ___ @@ -483,6 +501,7 @@ $code.=<<___; push %r13 push %r14 push %r15 +.Lmul_body: ___ $code.=<<___ if ($addx); cmp \$0x80100, %ecx @@ -515,12 +534,14 @@ $code.=<<___ if ($addx); ___ $code.=<<___; .Lmul_mont_done: - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - pop %rbp + mov 0(%rsp),%r15 + mov 8(%rsp),%r14 + mov 16(%rsp),%r13 + mov 24(%rsp),%r12 + mov 32(%rsp),%rbx + mov 40(%rsp),%rbp + lea 48(%rsp),%rsp +.Lmul_epilogue: ret .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont @@ -763,6 +784,7 @@ $code.=<<___; push %r13 push %r14 push %r15 +.Lsqr_body: ___ $code.=<<___ if ($addx); cmp \$0x80100, %ecx @@ -791,12 +813,14 @@ $code.=<<___ if ($addx); ___ $code.=<<___; .Lsqr_mont_done: - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - pop %rbp + mov 0(%rsp),%r15 + mov 8(%rsp),%r14 + mov 16(%rsp),%r13 + mov 24(%rsp),%r12 + mov 32(%rsp),%rbx + mov 40(%rsp),%rbp + lea 48(%rsp),%rsp +.Lsqr_epilogue: ret .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont @@ -1284,6 +1308,7 @@ $code.=<<___; ecp_nistz256_from_mont: push %r12 push %r13 +.Lfrom_body: mov 8*0($in_ptr), %rax mov .Lpoly+8*3(%rip), $t2 @@ -1364,8 +1389,10 @@ ecp_nistz256_from_mont: mov $acc2, 8*2($r_ptr) mov $acc3, 8*3($r_ptr) - pop %r13 - pop %r12 + mov 0(%rsp),%r13 + mov 8(%rsp),%r12 + lea 16(%rsp),%rsp +.Lfrom_epilogue: ret .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont ___ @@ -1492,10 +1519,10 @@ $code.=<<___ if ($win64); movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp -.LSEH_end_ecp_nistz256_gather_w5: ___ $code.=<<___; ret +.LSEH_end_ecp_nistz256_gather_w5: .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 ################################################################################ @@ -1597,10 +1624,10 @@ $code.=<<___ if ($win64); movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp -.LSEH_end_ecp_nistz256_gather_w7: ___ $code.=<<___; ret +.LSEH_end_ecp_nistz256_gather_w7: .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 ___ } @@ -1621,18 +1648,19 @@ ecp_nistz256_avx2_gather_w5: ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax + mov %rsp,%r11 .LSEH_begin_ecp_nistz256_avx2_gather_w5: - .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp - .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) - .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) - .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) - .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) - .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) - .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) - .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) - .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) - .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) - .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) + .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp + .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) + .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) + .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) + .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) + .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) + .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) + .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) + .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) + .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) + .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) ___ $code.=<<___; vmovdqa .LTwo(%rip), $TWO @@ -1698,11 +1726,11 @@ $code.=<<___ if ($win64); movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 - lea 0xa8(%rsp), %rsp -.LSEH_end_ecp_nistz256_avx2_gather_w5: + lea (%r11), %rsp ___ $code.=<<___; ret +.LSEH_end_ecp_nistz256_avx2_gather_w5: .size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5 ___ } @@ -1725,19 +1753,20 @@ ecp_nistz256_avx2_gather_w7: vzeroupper ___ $code.=<<___ if ($win64); + mov %rsp,%r11 lea -0x88(%rsp), %rax .LSEH_begin_ecp_nistz256_avx2_gather_w7: - .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp - .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax) - .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax) - .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax) - .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax) - .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax) - .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax) - .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax) - .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax) - .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax) - .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax) + .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp + .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) + .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) + .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) + .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) + .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) + .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) + .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) + .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) + .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) + .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) ___ $code.=<<___; vmovdqa .LThree(%rip), $THREE @@ -1818,11 +1847,11 @@ $code.=<<___ if ($win64); movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 - lea 0xa8(%rsp), %rsp -.LSEH_end_ecp_nistz256_avx2_gather_w7: + lea (%r11), %rsp ___ $code.=<<___; ret +.LSEH_end_ecp_nistz256_avx2_gather_w7: .size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 ___ } else { @@ -2053,6 +2082,7 @@ $code.=<<___; push %r14 push %r15 sub \$32*5+8, %rsp +.Lpoint_double${x}_body: .Lpoint_double_shortcut$x: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x @@ -2223,13 +2253,15 @@ $code.=<<___; movq %xmm1, $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); - add \$32*5+8, %rsp - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - pop %rbp + lea 32*5+56(%rsp), %rsi + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbx + mov -8(%rsi),%rbp + lea (%rsi),%rsp +.Lpoint_double${x}_epilogue: ret .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx ___ @@ -2283,6 +2315,7 @@ $code.=<<___; push %r14 push %r15 sub \$32*18+8, %rsp +.Lpoint_add${x}_body: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr movdqu 0x10($a_ptr), %xmm1 @@ -2591,13 +2624,15 @@ $code.=<<___; movdqu %xmm3, 0x30($r_ptr) .Ladd_done$x: - add \$32*18+8, %rsp - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - pop %rbp + lea 32*18+56(%rsp), %rsi + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbx + mov -8(%rsi),%rbp + lea (%rsi),%rsp +.Lpoint_add${x}_epilogue: ret .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx ___ @@ -2650,6 +2685,7 @@ $code.=<<___; push %r14 push %r15 sub \$32*15+8, %rsp +.Ladd_affine${x}_body: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr mov $b_org, $b_ptr # reassign @@ -2894,13 +2930,15 @@ $code.=<<___; movdqu %xmm2, 0x20($r_ptr) movdqu %xmm3, 0x30($r_ptr) - add \$32*15+8, %rsp - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbx - pop %rbp + lea 32*15+56(%rsp), %rsi + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbx + mov -8(%rsi),%rbp + lea (%rsi),%rsp +.Ladd_affine${x}_epilogue: ret .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx ___ @@ -3052,6 +3090,348 @@ ___ } }}} +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind + +.type short_handler,\@abi-omnipotent +.align 16 +short_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea 16(%rax),%rax + + mov -8(%rax),%r12 + mov -16(%rax),%r13 + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + + jmp .Lcommon_seh_tail +.size short_handler,.-short_handler + +.type full_handler,\@abi-omnipotent +.align 16 +full_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rax,%r10),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size full_handler,.-full_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_ecp_nistz256_mul_by_2 + .rva .LSEH_end_ecp_nistz256_mul_by_2 + .rva .LSEH_info_ecp_nistz256_mul_by_2 + + .rva .LSEH_begin_ecp_nistz256_div_by_2 + .rva .LSEH_end_ecp_nistz256_div_by_2 + .rva .LSEH_info_ecp_nistz256_div_by_2 + + .rva .LSEH_begin_ecp_nistz256_mul_by_3 + .rva .LSEH_end_ecp_nistz256_mul_by_3 + .rva .LSEH_info_ecp_nistz256_mul_by_3 + + .rva .LSEH_begin_ecp_nistz256_add + .rva .LSEH_end_ecp_nistz256_add + .rva .LSEH_info_ecp_nistz256_add + + .rva .LSEH_begin_ecp_nistz256_sub + .rva .LSEH_end_ecp_nistz256_sub + .rva .LSEH_info_ecp_nistz256_sub + + .rva .LSEH_begin_ecp_nistz256_neg + .rva .LSEH_end_ecp_nistz256_neg + .rva .LSEH_info_ecp_nistz256_neg + + .rva .LSEH_begin_ecp_nistz256_to_mont + .rva .LSEH_end_ecp_nistz256_to_mont + .rva .LSEH_info_ecp_nistz256_to_mont + + .rva .LSEH_begin_ecp_nistz256_mul_mont + .rva .LSEH_end_ecp_nistz256_mul_mont + .rva .LSEH_info_ecp_nistz256_mul_mont + + .rva .LSEH_begin_ecp_nistz256_sqr_mont + .rva .LSEH_end_ecp_nistz256_sqr_mont + .rva .LSEH_info_ecp_nistz256_sqr_mont + + .rva .LSEH_begin_ecp_nistz256_from_mont + .rva .LSEH_end_ecp_nistz256_from_mont + .rva .LSEH_info_ecp_nistz256_from_mont + + .rva .LSEH_begin_ecp_nistz256_gather_w5 + .rva .LSEH_end_ecp_nistz256_gather_w5 + .rva .LSEH_info_ecp_nistz256_gather_wX + + .rva .LSEH_begin_ecp_nistz256_gather_w7 + .rva .LSEH_end_ecp_nistz256_gather_w7 + .rva .LSEH_info_ecp_nistz256_gather_wX +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_ecp_nistz256_avx2_gather_w5 + .rva .LSEH_end_ecp_nistz256_avx2_gather_w5 + .rva .LSEH_info_ecp_nistz256_avx2_gather_wX + + .rva .LSEH_begin_ecp_nistz256_avx2_gather_w7 + .rva .LSEH_end_ecp_nistz256_avx2_gather_w7 + .rva .LSEH_info_ecp_nistz256_avx2_gather_wX +___ +$code.=<<___; + .rva .LSEH_begin_ecp_nistz256_point_double + .rva .LSEH_end_ecp_nistz256_point_double + .rva .LSEH_info_ecp_nistz256_point_double + + .rva .LSEH_begin_ecp_nistz256_point_add + .rva .LSEH_end_ecp_nistz256_point_add + .rva .LSEH_info_ecp_nistz256_point_add + + .rva .LSEH_begin_ecp_nistz256_point_add_affine + .rva .LSEH_end_ecp_nistz256_point_add_affine + .rva .LSEH_info_ecp_nistz256_point_add_affine +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_ecp_nistz256_point_doublex + .rva .LSEH_end_ecp_nistz256_point_doublex + .rva .LSEH_info_ecp_nistz256_point_doublex + + .rva .LSEH_begin_ecp_nistz256_point_addx + .rva .LSEH_end_ecp_nistz256_point_addx + .rva .LSEH_info_ecp_nistz256_point_addx + + .rva .LSEH_begin_ecp_nistz256_point_add_affinex + .rva .LSEH_end_ecp_nistz256_point_add_affinex + .rva .LSEH_info_ecp_nistz256_point_add_affinex +___ +$code.=<<___; + +.section .xdata +.align 8 +.LSEH_info_ecp_nistz256_mul_by_2: + .byte 9,0,0,0 + .rva short_handler + .rva .Lmul_by_2_body,.Lmul_by_2_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_div_by_2: + .byte 9,0,0,0 + .rva short_handler + .rva .Ldiv_by_2_body,.Ldiv_by_2_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_mul_by_3: + .byte 9,0,0,0 + .rva short_handler + .rva .Lmul_by_3_body,.Lmul_by_3_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_add: + .byte 9,0,0,0 + .rva short_handler + .rva .Ladd_body,.Ladd_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_sub: + .byte 9,0,0,0 + .rva short_handler + .rva .Lsub_body,.Lsub_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_neg: + .byte 9,0,0,0 + .rva short_handler + .rva .Lneg_body,.Lneg_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_to_mont: + .byte 9,0,0,0 + .rva full_handler + .rva .Lmul_body,.Lmul_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_mul_mont: + .byte 9,0,0,0 + .rva full_handler + .rva .Lmul_body,.Lmul_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_sqr_mont: + .byte 9,0,0,0 + .rva full_handler + .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] + .long 48,0 +.LSEH_info_ecp_nistz256_from_mont: + .byte 9,0,0,0 + .rva short_handler + .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] +.LSEH_info_ecp_nistz256_gather_wX: + .byte 0x01,0x33,0x16,0x00 + .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 + .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 + .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 + .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 + .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 + .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 + .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 + .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 + .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 + .align 8 +___ +$code.=<<___ if ($avx>1); +.LSEH_info_ecp_nistz256_avx2_gather_wX: + .byte 0x01,0x36,0x17,0x0b + .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 + .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 + .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 + .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 + .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 + .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 + .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 + .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 + .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 + .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 + .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 + .byte 0x00,0xb3,0x00,0x00 # set_frame r11 + .align 8 +___ +$code.=<<___; +.LSEH_info_ecp_nistz256_point_double: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[] + .long 32*5+56,0 +.LSEH_info_ecp_nistz256_point_add: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[] + .long 32*18+56,0 +.LSEH_info_ecp_nistz256_point_add_affine: + .byte 9,0,0,0 + .rva full_handler + .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[] + .long 32*15+56,0 +___ +$code.=<<___ if ($addx); +.align 8 +.LSEH_info_ecp_nistz256_point_doublex: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[] + .long 32*5+56,0 +.LSEH_info_ecp_nistz256_point_addx: + .byte 9,0,0,0 + .rva full_handler + .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[] + .long 32*18+56,0 +.LSEH_info_ecp_nistz256_point_add_affinex: + .byte 9,0,0,0 + .rva full_handler + .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[] + .long 32*15+56,0 +___ +} + ######################################################################## # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 # diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl index c782edcbb8..caa9ced696 100644 --- a/crypto/modes/asm/ghash-x86_64.pl +++ b/crypto/modes/asm/ghash-x86_64.pl @@ -237,8 +237,12 @@ $code=<<___; .align 16 gcm_gmult_4bit: push %rbx - push %rbp # %rbp and %r12 are pushed exclusively in + push %rbp # %rbp and others are pushed exclusively in push %r12 # order to reuse Win64 exception handler... + push %r13 + push %r14 + push %r15 + sub \$280,%rsp .Lgmult_prologue: movzb 15($Xi),$Zlo @@ -249,8 +253,9 @@ $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) - mov 16(%rsp),%rbx - lea 24(%rsp),%rsp + lea 280+48(%rsp),%rsi + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lgmult_epilogue: ret .size gcm_gmult_4bit,.-gcm_gmult_4bit @@ -400,14 +405,14 @@ $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) - lea 280(%rsp),%rsi - mov 0(%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + lea 280+48(%rsp),%rsi + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea 0(%rsi),%rsp .Lghash_epilogue: ret .size gcm_ghash_4bit,.-gcm_ghash_4bit @@ -1648,14 +1653,20 @@ se_handler: cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue - lea 24(%rax),%rax # adjust "rsp" + lea 48+280(%rax),%rax # adjust "rsp" mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 .Lin_prologue: mov 8(%rax),%rdi diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl index f06fa515a2..c0cc7b3898 100755 --- a/crypto/sha/asm/sha1-x86_64.pl +++ b/crypto/sha/asm/sha1-x86_64.pl @@ -462,7 +462,8 @@ my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); my $j=0; my $rx=0; -my $K_XX_XX="%r11"; +my $K_XX_XX="%r14"; +my $fp="%r11"; my $_rol=sub { &rol(@_) }; my $_ror=sub { &ror(@_) }; @@ -483,7 +484,7 @@ $code.=<<___; .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: - mov %rsp,%rax + mov %rsp,$fp # frame pointer push %rbx push %rbp push %r12 @@ -492,16 +493,15 @@ _ssse3_shortcut: lea `-64-($win64?6*16:0)`(%rsp),%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,-40-6*16(%rax) - movaps %xmm7,-40-5*16(%rax) - movaps %xmm8,-40-4*16(%rax) - movaps %xmm9,-40-3*16(%rax) - movaps %xmm10,-40-2*16(%rax) - movaps %xmm11,-40-1*16(%rax) + movaps %xmm6,-40-6*16($fp) + movaps %xmm7,-40-5*16($fp) + movaps %xmm8,-40-4*16($fp) + movaps %xmm9,-40-3*16($fp) + movaps %xmm10,-40-2*16($fp) + movaps %xmm11,-40-1*16($fp) .Lprologue_ssse3: ___ $code.=<<___; - mov %rax,%r14 # original %rsp and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument @@ -908,21 +908,20 @@ $code.=<<___; mov $E,16($ctx) ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps -40-6*16($fp),%xmm6 + movaps -40-5*16($fp),%xmm7 + movaps -40-4*16($fp),%xmm8 + movaps -40-3*16($fp),%xmm9 + movaps -40-2*16($fp),%xmm10 + movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + mov -40($fp),%r14 + mov -32($fp),%r13 + mov -24($fp),%r12 + mov -16($fp),%rbp + mov -8($fp),%rbx + lea ($fp),%rsp .Lepilogue_ssse3: ret .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 @@ -945,7 +944,7 @@ $code.=<<___; .align 16 sha1_block_data_order_avx: _avx_shortcut: - mov %rsp,%rax + mov %rsp,$fp push %rbx push %rbp push %r12 @@ -955,16 +954,15 @@ _avx_shortcut: vzeroupper ___ $code.=<<___ if ($win64); - vmovaps %xmm6,-40-6*16(%rax) - vmovaps %xmm7,-40-5*16(%rax) - vmovaps %xmm8,-40-4*16(%rax) - vmovaps %xmm9,-40-3*16(%rax) - vmovaps %xmm10,-40-2*16(%rax) - vmovaps %xmm11,-40-1*16(%rax) + vmovaps %xmm6,-40-6*16($fp) + vmovaps %xmm7,-40-5*16($fp) + vmovaps %xmm8,-40-4*16($fp) + vmovaps %xmm9,-40-3*16($fp) + vmovaps %xmm10,-40-2*16($fp) + vmovaps %xmm11,-40-1*16($fp) .Lprologue_avx: ___ $code.=<<___; - mov %rax,%r14 # original %rsp and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument @@ -1272,21 +1270,20 @@ $code.=<<___; mov $E,16($ctx) ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps -40-6*16($fp),%xmm6 + movaps -40-5*16($fp),%xmm7 + movaps -40-4*16($fp),%xmm8 + movaps -40-3*16($fp),%xmm9 + movaps -40-2*16($fp),%xmm10 + movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + mov -40($fp),%r14 + mov -32($fp),%r13 + mov -24($fp),%r12 + mov -16($fp),%rbp + mov -8($fp),%rbx + lea ($fp),%rsp .Lepilogue_avx: ret .size sha1_block_data_order_avx,.-sha1_block_data_order_avx @@ -1312,7 +1309,7 @@ $code.=<<___; .align 16 sha1_block_data_order_avx2: _avx2_shortcut: - mov %rsp,%rax + mov %rsp,$fp push %rbx push %rbp push %r12 @@ -1322,16 +1319,15 @@ _avx2_shortcut: ___ $code.=<<___ if ($win64); lea -6*16(%rsp),%rsp - vmovaps %xmm6,-40-6*16(%rax) - vmovaps %xmm7,-40-5*16(%rax) - vmovaps %xmm8,-40-4*16(%rax) - vmovaps %xmm9,-40-3*16(%rax) - vmovaps %xmm10,-40-2*16(%rax) - vmovaps %xmm11,-40-1*16(%rax) + vmovaps %xmm6,-40-6*16($fp) + vmovaps %xmm7,-40-5*16($fp) + vmovaps %xmm8,-40-4*16($fp) + vmovaps %xmm9,-40-3*16($fp) + vmovaps %xmm10,-40-2*16($fp) + vmovaps %xmm11,-40-1*16($fp) .Lprologue_avx2: ___ $code.=<<___; - mov %rax,%r14 # original %rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument mov %rdx,$num # reassigned argument @@ -1751,21 +1747,20 @@ $code.=<<___; vzeroupper ___ $code.=<<___ if ($win64); - movaps -40-6*16(%r14),%xmm6 - movaps -40-5*16(%r14),%xmm7 - movaps -40-4*16(%r14),%xmm8 - movaps -40-3*16(%r14),%xmm9 - movaps -40-2*16(%r14),%xmm10 - movaps -40-1*16(%r14),%xmm11 + movaps -40-6*16($fp),%xmm6 + movaps -40-5*16($fp),%xmm7 + movaps -40-4*16($fp),%xmm8 + movaps -40-3*16($fp),%xmm9 + movaps -40-2*16($fp),%xmm10 + movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; - lea (%r14),%rsi - mov -40(%rsi),%r14 - mov -32(%rsi),%r13 - mov -24(%rsi),%r12 - mov -16(%rsi),%rbp - mov -8(%rsi),%rbx - lea (%rsi),%rsp + mov -40($fp),%r14 + mov -32($fp),%r13 + mov -24($fp),%r12 + mov -16($fp),%rbp + mov -8($fp),%rbx + lea ($fp),%rsp .Lepilogue_avx2: ret .size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 @@ -1908,15 +1903,13 @@ ssse3_handler: cmp %r10,%rbx # context->RipRsp + mov 208($context),%rax # pull context->R11 mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail - mov 232($context),%rax # pull context->R14 - lea -40-6*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$12,%ecx diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl index 5a1cbcf0ca..7a8ed7c7f2 100755 --- a/crypto/sha/asm/sha512-x86_64.pl +++ b/crypto/sha/asm/sha512-x86_64.pl @@ -301,13 +301,13 @@ $code.=<<___ if ($SZ==4); jnz .Lssse3_shortcut ___ $code.=<<___; + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp shl \$4,%rdx # num*16 sub \$$framesz,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -315,7 +315,7 @@ $code.=<<___; mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp .Lprologue: mov $SZ*0($ctx),$A @@ -382,13 +382,13 @@ $code.=<<___; jb .Lloop mov $_rsp,%rsi - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue: ret .size $func,.-$func @@ -761,13 +761,13 @@ $code.=<<___; .align 64 ${func}_ssse3: .Lssse3_shortcut: + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*4`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -775,7 +775,7 @@ ${func}_ssse3: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -1082,13 +1082,13 @@ $code.=<<___ if ($win64); movaps 16*$SZ+80(%rsp),%xmm9 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_ssse3: ret .size ${func}_ssse3,.-${func}_ssse3 @@ -1105,13 +1105,13 @@ $code.=<<___; .align 64 ${func}_xop: .Lxop_shortcut: + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -1119,7 +1119,7 @@ ${func}_xop: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -1459,13 +1459,13 @@ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_xop: ret .size ${func}_xop,.-${func}_xop @@ -1481,13 +1481,13 @@ $code.=<<___; .align 64 ${func}_avx: .Lavx_shortcut: + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -1495,7 +1495,7 @@ ${func}_avx: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -1767,13 +1767,13 @@ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_avx: ret .size ${func}_avx,.-${func}_avx @@ -1832,13 +1832,13 @@ $code.=<<___; .align 64 ${func}_avx2: .Lavx2_shortcut: + mov %rsp,%rax # copy %rsp push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 - mov %rsp,%r11 # copy %rsp sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp shl \$4,%rdx # num*16 and \$-256*$SZ,%rsp # align stack frame @@ -1847,7 +1847,7 @@ ${func}_avx2: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -2141,13 +2141,13 @@ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue_avx2: ret .size ${func}_avx2,.-${func}_avx2 @@ -2209,7 +2209,6 @@ ___ $code.=<<___; mov %rax,%rsi # put aside Rsp mov 16*$SZ+3*8(%rax),%rax # pull $_rsp - lea 48(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp diff --git a/crypto/whrlpool/asm/wp-x86_64.pl b/crypto/whrlpool/asm/wp-x86_64.pl index c0b21d13ed..d0b7ecc9e7 100644 --- a/crypto/whrlpool/asm/wp-x86_64.pl +++ b/crypto/whrlpool/asm/wp-x86_64.pl @@ -66,6 +66,7 @@ $code=<<___; .type $func,\@function,3 .align 16 $func: + mov %rsp,%rax push %rbx push %rbp push %r12 @@ -73,7 +74,6 @@ $func: push %r14 push %r15 - mov %rsp,%r11 sub \$128+40,%rsp and \$-64,%rsp @@ -81,7 +81,7 @@ $func: mov %rdi,0(%r10) # save parameter block mov %rsi,8(%r10) mov %rdx,16(%r10) - mov %r11,32(%r10) # saved stack pointer + mov %rax,32(%r10) # saved stack pointer .Lprologue: mov %r10,%rbx @@ -205,13 +205,13 @@ $code.=<<___; jmp .Louterloop .Lalldone: mov 32(%rbx),%rsi # restore saved pointer - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 + mov -40(%rsi),%r14 + mov -32(%rsi),%r13 + mov -24(%rsi),%r12 + mov -16(%rsi),%rbp + mov -8(%rsi),%rbx + lea (%rsi),%rsp .Lepilogue: ret .size $func,.-$func @@ -526,7 +526,6 @@ se_handler: jae .Lin_prologue mov 128+32(%rax),%rax # pull saved stack pointer - lea 48(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp