3 ##############################################################################
5 # Copyright (c) 2012, Intel Corporation #
7 # All rights reserved. #
9 # Redistribution and use in source and binary forms, with or without #
10 # modification, are permitted provided that the following conditions are #
13 # * Redistributions of source code must retain the above copyright #
14 # notice, this list of conditions and the following disclaimer. #
16 # * Redistributions in binary form must reproduce the above copyright #
17 # notice, this list of conditions and the following disclaimer in the #
18 # documentation and/or other materials provided with the #
21 # * Neither the name of the Intel Corporation nor the names of its #
22 # contributors may be used to endorse or promote products derived from #
23 # this software without specific prior written permission. #
26 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
38 ##############################################################################
39 # Developers and authors: #
40 # Shay Gueron (1, 2), and Vlad Krasnov (1) #
41 # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
42 # Israel Development Center, Haifa, Israel #
43 # (2) University of Haifa #
44 ##############################################################################
46 # [1] S. Gueron, "Efficient Software Implementations of Modular #
47 # Exponentiation", http://eprint.iacr.org/2011/239 #
48 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
49 # IEEE Proceedings of 9th International Conference on Information #
50 # Technology: New Generations (ITNG 2012), 821-823 (2012). #
51 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
52 # Journal of Cryptographic Engineering 2:31-43 (2012). #
53 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
54 # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
55 # RSA1024 and RSA2048 on x86_64 platforms", #
56 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
57 ##############################################################################
59 # While original submission covers 512- and 1024-bit exponentiation,
60 # this module is limited to 512-bit version only (and as such
61 # accelerates RSA1024 sign). This is because improvement for longer
62 # keys is not high enough to justify the effort, highest measured
63 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
64 # for the moment of this writing!] Nor does this module implement
65 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
66 # to more modular mixture of C and assembly. And it's optimized even
67 # for processors other than Intel Core family (see table below for
68 # improvement coefficients).
71 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
72 # ----------------+---------------------------
73 # Opteron +13% |+5% +20%
74 # Bulldozer -0% |-1% +10%
76 # Westmere +5% |+14% +17%
77 # Sandy Bridge +2% |+12% +29%
78 # Ivy Bridge +1% |+11% +35%
79 # Haswell(**) -0% |+12% +39%
81 # VIA Nano +70% |+9% +25%
83 # (*) rsax engine and fips numbers are presented for reference
85 # (**) MULX was attempted, but found to give only marginal improvement;
89 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
91 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
93 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96 die "can't locate x86_64-xlate.pl";
98 open OUT,"| $^X $xlate $flavour $output";
101 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
102 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
106 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
107 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
111 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
112 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
116 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
118 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
123 .extern OPENSSL_ia32cap_P
126 .type rsaz_512_sqr,\@function,5
128 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
138 movq $mod, %rbp # common argument
143 $code.=<<___ if ($addx);
145 andl OPENSSL_ia32cap_P+8(%rip),%r11d
146 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
154 movl $times,128+8(%rsp)
198 addq %r8, %r8 #shlq \$1, %r8
200 adcq %r9, %r9 #shld \$1, %r8, %r9
261 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
263 adcq %r11, %r11 #shld \$1, %r10, %r11
301 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
319 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
349 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
367 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
392 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
407 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
431 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
439 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
460 adcq %r12, %r12 #shld \$1, %rbx, %r12
461 adcq %r13, %r13 #shld \$1, %r12, %r13
462 adcq %r14, %r14 #shld \$1, %r13, %r14
492 call __rsaz_512_reduce
504 call __rsaz_512_subtract
508 movl 128+8(%rsp), $times
520 movl $times,128+8(%rsp)
521 movq $out, %xmm0 # off-load
522 movq %rbp, %xmm1 # off-load
526 mulx 16($inp), %rcx, %r10
527 xor %rbp, %rbp # cf=0, of=0
529 mulx 24($inp), %rax, %r11
532 mulx 32($inp), %rcx, %r12
535 mulx 40($inp), %rax, %r13
538 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
542 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
544 adcx %rbp, %r15 # %rbp is 0
551 mulx %rdx, %rax, %rdx
560 mulx 16($inp), %rax, %rbx
564 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
568 mulx 32($inp), %rax, %rbx
572 mulx 40($inp), $out, %r8
576 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
580 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
590 mulx %rdx, %rax, %rcx
597 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
600 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
604 mulx 32($inp), %rax, %rcx
608 mulx 40($inp), $out, %r9
612 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
616 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
626 mulx %rdx, %rax, %rdx
633 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
636 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
640 mulx 40($inp), $out, %r10
644 mulx 48($inp), %rax, %rbx
648 mulx 56($inp), $out, %r10
659 mulx %rdx, %rax, %rdx
669 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
673 mulx 48($inp), %rax, %rcx
677 mulx 56($inp), $out, %r11
687 mulx %rdx, %rax, %rdx
697 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
701 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
711 mulx %rdx, %rax, %rdx
721 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
731 mulx %rdx, %rax, %rdx
737 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
738 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
741 mulx %rdx, %rax, %rdx
753 movq 128(%rsp), %rdx # pull $n0
763 call __rsaz_512_reducex
775 call __rsaz_512_subtract
779 movl 128+8(%rsp), $times
790 leaq 128+24+48(%rsp), %rax
800 .size rsaz_512_sqr,.-rsaz_512_sqr
804 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
807 .type rsaz_512_mul,\@function,5
819 movq $out, %xmm0 # off-load arguments
823 $code.=<<___ if ($addx);
825 andl OPENSSL_ia32cap_P+8(%rip),%r11d
826 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
830 movq ($bp), %rbx # pass b[0]
831 movq $bp, %rbp # pass argument
846 call __rsaz_512_reduce
848 $code.=<<___ if ($addx);
853 movq $bp, %rbp # pass argument
854 movq ($bp), %rdx # pass b[0]
860 movq 128(%rsp), %rdx # pull $n0
870 call __rsaz_512_reducex
884 call __rsaz_512_subtract
886 leaq 128+24+48(%rsp), %rax
896 .size rsaz_512_mul,.-rsaz_512_mul
900 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
902 .globl rsaz_512_mul_gather4
903 .type rsaz_512_mul_gather4,\@function,6
905 rsaz_512_mul_gather4:
917 $code.=<<___ if ($addx);
919 andl OPENSSL_ia32cap_P+8(%rip),%r11d
920 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
924 movl 64($bp,$pwr,4), %eax
925 movq $out, %xmm0 # off-load arguments
926 movl ($bp,$pwr,4), %ebx
934 leaq 128($bp,$pwr,4), %rbp
935 mulq %rbx # 0 iteration
1011 movd 64(%rbp), %xmm5
1062 leaq 128(%rbp), %rbp
1066 jnz .Loop_mul_gather
1089 call __rsaz_512_reduce
1091 $code.=<<___ if ($addx);
1092 jmp .Lmul_gather_tail
1096 mov 64($bp,$pwr,4), %eax
1097 movq $out, %xmm0 # off-load arguments
1098 lea 128($bp,$pwr,4), %rbp
1099 mov ($bp,$pwr,4), %edx
1105 mulx ($ap), %rbx, %r8 # 0 iteration
1107 xor %edi, %edi # cf=0, of=0
1109 mulx 8($ap), %rax, %r9
1112 mulx 16($ap), %rbx, %r10
1113 movd 64(%rbp), %xmm5
1116 mulx 24($ap), %rax, %r11
1120 mulx 32($ap), %rbx, %r12
1124 mulx 40($ap), %rax, %r13
1127 mulx 48($ap), %rbx, %r14
1131 mulx 56($ap), %rax, %r15
1136 adcx %rdi, %r15 # %rdi is 0
1139 jmp .Loop_mulx_gather
1143 mulx ($ap), %rax, %r8
1147 mulx 8($ap), %rax, %r9
1148 .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
1152 mulx 16($ap), %rax, %r10
1153 movd 64(%rbp), %xmm5
1158 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1164 mulx 32($ap), %rax, %r12
1168 mulx 40($ap), %rax, %r13
1172 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1176 mulx 56($ap), %rax, %r15
1178 mov %rbx, 64(%rsp,%rcx,8)
1182 adcx %rdi, %r15 # cf=0
1185 jnz .Loop_mulx_gather
1189 mov %r10, 64+16(%rsp)
1190 mov %r11, 64+24(%rsp)
1191 mov %r12, 64+32(%rsp)
1192 mov %r13, 64+40(%rsp)
1193 mov %r14, 64+48(%rsp)
1194 mov %r15, 64+56(%rsp)
1199 mov 128(%rsp), %rdx # pull $n0
1209 call __rsaz_512_reducex
1219 adcq 104(%rsp), %r13
1220 adcq 112(%rsp), %r14
1221 adcq 120(%rsp), %r15
1224 call __rsaz_512_subtract
1226 leaq 128+24+48(%rsp), %rax
1227 movq -48(%rax), %r15
1228 movq -40(%rax), %r14
1229 movq -32(%rax), %r13
1230 movq -24(%rax), %r12
1231 movq -16(%rax), %rbp
1234 .Lmul_gather4_epilogue:
1236 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1240 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1242 .globl rsaz_512_mul_scatter4
1243 .type rsaz_512_mul_scatter4,\@function,6
1245 rsaz_512_mul_scatter4:
1255 .Lmul_scatter4_body:
1256 leaq ($tbl,$pwr,4), $tbl
1257 movq $out, %xmm0 # off-load arguments
1264 $code.=<<___ if ($addx);
1265 movl \$0x80100,%r11d
1266 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1267 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1271 movq ($out),%rbx # pass b[0]
1286 call __rsaz_512_reduce
1288 $code.=<<___ if ($addx);
1289 jmp .Lmul_scatter_tail
1293 movq ($out), %rdx # pass b[0]
1294 call __rsaz_512_mulx
1299 movq 128(%rsp), %rdx # pull $n0
1309 call __rsaz_512_reducex
1319 adcq 104(%rsp), %r13
1320 adcq 112(%rsp), %r14
1321 adcq 120(%rsp), %r15
1325 call __rsaz_512_subtract
1327 movl %r8d, 64*0($inp) # scatter
1329 movl %r9d, 64*2($inp)
1331 movl %r10d, 64*4($inp)
1333 movl %r11d, 64*6($inp)
1335 movl %r12d, 64*8($inp)
1337 movl %r13d, 64*10($inp)
1339 movl %r14d, 64*12($inp)
1341 movl %r15d, 64*14($inp)
1343 movl %r8d, 64*1($inp)
1344 movl %r9d, 64*3($inp)
1345 movl %r10d, 64*5($inp)
1346 movl %r11d, 64*7($inp)
1347 movl %r12d, 64*9($inp)
1348 movl %r13d, 64*11($inp)
1349 movl %r14d, 64*13($inp)
1350 movl %r15d, 64*15($inp)
1352 leaq 128+24+48(%rsp), %rax
1353 movq -48(%rax), %r15
1354 movq -40(%rax), %r14
1355 movq -32(%rax), %r13
1356 movq -24(%rax), %r12
1357 movq -16(%rax), %rbp
1360 .Lmul_scatter4_epilogue:
1362 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1366 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1368 .globl rsaz_512_mul_by_one
1369 .type rsaz_512_mul_by_one,\@function,4
1371 rsaz_512_mul_by_one:
1382 $code.=<<___ if ($addx);
1383 movl OPENSSL_ia32cap_P+8(%rip),%eax
1386 movq $mod, %rbp # reassign argument
1399 movdqa %xmm0, (%rsp)
1400 movdqa %xmm0, 16(%rsp)
1401 movdqa %xmm0, 32(%rsp)
1402 movdqa %xmm0, 48(%rsp)
1403 movdqa %xmm0, 64(%rsp)
1404 movdqa %xmm0, 80(%rsp)
1405 movdqa %xmm0, 96(%rsp)
1407 $code.=<<___ if ($addx);
1409 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1413 call __rsaz_512_reduce
1415 $code.=<<___ if ($addx);
1419 movq 128(%rsp), %rdx # pull $n0
1420 call __rsaz_512_reducex
1433 leaq 128+24+48(%rsp), %rax
1434 movq -48(%rax), %r15
1435 movq -40(%rax), %r14
1436 movq -32(%rax), %r13
1437 movq -24(%rax), %r12
1438 movq -16(%rax), %rbp
1441 .Lmul_by_one_epilogue:
1443 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1446 { # __rsaz_512_reduce
1448 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1450 # clobbers: everything except %rbp and %rdi
1452 .type __rsaz_512_reduce,\@abi-omnipotent
1456 imulq 128+8(%rsp), %rbx
1459 jmp .Lreduction_loop
1490 movq 128+8(%rsp), %rsi
1531 jne .Lreduction_loop
1534 .size __rsaz_512_reduce,.-__rsaz_512_reduce
1538 # __rsaz_512_reducex
1540 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1542 # clobbers: everything except %rbp and %rdi
1544 .type __rsaz_512_reducex,\@abi-omnipotent
1547 #movq 128+8(%rsp), %rdx # pull $n0
1549 xorq %rsi, %rsi # cf=0,of=0
1551 jmp .Lreduction_loopx
1556 mulx 0(%rbp), %rax, %r8
1560 mulx 8(%rbp), %rax, %r9
1564 mulx 16(%rbp), %rbx, %r10
1568 mulx 24(%rbp), %rbx, %r11
1572 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1578 mulx 128+8(%rsp), %rbx, %rdx
1581 mulx 40(%rbp), %rax, %r13
1585 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1589 mulx 56(%rbp), %rax, %r15
1592 adox %rsi, %r15 # %rsi is 0
1593 adcx %rsi, %r15 # cf=0
1596 jne .Lreduction_loopx
1599 .size __rsaz_512_reducex,.-__rsaz_512_reducex
1602 { # __rsaz_512_subtract
1603 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1605 # clobbers: everything but %rdi, %rsi and %rbp
1607 .type __rsaz_512_subtract,\@abi-omnipotent
1609 __rsaz_512_subtract:
1663 .size __rsaz_512_subtract,.-__rsaz_512_subtract
1668 # input: %rsi - ap, %rbp - bp
1670 # clobbers: everything
1671 my ($ap,$bp) = ("%rsi","%rbp");
1673 .type __rsaz_512_mul,\@abi-omnipotent
1814 .size __rsaz_512_mul,.-__rsaz_512_mul
1820 # input: %rsi - ap, %rbp - bp
1822 # clobbers: everything
1823 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
1825 .type __rsaz_512_mulx,\@abi-omnipotent
1828 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
1831 mulx 8($ap), %rax, %r9
1834 mulx 16($ap), %rbx, %r10
1837 mulx 24($ap), %rax, %r11
1840 mulx 32($ap), %rbx, %r12
1843 mulx 40($ap), %rax, %r13
1846 mulx 48($ap), %rbx, %r14
1849 mulx 56($ap), %rax, %r15
1855 xor $zero, $zero # cf=0,of=0
1861 mulx ($ap), %rax, %r8
1865 mulx 8($ap), %rax, %r9
1869 mulx 16($ap), %rax, %r10
1873 mulx 24($ap), %rax, %r11
1877 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
1881 mulx 40($ap), %rax, %r13
1885 mulx 48($ap), %rax, %r14
1889 mulx 56($ap), %rax, %r15
1890 movq 64($bp,%rcx,8), %rdx
1891 movq %rbx, 8+64-8(%rsp,%rcx,8)
1894 adcx $zero, %r15 # cf=0
1900 mulx ($ap), %rax, %r8
1904 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
1908 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
1912 mulx 24($ap), %rax, %r11
1916 mulx 32($ap), %rax, %r12
1920 mulx 40($ap), %rax, %r13
1924 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1928 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
1933 mov %rbx, 8+64-8(%rsp)
1935 mov %r9, 8+64+8(%rsp)
1936 mov %r10, 8+64+16(%rsp)
1937 mov %r11, 8+64+24(%rsp)
1938 mov %r12, 8+64+32(%rsp)
1939 mov %r13, 8+64+40(%rsp)
1940 mov %r14, 8+64+48(%rsp)
1941 mov %r15, 8+64+56(%rsp)
1944 .size __rsaz_512_mulx,.-__rsaz_512_mulx
1948 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1950 .globl rsaz_512_scatter4
1951 .type rsaz_512_scatter4,\@abi-omnipotent
1954 leaq ($out,$power,4), $out
1964 leaq 128($out), $out
1968 .size rsaz_512_scatter4,.-rsaz_512_scatter4
1970 .globl rsaz_512_gather4
1971 .type rsaz_512_gather4,\@abi-omnipotent
1974 leaq ($inp,$power,4), $inp
1981 leaq 128($inp), $inp
1989 .size rsaz_512_gather4,.-rsaz_512_gather4
1993 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1994 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2002 .extern __imp_RtlVirtualUnwind
2003 .type se_handler,\@abi-omnipotent
2017 mov 120($context),%rax # pull context->Rax
2018 mov 248($context),%rbx # pull context->Rip
2020 mov 8($disp),%rsi # disp->ImageBase
2021 mov 56($disp),%r11 # disp->HandlerData
2023 mov 0(%r11),%r10d # HandlerData[0]
2024 lea (%rsi,%r10),%r10 # end of prologue label
2025 cmp %r10,%rbx # context->Rip<end of prologue label
2026 jb .Lcommon_seh_tail
2028 mov 152($context),%rax # pull context->Rsp
2030 mov 4(%r11),%r10d # HandlerData[1]
2031 lea (%rsi,%r10),%r10 # epilogue label
2032 cmp %r10,%rbx # context->Rip>=epilogue label
2033 jae .Lcommon_seh_tail
2035 lea 128+24+48(%rax),%rax
2043 mov %rbx,144($context) # restore context->Rbx
2044 mov %rbp,160($context) # restore context->Rbp
2045 mov %r12,216($context) # restore context->R12
2046 mov %r13,224($context) # restore context->R13
2047 mov %r14,232($context) # restore context->R14
2048 mov %r15,240($context) # restore context->R15
2053 mov %rax,152($context) # restore context->Rsp
2054 mov %rsi,168($context) # restore context->Rsi
2055 mov %rdi,176($context) # restore context->Rdi
2057 mov 40($disp),%rdi # disp->ContextRecord
2058 mov $context,%rsi # context
2059 mov \$154,%ecx # sizeof(CONTEXT)
2060 .long 0xa548f3fc # cld; rep movsq
2063 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2064 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2065 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2066 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2067 mov 40(%rsi),%r10 # disp->ContextRecord
2068 lea 56(%rsi),%r11 # &disp->HandlerData
2069 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2070 mov %r10,32(%rsp) # arg5
2071 mov %r11,40(%rsp) # arg6
2072 mov %r12,48(%rsp) # arg7
2073 mov %rcx,56(%rsp) # arg8, (NULL)
2074 call *__imp_RtlVirtualUnwind(%rip)
2076 mov \$1,%eax # ExceptionContinueSearch
2088 .size sqr_handler,.-sqr_handler
2092 .rva .LSEH_begin_rsaz_512_sqr
2093 .rva .LSEH_end_rsaz_512_sqr
2094 .rva .LSEH_info_rsaz_512_sqr
2096 .rva .LSEH_begin_rsaz_512_mul
2097 .rva .LSEH_end_rsaz_512_mul
2098 .rva .LSEH_info_rsaz_512_mul
2100 .rva .LSEH_begin_rsaz_512_mul_gather4
2101 .rva .LSEH_end_rsaz_512_mul_gather4
2102 .rva .LSEH_info_rsaz_512_mul_gather4
2104 .rva .LSEH_begin_rsaz_512_mul_scatter4
2105 .rva .LSEH_end_rsaz_512_mul_scatter4
2106 .rva .LSEH_info_rsaz_512_mul_scatter4
2108 .rva .LSEH_begin_rsaz_512_mul_by_one
2109 .rva .LSEH_end_rsaz_512_mul_by_one
2110 .rva .LSEH_info_rsaz_512_mul_by_one
2114 .LSEH_info_rsaz_512_sqr:
2117 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2118 .LSEH_info_rsaz_512_mul:
2121 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2122 .LSEH_info_rsaz_512_mul_gather4:
2125 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2126 .LSEH_info_rsaz_512_mul_scatter4:
2129 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2130 .LSEH_info_rsaz_512_mul_by_one:
2133 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2137 $code =~ s/\`([^\`]*)\`/eval $1/gem;