2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 # Copyright (c) 2012, Intel Corporation. All Rights Reserved.
5 # Licensed under the Apache License 2.0 (the "License"). You may not use
6 # this file except in compliance with the License. You can obtain a copy
7 # in the file LICENSE in the source distribution or at
8 # https://www.openssl.org/source/license.html
10 # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
11 # (1) Intel Corporation, Israel Development Center, Haifa, Israel
12 # (2) University of Haifa, Israel
15 # [1] S. Gueron, "Efficient Software Implementations of Modular
16 # Exponentiation", http://eprint.iacr.org/2011/239
17 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".
18 # IEEE Proceedings of 9th International Conference on Information
19 # Technology: New Generations (ITNG 2012), 821-823 (2012).
20 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation
21 # Journal of Cryptographic Engineering 2:31-43 (2012).
22 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
23 # resistant 512-bit and 1024-bit modular exponentiation for optimizing
24 # RSA1024 and RSA2048 on x86_64 platforms",
25 # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest
27 # While original submission covers 512- and 1024-bit exponentiation,
28 # this module is limited to 512-bit version only (and as such
29 # accelerates RSA1024 sign). This is because improvement for longer
30 # keys is not high enough to justify the effort, highest measured
31 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
32 # for the moment of this writing!] Nor does this module implement
33 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
34 # to more modular mixture of C and assembly. And it's optimized even
35 # for processors other than Intel Core family (see table below for
36 # improvement coefficients).
39 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
40 # ----------------+---------------------------
41 # Opteron +13% |+5% +20%
42 # Bulldozer -0% |-1% +10%
44 # Westmere +5% |+14% +17%
45 # Sandy Bridge +2% |+12% +29%
46 # Ivy Bridge +1% |+11% +35%
47 # Haswell(**) -0% |+12% +39%
49 # VIA Nano +70% |+9% +25%
51 # (*) rsax engine and fips numbers are presented for reference
53 # (**) MULX was attempted, but found to give only marginal improvement;
55 # $output is the last argument if it looks like a file (it has an extension)
56 # $flavour is the first argument if it doesn't look like a file
57 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
58 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
60 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
64 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
65 die "can't locate x86_64-xlate.pl";
67 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
68 or die "can't call $xlate: $!";
71 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
72 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
76 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
77 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
81 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
82 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
86 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
87 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
91 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
93 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
98 .extern OPENSSL_ia32cap_P
101 .type rsaz_512_sqr,\@function,5
103 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
119 .cfi_adjust_cfa_offset 128+24
121 movq $mod, %xmm1 # common off-load
126 $code.=<<___ if ($addx);
128 andl OPENSSL_ia32cap_P+8(%rip),%r11d
129 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
137 movl $times,128+8(%rsp)
139 movq %rdx, %rbx # 0($inp)
140 mov %rax, %rbp # 8($inp)
181 xorq %rcx,%rcx # rcx:r8 = r8 << 1
240 xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
247 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
296 xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
303 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
315 mov %rax, %r11 # 32($inp)
322 mov %rax, %r12 # 40($inp)
331 mov %rax, %rbp # 48($inp)
347 xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1
354 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
357 movq %r12, %rax # 40($inp)
367 movq %rbp, %rax # 48($inp)
379 mov %rax, %r14 # 56($inp)
387 xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1
394 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
397 movq %rbp, %rax # 48($inp)
407 movq %r14, %rax # 56($inp)
418 xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
425 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
428 movq %r14, %rax # 56($inp)
441 xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
448 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
451 movq %r14, %rax # 56($inp)
459 xorq %rbx, %rbx # rbx:r13 = r13 << 1
464 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
482 call __rsaz_512_reduce
494 call __rsaz_512_subtract
498 movl 128+8(%rsp), $times
510 movl $times,128+8(%rsp)
511 movq $out, %xmm0 # off-load
516 mulx 16($inp), %rcx, %r10
517 xor %rbp, %rbp # cf=0, of=0
519 mulx 24($inp), %rax, %r11
522 .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12
525 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13
528 mulx 48($inp), %rcx, %r14
532 mulx 56($inp), %rax, %r15
534 adcx %rbp, %r15 # %rbp is 0
536 mulx %rdx, %rax, $out
537 mov %rbx, %rdx # 8($inp)
548 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx
552 mulx 24($inp), $out, %r8
557 mulx 32($inp), %rax, %rbx
561 mulx 40($inp), $out, %r8
565 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
569 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
572 mulx %rdx, %rax, $out
574 .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx
578 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
587 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
590 mulx 24($inp), $out, %r9
594 mulx 32($inp), %rax, %rcx
598 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9
602 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
606 mulx 56($inp), $out, %r9
609 mulx %rdx, %rax, $out
615 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
627 mulx 32($inp), %rax, %rbx
631 mulx 40($inp), $out, %r10
635 mulx 48($inp), %rax, %rbx
639 mulx 56($inp), $out, %r10
642 mulx %rdx, %rax, $out
648 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
660 mulx 40($inp), $out, %r11
664 mulx 48($inp), %rax, %rcx
668 mulx 56($inp), $out, %r11
671 mulx %rdx, %rax, $out
677 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
689 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
693 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
696 mulx %rdx, %rax, $out
702 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
714 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
718 mulx %rdx, %rax, $out
722 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
730 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
731 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
734 mulx %rdx, %rax, %rdx
737 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
746 movq 128(%rsp), %rdx # pull $n0
759 call __rsaz_512_reducex
771 call __rsaz_512_subtract
775 movl 128+8(%rsp), $times
786 leaq 128+24+48(%rsp), %rax
801 .cfi_def_cfa_register %rsp
805 .size rsaz_512_sqr,.-rsaz_512_sqr
809 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
812 .type rsaz_512_mul,\@function,5
830 .cfi_adjust_cfa_offset 128+24
832 movq $out, %xmm0 # off-load arguments
836 $code.=<<___ if ($addx);
838 andl OPENSSL_ia32cap_P+8(%rip),%r11d
839 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
843 movq ($bp), %rbx # pass b[0]
844 movq $bp, %rbp # pass argument
859 call __rsaz_512_reduce
861 $code.=<<___ if ($addx);
866 movq $bp, %rbp # pass argument
867 movq ($bp), %rdx # pass b[0]
873 movq 128(%rsp), %rdx # pull $n0
883 call __rsaz_512_reducex
897 call __rsaz_512_subtract
899 leaq 128+24+48(%rsp), %rax
914 .cfi_def_cfa_register %rsp
918 .size rsaz_512_mul,.-rsaz_512_mul
922 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
924 .globl rsaz_512_mul_gather4
925 .type rsaz_512_mul_gather4,\@function,6
927 rsaz_512_mul_gather4:
942 subq \$`128+24+($win64?0xb0:0)`, %rsp
943 .cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)`
945 $code.=<<___ if ($win64);
946 movaps %xmm6,0xa0(%rsp)
947 movaps %xmm7,0xb0(%rsp)
948 movaps %xmm8,0xc0(%rsp)
949 movaps %xmm9,0xd0(%rsp)
950 movaps %xmm10,0xe0(%rsp)
951 movaps %xmm11,0xf0(%rsp)
952 movaps %xmm12,0x100(%rsp)
953 movaps %xmm13,0x110(%rsp)
954 movaps %xmm14,0x120(%rsp)
955 movaps %xmm15,0x130(%rsp)
960 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
961 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
963 pshufd \$0,%xmm8,%xmm8 # broadcast $power
967 ########################################################################
968 # calculate mask by comparing 0..15 to $power
970 for($i=0;$i<4;$i++) {
972 paddd %xmm`$i`,%xmm`$i+1`
973 pcmpeqd %xmm8,%xmm`$i`
974 movdqa %xmm7,%xmm`$i+3`
979 paddd %xmm`$i`,%xmm`$i+1`
980 pcmpeqd %xmm8,%xmm`$i`
986 movdqa 16*0($bp),%xmm8
987 movdqa 16*1($bp),%xmm9
988 movdqa 16*2($bp),%xmm10
989 movdqa 16*3($bp),%xmm11
991 movdqa 16*4($bp),%xmm12
993 movdqa 16*5($bp),%xmm13
995 movdqa 16*6($bp),%xmm14
997 movdqa 16*7($bp),%xmm15
1011 pshufd \$0x4e,%xmm8,%xmm9
1014 $code.=<<___ if ($addx);
1015 movl \$0x80100,%r11d
1016 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1017 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1023 movq $n0, 128(%rsp) # off-load arguments
1024 movq $out, 128+8(%rsp)
1025 movq $mod, 128+16(%rsp)
1029 mulq %rbx # 0 iteration
1078 jmp .Loop_mul_gather
1082 movdqa 16*0(%rbp),%xmm8
1083 movdqa 16*1(%rbp),%xmm9
1084 movdqa 16*2(%rbp),%xmm10
1085 movdqa 16*3(%rbp),%xmm11
1087 movdqa 16*4(%rbp),%xmm12
1089 movdqa 16*5(%rbp),%xmm13
1091 movdqa 16*6(%rbp),%xmm14
1093 movdqa 16*7(%rbp),%xmm15
1094 leaq 128(%rbp), %rbp
1107 pshufd \$0x4e,%xmm8,%xmm9
1177 jnz .Loop_mul_gather
1188 movq 128+8(%rsp), $out
1189 movq 128+16(%rsp), %rbp
1200 call __rsaz_512_reduce
1202 $code.=<<___ if ($addx);
1203 jmp .Lmul_gather_tail
1209 mov $n0, 128(%rsp) # off-load arguments
1210 mov $out, 128+8(%rsp)
1211 mov $mod, 128+16(%rsp)
1213 mulx ($ap), %rbx, %r8 # 0 iteration
1215 xor %edi, %edi # cf=0, of=0
1217 mulx 8($ap), %rax, %r9
1219 mulx 16($ap), %rbx, %r10
1222 mulx 24($ap), %rax, %r11
1225 mulx 32($ap), %rbx, %r12
1228 mulx 40($ap), %rax, %r13
1231 mulx 48($ap), %rbx, %r14
1234 mulx 56($ap), %rax, %r15
1239 adcx %rdi, %r15 # %rdi is 0
1242 jmp .Loop_mulx_gather
1246 movdqa 16*0(%rbp),%xmm8
1247 movdqa 16*1(%rbp),%xmm9
1248 movdqa 16*2(%rbp),%xmm10
1249 movdqa 16*3(%rbp),%xmm11
1251 movdqa 16*4(%rbp),%xmm12
1253 movdqa 16*5(%rbp),%xmm13
1255 movdqa 16*6(%rbp),%xmm14
1257 movdqa 16*7(%rbp),%xmm15
1258 leaq 128(%rbp), %rbp
1271 pshufd \$0x4e,%xmm8,%xmm9
1275 .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
1279 mulx 8($ap), %rax, %r9
1283 mulx 16($ap), %rax, %r10
1287 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
1291 mulx 32($ap), %rax, %r12
1295 mulx 40($ap), %rax, %r13
1299 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
1304 mulx 56($ap), %rax, %r15
1305 mov %rbx, 64(%rsp,%rcx,8)
1309 adcx %rdi, %r15 # cf=0
1312 jnz .Loop_mulx_gather
1316 mov %r10, 64+16(%rsp)
1317 mov %r11, 64+24(%rsp)
1318 mov %r12, 64+32(%rsp)
1319 mov %r13, 64+40(%rsp)
1320 mov %r14, 64+48(%rsp)
1321 mov %r15, 64+56(%rsp)
1323 mov 128(%rsp), %rdx # pull arguments
1324 mov 128+8(%rsp), $out
1325 mov 128+16(%rsp), %rbp
1336 call __rsaz_512_reducex
1346 adcq 104(%rsp), %r13
1347 adcq 112(%rsp), %r14
1348 adcq 120(%rsp), %r15
1351 call __rsaz_512_subtract
1353 leaq 128+24+48(%rsp), %rax
1355 $code.=<<___ if ($win64);
1356 movaps 0xa0-0xc8(%rax),%xmm6
1357 movaps 0xb0-0xc8(%rax),%xmm7
1358 movaps 0xc0-0xc8(%rax),%xmm8
1359 movaps 0xd0-0xc8(%rax),%xmm9
1360 movaps 0xe0-0xc8(%rax),%xmm10
1361 movaps 0xf0-0xc8(%rax),%xmm11
1362 movaps 0x100-0xc8(%rax),%xmm12
1363 movaps 0x110-0xc8(%rax),%xmm13
1364 movaps 0x120-0xc8(%rax),%xmm14
1365 movaps 0x130-0xc8(%rax),%xmm15
1370 movq -48(%rax), %r15
1372 movq -40(%rax), %r14
1374 movq -32(%rax), %r13
1376 movq -24(%rax), %r12
1378 movq -16(%rax), %rbp
1383 .cfi_def_cfa_register %rsp
1384 .Lmul_gather4_epilogue:
1387 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1391 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1393 .globl rsaz_512_mul_scatter4
1394 .type rsaz_512_mul_scatter4,\@function,6
1396 rsaz_512_mul_scatter4:
1413 .cfi_adjust_cfa_offset 128+24
1414 .Lmul_scatter4_body:
1415 leaq ($tbl,$pwr,8), $tbl
1416 movq $out, %xmm0 # off-load arguments
1423 $code.=<<___ if ($addx);
1424 movl \$0x80100,%r11d
1425 andl OPENSSL_ia32cap_P+8(%rip),%r11d
1426 cmpl \$0x80100,%r11d # check for MULX and ADO/CX
1430 movq ($out),%rbx # pass b[0]
1445 call __rsaz_512_reduce
1447 $code.=<<___ if ($addx);
1448 jmp .Lmul_scatter_tail
1452 movq ($out), %rdx # pass b[0]
1453 call __rsaz_512_mulx
1458 movq 128(%rsp), %rdx # pull $n0
1468 call __rsaz_512_reducex
1478 adcq 104(%rsp), %r13
1479 adcq 112(%rsp), %r14
1480 adcq 120(%rsp), %r15
1484 call __rsaz_512_subtract
1486 movq %r8, 128*0($inp) # scatter
1487 movq %r9, 128*1($inp)
1488 movq %r10, 128*2($inp)
1489 movq %r11, 128*3($inp)
1490 movq %r12, 128*4($inp)
1491 movq %r13, 128*5($inp)
1492 movq %r14, 128*6($inp)
1493 movq %r15, 128*7($inp)
1495 leaq 128+24+48(%rsp), %rax
1497 movq -48(%rax), %r15
1499 movq -40(%rax), %r14
1501 movq -32(%rax), %r13
1503 movq -24(%rax), %r12
1505 movq -16(%rax), %rbp
1510 .cfi_def_cfa_register %rsp
1511 .Lmul_scatter4_epilogue:
1514 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1518 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1520 .globl rsaz_512_mul_by_one
1521 .type rsaz_512_mul_by_one,\@function,4
1523 rsaz_512_mul_by_one:
1539 .cfi_adjust_cfa_offset 128+24
1542 $code.=<<___ if ($addx);
1543 movl OPENSSL_ia32cap_P+8(%rip),%eax
1546 movq $mod, %rbp # reassign argument
1559 movdqa %xmm0, (%rsp)
1560 movdqa %xmm0, 16(%rsp)
1561 movdqa %xmm0, 32(%rsp)
1562 movdqa %xmm0, 48(%rsp)
1563 movdqa %xmm0, 64(%rsp)
1564 movdqa %xmm0, 80(%rsp)
1565 movdqa %xmm0, 96(%rsp)
1567 $code.=<<___ if ($addx);
1569 cmpl \$0x80100,%eax # check for MULX and ADO/CX
1573 call __rsaz_512_reduce
1575 $code.=<<___ if ($addx);
1579 movq 128(%rsp), %rdx # pull $n0
1580 call __rsaz_512_reducex
1593 leaq 128+24+48(%rsp), %rax
1595 movq -48(%rax), %r15
1597 movq -40(%rax), %r14
1599 movq -32(%rax), %r13
1601 movq -24(%rax), %r12
1603 movq -16(%rax), %rbp
1608 .cfi_def_cfa_register %rsp
1609 .Lmul_by_one_epilogue:
1612 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1615 { # __rsaz_512_reduce
1617 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1619 # clobbers: everything except %rbp and %rdi
1621 .type __rsaz_512_reduce,\@abi-omnipotent
1626 imulq 128+8(%rsp), %rbx
1629 jmp .Lreduction_loop
1660 movq 128+8(%rsp), %rsi
1701 jne .Lreduction_loop
1705 .size __rsaz_512_reduce,.-__rsaz_512_reduce
1709 # __rsaz_512_reducex
1711 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1713 # clobbers: everything except %rbp and %rdi
1715 .type __rsaz_512_reducex,\@abi-omnipotent
1719 #movq 128+8(%rsp), %rdx # pull $n0
1721 xorq %rsi, %rsi # cf=0,of=0
1723 jmp .Lreduction_loopx
1728 mulx 0(%rbp), %rax, %r8
1732 mulx 8(%rbp), %rax, %r9
1736 mulx 16(%rbp), %rbx, %r10
1740 mulx 24(%rbp), %rbx, %r11
1744 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
1750 mulx 128+8(%rsp), %rbx, %rdx
1753 mulx 40(%rbp), %rax, %r13
1757 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
1761 mulx 56(%rbp), %rax, %r15
1764 adox %rsi, %r15 # %rsi is 0
1765 adcx %rsi, %r15 # cf=0
1768 jne .Lreduction_loopx
1772 .size __rsaz_512_reducex,.-__rsaz_512_reducex
1775 { # __rsaz_512_subtract
1776 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1778 # clobbers: everything but %rdi, %rsi and %rbp
1780 .type __rsaz_512_subtract,\@abi-omnipotent
1782 __rsaz_512_subtract:
1838 .size __rsaz_512_subtract,.-__rsaz_512_subtract
1843 # input: %rsi - ap, %rbp - bp
1845 # clobbers: everything
1846 my ($ap,$bp) = ("%rsi","%rbp");
1848 .type __rsaz_512_mul,\@abi-omnipotent
1991 .size __rsaz_512_mul,.-__rsaz_512_mul
1997 # input: %rsi - ap, %rbp - bp
1999 # clobbers: everything
2000 my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
2002 .type __rsaz_512_mulx,\@abi-omnipotent
2006 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
2009 mulx 8($ap), %rax, %r9
2012 mulx 16($ap), %rbx, %r10
2015 mulx 24($ap), %rax, %r11
2018 mulx 32($ap), %rbx, %r12
2021 mulx 40($ap), %rax, %r13
2024 mulx 48($ap), %rbx, %r14
2027 mulx 56($ap), %rax, %r15
2033 xor $zero, $zero # cf=0,of=0
2039 mulx ($ap), %rax, %r8
2043 mulx 8($ap), %rax, %r9
2047 mulx 16($ap), %rax, %r10
2051 mulx 24($ap), %rax, %r11
2055 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
2059 mulx 40($ap), %rax, %r13
2063 mulx 48($ap), %rax, %r14
2067 mulx 56($ap), %rax, %r15
2068 movq 64($bp,%rcx,8), %rdx
2069 movq %rbx, 8+64-8(%rsp,%rcx,8)
2072 adcx $zero, %r15 # cf=0
2078 mulx ($ap), %rax, %r8
2082 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
2086 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
2090 mulx 24($ap), %rax, %r11
2094 mulx 32($ap), %rax, %r12
2098 mulx 40($ap), %rax, %r13
2102 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
2106 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
2111 mov %rbx, 8+64-8(%rsp)
2113 mov %r9, 8+64+8(%rsp)
2114 mov %r10, 8+64+16(%rsp)
2115 mov %r11, 8+64+24(%rsp)
2116 mov %r12, 8+64+32(%rsp)
2117 mov %r13, 8+64+40(%rsp)
2118 mov %r14, 8+64+48(%rsp)
2119 mov %r15, 8+64+56(%rsp)
2123 .size __rsaz_512_mulx,.-__rsaz_512_mulx
2127 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
2129 .globl rsaz_512_scatter4
2130 .type rsaz_512_scatter4,\@abi-omnipotent
2134 leaq ($out,$power,8), $out
2142 leaq 128($out), $out
2147 .size rsaz_512_scatter4,.-rsaz_512_scatter4
2149 .globl rsaz_512_gather4
2150 .type rsaz_512_gather4,\@abi-omnipotent
2155 $code.=<<___ if ($win64);
2156 .LSEH_begin_rsaz_512_gather4:
2157 .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
2158 .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
2159 .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
2160 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
2161 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
2162 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
2163 .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
2164 .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
2165 .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
2166 .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
2167 .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
2171 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
2172 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
2174 pshufd \$0,%xmm8,%xmm8 # broadcast $power
2178 ########################################################################
2179 # calculate mask by comparing 0..15 to $power
2181 for($i=0;$i<4;$i++) {
2183 paddd %xmm`$i`,%xmm`$i+1`
2184 pcmpeqd %xmm8,%xmm`$i`
2185 movdqa %xmm7,%xmm`$i+3`
2190 paddd %xmm`$i`,%xmm`$i+1`
2191 pcmpeqd %xmm8,%xmm`$i`
2200 movdqa 16*0($inp),%xmm8
2201 movdqa 16*1($inp),%xmm9
2202 movdqa 16*2($inp),%xmm10
2203 movdqa 16*3($inp),%xmm11
2205 movdqa 16*4($inp),%xmm12
2207 movdqa 16*5($inp),%xmm13
2209 movdqa 16*6($inp),%xmm14
2211 movdqa 16*7($inp),%xmm15
2212 leaq 128($inp), $inp
2225 pshufd \$0x4e,%xmm8,%xmm9
2232 $code.=<<___ if ($win64);
2233 movaps 0x00(%rsp),%xmm6
2234 movaps 0x10(%rsp),%xmm7
2235 movaps 0x20(%rsp),%xmm8
2236 movaps 0x30(%rsp),%xmm9
2237 movaps 0x40(%rsp),%xmm10
2238 movaps 0x50(%rsp),%xmm11
2239 movaps 0x60(%rsp),%xmm12
2240 movaps 0x70(%rsp),%xmm13
2241 movaps 0x80(%rsp),%xmm14
2242 movaps 0x90(%rsp),%xmm15
2247 .LSEH_end_rsaz_512_gather4:
2249 .size rsaz_512_gather4,.-rsaz_512_gather4
2258 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2259 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2267 .extern __imp_RtlVirtualUnwind
2268 .type se_handler,\@abi-omnipotent
2282 mov 120($context),%rax # pull context->Rax
2283 mov 248($context),%rbx # pull context->Rip
2285 mov 8($disp),%rsi # disp->ImageBase
2286 mov 56($disp),%r11 # disp->HandlerData
2288 mov 0(%r11),%r10d # HandlerData[0]
2289 lea (%rsi,%r10),%r10 # end of prologue label
2290 cmp %r10,%rbx # context->Rip<end of prologue label
2291 jb .Lcommon_seh_tail
2293 mov 152($context),%rax # pull context->Rsp
2295 mov 4(%r11),%r10d # HandlerData[1]
2296 lea (%rsi,%r10),%r10 # epilogue label
2297 cmp %r10,%rbx # context->Rip>=epilogue label
2298 jae .Lcommon_seh_tail
2300 lea 128+24+48(%rax),%rax
2302 lea .Lmul_gather4_epilogue(%rip),%rbx
2304 jne .Lse_not_in_mul_gather4
2308 lea -48-0xa8(%rax),%rsi
2309 lea 512($context),%rdi
2311 .long 0xa548f3fc # cld; rep movsq
2313 .Lse_not_in_mul_gather4:
2320 mov %rbx,144($context) # restore context->Rbx
2321 mov %rbp,160($context) # restore context->Rbp
2322 mov %r12,216($context) # restore context->R12
2323 mov %r13,224($context) # restore context->R13
2324 mov %r14,232($context) # restore context->R14
2325 mov %r15,240($context) # restore context->R15
2330 mov %rax,152($context) # restore context->Rsp
2331 mov %rsi,168($context) # restore context->Rsi
2332 mov %rdi,176($context) # restore context->Rdi
2334 mov 40($disp),%rdi # disp->ContextRecord
2335 mov $context,%rsi # context
2336 mov \$154,%ecx # sizeof(CONTEXT)
2337 .long 0xa548f3fc # cld; rep movsq
2340 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2341 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2342 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2343 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2344 mov 40(%rsi),%r10 # disp->ContextRecord
2345 lea 56(%rsi),%r11 # &disp->HandlerData
2346 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2347 mov %r10,32(%rsp) # arg5
2348 mov %r11,40(%rsp) # arg6
2349 mov %r12,48(%rsp) # arg7
2350 mov %rcx,56(%rsp) # arg8, (NULL)
2351 call *__imp_RtlVirtualUnwind(%rip)
2353 mov \$1,%eax # ExceptionContinueSearch
2365 .size se_handler,.-se_handler
2369 .rva .LSEH_begin_rsaz_512_sqr
2370 .rva .LSEH_end_rsaz_512_sqr
2371 .rva .LSEH_info_rsaz_512_sqr
2373 .rva .LSEH_begin_rsaz_512_mul
2374 .rva .LSEH_end_rsaz_512_mul
2375 .rva .LSEH_info_rsaz_512_mul
2377 .rva .LSEH_begin_rsaz_512_mul_gather4
2378 .rva .LSEH_end_rsaz_512_mul_gather4
2379 .rva .LSEH_info_rsaz_512_mul_gather4
2381 .rva .LSEH_begin_rsaz_512_mul_scatter4
2382 .rva .LSEH_end_rsaz_512_mul_scatter4
2383 .rva .LSEH_info_rsaz_512_mul_scatter4
2385 .rva .LSEH_begin_rsaz_512_mul_by_one
2386 .rva .LSEH_end_rsaz_512_mul_by_one
2387 .rva .LSEH_info_rsaz_512_mul_by_one
2389 .rva .LSEH_begin_rsaz_512_gather4
2390 .rva .LSEH_end_rsaz_512_gather4
2391 .rva .LSEH_info_rsaz_512_gather4
2395 .LSEH_info_rsaz_512_sqr:
2398 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
2399 .LSEH_info_rsaz_512_mul:
2402 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
2403 .LSEH_info_rsaz_512_mul_gather4:
2406 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
2407 .LSEH_info_rsaz_512_mul_scatter4:
2410 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
2411 .LSEH_info_rsaz_512_mul_by_one:
2414 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
2415 .LSEH_info_rsaz_512_gather4:
2416 .byte 0x01,0x46,0x16,0x00
2417 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
2418 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
2419 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
2420 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
2421 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
2422 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
2423 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
2424 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
2425 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
2426 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
2427 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
2431 $code =~ s/\`([^\`]*)\`/eval $1/gem;
2433 close STDOUT or die "error closing STDOUT";