3 #******************************************************************************#
4 #* Copyright(c) 2012, Intel Corp. *#
5 #* Developers and authors: *#
6 #* Shay Gueron (1, 2), and Vlad Krasnov (1) *#
7 #* (1) Intel Architecture Group, Microprocessor and Chipset Development, *#
8 #* Israel Development Center, Haifa, Israel *#
9 #* (2) University of Haifa *#
10 #******************************************************************************#
11 #* This submission to OpenSSL is to be made available under the OpenSSL *#
12 #* license, and only to the OpenSSL project, in order to allow integration *#
13 #* into the publicly distributed code. ? *#
14 #* The use of this code, or portions of this code, or concepts embedded in *#
15 #* this code, or modification of this code and/or algorithm(s) in it, or the *#
16 #* use of this code for any other purpose than stated above, requires special *#
18 #******************************************************************************#
19 #******************************************************************************#
21 #* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS *#
22 #* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *#
23 #* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *#
24 #* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT*#
25 #* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, *#
26 #* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF *#
27 #* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS *#
28 #* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN *#
29 #* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) *#
30 #* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *#
31 #* POSSIBILITY OF SUCH DAMAGE. *#
32 #******************************************************************************#
34 #* [1] S. Gueron, "Efficient Software Implementations of Modular *#
35 #* Exponentiation", http://eprint.iacr.org/2011/239 *#
36 #* [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". *#
37 #* IEEE Proceedings of 9th International Conference on Information *#
38 #* Technology: New Generations (ITNG 2012), 821-823 (2012). *#
39 #* [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation*#
40 #* Journal of Cryptographic Engineering 2:31-43 (2012). *#
41 #* [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis *#
42 #* resistant 512-bit and 1024-bit modular exponentiation for optimizing *#
43 #* RSA1024 and RSA2048 on x86_64 platforms", *#
44 #* http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest*#
45 ################################################################################
47 # While original submission covers 512- and 1024-bit exponentiation,
48 # this module is limited to 512-bit version only (and as such
49 # accelerates RSA1024 sign). This is because improvement for longer
50 # keys is not high enough to justify the effort, highest measured
51 # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
52 # for the moment of this writing!] Nor does this module implement
53 # "monolithic" complete exponentiation jumbo-subroutine, but adheres
54 # to more modular mixture of C and assembly. And it's optimized even
55 # for processors other than Intel Core family (see table below for
56 # improvement coefficients).
59 # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
60 # ----------------+---------------------------
61 # Opteron +13% |+5% +20%
62 # Bulldozer -0% |-1% +10%
64 # Westmere +5% |+14% +17%
65 # Sandy Bridge +2% |+12% +29%
66 # Ivy Bridge +1% |+11% +35%
67 # Haswell(**) -0% |+12% +39%
69 # VIA Nano +70% |+9% +25%
71 # (*) rsax engine and fips numbers are presented for reference
73 # (**) you might notice MULX code below, strangely enough gain is
74 # marginal, which is why code remains disabled;
78 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
80 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
82 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
84 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
85 die "can't locate x86_64-xlate.pl";
87 open OUT,"| $^X $xlate $flavour $output";
90 ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
92 my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
98 .type rsaz_512_sqr,\@function,4
100 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
110 movq $mod, %rbp # common argument
118 movl $times,128+8(%rsp)
165 addq %r8, %r8 #shlq \$1, %r8
167 adcq %r9, %r9 #shld \$1, %r8, %r9
228 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
230 adcq %r11, %r11 #shld \$1, %r10, %r11
268 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
286 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
316 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
334 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
359 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
374 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
398 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
406 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
427 adcq %r12, %r12 #shld \$1, %rbx, %r12
428 adcq %r13, %r13 #shld \$1, %r12, %r13
429 adcq %r14, %r14 #shld \$1, %r13, %r14
452 movq $out, %xmm0 # off-load
456 mulx 16($inp), %rcx, %r10
458 mulx 24($inp), %rax, %r11
461 mulx 32($inp), %rcx, %r12
464 mulx 40($inp), %rax, %r13
467 mulx 48($inp), %rcx, %r14
470 mulx 56($inp), %rax, %r15
479 mulx %rdx, %rax, %rdx
488 mulx 16($inp), %rax, %rbx
490 mulx 24($inp), $out, %r8
495 mulx 32($inp), %rax, %rbx
500 mulx 40($inp), $out, %r8
505 mulx 48($inp), %rax, %rbx
510 mulx 56($inp), $out, %r8
521 mulx %rdx, %rax, %rcx
531 mulx 24($inp), $out, %r9
533 mulx 32($inp), %rax, %rcx
538 mulx 40($inp), $out, %r9
543 mulx 48($inp), %rax, %rcx
548 mulx 56($inp), $out, %r9
559 mulx %rdx, %rax, %rdx
569 mulx 32($inp), %rax, %rbx
571 mulx 40($inp), $out, %r10
576 mulx 48($inp), %rax, %rbx
581 mulx 56($inp), $out, %r10
593 mulx %rdx, %rax, %rdx
603 mulx 40($inp), $out, %r11
605 mulx 48($inp), %rax, %rcx
610 mulx 56($inp), $out, %r11
621 mulx %rdx, %rax, %rdx
631 mulx 48($inp), %rax, %rbx
633 mulx 56($inp), $out, %r12
644 mulx %rdx, %rax, %rdx
654 mulx 56($inp), %rax, %r13
663 mulx %rdx, %rax, %rdx
673 mulx %rdx, %rax, %rdx
694 call _rsaz_512_reduce
706 call _rsaz_512_subtract
710 movl 128+8(%rsp), $times
716 leaq 128+24+48(%rsp), %rax
726 .size rsaz_512_sqr,.-rsaz_512_sqr
730 my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
733 .type rsaz_512_mul,\@function,5
745 movq $out, %xmm0 # off-load arguments
749 movq $bp, %rbp # pass argument
764 call _rsaz_512_reduce
776 call _rsaz_512_subtract
778 leaq 128+24+48(%rsp), %rax
788 .size rsaz_512_mul,.-rsaz_512_mul
792 my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
794 .globl rsaz_512_mul_gather4
795 .type rsaz_512_mul_gather4,\@function,6
797 rsaz_512_mul_gather4:
807 movl 64($bp,$pwr,4), %eax
808 movq $out, %xmm0 # off-load arguments
809 movl ($bp,$pwr,4), %ebx
817 leaq 128($bp,$pwr,4), %rbp
818 mulq %rbx # 0 iteration
972 call _rsaz_512_reduce
984 call _rsaz_512_subtract
986 leaq 128+24+48(%rsp), %rax
994 .Lmul_gather4_epilogue:
996 .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1000 my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1002 .globl rsaz_512_mul_scatter4
1003 .type rsaz_512_mul_scatter4,\@function,6
1005 rsaz_512_mul_scatter4:
1014 .Lmul_scatter4_body:
1015 leaq ($tbl,$pwr,4), $tbl
1016 movq $out, %xmm0 # off-load arguments
1036 call _rsaz_512_reduce
1043 adcq 104(%rsp), %r13
1044 adcq 112(%rsp), %r14
1045 adcq 120(%rsp), %r15
1049 call _rsaz_512_subtract
1051 movl %r8d, 64*0($inp) # scatter
1053 movl %r9d, 64*2($inp)
1055 movl %r10d, 64*4($inp)
1057 movl %r11d, 64*6($inp)
1059 movl %r12d, 64*8($inp)
1061 movl %r13d, 64*10($inp)
1063 movl %r14d, 64*12($inp)
1065 movl %r15d, 64*14($inp)
1067 movl %r8d, 64*1($inp)
1068 movl %r9d, 64*3($inp)
1069 movl %r10d, 64*5($inp)
1070 movl %r11d, 64*7($inp)
1071 movl %r12d, 64*9($inp)
1072 movl %r13d, 64*11($inp)
1073 movl %r14d, 64*13($inp)
1074 movl %r15d, 64*15($inp)
1076 leaq 128+24+48(%rsp), %rax
1077 movq -48(%rax), %r15
1078 movq -40(%rax), %r14
1079 movq -32(%rax), %r13
1080 movq -24(%rax), %r12
1081 movq -16(%rax), %rbp
1084 .Lmul_scatter4_epilogue:
1086 .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1090 my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
1092 .globl rsaz_512_mul_by_one
1093 .type rsaz_512_mul_by_one,\@function,4
1095 rsaz_512_mul_by_one:
1105 movq $mod, %rbp # reassign argument
1118 movdqa %xmm0, (%rsp)
1119 movdqa %xmm0, 16(%rsp)
1120 movdqa %xmm0, 32(%rsp)
1121 movdqa %xmm0, 48(%rsp)
1122 movdqa %xmm0, 64(%rsp)
1123 movdqa %xmm0, 80(%rsp)
1124 movdqa %xmm0, 96(%rsp)
1126 call _rsaz_512_reduce
1137 leaq 128+24+48(%rsp), %rax
1138 movq -48(%rax), %r15
1139 movq -40(%rax), %r14
1140 movq -32(%rax), %r13
1141 movq -24(%rax), %r12
1142 movq -16(%rax), %rbp
1145 .Lmul_by_one_epilogue:
1147 .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1150 { # _rsaz_512_reduce
1152 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
1154 # clobbers: everything except %rbp and %rdi
1156 .type _rsaz_512_reduce,\@abi-omnipotent
1163 imulq 128+8(%rsp), %rbx
1166 jmp .Lreduction_loop
1197 movq 128+8(%rsp), %rsi
1236 jne .Lreduction_loop
1240 movq 128+8(%rsp), %rdx # pull $n0
1243 jmp .Lreduction_loop
1248 mulx 0(%rbp), %rax, %r8
1251 mulx 8(%rbp), %rax, %r9
1256 mulx 16(%rbp), %rax, %r10
1258 mov 128+8(%rsp), %rbx # pull $n0
1263 mulx 24(%rbp), %rax, %r11
1268 mulx 32(%rbp), %rax, %r12
1273 mulx 40(%rbp), %rax, %r13
1278 mulx 48(%rbp), %rax, %r14
1283 mulx 56(%rbp), %rax, %r15
1290 jne .Lreduction_loop
1295 .size _rsaz_512_reduce,.-_rsaz_512_reduce
1298 { # _rsaz_512_subtract
1299 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
1301 # clobbers: everything but %rdi, %rsi and %rbp
1303 .type _rsaz_512_subtract,\@abi-omnipotent
1359 .size _rsaz_512_subtract,.-_rsaz_512_subtract
1364 # input: %rsi - ap, %rbp - bp
1366 # clobbers: everything
1367 my ($ap,$bp) = ("%rsi","%rbp");
1369 .type __rsaz_512_mul,\@abi-omnipotent
1511 .size __rsaz_512_mul,.-__rsaz_512_mul
1515 my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1517 .globl rsaz_512_scatter4
1518 .type rsaz_512_scatter4,\@abi-omnipotent
1521 leaq ($out,$power,4), $out
1531 leaq 128($out), $out
1535 .size rsaz_512_scatter4,.-rsaz_512_scatter4
1537 .globl rsaz_512_gather4
1538 .type rsaz_512_gather4,\@abi-omnipotent
1541 leaq ($inp,$power,4), $inp
1548 leaq 128($inp), $inp
1556 .size rsaz_512_gather4,.-rsaz_512_gather4
1560 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1561 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1569 .extern __imp_RtlVirtualUnwind
1570 .type se_handler,\@abi-omnipotent
1584 mov 120($context),%rax # pull context->Rax
1585 mov 248($context),%rbx # pull context->Rip
1587 mov 8($disp),%rsi # disp->ImageBase
1588 mov 56($disp),%r11 # disp->HandlerData
1590 mov 0(%r11),%r10d # HandlerData[0]
1591 lea (%rsi,%r10),%r10 # end of prologue label
1592 cmp %r10,%rbx # context->Rip<end of prologue label
1593 jb .Lcommon_seh_tail
1595 mov 152($context),%rax # pull context->Rsp
1597 mov 4(%r11),%r10d # HandlerData[1]
1598 lea (%rsi,%r10),%r10 # epilogue label
1599 cmp %r10,%rbx # context->Rip>=epilogue label
1600 jae .Lcommon_seh_tail
1602 lea 128+24+48(%rax),%rax
1610 mov %rbx,144($context) # restore context->Rbx
1611 mov %rbp,160($context) # restore context->Rbp
1612 mov %r12,216($context) # restore context->R12
1613 mov %r13,224($context) # restore context->R13
1614 mov %r14,232($context) # restore context->R14
1615 mov %r15,240($context) # restore context->R15
1620 mov %rax,152($context) # restore context->Rsp
1621 mov %rsi,168($context) # restore context->Rsi
1622 mov %rdi,176($context) # restore context->Rdi
1624 mov 40($disp),%rdi # disp->ContextRecord
1625 mov $context,%rsi # context
1626 mov \$154,%ecx # sizeof(CONTEXT)
1627 .long 0xa548f3fc # cld; rep movsq
1630 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1631 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1632 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1633 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1634 mov 40(%rsi),%r10 # disp->ContextRecord
1635 lea 56(%rsi),%r11 # &disp->HandlerData
1636 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1637 mov %r10,32(%rsp) # arg5
1638 mov %r11,40(%rsp) # arg6
1639 mov %r12,48(%rsp) # arg7
1640 mov %rcx,56(%rsp) # arg8, (NULL)
1641 call *__imp_RtlVirtualUnwind(%rip)
1643 mov \$1,%eax # ExceptionContinueSearch
1655 .size sqr_handler,.-sqr_handler
1659 .rva .LSEH_begin_rsaz_512_sqr
1660 .rva .LSEH_end_rsaz_512_sqr
1661 .rva .LSEH_info_rsaz_512_sqr
1663 .rva .LSEH_begin_rsaz_512_mul
1664 .rva .LSEH_end_rsaz_512_mul
1665 .rva .LSEH_info_rsaz_512_mul
1667 .rva .LSEH_begin_rsaz_512_mul_gather4
1668 .rva .LSEH_end_rsaz_512_mul_gather4
1669 .rva .LSEH_info_rsaz_512_mul_gather4
1671 .rva .LSEH_begin_rsaz_512_mul_scatter4
1672 .rva .LSEH_end_rsaz_512_mul_scatter4
1673 .rva .LSEH_info_rsaz_512_mul_scatter4
1675 .rva .LSEH_begin_rsaz_512_mul_by_one
1676 .rva .LSEH_end_rsaz_512_mul_by_one
1677 .rva .LSEH_info_rsaz_512_mul_by_one
1681 .LSEH_info_rsaz_512_sqr:
1684 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
1685 .LSEH_info_rsaz_512_mul:
1688 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
1689 .LSEH_info_rsaz_512_mul_gather4:
1692 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
1693 .LSEH_info_rsaz_512_mul_scatter4:
1696 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
1697 .LSEH_info_rsaz_512_mul_by_one:
1700 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
1704 $code =~ s/\`([^\`]*)\`/eval $1/gem;