2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # Montgomery multiplication routine for x86_64. While it gives modest
20 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
21 # than twice, >2x, as fast. Most common rsa1024 sign is improved by
22 # respectful 50%. It remains to be seen if loop unrolling and
23 # dedicated squaring routine can provide further improvement...
27 # Add dedicated squaring procedure. Performance improvement varies
28 # from platform to platform, but in average it's ~5%/15%/25%/33%
29 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
33 # Unroll and modulo-schedule inner loops in such manner that they
34 # are "fallen through" for input lengths of 8, which is critical for
35 # 1024-bit RSA *sign*. Average performance improvement in comparison
36 # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
37 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
41 # Optimize reduction in squaring procedure and improve 1024+-bit RSA
42 # sign performance by 10-16% on Intel Sandy Bridge and later
43 # (virtually same on non-Intel processors).
47 # Add MULX/ADOX/ADCX code path.
51 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
53 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
55 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
56 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
57 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
58 die "can't locate x86_64-xlate.pl";
60 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
63 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
64 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
68 if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
69 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
73 if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
74 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
78 if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
79 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
84 $rp="%rdi"; # BN_ULONG *rp,
85 $ap="%rsi"; # const BN_ULONG *ap,
86 $bp="%rdx"; # const BN_ULONG *bp,
87 $np="%rcx"; # const BN_ULONG *np,
88 $n0="%r8"; # const BN_ULONG *n0,
89 $num="%r9"; # int num);
101 .extern OPENSSL_ia32cap_P
104 .type bn_mul_mont,\@function,6
114 $code.=<<___ if ($addx);
115 mov OPENSSL_ia32cap_P+8(%rip),%r11d
135 lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2))
136 neg $num # restore $num
137 and \$-1024,%r10 # minimize TLB usage
139 # An OS-agnostic version of __chkstk.
141 # Some OSes (Windows) insist on stack being "wired" to
142 # physical memory in strictly sequential manner, i.e. if stack
143 # allocation spans two pages, then reference to farmost one can
144 # be punishable by SEGV. But page walking can do good even on
145 # other OSes, because it guarantees that villain thread hits
146 # the guard page before it can make damage to innocent one...
153 jmp .Lmul_page_walk_done
161 .Lmul_page_walk_done:
163 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
165 mov $bp,%r12 # reassign $bp
169 mov ($n0),$n0 # pull n0[0] value
170 mov ($bp),$m0 # m0=bp[0]
177 mulq $m0 # ap[0]*bp[0]
181 imulq $lo0,$m1 # "tp[0]"*n0
185 add %rax,$lo0 # discarded
198 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
201 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
205 mulq $m0 # ap[j]*bp[0]
217 mov ($ap),%rax # ap[0]
219 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
221 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
228 mov $hi1,-8(%rsp,$num,8)
229 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
235 mov ($bp,$i,8),$m0 # m0=bp[i]
239 mulq $m0 # ap[0]*bp[i]
240 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
244 imulq $lo0,$m1 # tp[0]*n0
248 add %rax,$lo0 # discarded
251 mov 8(%rsp),$lo0 # tp[1]
262 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
265 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
269 mulq $m0 # ap[j]*bp[i]
273 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
283 mov ($ap),%rax # ap[0]
285 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
288 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
294 add $lo0,$hi1 # pull upmost overflow bit
296 mov $hi1,-8(%rsp,$num,8)
297 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
303 xor $i,$i # i=0 and clear CF!
304 mov (%rsp),%rax # tp[0]
305 lea (%rsp),$ap # borrow ap for tp
309 .Lsub: sbb ($np,$i,8),%rax
310 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
311 mov 8($ap,$i,8),%rax # tp[i+1]
313 dec $j # doesnn't affect CF!
316 sbb \$0,%rax # handle upmost overflow bit
323 or $np,$ap # ap=borrow?tp:rp
325 .Lcopy: # copy or in-place refresh
327 mov $i,(%rsp,$i,8) # zap temporary vector
328 mov %rax,($rp,$i,8) # rp[i]=tp[i]
333 mov 8(%rsp,$num,8),%rsi # restore %rsp
344 .size bn_mul_mont,.-bn_mul_mont
347 my @A=("%r10","%r11");
348 my @N=("%r13","%rdi");
350 .type bn_mul4x_mont,\@function,6
357 $code.=<<___ if ($addx);
372 lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))
374 and \$-1024,%r10 # minimize TLB usage
382 jmp .Lmul4x_page_walk_done
389 .Lmul4x_page_walk_done:
391 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
393 mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
394 mov %rdx,%r12 # reassign $bp
398 mov ($n0),$n0 # pull n0[0] value
399 mov ($bp),$m0 # m0=bp[0]
406 mulq $m0 # ap[0]*bp[0]
410 imulq $A[0],$m1 # "tp[0]"*n0
414 add %rax,$A[0] # discarded
437 mulq $m0 # ap[j]*bp[0]
439 mov -16($np,$j,8),%rax
445 mov -8($ap,$j,8),%rax
447 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
449 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
452 mulq $m0 # ap[j]*bp[0]
454 mov -8($np,$j,8),%rax
462 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
464 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
467 mulq $m0 # ap[j]*bp[0]
477 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
479 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
482 mulq $m0 # ap[j]*bp[0]
491 mov -16($ap,$j,8),%rax
493 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
495 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
500 mulq $m0 # ap[j]*bp[0]
502 mov -16($np,$j,8),%rax
508 mov -8($ap,$j,8),%rax
510 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
512 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
515 mulq $m0 # ap[j]*bp[0]
517 mov -8($np,$j,8),%rax
523 mov ($ap),%rax # ap[0]
525 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
527 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
533 mov $N[0],-8(%rsp,$j,8)
534 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
539 mov ($bp,$i,8),$m0 # m0=bp[i]
543 mulq $m0 # ap[0]*bp[i]
544 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
548 imulq $A[0],$m1 # tp[0]*n0
552 add %rax,$A[0] # "$N[0]", discarded
557 mulq $m0 # ap[j]*bp[i]
561 add 8(%rsp),$A[1] # +tp[1]
569 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
572 mov $N[1],(%rsp) # tp[j-1]
577 mulq $m0 # ap[j]*bp[i]
579 mov -16($np,$j,8),%rax
581 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
587 mov -8($ap,$j,8),%rax
591 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
594 mulq $m0 # ap[j]*bp[i]
596 mov -8($np,$j,8),%rax
598 add -8(%rsp,$j,8),$A[1]
608 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
611 mulq $m0 # ap[j]*bp[i]
615 add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
625 mov $N[0],-8(%rsp,$j,8) # tp[j-1]
628 mulq $m0 # ap[j]*bp[i]
632 add 8(%rsp,$j,8),$A[1]
639 mov -16($ap,$j,8),%rax
643 mov $N[1],-32(%rsp,$j,8) # tp[j-1]
648 mulq $m0 # ap[j]*bp[i]
650 mov -16($np,$j,8),%rax
652 add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
658 mov -8($ap,$j,8),%rax
662 mov $N[0],-24(%rsp,$j,8) # tp[j-1]
665 mulq $m0 # ap[j]*bp[i]
667 mov -8($np,$j,8),%rax
669 add -8(%rsp,$j,8),$A[1]
676 mov ($ap),%rax # ap[0]
680 mov $N[1],-16(%rsp,$j,8) # tp[j-1]
686 add (%rsp,$num,8),$N[0] # pull upmost overflow bit
688 mov $N[0],-8(%rsp,$j,8)
689 mov $N[1],(%rsp,$j,8) # store upmost overflow bit
695 my @ri=("%rax","%rdx",$m0,$m1);
697 mov 16(%rsp,$num,8),$rp # restore $rp
698 mov 0(%rsp),@ri[0] # tp[0]
700 mov 8(%rsp),@ri[1] # tp[1]
701 shr \$2,$num # num/=4
702 lea (%rsp),$ap # borrow ap for tp
703 xor $i,$i # i=0 and clear CF!
706 mov 16($ap),@ri[2] # tp[2]
707 mov 24($ap),@ri[3] # tp[3]
709 lea -1($num),$j # j=num/4-1
713 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
714 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
715 sbb 16($np,$i,8),@ri[2]
716 mov 32($ap,$i,8),@ri[0] # tp[i+1]
717 mov 40($ap,$i,8),@ri[1]
718 sbb 24($np,$i,8),@ri[3]
719 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
720 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
721 sbb 32($np,$i,8),@ri[0]
722 mov 48($ap,$i,8),@ri[2]
723 mov 56($ap,$i,8),@ri[3]
724 sbb 40($np,$i,8),@ri[1]
726 dec $j # doesnn't affect CF!
729 mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
730 mov 32($ap,$i,8),@ri[0] # load overflow bit
731 sbb 16($np,$i,8),@ri[2]
732 mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
733 sbb 24($np,$i,8),@ri[3]
734 mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
736 sbb \$0,@ri[0] # handle upmost overflow bit
737 mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
744 or $np,$ap # ap=borrow?tp:rp
751 .Lcopy4x: # copy or in-place refresh
752 movdqu 16($ap,$i),%xmm2
753 movdqu 32($ap,$i),%xmm1
754 movdqa %xmm0,16(%rsp,$i)
755 movdqu %xmm2,16($rp,$i)
756 movdqa %xmm0,32(%rsp,$i)
757 movdqu %xmm1,32($rp,$i)
763 movdqu 16($ap,$i),%xmm2
764 movdqa %xmm0,16(%rsp,$i)
765 movdqu %xmm2,16($rp,$i)
769 mov 8(%rsp,$num,8),%rsi # restore %rsp
780 .size bn_mul4x_mont,.-bn_mul4x_mont
784 ######################################################################
785 # void bn_sqr8x_mont(
786 my $rptr="%rdi"; # const BN_ULONG *rptr,
787 my $aptr="%rsi"; # const BN_ULONG *aptr,
788 my $bptr="%rdx"; # not used
789 my $nptr="%rcx"; # const BN_ULONG *nptr,
790 my $n0 ="%r8"; # const BN_ULONG *n0);
791 my $num ="%r9"; # int num, has to be divisible by 8
793 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
794 my @A0=("%r10","%r11");
795 my @A1=("%r12","%r13");
796 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
798 $code.=<<___ if ($addx);
799 .extern bn_sqrx8x_internal # see x86_64-mont5 module
802 .extern bn_sqr8x_internal # see x86_64-mont5 module
804 .type bn_sqr8x_mont,\@function,6
818 shl \$3,${num}d # convert $num to bytes
819 shl \$3+2,%r10 # 4*$num
822 ##############################################################
823 # ensure that stack frame doesn't alias with $aptr modulo
824 # 4096. this is done to allow memory disambiguation logic
827 lea -64(%rsp,$num,2),%r11
834 sub %r11,%rbp # align with $aptr
835 lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
840 lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
841 lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num)
855 jmp .Lsqr8x_page_walk_done
863 .Lsqr8x_page_walk_done:
869 mov %rax, 40(%rsp) # save original %rsp
872 movq $nptr, %xmm2 # save pointer to modulus
874 movq $rptr,%xmm1 # save $rptr
875 movq %r10, %xmm3 # -$num
877 $code.=<<___ if ($addx);
878 mov OPENSSL_ia32cap_P+8(%rip),%eax
883 call bn_sqrx8x_internal # see x86_64-mont5 module
884 # %rax top-most carry
887 # %r8 end of tp[2*num]
892 sar \$3+2,%rcx # %cf=0
899 call bn_sqr8x_internal # see x86_64-mont5 module
900 # %rax top-most carry
903 # %rdi end of tp[2*num]
908 sar \$3+2,%rcx # %cf=0
928 inc %rcx # preserves %cf
931 sbb \$0,%rax # top-most carry
932 lea (%rbx,$num),%rbx # rewind
933 lea ($rptr,$num),$rptr # rewind
937 pshufd \$0,%xmm1,%xmm1
938 mov 40(%rsp),%rsi # restore %rsp
939 jmp .Lsqr8x_cond_copy
943 movdqa 16*0(%rbx),%xmm2
944 movdqa 16*1(%rbx),%xmm3
946 movdqu 16*0($rptr),%xmm4
947 movdqu 16*1($rptr),%xmm5
948 lea 16*2($rptr),$rptr
949 movdqa %xmm0,-16*2(%rbx) # zero tp
950 movdqa %xmm0,-16*1(%rbx)
951 movdqa %xmm0,-16*2(%rbx,%rdx)
952 movdqa %xmm0,-16*1(%rbx,%rdx)
961 movdqu %xmm4,-16*2($rptr)
962 movdqu %xmm5,-16*1($rptr)
964 jnz .Lsqr8x_cond_copy
976 .size bn_sqr8x_mont,.-bn_sqr8x_mont
981 my $bp="%rdx"; # original value
984 .type bn_mulx4x_mont,\@function,6
997 shl \$3,${num}d # convert $num to bytes
999 sub $num,%r10 # -$num
1001 lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8)
1006 lea (%rbp,%r11),%rsp
1009 ja .Lmulx4x_page_walk
1010 jmp .Lmulx4x_page_walk_done
1014 lea -4096(%rsp),%rsp
1017 ja .Lmulx4x_page_walk
1018 .Lmulx4x_page_walk_done:
1021 ##############################################################
1024 # +8 off-loaded &b[i]
1033 mov $num,0(%rsp) # save $num
1035 mov %r10,16(%rsp) # end of b[num]
1037 mov $n0, 24(%rsp) # save *n0
1038 mov $rp, 32(%rsp) # save $rp
1039 mov %rax,40(%rsp) # save original %rsp
1040 mov $num,48(%rsp) # inner counter
1046 my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
1047 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
1051 mov ($bp),%rdx # b[0], $bp==%rdx actually
1052 lea 64+32(%rsp),$tptr
1055 mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
1056 mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
1058 mov $bptr,8(%rsp) # off-load &b[i]
1059 mulx 2*8($aptr),%r12,%r13 # ...
1063 mov $mi,$bptr # borrow $bptr
1064 imulq 24(%rsp),$mi # "t[0]"*n0
1065 xor $zero,$zero # cf=0, of=0
1067 mulx 3*8($aptr),%rax,%r14
1069 lea 4*8($aptr),$aptr
1071 adcx $zero,%r14 # cf=0
1073 mulx 0*8($nptr),%rax,%r10
1074 adcx %rax,$bptr # discarded
1076 mulx 1*8($nptr),%rax,%r11
1079 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
1080 mov 48(%rsp),$bptr # counter value
1081 mov %r10,-4*8($tptr)
1084 mulx 3*8($nptr),%rax,%r15
1086 mov %r11,-3*8($tptr)
1088 adox $zero,%r15 # of=0
1089 lea 4*8($nptr),$nptr
1090 mov %r12,-2*8($tptr)
1096 adcx $zero,%r15 # cf=0, modulo-scheduled
1097 mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
1099 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
1101 mulx 2*8($aptr),%r12,%rax # ...
1103 mulx 3*8($aptr),%r13,%r14
1107 adcx $zero,%r14 # cf=0
1108 lea 4*8($aptr),$aptr
1109 lea 4*8($tptr),$tptr
1112 mulx 0*8($nptr),%rax,%r15
1115 mulx 1*8($nptr),%rax,%r15
1118 mulx 2*8($nptr),%rax,%r15
1119 mov %r10,-5*8($tptr)
1121 mov %r11,-4*8($tptr)
1123 mulx 3*8($nptr),%rax,%r15
1125 mov %r12,-3*8($tptr)
1128 lea 4*8($nptr),$nptr
1129 mov %r13,-2*8($tptr)
1131 dec $bptr # of=0, pass cf
1134 mov 0(%rsp),$num # load num
1135 mov 8(%rsp),$bptr # re-load &b[i]
1136 adc $zero,%r15 # modulo-scheduled
1138 sbb %r15,%r15 # top-most carry
1139 mov %r14,-1*8($tptr)
1144 mov ($bptr),%rdx # b[i]
1145 lea 8($bptr),$bptr # b++
1146 sub $num,$aptr # rewind $aptr
1147 mov %r15,($tptr) # save top-most carry
1148 lea 64+4*8(%rsp),$tptr
1149 sub $num,$nptr # rewind $nptr
1151 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
1152 xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
1154 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
1155 adox -4*8($tptr),$mi
1157 mulx 2*8($aptr),%r15,%r13 # ...
1158 adox -3*8($tptr),%r11
1160 adox -2*8($tptr),%r12
1164 mov $bptr,8(%rsp) # off-load &b[i]
1166 imulq 24(%rsp),$mi # "t[0]"*n0
1167 xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
1169 mulx 3*8($aptr),%rax,%r14
1172 adox -1*8($tptr),%r13
1174 lea 4*8($aptr),$aptr
1177 mulx 0*8($nptr),%rax,%r10
1178 adcx %rax,%r15 # discarded
1180 mulx 1*8($nptr),%rax,%r11
1183 mulx 2*8($nptr),%rax,%r12
1184 mov %r10,-4*8($tptr)
1187 mulx 3*8($nptr),%rax,%r15
1189 mov %r11,-3*8($tptr)
1190 lea 4*8($nptr),$nptr
1192 adox $zero,%r15 # of=0
1193 mov 48(%rsp),$bptr # counter value
1194 mov %r12,-2*8($tptr)
1200 mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
1201 adcx $zero,%r15 # cf=0, modulo-scheduled
1203 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
1204 adcx 0*8($tptr),%r10
1206 mulx 2*8($aptr),%r12,%rax # ...
1207 adcx 1*8($tptr),%r11
1209 mulx 3*8($aptr),%r13,%r14
1211 adcx 2*8($tptr),%r12
1213 adcx 3*8($tptr),%r13
1214 adox $zero,%r14 # of=0
1215 lea 4*8($aptr),$aptr
1216 lea 4*8($tptr),$tptr
1217 adcx $zero,%r14 # cf=0
1220 mulx 0*8($nptr),%rax,%r15
1223 mulx 1*8($nptr),%rax,%r15
1226 mulx 2*8($nptr),%rax,%r15
1227 mov %r10,-5*8($tptr)
1230 mulx 3*8($nptr),%rax,%r15
1232 mov %r11,-4*8($tptr)
1233 mov %r12,-3*8($tptr)
1236 lea 4*8($nptr),$nptr
1237 mov %r13,-2*8($tptr)
1239 dec $bptr # of=0, pass cf
1242 mov 0(%rsp),$num # load num
1243 mov 8(%rsp),$bptr # re-load &b[i]
1244 adc $zero,%r15 # modulo-scheduled
1245 sub 0*8($tptr),$zero # pull top-most carry
1247 sbb %r15,%r15 # top-most carry
1248 mov %r14,-1*8($tptr)
1254 sub $num,$nptr # rewind $nptr
1257 shr \$3+2,$num # %cf=0
1258 mov 32(%rsp),$rptr # restore rp
1267 lea 8*4($tptr),$tptr
1272 lea 8*4($nptr),$nptr
1277 lea 8*4($rptr),$rptr
1278 dec $num # preserves %cf
1281 sbb \$0,%r15 # top-most carry
1283 sub %rdx,$rptr # rewind
1287 pshufd \$0,%xmm1,%xmm1
1288 mov 40(%rsp),%rsi # restore %rsp
1289 jmp .Lmulx4x_cond_copy
1293 movdqa 16*0($tptr),%xmm2
1294 movdqa 16*1($tptr),%xmm3
1295 lea 16*2($tptr),$tptr
1296 movdqu 16*0($rptr),%xmm4
1297 movdqu 16*1($rptr),%xmm5
1298 lea 16*2($rptr),$rptr
1299 movdqa %xmm0,-16*2($tptr) # zero tp
1300 movdqa %xmm0,-16*1($tptr)
1309 movdqu %xmm4,-16*2($rptr)
1310 movdqu %xmm5,-16*1($rptr)
1312 jnz .Lmulx4x_cond_copy
1326 .size bn_mulx4x_mont,.-bn_mulx4x_mont
1330 .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1334 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1335 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1343 .extern __imp_RtlVirtualUnwind
1344 .type mul_handler,\@abi-omnipotent
1358 mov 120($context),%rax # pull context->Rax
1359 mov 248($context),%rbx # pull context->Rip
1361 mov 8($disp),%rsi # disp->ImageBase
1362 mov 56($disp),%r11 # disp->HandlerData
1364 mov 0(%r11),%r10d # HandlerData[0]
1365 lea (%rsi,%r10),%r10 # end of prologue label
1366 cmp %r10,%rbx # context->Rip<end of prologue label
1367 jb .Lcommon_seh_tail
1369 mov 152($context),%rax # pull context->Rsp
1371 mov 4(%r11),%r10d # HandlerData[1]
1372 lea (%rsi,%r10),%r10 # epilogue label
1373 cmp %r10,%rbx # context->Rip>=epilogue label
1374 jae .Lcommon_seh_tail
1376 mov 192($context),%r10 # pull $num
1377 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
1379 jmp .Lcommon_pop_regs
1380 .size mul_handler,.-mul_handler
1382 .type sqr_handler,\@abi-omnipotent
1396 mov 120($context),%rax # pull context->Rax
1397 mov 248($context),%rbx # pull context->Rip
1399 mov 8($disp),%rsi # disp->ImageBase
1400 mov 56($disp),%r11 # disp->HandlerData
1402 mov 0(%r11),%r10d # HandlerData[0]
1403 lea (%rsi,%r10),%r10 # end of prologue label
1404 cmp %r10,%rbx # context->Rip<.Lsqr_body
1405 jb .Lcommon_seh_tail
1407 mov 4(%r11),%r10d # HandlerData[1]
1408 lea (%rsi,%r10),%r10 # body label
1409 cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
1410 jb .Lcommon_pop_regs
1412 mov 152($context),%rax # pull context->Rsp
1414 mov 8(%r11),%r10d # HandlerData[2]
1415 lea (%rsi,%r10),%r10 # epilogue label
1416 cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
1417 jae .Lcommon_seh_tail
1419 mov 40(%rax),%rax # pull saved stack pointer
1428 mov %rbx,144($context) # restore context->Rbx
1429 mov %rbp,160($context) # restore context->Rbp
1430 mov %r12,216($context) # restore context->R12
1431 mov %r13,224($context) # restore context->R13
1432 mov %r14,232($context) # restore context->R14
1433 mov %r15,240($context) # restore context->R15
1438 mov %rax,152($context) # restore context->Rsp
1439 mov %rsi,168($context) # restore context->Rsi
1440 mov %rdi,176($context) # restore context->Rdi
1442 mov 40($disp),%rdi # disp->ContextRecord
1443 mov $context,%rsi # context
1444 mov \$154,%ecx # sizeof(CONTEXT)
1445 .long 0xa548f3fc # cld; rep movsq
1448 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1449 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1450 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1451 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1452 mov 40(%rsi),%r10 # disp->ContextRecord
1453 lea 56(%rsi),%r11 # &disp->HandlerData
1454 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1455 mov %r10,32(%rsp) # arg5
1456 mov %r11,40(%rsp) # arg6
1457 mov %r12,48(%rsp) # arg7
1458 mov %rcx,56(%rsp) # arg8, (NULL)
1459 call *__imp_RtlVirtualUnwind(%rip)
1461 mov \$1,%eax # ExceptionContinueSearch
1473 .size sqr_handler,.-sqr_handler
1477 .rva .LSEH_begin_bn_mul_mont
1478 .rva .LSEH_end_bn_mul_mont
1479 .rva .LSEH_info_bn_mul_mont
1481 .rva .LSEH_begin_bn_mul4x_mont
1482 .rva .LSEH_end_bn_mul4x_mont
1483 .rva .LSEH_info_bn_mul4x_mont
1485 .rva .LSEH_begin_bn_sqr8x_mont
1486 .rva .LSEH_end_bn_sqr8x_mont
1487 .rva .LSEH_info_bn_sqr8x_mont
1489 $code.=<<___ if ($addx);
1490 .rva .LSEH_begin_bn_mulx4x_mont
1491 .rva .LSEH_end_bn_mulx4x_mont
1492 .rva .LSEH_info_bn_mulx4x_mont
1497 .LSEH_info_bn_mul_mont:
1500 .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
1501 .LSEH_info_bn_mul4x_mont:
1504 .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
1505 .LSEH_info_bn_sqr8x_mont:
1508 .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
1511 $code.=<<___ if ($addx);
1512 .LSEH_info_bn_mulx4x_mont:
1515 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]