3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
11 # Montgomery multiplication routine for x86_64. While it gives modest
12 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
13 # than twice, >2x, as fast. Most common rsa1024 sign is improved by
14 # respectful 50%. It remains to be seen if loop unrolling and
15 # dedicated squaring routine can provide further improvement...
18 open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
21 $rp="%rdi"; # BN_ULONG *rp,
22 $ap="%rsi"; # const BN_ULONG *ap,
23 $bp="%rdx"; # const BN_ULONG *bp,
24 $np="%rcx"; # const BN_ULONG *np,
25 $n0="%r8"; # BN_ULONG n0,
26 $num="%r9"; # int num);
29 $bp="%r12"; # reassign $bp
40 .type bn_mul_mont,\@function,6
53 lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2))
54 and \$-1024,%rsp # minimize TLB usage
55 mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp
56 mov %rdx,$bp # $bp reassigned, remember?
61 mov ($bp),$m0 # m0=bp[0]
63 mulq $m0 # ap[0]*bp[0]
67 imulq $n0,%rax # "tp[0]"*n0
71 add $lo0,%rax # discarded
78 mulq $m0 # ap[j]*bp[0]
88 add $lo0,%rax # np[j]*m1+ap[j]*bp[0]
90 mov %rax,-8(%rsp,$j,8) # tp[j-1]
100 mov $hi1,-8(%rsp,$j,8)
108 mov ($bp,$i,8),$m0 # m0=bp[i]
109 mov ($ap),%rax # ap[0]
110 mulq $m0 # ap[0]*bp[i]
111 add (%rsp),%rax # ap[0]*bp[i]+tp[0]
116 imulq $n0,%rax # tp[0]*n0
119 mulq ($np,$j,8) # np[0]*m1
120 add $lo0,%rax # discarded
128 mulq $m0 # ap[j]*bp[i]
131 add (%rsp,$j,8),%rax # ap[j]*bp[i]+tp[j]
140 add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j]
142 mov %rax,-8(%rsp,$j,8) # tp[j-1]
149 xor %rdx,%rdx # $j equals to num here...
152 add (%rsp,$j,8),$hi1 # pull upmost overflow bit
154 mov $hi1,-8(%rsp,$j,8)
155 mov %rdx,(%rsp,$j,8) # store upmost overflow bit
161 sub $i,$i # clear CF at once
162 cmp \$0,%rdx # %rdx still holds upmost overflow bit
163 jnz .Lsub # ... and $j still equals to num
164 mov -8(%rsp,$num,8),%rax
165 cmp -8($np,$num,8),%rax # tp[num-1]-np[num-1]
168 lea -1($num),$j # j=num-1
172 mov %rax,($rp,$j,8) # rp[i]=tp[i]
173 mov $i,(%rsp,$j,8) # zap temporary vector
178 mov 8(%rsp,$num,8),%rsp # restore %rsp
189 .Lsub: mov (%rsp,$i,8),%rax
191 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[j]
193 dec $j # doesn't affect cf!
195 lea -1($num),$j # j=num-1
197 jc .Lcopy # tp was less than np
199 .Lzap: mov $i,(%rsp,$j,8) # zap temporary vector
203 .size bn_mul_mont,.-bn_mul_mont