3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13 # for undertaken effort are multiple. First of all, UltraSPARC is not
14 # the whole SPARCv9 universe and other VIS-free implementations deserve
15 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
16 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18 # several integrated RSA/DSA accelerator circuits accessible through
19 # kernel driver [only(*)], but having decent user-land software
20 # implementation is important too. Finally, reasons like desire to
21 # experiment with dedicated squaring procedure. Yes, this module
22 # implements one, because it was easiest to draft it in SPARCv9
25 # (*) Engine accessing the driver in question is on my TODO list.
26 # For reference, acceleator is estimated to give 6 to 10 times
27 # improvement on single-threaded RSA sign. It should be noted
28 # that 6-10x improvement coefficient does not actually mean
29 # something extraordinary in terms of absolute [single-threaded]
30 # performance, as SPARCv9 instruction set is by all means least
31 # suitable for high performance crypto among other 64 bit
32 # platforms. 6-10x factor simply places T1 in same performance
33 # domain as say AMD64 and IA-64. Improvement of RSA verify don't
34 # appear impressive at all, but it's the sign operation which is
35 # far more critical/interesting.
37 # You might notice that inner loops are modulo-scheduled:-) This has
38 # essentially negligible impact on UltraSPARC performance, it's
39 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
40 # the advantage... Currently this module surpasses sparcv9a-mont.pl
41 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42 # module still have hidden potential [see TODO list there], which is
43 # estimated to be larger than 20%...
46 open STDOUT,">$output";
49 $rp="%i0"; # BN_ULONG *rp,
50 $ap="%i1"; # const BN_ULONG *ap,
51 $bp="%i2"; # const BN_ULONG *bp,
52 $np="%i3"; # const BN_ULONG *np,
53 $n0="%i4"; # const BN_ULONG *n0,
54 $num="%i5"; # int num);
64 $mask="%g1"; # 32 bits, what a waste...
77 $fname="bn_mul_mont_int";
80 #include "sparc_arch.h"
82 .section ".text",#alloc,#execinstr
87 cmp %o5,4 ! 128 bits minimum
89 sethi %hi(0xffffffff),$mask
95 sll $num,2,$num ! num*=4
96 or $mask,%lo(0xffffffff),$mask
100 ld [$bp],$mul0 ! bp[0]
103 add %sp,$bias,%o7 ! real top of stack
104 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
106 ld [$ap+4],$apj ! ap[1]
108 ld [$np],$car1 ! np[0]
109 sub %o7,$bias,%sp ! alloca
110 ld [$np+4],$npj ! np[1]
111 be,pt SIZE_T_CC,.Lbn_sqr_mont
114 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
115 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
116 and $car0,$mask,$acc0
117 add %sp,$bias+$frame,$tp
118 ld [$ap+8],$apj !prologue!
120 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
121 and $mul1,$mask,$mul1
123 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
124 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
126 add $acc0,$car1,$car1
127 ld [$np+8],$npj !prologue!
129 mov $tmp0,$acc0 !prologue!
132 mulx $apj,$mul0,$tmp0
133 mulx $npj,$mul1,$tmp1
134 add $acc0,$car0,$car0
135 ld [$ap+$j],$apj ! ap[j]
136 and $car0,$mask,$acc0
137 add $acc1,$car1,$car1
138 ld [$np+$j],$npj ! np[j]
140 add $acc0,$car1,$car1
151 mulx $apj,$mul0,$tmp0 !epilogue!
152 mulx $npj,$mul1,$tmp1
153 add $acc0,$car0,$car0
154 and $car0,$mask,$acc0
155 add $acc1,$car1,$car1
157 add $acc0,$car1,$car1
161 add $tmp0,$car0,$car0
162 and $car0,$mask,$acc0
163 add $tmp1,$car1,$car1
165 add $acc0,$car1,$car1
169 add $car0,$car1,$car1
174 ld [$bp+4],$mul0 ! bp[1]
176 add %sp,$bias+$frame,$tp
177 ld [$ap],$car0 ! ap[0]
178 ld [$ap+4],$apj ! ap[1]
179 ld [$np],$car1 ! np[0]
180 ld [$np+4],$npj ! np[1]
181 ld [$tp],$tmp1 ! tp[0]
182 ld [$tp+4],$tpj ! tp[1]
185 mulx $car0,$mul0,$car0
186 mulx $apj,$mul0,$tmp0 !prologue!
187 add $tmp1,$car0,$car0
188 ld [$ap+8],$apj !prologue!
189 and $car0,$mask,$acc0
192 and $mul1,$mask,$mul1
194 mulx $car1,$mul1,$car1
195 mulx $npj,$mul1,$acc1 !prologue!
197 add $acc0,$car1,$car1
198 ld [$np+8],$npj !prologue!
200 mov $tmp0,$acc0 !prologue!
203 mulx $apj,$mul0,$tmp0
204 mulx $npj,$mul1,$tmp1
206 ld [$ap+$j],$apj ! ap[j]
207 add $acc0,$car0,$car0
208 add $acc1,$car1,$car1
209 ld [$np+$j],$npj ! np[j]
210 and $car0,$mask,$acc0
211 ld [$tp+8],$tpj ! tp[j]
213 add $acc0,$car1,$car1
216 st $car1,[$tp] ! tp[j-1]
224 mulx $apj,$mul0,$tmp0 !epilogue!
225 mulx $npj,$mul1,$tmp1
227 add $acc0,$car0,$car0
228 ld [$tp+8],$tpj ! tp[j]
229 and $car0,$mask,$acc0
230 add $acc1,$car1,$car1
232 add $acc0,$car1,$car1
233 st $car1,[$tp] ! tp[j-1]
237 add $tmp0,$car0,$car0
238 and $car0,$mask,$acc0
239 add $tmp1,$car1,$car1
240 add $acc0,$car1,$car1
241 st $car1,[$tp+4] ! tp[j-1]
246 add $car0,$car1,$car1
248 add $car2,$car1,$car1
253 ld [$bp+$i],$mul0 ! bp[i]
262 sub %g0,$num,%o7 ! k=-num
264 subcc %g0,%g0,%g0 ! clear %icc.c
269 subccc %o0,%o1,%o1 ! tp[j]-np[j]
274 subc $car2,0,$car2 ! handle upmost overflow bit
281 ld [$ap+%o7],%o0 ! copy or in-place refresh
282 st %g0,[$tp+%o7] ! zap tp
293 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
294 ######## code without following dedicated squaring procedure.
296 $sbit="%i2"; # re-use $bp!
301 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
302 mulx $apj,$mul0,$tmp0 !prologue!
303 and $car0,$mask,$acc0
304 add %sp,$bias+$frame,$tp
305 ld [$ap+8],$apj !prologue!
307 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
309 and $mul1,$mask,$mul1
311 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
312 mulx $npj,$mul1,$acc1 !prologue!
314 ld [$np+8],$npj !prologue!
316 add $acc0,$car1,$car1
318 mov $tmp0,$acc0 !prologue!
321 mulx $apj,$mul0,$tmp0
322 mulx $npj,$mul1,$tmp1
323 add $acc0,$car0,$car0 ! ap[j]*a0+c0
324 add $acc1,$car1,$car1
325 ld [$ap+$j],$apj ! ap[j]
326 and $car0,$mask,$acc0
327 ld [$np+$j],$npj ! np[j]
329 add $acc0,$acc0,$acc0
334 and $acc0,$mask,$acc0
336 add $acc0,$car1,$car1
344 mulx $apj,$mul0,$tmp0 ! epilogue
345 mulx $npj,$mul1,$tmp1
346 add $acc0,$car0,$car0 ! ap[j]*a0+c0
347 add $acc1,$car1,$car1
348 and $car0,$mask,$acc0
350 add $acc0,$acc0,$acc0
353 and $acc0,$mask,$acc0
354 add $acc0,$car1,$car1
358 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
359 add $tmp1,$car1,$car1
360 and $car0,$mask,$acc0
362 add $acc0,$acc0,$acc0
365 and $acc0,$mask,$acc0
366 add $acc0,$car1,$car1
370 add $car0,$car0,$car0
372 add $car0,$car1,$car1
376 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
377 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
378 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
379 ld [$ap+4],$mul0 ! ap[1]
380 ld [$ap+8],$apj ! ap[2]
381 ld [$np],$car1 ! np[0]
382 ld [$np+4],$npj ! np[1]
385 mulx $mul0,$mul0,$car0
386 and $mul1,$mask,$mul1
388 mulx $car1,$mul1,$car1
389 mulx $npj,$mul1,$acc1
390 add $tmp0,$car1,$car1
391 and $car0,$mask,$acc0
392 ld [$np+8],$npj ! np[2]
394 add $tmp1,$car1,$car1
396 add $acc0,$car1,$car1
398 add $acc1,$car1,$car1
401 st $car1,[%sp+$bias+$frame] ! tp[0]=
403 add %sp,$bias+$frame+4,$tp
406 mulx $apj,$mul0,$acc0
407 mulx $npj,$mul1,$acc1
408 add $acc0,$car0,$car0
410 ld [$ap+$j],$apj ! ap[j]
411 and $car0,$mask,$acc0
412 ld [$np+$j],$npj ! np[j]
414 add $acc1,$car1,$car1
415 ld [$tp+8],$tpj ! tp[j]
416 add $acc0,$acc0,$acc0
420 and $acc0,$mask,$acc0
422 add $acc0,$car1,$car1
423 st $car1,[$tp] ! tp[j-1]
429 mulx $apj,$mul0,$acc0
430 mulx $npj,$mul1,$acc1
431 add $acc0,$car0,$car0
433 and $car0,$mask,$acc0
435 add $acc1,$car1,$car1
436 add $acc0,$acc0,$acc0
439 and $acc0,$mask,$acc0
440 add $acc0,$car1,$car1
441 st $car1,[$tp] ! tp[j-1]
444 add $car0,$car0,$car0
446 add $car0,$car1,$car1
447 add $car2,$car1,$car1
451 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
452 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
453 ld [$ap+8],$mul0 ! ap[2]
454 ld [$np],$car1 ! np[0]
455 ld [$np+4],$npj ! np[1]
457 and $mul1,$mask,$mul1
460 mulx $mul0,$mul0,$car0
461 mulx $car1,$mul1,$car1
462 and $car0,$mask,$acc0
463 add $tmp1,$car1,$car1
465 add %sp,$bias+$frame,$tp
473 mulx $npj,$mul1,$acc1
478 add $acc1,$car1,$car1
487 ld [$ap+$j],$apj ! ap[j]
488 mulx $npj,$mul1,$acc1
490 ld [$np+$j],$npj ! np[j]
491 add $acc0,$car1,$car1
492 ld [$tp+8],$tpj ! tp[j]
493 add $acc1,$car1,$car1
499 be,pn %icc,.Lsqr_no_inner2
503 mulx $apj,$mul0,$acc0
504 mulx $npj,$mul1,$acc1
506 add $acc0,$car0,$car0
507 ld [$ap+$j],$apj ! ap[j]
508 and $car0,$mask,$acc0
509 ld [$np+$j],$npj ! np[j]
511 add $acc0,$acc0,$acc0
512 ld [$tp+8],$tpj ! tp[j]
516 and $acc0,$mask,$acc0
518 add $acc0,$car1,$car1
519 add $acc1,$car1,$car1
520 st $car1,[$tp] ! tp[j-1]
526 mulx $apj,$mul0,$acc0
527 mulx $npj,$mul1,$acc1
529 add $acc0,$car0,$car0
530 and $car0,$mask,$acc0
532 add $acc0,$acc0,$acc0
535 and $acc0,$mask,$acc0
536 add $acc0,$car1,$car1
537 add $acc1,$car1,$car1
538 st $car1,[$tp] ! tp[j-1]
541 add $car0,$car0,$car0
543 add $car0,$car1,$car1
544 add $car2,$car1,$car1
549 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
550 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
551 ld [$ap+$i],$mul0 ! ap[j]
552 ld [$np],$car1 ! np[0]
553 ld [$np+4],$npj ! np[1]
555 and $mul1,$mask,$mul1
558 mulx $mul0,$mul0,$car0
559 mulx $car1,$mul1,$car1
560 and $car0,$mask,$acc0
561 add $tmp1,$car1,$car1
563 add %sp,$bias+$frame,$tp
568 cmp $tmp0,$num ! i<num-1
573 mulx $npj,$mul1,$acc1
578 add $acc1,$car1,$car1
586 mulx $npj,$mul1,$acc1
588 add $acc0,$car1,$car1
589 add $acc1,$car1,$car1
593 add $car0,$car0,$car0 ! recover $car0
595 add $car0,$car1,$car1
596 add $car2,$car1,$car1
602 .type $fname,#function
603 .size $fname,(.-$fname)
604 .asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
607 $code =~ s/\`([^\`]*)\`/eval($1)/gem;