3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
13 # for undertaken effort are multiple. First of all, UltraSPARC is not
14 # the whole SPARCv9 universe and other VIS-free implementations deserve
15 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
16 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
17 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
18 # several integrated RSA/DSA accelerator circuits accessible through
19 # kernel driver [only(*)], but having decent user-land software
20 # implementation is important too. Finally, reasons like desire to
21 # experiment with dedicated squaring procedure. Yes, this module
22 # implements one, because it was easiest to draft it in SPARCv9
25 # (*) Engine accessing the driver in question is on my TODO list.
26 # For reference, acceleator is estimated to give 6 to 10 times
27 # improvement on single-threaded RSA sign. It should be noted
28 # that 6-10x improvement coefficient does not actually mean
29 # something extraordinary in terms of absolute [single-threaded]
30 # performance, as SPARCv9 instruction set is by all means least
31 # suitable for high performance crypto among other 64 bit
32 # platforms. 6-10x factor simply places T1 in same performance
33 # domain as say AMD64 and IA-64. Improvement of RSA verify don't
34 # appear impressive at all, but it's the sign operation which is
35 # far more critical/interesting.
37 # You might notice that inner loops are modulo-scheduled:-) This has
38 # essentially negligible impact on UltraSPARC performance, it's
39 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
40 # the advantage... Currently this module surpasses sparcv9a-mont.pl
41 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
42 # module still have hidden potential [see TODO list there], which is
43 # estimated to be larger than 20%...
46 $rp="%i0"; # BN_ULONG *rp,
47 $ap="%i1"; # const BN_ULONG *ap,
48 $bp="%i2"; # const BN_ULONG *bp,
49 $np="%i3"; # const BN_ULONG *np,
50 $n0="%i4"; # const BN_ULONG *n0,
51 $num="%i5"; # int num);
54 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55 if ($bits==64) { $bias=2047; $frame=192; }
56 else { $bias=0; $frame=128; }
63 $mask="%g1"; # 32 bits, what a waste...
76 $fname="bn_mul_mont_int";
79 .section ".text",#alloc,#execinstr
84 cmp %o5,4 ! 128 bits minimum
86 sethi %hi(0xffffffff),$mask
92 sll $num,2,$num ! num*=4
93 or $mask,%lo(0xffffffff),$mask
97 ld [$bp],$mul0 ! bp[0]
100 add %sp,$bias,%o7 ! real top of stack
101 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
103 ld [$ap+4],$apj ! ap[1]
105 ld [$np],$car1 ! np[0]
106 sub %o7,$bias,%sp ! alloca
107 ld [$np+4],$npj ! np[1]
108 be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
111 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
112 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
113 and $car0,$mask,$acc0
114 add %sp,$bias+$frame,$tp
115 ld [$ap+8],$apj !prologue!
117 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
118 and $mul1,$mask,$mul1
120 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
121 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
123 add $acc0,$car1,$car1
124 ld [$np+8],$npj !prologue!
126 mov $tmp0,$acc0 !prologue!
129 mulx $apj,$mul0,$tmp0
130 mulx $npj,$mul1,$tmp1
131 add $acc0,$car0,$car0
132 ld [$ap+$j],$apj ! ap[j]
133 and $car0,$mask,$acc0
134 add $acc1,$car1,$car1
135 ld [$np+$j],$npj ! np[j]
137 add $acc0,$car1,$car1
148 mulx $apj,$mul0,$tmp0 !epilogue!
149 mulx $npj,$mul1,$tmp1
150 add $acc0,$car0,$car0
151 and $car0,$mask,$acc0
152 add $acc1,$car1,$car1
154 add $acc0,$car1,$car1
158 add $tmp0,$car0,$car0
159 and $car0,$mask,$acc0
160 add $tmp1,$car1,$car1
162 add $acc0,$car1,$car1
166 add $car0,$car1,$car1
171 ld [$bp+4],$mul0 ! bp[1]
173 add %sp,$bias+$frame,$tp
174 ld [$ap],$car0 ! ap[0]
175 ld [$ap+4],$apj ! ap[1]
176 ld [$np],$car1 ! np[0]
177 ld [$np+4],$npj ! np[1]
178 ld [$tp],$tmp1 ! tp[0]
179 ld [$tp+4],$tpj ! tp[1]
182 mulx $car0,$mul0,$car0
183 mulx $apj,$mul0,$tmp0 !prologue!
184 add $tmp1,$car0,$car0
185 ld [$ap+8],$apj !prologue!
186 and $car0,$mask,$acc0
189 and $mul1,$mask,$mul1
191 mulx $car1,$mul1,$car1
192 mulx $npj,$mul1,$acc1 !prologue!
194 add $acc0,$car1,$car1
195 ld [$np+8],$npj !prologue!
197 mov $tmp0,$acc0 !prologue!
200 mulx $apj,$mul0,$tmp0
201 mulx $npj,$mul1,$tmp1
203 ld [$ap+$j],$apj ! ap[j]
204 add $acc0,$car0,$car0
205 add $acc1,$car1,$car1
206 ld [$np+$j],$npj ! np[j]
207 and $car0,$mask,$acc0
208 ld [$tp+8],$tpj ! tp[j]
210 add $acc0,$car1,$car1
213 st $car1,[$tp] ! tp[j-1]
221 mulx $apj,$mul0,$tmp0 !epilogue!
222 mulx $npj,$mul1,$tmp1
224 add $acc0,$car0,$car0
225 ld [$tp+8],$tpj ! tp[j]
226 and $car0,$mask,$acc0
227 add $acc1,$car1,$car1
229 add $acc0,$car1,$car1
230 st $car1,[$tp] ! tp[j-1]
234 add $tmp0,$car0,$car0
235 and $car0,$mask,$acc0
236 add $tmp1,$car1,$car1
237 add $acc0,$car1,$car1
238 st $car1,[$tp+4] ! tp[j-1]
243 add $car0,$car1,$car1
245 add $car2,$car1,$car1
250 ld [$bp+$i],$mul0 ! bp[i]
259 sub %g0,$num,%o7 ! k=-num
261 subcc %g0,%g0,%g0 ! clear %icc.c
266 subccc %o0,%o1,%o1 ! tp[j]-np[j]
271 subc $car2,0,$car2 ! handle upmost overflow bit
278 ld [$ap+%o7],%o0 ! copy or in-place refresh
279 st %g0,[$tp+%o7] ! zap tp
290 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
291 ######## code without following dedicated squaring procedure.
293 $sbit="%i2"; # re-use $bp!
298 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
299 mulx $apj,$mul0,$tmp0 !prologue!
300 and $car0,$mask,$acc0
301 add %sp,$bias+$frame,$tp
302 ld [$ap+8],$apj !prologue!
304 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
306 and $mul1,$mask,$mul1
308 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
309 mulx $npj,$mul1,$acc1 !prologue!
311 ld [$np+8],$npj !prologue!
313 add $acc0,$car1,$car1
315 mov $tmp0,$acc0 !prologue!
318 mulx $apj,$mul0,$tmp0
319 mulx $npj,$mul1,$tmp1
320 add $acc0,$car0,$car0 ! ap[j]*a0+c0
321 add $acc1,$car1,$car1
322 ld [$ap+$j],$apj ! ap[j]
323 and $car0,$mask,$acc0
324 ld [$np+$j],$npj ! np[j]
326 add $acc0,$acc0,$acc0
331 and $acc0,$mask,$acc0
333 add $acc0,$car1,$car1
341 mulx $apj,$mul0,$tmp0 ! epilogue
342 mulx $npj,$mul1,$tmp1
343 add $acc0,$car0,$car0 ! ap[j]*a0+c0
344 add $acc1,$car1,$car1
345 and $car0,$mask,$acc0
347 add $acc0,$acc0,$acc0
350 and $acc0,$mask,$acc0
351 add $acc0,$car1,$car1
355 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
356 add $tmp1,$car1,$car1
357 and $car0,$mask,$acc0
359 add $acc0,$acc0,$acc0
362 and $acc0,$mask,$acc0
363 add $acc0,$car1,$car1
367 add $car0,$car0,$car0
369 add $car0,$car1,$car1
373 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
374 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
375 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
376 ld [$ap+4],$mul0 ! ap[1]
377 ld [$ap+8],$apj ! ap[2]
378 ld [$np],$car1 ! np[0]
379 ld [$np+4],$npj ! np[1]
382 mulx $mul0,$mul0,$car0
383 and $mul1,$mask,$mul1
385 mulx $car1,$mul1,$car1
386 mulx $npj,$mul1,$acc1
387 add $tmp0,$car1,$car1
388 and $car0,$mask,$acc0
389 ld [$np+8],$npj ! np[2]
391 add $tmp1,$car1,$car1
393 add $acc0,$car1,$car1
395 add $acc1,$car1,$car1
398 st $car1,[%sp+$bias+$frame] ! tp[0]=
400 add %sp,$bias+$frame+4,$tp
403 mulx $apj,$mul0,$acc0
404 mulx $npj,$mul1,$acc1
405 add $acc0,$car0,$car0
407 ld [$ap+$j],$apj ! ap[j]
408 and $car0,$mask,$acc0
409 ld [$np+$j],$npj ! np[j]
411 add $acc1,$car1,$car1
412 ld [$tp+8],$tpj ! tp[j]
413 add $acc0,$acc0,$acc0
417 and $acc0,$mask,$acc0
419 add $acc0,$car1,$car1
420 st $car1,[$tp] ! tp[j-1]
426 mulx $apj,$mul0,$acc0
427 mulx $npj,$mul1,$acc1
428 add $acc0,$car0,$car0
430 and $car0,$mask,$acc0
432 add $acc1,$car1,$car1
433 add $acc0,$acc0,$acc0
436 and $acc0,$mask,$acc0
437 add $acc0,$car1,$car1
438 st $car1,[$tp] ! tp[j-1]
441 add $car0,$car0,$car0
443 add $car0,$car1,$car1
444 add $car2,$car1,$car1
448 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
449 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
450 ld [$ap+8],$mul0 ! ap[2]
451 ld [$np],$car1 ! np[0]
452 ld [$np+4],$npj ! np[1]
454 and $mul1,$mask,$mul1
457 mulx $mul0,$mul0,$car0
458 mulx $car1,$mul1,$car1
459 and $car0,$mask,$acc0
460 add $tmp1,$car1,$car1
462 add %sp,$bias+$frame,$tp
470 mulx $npj,$mul1,$acc1
475 add $acc1,$car1,$car1
484 ld [$ap+$j],$apj ! ap[j]
485 mulx $npj,$mul1,$acc1
487 ld [$np+$j],$npj ! np[j]
488 add $acc0,$car1,$car1
489 ld [$tp+8],$tpj ! tp[j]
490 add $acc1,$car1,$car1
496 be,pn %icc,.Lsqr_no_inner2
500 mulx $apj,$mul0,$acc0
501 mulx $npj,$mul1,$acc1
503 add $acc0,$car0,$car0
504 ld [$ap+$j],$apj ! ap[j]
505 and $car0,$mask,$acc0
506 ld [$np+$j],$npj ! np[j]
508 add $acc0,$acc0,$acc0
509 ld [$tp+8],$tpj ! tp[j]
513 and $acc0,$mask,$acc0
515 add $acc0,$car1,$car1
516 add $acc1,$car1,$car1
517 st $car1,[$tp] ! tp[j-1]
523 mulx $apj,$mul0,$acc0
524 mulx $npj,$mul1,$acc1
526 add $acc0,$car0,$car0
527 and $car0,$mask,$acc0
529 add $acc0,$acc0,$acc0
532 and $acc0,$mask,$acc0
533 add $acc0,$car1,$car1
534 add $acc1,$car1,$car1
535 st $car1,[$tp] ! tp[j-1]
538 add $car0,$car0,$car0
540 add $car0,$car1,$car1
541 add $car2,$car1,$car1
546 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
547 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
548 ld [$ap+$i],$mul0 ! ap[j]
549 ld [$np],$car1 ! np[0]
550 ld [$np+4],$npj ! np[1]
552 and $mul1,$mask,$mul1
555 mulx $mul0,$mul0,$car0
556 mulx $car1,$mul1,$car1
557 and $car0,$mask,$acc0
558 add $tmp1,$car1,$car1
560 add %sp,$bias+$frame,$tp
565 cmp $tmp0,$num ! i<num-1
570 mulx $npj,$mul1,$acc1
575 add $acc1,$car1,$car1
583 mulx $npj,$mul1,$acc1
585 add $acc0,$car1,$car1
586 add $acc1,$car1,$car1
590 add $car0,$car0,$car0 ! recover $car0
592 add $car0,$car1,$car1
593 add $car2,$car1,$car1
599 .type $fname,#function
600 .size $fname,(.-$fname)
601 .asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
604 $code =~ s/\`([^\`]*)\`/eval($1)/gem;