2 # Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
20 # for undertaken effort are multiple. First of all, UltraSPARC is not
21 # the whole SPARCv9 universe and other VIS-free implementations deserve
22 # optimized code as much. Secondly, newly introduced UltraSPARC T1,
23 # a.k.a. Niagara, has shared FPU and concurrent FPU-intensive paths,
24 # such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
25 # several integrated RSA/DSA accelerator circuits accessible through
26 # kernel driver [only(*)], but having decent user-land software
27 # implementation is important too. Finally, reasons like desire to
28 # experiment with dedicated squaring procedure. Yes, this module
29 # implements one, because it was easiest to draft it in SPARCv9
32 # (*) Engine accessing the driver in question is on my TODO list.
33 # For reference, accelerator is estimated to give 6 to 10 times
34 # improvement on single-threaded RSA sign. It should be noted
35 # that 6-10x improvement coefficient does not actually mean
36 # something extraordinary in terms of absolute [single-threaded]
37 # performance, as SPARCv9 instruction set is by all means least
38 # suitable for high performance crypto among other 64 bit
39 # platforms. 6-10x factor simply places T1 in same performance
40 # domain as say AMD64 and IA-64. Improvement of RSA verify don't
41 # appear impressive at all, but it's the sign operation which is
42 # far more critical/interesting.
44 # You might notice that inner loops are modulo-scheduled:-) This has
45 # essentially negligible impact on UltraSPARC performance, it's
46 # Fujitsu SPARC64 V users who should notice and hopefully appreciate
47 # the advantage... Currently this module surpasses sparcv9a-mont.pl
48 # by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
49 # module still have hidden potential [see TODO list there], which is
50 # estimated to be larger than 20%...
52 $output = pop and open STDOUT,">$output";
55 $rp="%i0"; # BN_ULONG *rp,
56 $ap="%i1"; # const BN_ULONG *ap,
57 $bp="%i2"; # const BN_ULONG *bp,
58 $np="%i3"; # const BN_ULONG *np,
59 $n0="%i4"; # const BN_ULONG *n0,
60 $num="%i5"; # int num);
70 $mask="%g1"; # 32 bits, what a waste...
83 $fname="bn_mul_mont_int";
86 #include "sparc_arch.h"
88 .section ".text",#alloc,#execinstr
93 cmp %o5,4 ! 128 bits minimum
95 sethi %hi(0xffffffff),$mask
101 sll $num,2,$num ! num*=4
102 or $mask,%lo(0xffffffff),$mask
106 ld [$bp],$mul0 ! bp[0]
109 add %sp,$bias,%o7 ! real top of stack
110 ld [$ap],$car0 ! ap[0] ! redundant in squaring context
112 ld [$ap+4],$apj ! ap[1]
114 ld [$np],$car1 ! np[0]
115 sub %o7,$bias,%sp ! alloca
116 ld [$np+4],$npj ! np[1]
117 be,pt SIZE_T_CC,.Lbn_sqr_mont
120 mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
121 mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
122 and $car0,$mask,$acc0
123 add %sp,$bias+$frame,$tp
124 ld [$ap+8],$apj !prologue!
126 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
127 and $mul1,$mask,$mul1
129 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
130 mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
132 add $acc0,$car1,$car1
133 ld [$np+8],$npj !prologue!
135 mov $tmp0,$acc0 !prologue!
138 mulx $apj,$mul0,$tmp0
139 mulx $npj,$mul1,$tmp1
140 add $acc0,$car0,$car0
141 ld [$ap+$j],$apj ! ap[j]
142 and $car0,$mask,$acc0
143 add $acc1,$car1,$car1
144 ld [$np+$j],$npj ! np[j]
146 add $acc0,$car1,$car1
157 mulx $apj,$mul0,$tmp0 !epilogue!
158 mulx $npj,$mul1,$tmp1
159 add $acc0,$car0,$car0
160 and $car0,$mask,$acc0
161 add $acc1,$car1,$car1
163 add $acc0,$car1,$car1
167 add $tmp0,$car0,$car0
168 and $car0,$mask,$acc0
169 add $tmp1,$car1,$car1
171 add $acc0,$car1,$car1
175 add $car0,$car1,$car1
180 ld [$bp+4],$mul0 ! bp[1]
182 add %sp,$bias+$frame,$tp
183 ld [$ap],$car0 ! ap[0]
184 ld [$ap+4],$apj ! ap[1]
185 ld [$np],$car1 ! np[0]
186 ld [$np+4],$npj ! np[1]
187 ld [$tp],$tmp1 ! tp[0]
188 ld [$tp+4],$tpj ! tp[1]
191 mulx $car0,$mul0,$car0
192 mulx $apj,$mul0,$tmp0 !prologue!
193 add $tmp1,$car0,$car0
194 ld [$ap+8],$apj !prologue!
195 and $car0,$mask,$acc0
198 and $mul1,$mask,$mul1
200 mulx $car1,$mul1,$car1
201 mulx $npj,$mul1,$acc1 !prologue!
203 add $acc0,$car1,$car1
204 ld [$np+8],$npj !prologue!
206 mov $tmp0,$acc0 !prologue!
209 mulx $apj,$mul0,$tmp0
210 mulx $npj,$mul1,$tmp1
212 ld [$ap+$j],$apj ! ap[j]
213 add $acc0,$car0,$car0
214 add $acc1,$car1,$car1
215 ld [$np+$j],$npj ! np[j]
216 and $car0,$mask,$acc0
217 ld [$tp+8],$tpj ! tp[j]
219 add $acc0,$car1,$car1
222 st $car1,[$tp] ! tp[j-1]
230 mulx $apj,$mul0,$tmp0 !epilogue!
231 mulx $npj,$mul1,$tmp1
233 add $acc0,$car0,$car0
234 ld [$tp+8],$tpj ! tp[j]
235 and $car0,$mask,$acc0
236 add $acc1,$car1,$car1
238 add $acc0,$car1,$car1
239 st $car1,[$tp] ! tp[j-1]
243 add $tmp0,$car0,$car0
244 and $car0,$mask,$acc0
245 add $tmp1,$car1,$car1
246 add $acc0,$car1,$car1
247 st $car1,[$tp+4] ! tp[j-1]
252 add $car0,$car1,$car1
254 add $car2,$car1,$car1
259 ld [$bp+$i],$mul0 ! bp[i]
267 sub %g0,$num,%o7 ! k=-num
269 subcc %g0,%g0,%g0 ! clear %icc.c
274 subccc %o0,%o1,%o1 ! tp[j]-np[j]
279 subccc $car2,0,$car2 ! handle upmost overflow bit
283 ld [$tp+%o7],%o1 ! conditional copy
285 st %g0,[$tp+%o7] ! zap tp
297 ######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
298 ######## code without following dedicated squaring procedure.
305 mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
306 mulx $apj,$mul0,$tmp0 !prologue!
307 and $car0,$mask,$acc0
308 add %sp,$bias+$frame,$tp
309 ld [$ap+8],$apj !prologue!
311 mulx $n0,$acc0,$mul1 ! "t[0]"*n0
313 and $mul1,$mask,$mul1
315 mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
316 mulx $npj,$mul1,$acc1 !prologue!
318 ld [$np+8],$npj !prologue!
320 add $acc0,$car1,$car1
322 mov $tmp0,$acc0 !prologue!
325 mulx $apj,$mul0,$tmp0
326 mulx $npj,$mul1,$tmp1
327 add $acc0,$car0,$car0 ! ap[j]*a0+c0
328 add $acc1,$car1,$car1
329 ld [$ap+$j],$apj ! ap[j]
330 and $car0,$mask,$acc0
331 ld [$np+$j],$npj ! np[j]
333 add $acc0,$acc0,$acc0
338 and $acc0,$mask,$acc0
340 add $acc0,$car1,$car1
348 mulx $apj,$mul0,$tmp0 ! epilogue
349 mulx $npj,$mul1,$tmp1
350 add $acc0,$car0,$car0 ! ap[j]*a0+c0
351 add $acc1,$car1,$car1
352 and $car0,$mask,$acc0
354 add $acc0,$acc0,$acc0
357 and $acc0,$mask,$acc0
358 add $acc0,$car1,$car1
362 add $tmp0,$car0,$car0 ! ap[j]*a0+c0
363 add $tmp1,$car1,$car1
364 and $car0,$mask,$acc0
366 add $acc0,$acc0,$acc0
369 and $acc0,$mask,$acc0
370 add $acc0,$car1,$car1
374 add $car0,$car0,$car0
376 add $car0,$car1,$car1
380 ld [%sp+$bias+$frame],$tmp0 ! tp[0]
381 ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
382 ld [%sp+$bias+$frame+8],$tpj ! tp[2]
383 ld [$ap+4],$mul0 ! ap[1]
384 ld [$ap+8],$apj ! ap[2]
385 ld [$np],$car1 ! np[0]
386 ld [$np+4],$npj ! np[1]
389 mulx $mul0,$mul0,$car0
390 and $mul1,$mask,$mul1
392 mulx $car1,$mul1,$car1
393 mulx $npj,$mul1,$acc1
394 add $tmp0,$car1,$car1
395 and $car0,$mask,$acc0
396 ld [$np+8],$npj ! np[2]
398 add $tmp1,$car1,$car1
400 add $acc0,$car1,$car1
402 add $acc1,$car1,$car1
405 st $car1,[%sp+$bias+$frame] ! tp[0]=
407 add %sp,$bias+$frame+4,$tp
410 mulx $apj,$mul0,$acc0
411 mulx $npj,$mul1,$acc1
412 add $acc0,$car0,$car0
414 ld [$ap+$j],$apj ! ap[j]
415 and $car0,$mask,$acc0
416 ld [$np+$j],$npj ! np[j]
418 add $acc1,$car1,$car1
419 ld [$tp+8],$tpj ! tp[j]
420 add $acc0,$acc0,$acc0
422 add $sbit,$acc0,$acc0
424 and $acc0,$mask,$acc0
426 add $acc0,$car1,$car1
427 st $car1,[$tp] ! tp[j-1]
433 mulx $apj,$mul0,$acc0
434 mulx $npj,$mul1,$acc1
435 add $acc0,$car0,$car0
437 and $car0,$mask,$acc0
439 add $acc1,$car1,$car1
440 add $acc0,$acc0,$acc0
441 add $sbit,$acc0,$acc0
443 and $acc0,$mask,$acc0
444 add $acc0,$car1,$car1
445 st $car1,[$tp] ! tp[j-1]
448 add $car0,$car0,$car0
449 add $sbit,$car0,$car0
450 add $car0,$car1,$car1
451 add $car2,$car1,$car1
455 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
456 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
457 ld [$ap+8],$mul0 ! ap[2]
458 ld [$np],$car1 ! np[0]
459 ld [$np+4],$npj ! np[1]
461 and $mul1,$mask,$mul1
464 mulx $mul0,$mul0,$car0
465 mulx $car1,$mul1,$car1
466 and $car0,$mask,$acc0
467 add $tmp1,$car1,$car1
469 add %sp,$bias+$frame,$tp
477 mulx $npj,$mul1,$acc1
482 add $acc1,$car1,$car1
491 ld [$ap+$j],$apj ! ap[j]
492 mulx $npj,$mul1,$acc1
494 ld [$np+$j],$npj ! np[j]
496 and $car1,$mask,$car1
497 add $tmp0,$sbit,$sbit
498 add $acc0,$car1,$car1
499 ld [$tp+8],$tpj ! tp[j]
500 add $acc1,$car1,$car1
506 be,pn %icc,.Lsqr_no_inner2
510 mulx $apj,$mul0,$acc0
511 mulx $npj,$mul1,$acc1
513 add $acc0,$car0,$car0
514 ld [$ap+$j],$apj ! ap[j]
515 and $car0,$mask,$acc0
516 ld [$np+$j],$npj ! np[j]
518 add $acc0,$acc0,$acc0
519 ld [$tp+8],$tpj ! tp[j]
520 add $sbit,$acc0,$acc0
523 and $acc0,$mask,$acc0
525 add $acc0,$car1,$car1
526 add $acc1,$car1,$car1
527 st $car1,[$tp] ! tp[j-1]
533 mulx $apj,$mul0,$acc0
534 mulx $npj,$mul1,$acc1
536 add $acc0,$car0,$car0
537 and $car0,$mask,$acc0
539 add $acc0,$acc0,$acc0
540 add $sbit,$acc0,$acc0
542 and $acc0,$mask,$acc0
543 add $acc0,$car1,$car1
544 add $acc1,$car1,$car1
545 st $car1,[$tp] ! tp[j-1]
548 add $car0,$car0,$car0
549 add $sbit,$car0,$car0
550 add $car0,$car1,$car1
551 add $car2,$car1,$car1
556 ld [%sp+$bias+$frame],$tmp1 ! tp[0]
557 ld [%sp+$bias+$frame+4],$tpj ! tp[1]
558 ld [$ap+$i],$mul0 ! ap[j]
559 ld [$np],$car1 ! np[0]
560 ld [$np+4],$npj ! np[1]
562 and $mul1,$mask,$mul1
565 mulx $mul0,$mul0,$car0
566 mulx $car1,$mul1,$car1
567 and $car0,$mask,$acc0
568 add $tmp1,$car1,$car1
570 add %sp,$bias+$frame,$tp
575 cmp $tmp0,$num ! i<num-1
580 mulx $npj,$mul1,$acc1
585 add $acc1,$car1,$car1
593 mulx $npj,$mul1,$acc1
596 and $acc0,$mask,$acc0
597 add $tmp0,$sbit,$sbit
598 add $acc0,$car1,$car1
599 add $acc1,$car1,$car1
603 add $car0,$car0,$car0 ! recover $car0
604 add $sbit,$car0,$car0
605 add $car0,$car1,$car1
606 add $car2,$car1,$car1
612 .type $fname,#function
613 .size $fname,(.-$fname)
614 .asciz "Montgomery Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
617 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
619 close STDOUT or die "error closing STDOUT";