3 # ====================================================================
4 # Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
5 # <appro@openssl.org>. The module is licensed under 2-clause BSD
6 # license. November 2012. All rights reserved.
7 # ====================================================================
9 ######################################################################
10 # Montgomery squaring-n-multiplication module for SPARC T4.
12 # The module consists of three parts:
14 # 1) collection of "single-op" subroutines that perform single
15 # operation, Montgomery squaring or multiplication, on 512-,
16 # 1024-, 1536- and 2048-bit operands;
17 # 2) collection of "multi-op" subroutines that perform 5 squaring and
18 # 1 multiplication operations on operands of above lengths;
19 # 3) fall-back and helper VIS3 subroutines.
21 # RSA sign is dominated by multi-op subroutine, while RSA verify and
22 # DSA - by single-op. Special note about 4096-bit RSA verify result.
23 # Operands are too long for dedicated hardware and it's handled by
24 # VIS3 code, which is why you don't see any improvement. It's surely
25 # possible to improve it [by deploying 'mpmul' instruction], maybe in
28 # Performance improvement.
30 # 64-bit process, VIS3:
31 # sign verify sign/s verify/s
32 # rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
33 # rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
34 # rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
35 # dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
36 # dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
38 # 64-bit process, this module:
39 # sign verify sign/s verify/s
40 # rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
41 # rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
42 # rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
43 # dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
44 # dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
46 ######################################################################
47 # 32-bit process, VIS3:
48 # sign verify sign/s verify/s
49 # rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
50 # rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
51 # rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
52 # dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
53 # dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
55 # 32-bit process, this module:
56 # sign verify sign/s verify/s
57 # rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
58 # rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
59 # rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
60 # dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
61 # dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
63 # 32-bit code is prone to performance degradation as interrupt rate
64 # dispatched to CPU executing the code grows. This is because in
65 # standard process of handling interrupt in 32-bit process context
66 # upper halves of most integer registers used as input or output are
67 # zeroed. This renders result invalid, and operation has to be re-run.
68 # If CPU is "bothered" with timer interrupts only, the penalty is
69 # hardly measurable. But in order to mitigate this problem for higher
70 # interrupt rates contemporary Linux kernel recognizes biased stack
71 # even in 32-bit process context and preserves full register contents.
72 # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
75 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
76 push(@INC,"${dir}","${dir}../../perlasm");
77 require "sparcv9_modes.pl";
80 #include "sparc_arch.h"
83 .register %g2,#scratch
84 .register %g3,#scratch
87 .section ".text",#alloc,#execinstr
94 ########################################################################
95 # Register layout for mont[mul|sqr] instructions.
96 # For details see "Oracle SPARC Architecture 2011" manual at
97 # http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
99 my @R=map("%f".2*$_,(0..11,30,31,12..29));
100 my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
101 my @A=(@N[0..13],@R[14..31]);
102 my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
104 ########################################################################
105 # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
106 # const u64 *np,const BN_ULONG *n0);
108 sub generate_bn_mul_mont_t4() {
110 my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
113 .globl bn_mul_mont_t4_$NUM
119 #elif defined(SPARCV9_64BIT_STACK)
120 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
121 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
123 and %g1,SPARCV9_64BIT_STACK,%g1
131 sllx $sentinel,32,$sentinel
134 save %sp,-128,%sp ! warm it up
149 or %g4,$sentinel,$sentinel
151 ! copy arguments to global registers
156 ld [%i4+0],%f1 ! load *n0
161 # load ap[$NUM] ########################################################
163 save %sp,-128,%sp; or $sentinel,%fp,%fp
165 for($i=0; $i<14 && $i<$NUM; $i++) {
166 my $lo=$i<13?@A[$i+1]:"%o7";
169 ld [$ap+$i*8+4],@A[$i]
170 sllx @A[$i],32,@A[$i]
174 for(; $i<$NUM; $i++) {
175 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
182 # load np[$NUM] ########################################################
184 save %sp,-128,%sp; or $sentinel,%fp,%fp
186 for($i=0; $i<14 && $i<$NUM; $i++) {
187 my $lo=$i<13?@N[$i+1]:"%o7";
190 ld [$np+$i*8+4],@N[$i]
191 sllx @N[$i],32,@N[$i]
196 save %sp,-128,%sp; or $sentinel,%fp,%fp
198 for(; $i<28 && $i<$NUM; $i++) {
199 my $lo=$i<27?@N[$i+1]:"%o7";
202 ld [$np+$i*8+4],@N[$i]
203 sllx @N[$i],32,@N[$i]
208 save %sp,-128,%sp; or $sentinel,%fp,%fp
210 for(; $i<$NUM; $i++) {
211 my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
214 ld [$np+$i*8+4],@N[$i]
215 sllx @N[$i],32,@N[$i]
221 be SIZE_T_CC,.Lmsquare_$NUM
225 # load bp[$NUM] ########################################################
227 save %sp,-128,%sp; or $sentinel,%fp,%fp
229 for($i=0; $i<14 && $i<$NUM; $i++) {
230 my $lo=$i<13?@B[$i+1]:"%o7";
233 ld [$bp+$i*8+4],@B[$i]
234 sllx @B[$i],32,@B[$i]
239 save %sp,-128,%sp; or $sentinel,%fp,%fp
241 for(; $i<$NUM; $i++) {
242 my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
245 ld [$bp+$i*8+4],@B[$i]
246 sllx @B[$i],32,@B[$i]
250 # magic ################################################################
252 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
254 fbu,pn %fcc3,.Lmabort_$NUM
256 and %fp,$sentinel,$sentinel
257 brz,pn $sentinel,.Lmabort_$NUM
267 restore; and %fp,$sentinel,$sentinel
268 restore; and %fp,$sentinel,$sentinel
269 restore; and %fp,$sentinel,$sentinel
270 restore; and %fp,$sentinel,$sentinel
271 brz,pn $sentinel,.Lmabort1_$NUM
276 # save tp[$NUM] ########################################################
277 for($i=0; $i<14 && $i<$NUM; $i++) {
279 movxtod @A[$i],@R[$i]
286 and %fp,$sentinel,$sentinel
289 and %fp,$sentinel,$sentinel
290 srl %fp,0,%fp ! just in case?
291 or %o7,$sentinel,$sentinel
292 brz,a,pn $sentinel,.Lmdone_$NUM
293 mov 0,%i0 ! return failure
296 for($i=0; $i<12 && $i<$NUM; $i++) {
297 @R[$i] =~ /%f([0-9]+)/;
298 my $lo = "%f".($1+1);
301 st @R[$i],[$rp+$i*8+4]
304 for(; $i<$NUM; $i++) {
305 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
313 mov 1,%i0 ! return success
327 mov 0,%i0 ! return failure
333 save %sp,-128,%sp; or $sentinel,%fp,%fp
334 save %sp,-128,%sp; or $sentinel,%fp,%fp
335 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
338 .type bn_mul_mont_t4_$NUM, #function
339 .size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
343 for ($i=8;$i<=32;$i+=8) {
344 &generate_bn_mul_mont_t4($i);
347 ########################################################################
350 my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
355 sll %o5, 3, %o5 ! offset within first cache line
356 add %o5, $ptbl, $ptbl ! of the pwrtbl
360 $code.=<<___ if (!$skip_wr);
365 my ($pwrtbl,$B0,$B1)=@_;
368 ldx [$pwrtbl+0*32], $B0
369 ldx [$pwrtbl+8*32], $B1
370 ldx [$pwrtbl+1*32], %o4
371 ldx [$pwrtbl+9*32], %o5
373 ldx [$pwrtbl+2*32], %o4
375 ldx [$pwrtbl+10*32],%o5
377 ldx [$pwrtbl+3*32], %o4
379 ldx [$pwrtbl+11*32],%o5
380 movneg %icc, %o4, $B0
381 ldx [$pwrtbl+4*32], %o4
382 movneg %icc, %o5, $B1
383 ldx [$pwrtbl+12*32],%o5
385 ldx [$pwrtbl+5*32],%o4
387 ldx [$pwrtbl+13*32],%o5
389 ldx [$pwrtbl+6*32], %o4
391 ldx [$pwrtbl+14*32],%o5
393 ldx [$pwrtbl+7*32], %o4
395 ldx [$pwrtbl+15*32],%o5
396 movneg %xcc, %o4, $B0
397 add $pwrtbl,16*32, $pwrtbl
398 movneg %xcc, %o5, $B1
405 ldx [$pwrtbl+0*32], $Bi
406 ldx [$pwrtbl+1*32], %o4
407 ldx [$pwrtbl+2*32], %o5
409 ldx [$pwrtbl+3*32], %o4
411 ldx [$pwrtbl+4*32], %o5
412 movneg %icc, %o4, $Bi
413 ldx [$pwrtbl+5*32], %o4
415 ldx [$pwrtbl+6*32], %o5
417 ldx [$pwrtbl+7*32], %o4
419 add $pwrtbl,8*32, $pwrtbl
420 movneg %xcc, %o4, $Bi
424 ########################################################################
425 # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
426 # const u64 *pwrtbl,int pwr,int stride);
428 sub generate_bn_pwr5_mont_t4() {
430 my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
433 .globl bn_pwr5_mont_t4_$NUM
435 bn_pwr5_mont_t4_$NUM:
439 #elif defined(SPARCV9_64BIT_STACK)
440 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
441 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
443 and %g1,SPARCV9_64BIT_STACK,%g1
451 sllx $sentinel,32,$sentinel
454 save %sp,-128,%sp ! warm it up
469 or %g4,$sentinel,$sentinel
471 ! copy arguments to global registers
474 ld [%i2+0],%f1 ! load *n0
477 srl %i4,%g0,%i4 ! pack last arguments
483 # load tp[$NUM] ########################################################
485 save %sp,-128,%sp; or $sentinel,%fp,%fp
487 for($i=0; $i<14 && $i<$NUM; $i++) {
489 ldx [$tp+$i*8],@A[$i]
492 for(; $i<$NUM; $i++) {
494 ldd [$tp+$i*8],@A[$i]
497 # load np[$NUM] ########################################################
499 save %sp,-128,%sp; or $sentinel,%fp,%fp
501 for($i=0; $i<14 && $i<$NUM; $i++) {
503 ldx [$np+$i*8],@N[$i]
507 save %sp,-128,%sp; or $sentinel,%fp,%fp
509 for(; $i<28 && $i<$NUM; $i++) {
511 ldx [$np+$i*8],@N[$i]
515 save %sp,-128,%sp; or $sentinel,%fp,%fp
517 for(; $i<$NUM; $i++) {
519 ldx [$np+$i*8],@N[$i]
522 # load pwrtbl[pwr] ########################################################
524 save %sp,-128,%sp; or $sentinel,%fp,%fp
526 srlx $pwr, 32, %o4 ! unpack $pwr
530 sllx %o4, 32, $pwr ! re-pack $pwr
534 &load_ccr("%o7","%o5","%o4");
541 for($i=0; $i<14 && $i<$NUM; $i+=2) {
542 &load_b_pair("%o7",@B[$i],@B[$i+1]);
545 save %sp,-128,%sp; or $sentinel,%fp,%fp
547 for(; $i<$NUM; $i+=2) {
548 &load_b_pair("%i7",@B[$i],@B[$i+1]);
551 srax $pwr, 32, %o4 ! unpack $pwr
555 sllx %o4, 32, $pwr ! re-pack $pwr
559 &load_ccr("%i7","%o5","%o4",1);
561 # magic ################################################################
562 for($i=0; $i<5; $i++) {
564 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
565 fbu,pn %fcc3,.Labort_$NUM
567 and %fp,$sentinel,$sentinel
568 brz,pn $sentinel,.Labort_$NUM
575 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
576 fbu,pn %fcc3,.Labort_$NUM
578 and %fp,$sentinel,$sentinel
579 brz,pn $sentinel,.Labort_$NUM
584 brgez %o4,.Lstride_$NUM
591 brgez %o4,.Lstride_$NUM
592 restore; and %fp,$sentinel,$sentinel
593 restore; and %fp,$sentinel,$sentinel
594 restore; and %fp,$sentinel,$sentinel
595 restore; and %fp,$sentinel,$sentinel
596 brz,pn $sentinel,.Labort1_$NUM
601 # save tp[$NUM] ########################################################
602 for($i=0; $i<14 && $i<$NUM; $i++) {
604 movxtod @A[$i],@R[$i]
611 and %fp,$sentinel,$sentinel
614 and %fp,$sentinel,$sentinel
615 srl %fp,0,%fp ! just in case?
616 or %o7,$sentinel,$sentinel
617 brz,a,pn $sentinel,.Ldone_$NUM
618 mov 0,%i0 ! return failure
621 for($i=0; $i<$NUM; $i++) {
623 std @R[$i],[$tp+$i*8]
627 mov 1,%i0 ! return success
641 mov 0,%i0 ! return failure
644 .type bn_pwr5_mont_t4_$NUM, #function
645 .size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
649 for ($i=8;$i<=32;$i+=8) {
650 &generate_bn_pwr5_mont_t4($i);
654 ########################################################################
655 # Fall-back subroutines
657 # copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
659 ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
660 (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
663 $rp="%o0"; # u64 *rp,
664 $ap="%o1"; # const u64 *ap,
665 $bp="%o2"; # const u64 *bp,
666 $np="%o3"; # const u64 *np,
667 $n0p="%o4"; # const BN_ULONG *n0,
668 $num="%o5"; # int num); # caller ensures that num is >=3
670 .globl bn_mul_mont_t4
673 add %sp, STACK_BIAS, %g4 ! real top of stack
674 sll $num, 3, $num ! size in bytes
676 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
678 andn %g1, 63, %g1 ! align at 64 byte
679 sub %g1, STACK_FRAME, %g1 ! new top of stack
684 # +-------------------------------+<----- %sp
686 # +-------------------------------+<----- aligned at 64 bytes
688 # +-------------------------------+
691 # +-------------------------------+<----- aligned at 64 bytes
693 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
694 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
697 ld [$n0p+0], $t0 ! pull n0[0..1] value
699 add %sp, STACK_BIAS+STACK_FRAME, $tp
700 ldx [$bp+0], $m0 ! m0=bp[0]
705 ldx [$ap+0], $aj ! ap[0]
707 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
708 umulxhi $aj, $m0, $hi0
710 ldx [$ap+8], $aj ! ap[1]
712 ldx [$np+0], $nj ! np[0]
714 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
716 mulx $aj, $m0, $alo ! ap[1]*bp[0]
717 umulxhi $aj, $m0, $aj ! ahi=aj
719 mulx $nj, $m1, $lo1 ! np[0]*m1
720 umulxhi $nj, $m1, $hi1
722 ldx [$np+8], $nj ! np[1]
724 addcc $lo0, $lo1, $lo1
726 addxc %g0, $hi1, $hi1
728 mulx $nj, $m1, $nlo ! np[1]*m1
729 umulxhi $nj, $m1, $nj ! nhi=nj
732 sub $num, 24, $cnt ! cnt=num-3
736 addcc $alo, $hi0, $lo0
739 ldx [$ap+0], $aj ! ap[j]
740 addcc $nlo, $hi1, $lo1
742 addxc $nj, %g0, $hi1 ! nhi=nj
744 ldx [$np+0], $nj ! np[j]
745 mulx $aj, $m0, $alo ! ap[j]*bp[0]
747 umulxhi $aj, $m0, $aj ! ahi=aj
749 mulx $nj, $m1, $nlo ! np[j]*m1
750 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
751 umulxhi $nj, $m1, $nj ! nhi=nj
752 addxc %g0, $hi1, $hi1
753 stxa $lo1, [$tp]0xe2 ! tp[j-1]
754 add $tp, 8, $tp ! tp++
757 sub $cnt, 8, $cnt ! j--
759 addcc $alo, $hi0, $lo0
760 addxc $aj, %g0, $hi0 ! ahi=aj
762 addcc $nlo, $hi1, $lo1
764 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
765 addxc %g0, $hi1, $hi1
766 stxa $lo1, [$tp]0xe2 ! tp[j-1]
769 addcc $hi0, $hi1, $hi1
770 addxc %g0, %g0, $ovf ! upmost overflow bit
775 sub $num, 16, $i ! i=num-2
779 ldx [$bp+0], $m0 ! m0=bp[i]
782 sub $ap, $num, $ap ! rewind
786 ldx [$ap+0], $aj ! ap[0]
787 ldx [$np+0], $nj ! np[0]
789 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
790 ldx [$tp], $tj ! tp[0]
791 umulxhi $aj, $m0, $hi0
792 ldx [$ap+8], $aj ! ap[1]
793 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
794 mulx $aj, $m0, $alo ! ap[1]*bp[i]
795 addxc %g0, $hi0, $hi0
796 mulx $lo0, $n0, $m1 ! tp[0]*n0
797 umulxhi $aj, $m0, $aj ! ahi=aj
798 mulx $nj, $m1, $lo1 ! np[0]*m1
800 umulxhi $nj, $m1, $hi1
801 ldx [$np+8], $nj ! np[1]
803 addcc $lo1, $lo0, $lo1
804 mulx $nj, $m1, $nlo ! np[1]*m1
805 addxc %g0, $hi1, $hi1
806 umulxhi $nj, $m1, $nj ! nhi=nj
809 sub $num, 24, $cnt ! cnt=num-3
812 addcc $alo, $hi0, $lo0
813 ldx [$tp+8], $tj ! tp[j]
814 addxc $aj, %g0, $hi0 ! ahi=aj
815 ldx [$ap+0], $aj ! ap[j]
817 addcc $nlo, $hi1, $lo1
818 mulx $aj, $m0, $alo ! ap[j]*bp[i]
819 addxc $nj, %g0, $hi1 ! nhi=nj
820 ldx [$np+0], $nj ! np[j]
822 umulxhi $aj, $m0, $aj ! ahi=aj
823 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
824 mulx $nj, $m1, $nlo ! np[j]*m1
825 addxc %g0, $hi0, $hi0
826 umulxhi $nj, $m1, $nj ! nhi=nj
827 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
828 addxc %g0, $hi1, $hi1
829 stx $lo1, [$tp] ! tp[j-1]
831 brnz,pt $cnt, .Linner
834 ldx [$tp+8], $tj ! tp[j]
835 addcc $alo, $hi0, $lo0
836 addxc $aj, %g0, $hi0 ! ahi=aj
837 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
838 addxc %g0, $hi0, $hi0
840 addcc $nlo, $hi1, $lo1
841 addxc $nj, %g0, $hi1 ! nhi=nj
842 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
843 addxc %g0, $hi1, $hi1
844 stx $lo1, [$tp] ! tp[j-1]
846 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
847 addxccc $hi1, $hi0, $hi1
855 sub $ap, $num, $ap ! rewind
859 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
867 subccc $tj, $nj, $t2 ! tp[j]-np[j]
872 st $t2, [$rp-4] ! reverse order
877 sub $np, $num, $np ! rewind
881 subc $ovf, %g0, $ovf ! handle upmost overflow bit
884 or $np, $ap, $ap ! ap=borrow?tp:rp
889 .Lcopy: ! copy or in-place refresh
902 .type bn_mul_mont_t4, #function
903 .size bn_mul_mont_t4, .-bn_mul_mont_t4
906 # int bn_mul_mont_gather5(
907 $rp="%o0"; # u64 *rp,
908 $ap="%o1"; # const u64 *ap,
909 $bp="%o2"; # const u64 *pwrtbl,
910 $np="%o3"; # const u64 *np,
911 $n0p="%o4"; # const BN_ULONG *n0,
912 $num="%o5"; # int num, # caller ensures that num is >=3
915 .globl bn_mul_mont_gather5_t4
917 bn_mul_mont_gather5_t4:
918 add %sp, STACK_BIAS, %g4 ! real top of stack
919 sll $num, 3, $num ! size in bytes
921 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
923 andn %g1, 63, %g1 ! align at 64 byte
924 sub %g1, STACK_FRAME, %g1 ! new top of stack
926 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
930 # +-------------------------------+<----- %sp
932 # +-------------------------------+<----- aligned at 64 bytes
934 # +-------------------------------+
937 # +-------------------------------+<----- aligned at 64 bytes
939 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
940 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
942 &load_ccr($bp,"%g4",$ccr);
943 &load_b($bp,$m0,"%o7"); # m0=bp[0]
946 ld [$n0p+0], $t0 ! pull n0[0..1] value
948 add %sp, STACK_BIAS+STACK_FRAME, $tp
952 ldx [$ap+0], $aj ! ap[0]
954 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
955 umulxhi $aj, $m0, $hi0
957 ldx [$ap+8], $aj ! ap[1]
959 ldx [$np+0], $nj ! np[0]
961 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
963 mulx $aj, $m0, $alo ! ap[1]*bp[0]
964 umulxhi $aj, $m0, $aj ! ahi=aj
966 mulx $nj, $m1, $lo1 ! np[0]*m1
967 umulxhi $nj, $m1, $hi1
969 ldx [$np+8], $nj ! np[1]
971 addcc $lo0, $lo1, $lo1
973 addxc %g0, $hi1, $hi1
975 mulx $nj, $m1, $nlo ! np[1]*m1
976 umulxhi $nj, $m1, $nj ! nhi=nj
979 sub $num, 24, $cnt ! cnt=num-3
983 addcc $alo, $hi0, $lo0
986 ldx [$ap+0], $aj ! ap[j]
987 addcc $nlo, $hi1, $lo1
989 addxc $nj, %g0, $hi1 ! nhi=nj
991 ldx [$np+0], $nj ! np[j]
992 mulx $aj, $m0, $alo ! ap[j]*bp[0]
994 umulxhi $aj, $m0, $aj ! ahi=aj
996 mulx $nj, $m1, $nlo ! np[j]*m1
997 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
998 umulxhi $nj, $m1, $nj ! nhi=nj
999 addxc %g0, $hi1, $hi1
1000 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1001 add $tp, 8, $tp ! tp++
1003 brnz,pt $cnt, .L1st_g5
1004 sub $cnt, 8, $cnt ! j--
1006 addcc $alo, $hi0, $lo0
1007 addxc $aj, %g0, $hi0 ! ahi=aj
1009 addcc $nlo, $hi1, $lo1
1010 addxc $nj, %g0, $hi1
1011 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1012 addxc %g0, $hi1, $hi1
1013 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1016 addcc $hi0, $hi1, $hi1
1017 addxc %g0, %g0, $ovf ! upmost overflow bit
1018 stxa $hi1, [$tp]0xe2
1022 sub $num, 16, $i ! i=num-2
1028 &load_b($bp,$m0); # m0=bp[i]
1030 sub $ap, $num, $ap ! rewind
1034 ldx [$ap+0], $aj ! ap[0]
1035 ldx [$np+0], $nj ! np[0]
1037 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
1038 ldx [$tp], $tj ! tp[0]
1039 umulxhi $aj, $m0, $hi0
1040 ldx [$ap+8], $aj ! ap[1]
1041 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
1042 mulx $aj, $m0, $alo ! ap[1]*bp[i]
1043 addxc %g0, $hi0, $hi0
1044 mulx $lo0, $n0, $m1 ! tp[0]*n0
1045 umulxhi $aj, $m0, $aj ! ahi=aj
1046 mulx $nj, $m1, $lo1 ! np[0]*m1
1048 umulxhi $nj, $m1, $hi1
1049 ldx [$np+8], $nj ! np[1]
1051 addcc $lo1, $lo0, $lo1
1052 mulx $nj, $m1, $nlo ! np[1]*m1
1053 addxc %g0, $hi1, $hi1
1054 umulxhi $nj, $m1, $nj ! nhi=nj
1057 sub $num, 24, $cnt ! cnt=num-3
1060 addcc $alo, $hi0, $lo0
1061 ldx [$tp+8], $tj ! tp[j]
1062 addxc $aj, %g0, $hi0 ! ahi=aj
1063 ldx [$ap+0], $aj ! ap[j]
1065 addcc $nlo, $hi1, $lo1
1066 mulx $aj, $m0, $alo ! ap[j]*bp[i]
1067 addxc $nj, %g0, $hi1 ! nhi=nj
1068 ldx [$np+0], $nj ! np[j]
1070 umulxhi $aj, $m0, $aj ! ahi=aj
1071 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1072 mulx $nj, $m1, $nlo ! np[j]*m1
1073 addxc %g0, $hi0, $hi0
1074 umulxhi $nj, $m1, $nj ! nhi=nj
1075 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1076 addxc %g0, $hi1, $hi1
1077 stx $lo1, [$tp] ! tp[j-1]
1079 brnz,pt $cnt, .Linner_g5
1082 ldx [$tp+8], $tj ! tp[j]
1083 addcc $alo, $hi0, $lo0
1084 addxc $aj, %g0, $hi0 ! ahi=aj
1085 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1086 addxc %g0, $hi0, $hi0
1088 addcc $nlo, $hi1, $lo1
1089 addxc $nj, %g0, $hi1 ! nhi=nj
1090 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1091 addxc %g0, $hi1, $hi1
1092 stx $lo1, [$tp] ! tp[j-1]
1094 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
1095 addxccc $hi1, $hi0, $hi1
1096 addxc %g0, %g0, $ovf
1100 brnz,pt $i, .Louter_g5
1103 sub $ap, $num, $ap ! rewind
1107 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
1115 subccc $tj, $nj, $t2 ! tp[j]-np[j]
1118 subccc $tj, $nj, $t3
1120 st $t2, [$rp-4] ! reverse order
1122 brnz,pt $cnt, .Lsub_g5
1125 sub $np, $num, $np ! rewind
1129 subc $ovf, %g0, $ovf ! handle upmost overflow bit
1132 or $np, $ap, $ap ! ap=borrow?tp:rp
1137 .Lcopy_g5: ! copy or in-place refresh
1140 stx %g0, [$tp] ! zap
1144 brnz $cnt, .Lcopy_g5
1150 .type bn_mul_mont_gather5_t4, #function
1151 .size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
1166 brnz %o2, .Loop_flip
1170 .type bn_flip_t4, #function
1171 .size bn_flip_t4, .-bn_flip_t4
1173 .globl bn_flip_n_scatter5_t4
1175 bn_flip_n_scatter5_t4:
1178 add %o3, %o2, %o2 ! &pwrtbl[pwr]
1180 .Loop_flip_n_scatter5:
1181 ld [%o0+0], %o4 ! inp[i]
1188 brnz %o1, .Loop_flip_n_scatter5
1192 .type bn_flip_n_scatter5_t4, #function
1193 .size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
1195 .globl bn_gather5_t4
1199 &load_ccr("%o2","%o3","%g1");
1204 &load_b("%o2","%g1");
1208 brnz %o1, .Loop_gather5
1213 .type bn_gather5_t4, #function
1214 .size bn_gather5_t4, .-bn_gather5_t4
1216 .asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"