2 # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
12 # <appro@openssl.org>. The module is licensed under 2-clause BSD
13 # license. November 2012. All rights reserved.
14 # ====================================================================
16 ######################################################################
17 # Montgomery squaring-n-multiplication module for SPARC T4.
19 # The module consists of three parts:
21 # 1) collection of "single-op" subroutines that perform single
22 # operation, Montgomery squaring or multiplication, on 512-,
23 # 1024-, 1536- and 2048-bit operands;
24 # 2) collection of "multi-op" subroutines that perform 5 squaring and
25 # 1 multiplication operations on operands of above lengths;
26 # 3) fall-back and helper VIS3 subroutines.
28 # RSA sign is dominated by multi-op subroutine, while RSA verify and
29 # DSA - by single-op. Special note about 4096-bit RSA verify result.
30 # Operands are too long for dedicated hardware and it's handled by
31 # VIS3 code, which is why you don't see any improvement. It's surely
32 # possible to improve it [by deploying 'mpmul' instruction], maybe in
35 # Performance improvement.
37 # 64-bit process, VIS3:
38 # sign verify sign/s verify/s
39 # rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
40 # rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
41 # rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
42 # dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
43 # dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
45 # 64-bit process, this module:
46 # sign verify sign/s verify/s
47 # rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
48 # rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
49 # rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
50 # dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
51 # dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
53 ######################################################################
54 # 32-bit process, VIS3:
55 # sign verify sign/s verify/s
56 # rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
57 # rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
58 # rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
59 # dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
60 # dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
62 # 32-bit process, this module:
63 # sign verify sign/s verify/s
64 # rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
65 # rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
66 # rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
67 # dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
68 # dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
70 # 32-bit code is prone to performance degradation as interrupt rate
71 # dispatched to CPU executing the code grows. This is because in
72 # standard process of handling interrupt in 32-bit process context
73 # upper halves of most integer registers used as input or output are
74 # zeroed. This renders result invalid, and operation has to be re-run.
75 # If CPU is "bothered" with timer interrupts only, the penalty is
76 # hardly measurable. But in order to mitigate this problem for higher
77 # interrupt rates contemporary Linux kernel recognizes biased stack
78 # even in 32-bit process context and preserves full register contents.
79 # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
82 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83 push(@INC,"${dir}","${dir}../../perlasm");
84 require "sparcv9_modes.pl";
87 open STDOUT,">$output";
90 #include "sparc_arch.h"
93 .register %g2,#scratch
94 .register %g3,#scratch
97 .section ".text",#alloc,#execinstr
104 ########################################################################
105 # Register layout for mont[mul|sqr] instructions.
106 # For details see "Oracle SPARC Architecture 2011" manual at
107 # http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
109 my @R=map("%f".2*$_,(0..11,30,31,12..29));
110 my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
111 my @A=(@N[0..13],@R[14..31]);
112 my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
114 ########################################################################
115 # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
116 # const u64 *np,const BN_ULONG *n0);
118 sub generate_bn_mul_mont_t4() {
120 my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
123 .globl bn_mul_mont_t4_$NUM
129 #elif defined(SPARCV9_64BIT_STACK)
130 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
131 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
133 and %g1,SPARCV9_64BIT_STACK,%g1
141 sllx $sentinel,32,$sentinel
144 save %sp,-128,%sp ! warm it up
159 or %g4,$sentinel,$sentinel
161 ! copy arguments to global registers
166 ld [%i4+0],%f1 ! load *n0
171 # load ap[$NUM] ########################################################
173 save %sp,-128,%sp; or $sentinel,%fp,%fp
175 for($i=0; $i<14 && $i<$NUM; $i++) {
176 my $lo=$i<13?@A[$i+1]:"%o7";
179 ld [$ap+$i*8+4],@A[$i]
180 sllx @A[$i],32,@A[$i]
184 for(; $i<$NUM; $i++) {
185 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
192 # load np[$NUM] ########################################################
194 save %sp,-128,%sp; or $sentinel,%fp,%fp
196 for($i=0; $i<14 && $i<$NUM; $i++) {
197 my $lo=$i<13?@N[$i+1]:"%o7";
200 ld [$np+$i*8+4],@N[$i]
201 sllx @N[$i],32,@N[$i]
206 save %sp,-128,%sp; or $sentinel,%fp,%fp
208 for(; $i<28 && $i<$NUM; $i++) {
209 my $lo=$i<27?@N[$i+1]:"%o7";
212 ld [$np+$i*8+4],@N[$i]
213 sllx @N[$i],32,@N[$i]
218 save %sp,-128,%sp; or $sentinel,%fp,%fp
220 for(; $i<$NUM; $i++) {
221 my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
224 ld [$np+$i*8+4],@N[$i]
225 sllx @N[$i],32,@N[$i]
231 be SIZE_T_CC,.Lmsquare_$NUM
235 # load bp[$NUM] ########################################################
237 save %sp,-128,%sp; or $sentinel,%fp,%fp
239 for($i=0; $i<14 && $i<$NUM; $i++) {
240 my $lo=$i<13?@B[$i+1]:"%o7";
243 ld [$bp+$i*8+4],@B[$i]
244 sllx @B[$i],32,@B[$i]
249 save %sp,-128,%sp; or $sentinel,%fp,%fp
251 for(; $i<$NUM; $i++) {
252 my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
255 ld [$bp+$i*8+4],@B[$i]
256 sllx @B[$i],32,@B[$i]
260 # magic ################################################################
262 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
264 fbu,pn %fcc3,.Lmabort_$NUM
266 and %fp,$sentinel,$sentinel
267 brz,pn $sentinel,.Lmabort_$NUM
277 restore; and %fp,$sentinel,$sentinel
278 restore; and %fp,$sentinel,$sentinel
279 restore; and %fp,$sentinel,$sentinel
280 restore; and %fp,$sentinel,$sentinel
281 brz,pn $sentinel,.Lmabort1_$NUM
286 # save tp[$NUM] ########################################################
287 for($i=0; $i<14 && $i<$NUM; $i++) {
289 movxtod @A[$i],@R[$i]
296 and %fp,$sentinel,$sentinel
299 and %fp,$sentinel,$sentinel
300 srl %fp,0,%fp ! just in case?
301 or %o7,$sentinel,$sentinel
302 brz,a,pn $sentinel,.Lmdone_$NUM
303 mov 0,%i0 ! return failure
306 for($i=0; $i<12 && $i<$NUM; $i++) {
307 @R[$i] =~ /%f([0-9]+)/;
308 my $lo = "%f".($1+1);
311 st @R[$i],[$rp+$i*8+4]
314 for(; $i<$NUM; $i++) {
315 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
323 mov 1,%i0 ! return success
337 mov 0,%i0 ! return failure
343 save %sp,-128,%sp; or $sentinel,%fp,%fp
344 save %sp,-128,%sp; or $sentinel,%fp,%fp
345 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
348 .type bn_mul_mont_t4_$NUM, #function
349 .size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
353 for ($i=8;$i<=32;$i+=8) {
354 &generate_bn_mul_mont_t4($i);
357 ########################################################################
360 my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
365 sll %o5, 3, %o5 ! offset within first cache line
366 add %o5, $ptbl, $ptbl ! of the pwrtbl
370 $code.=<<___ if (!$skip_wr);
375 my ($pwrtbl,$B0,$B1)=@_;
378 ldx [$pwrtbl+0*32], $B0
379 ldx [$pwrtbl+8*32], $B1
380 ldx [$pwrtbl+1*32], %o4
381 ldx [$pwrtbl+9*32], %o5
383 ldx [$pwrtbl+2*32], %o4
385 ldx [$pwrtbl+10*32],%o5
387 ldx [$pwrtbl+3*32], %o4
389 ldx [$pwrtbl+11*32],%o5
390 movneg %icc, %o4, $B0
391 ldx [$pwrtbl+4*32], %o4
392 movneg %icc, %o5, $B1
393 ldx [$pwrtbl+12*32],%o5
395 ldx [$pwrtbl+5*32],%o4
397 ldx [$pwrtbl+13*32],%o5
399 ldx [$pwrtbl+6*32], %o4
401 ldx [$pwrtbl+14*32],%o5
403 ldx [$pwrtbl+7*32], %o4
405 ldx [$pwrtbl+15*32],%o5
406 movneg %xcc, %o4, $B0
407 add $pwrtbl,16*32, $pwrtbl
408 movneg %xcc, %o5, $B1
415 ldx [$pwrtbl+0*32], $Bi
416 ldx [$pwrtbl+1*32], %o4
417 ldx [$pwrtbl+2*32], %o5
419 ldx [$pwrtbl+3*32], %o4
421 ldx [$pwrtbl+4*32], %o5
422 movneg %icc, %o4, $Bi
423 ldx [$pwrtbl+5*32], %o4
425 ldx [$pwrtbl+6*32], %o5
427 ldx [$pwrtbl+7*32], %o4
429 add $pwrtbl,8*32, $pwrtbl
430 movneg %xcc, %o4, $Bi
434 ########################################################################
435 # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
436 # const u64 *pwrtbl,int pwr,int stride);
438 sub generate_bn_pwr5_mont_t4() {
440 my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
443 .globl bn_pwr5_mont_t4_$NUM
445 bn_pwr5_mont_t4_$NUM:
449 #elif defined(SPARCV9_64BIT_STACK)
450 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
451 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
453 and %g1,SPARCV9_64BIT_STACK,%g1
461 sllx $sentinel,32,$sentinel
464 save %sp,-128,%sp ! warm it up
479 or %g4,$sentinel,$sentinel
481 ! copy arguments to global registers
484 ld [%i2+0],%f1 ! load *n0
487 srl %i4,%g0,%i4 ! pack last arguments
493 # load tp[$NUM] ########################################################
495 save %sp,-128,%sp; or $sentinel,%fp,%fp
497 for($i=0; $i<14 && $i<$NUM; $i++) {
499 ldx [$tp+$i*8],@A[$i]
502 for(; $i<$NUM; $i++) {
504 ldd [$tp+$i*8],@A[$i]
507 # load np[$NUM] ########################################################
509 save %sp,-128,%sp; or $sentinel,%fp,%fp
511 for($i=0; $i<14 && $i<$NUM; $i++) {
513 ldx [$np+$i*8],@N[$i]
517 save %sp,-128,%sp; or $sentinel,%fp,%fp
519 for(; $i<28 && $i<$NUM; $i++) {
521 ldx [$np+$i*8],@N[$i]
525 save %sp,-128,%sp; or $sentinel,%fp,%fp
527 for(; $i<$NUM; $i++) {
529 ldx [$np+$i*8],@N[$i]
532 # load pwrtbl[pwr] ########################################################
534 save %sp,-128,%sp; or $sentinel,%fp,%fp
536 srlx $pwr, 32, %o4 ! unpack $pwr
540 sllx %o4, 32, $pwr ! re-pack $pwr
544 &load_ccr("%o7","%o5","%o4");
551 for($i=0; $i<14 && $i<$NUM; $i+=2) {
552 &load_b_pair("%o7",@B[$i],@B[$i+1]);
555 save %sp,-128,%sp; or $sentinel,%fp,%fp
557 for(; $i<$NUM; $i+=2) {
558 &load_b_pair("%i7",@B[$i],@B[$i+1]);
561 srax $pwr, 32, %o4 ! unpack $pwr
565 sllx %o4, 32, $pwr ! re-pack $pwr
569 &load_ccr("%i7","%o5","%o4",1);
571 # magic ################################################################
572 for($i=0; $i<5; $i++) {
574 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
575 fbu,pn %fcc3,.Labort_$NUM
577 and %fp,$sentinel,$sentinel
578 brz,pn $sentinel,.Labort_$NUM
585 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
586 fbu,pn %fcc3,.Labort_$NUM
588 and %fp,$sentinel,$sentinel
589 brz,pn $sentinel,.Labort_$NUM
594 brgez %o4,.Lstride_$NUM
601 brgez %o4,.Lstride_$NUM
602 restore; and %fp,$sentinel,$sentinel
603 restore; and %fp,$sentinel,$sentinel
604 restore; and %fp,$sentinel,$sentinel
605 restore; and %fp,$sentinel,$sentinel
606 brz,pn $sentinel,.Labort1_$NUM
611 # save tp[$NUM] ########################################################
612 for($i=0; $i<14 && $i<$NUM; $i++) {
614 movxtod @A[$i],@R[$i]
621 and %fp,$sentinel,$sentinel
624 and %fp,$sentinel,$sentinel
625 srl %fp,0,%fp ! just in case?
626 or %o7,$sentinel,$sentinel
627 brz,a,pn $sentinel,.Ldone_$NUM
628 mov 0,%i0 ! return failure
631 for($i=0; $i<$NUM; $i++) {
633 std @R[$i],[$tp+$i*8]
637 mov 1,%i0 ! return success
651 mov 0,%i0 ! return failure
654 .type bn_pwr5_mont_t4_$NUM, #function
655 .size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
659 for ($i=8;$i<=32;$i+=8) {
660 &generate_bn_pwr5_mont_t4($i);
664 ########################################################################
665 # Fall-back subroutines
667 # copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
669 ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
670 (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
673 $rp="%o0"; # u64 *rp,
674 $ap="%o1"; # const u64 *ap,
675 $bp="%o2"; # const u64 *bp,
676 $np="%o3"; # const u64 *np,
677 $n0p="%o4"; # const BN_ULONG *n0,
678 $num="%o5"; # int num); # caller ensures that num is >=3
680 .globl bn_mul_mont_t4
683 add %sp, STACK_BIAS, %g4 ! real top of stack
684 sll $num, 3, $num ! size in bytes
686 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
688 andn %g1, 63, %g1 ! align at 64 byte
689 sub %g1, STACK_FRAME, %g1 ! new top of stack
694 # +-------------------------------+<----- %sp
696 # +-------------------------------+<----- aligned at 64 bytes
698 # +-------------------------------+
701 # +-------------------------------+<----- aligned at 64 bytes
703 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
704 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
707 ld [$n0p+0], $t0 ! pull n0[0..1] value
709 add %sp, STACK_BIAS+STACK_FRAME, $tp
710 ldx [$bp+0], $m0 ! m0=bp[0]
715 ldx [$ap+0], $aj ! ap[0]
717 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
718 umulxhi $aj, $m0, $hi0
720 ldx [$ap+8], $aj ! ap[1]
722 ldx [$np+0], $nj ! np[0]
724 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
726 mulx $aj, $m0, $alo ! ap[1]*bp[0]
727 umulxhi $aj, $m0, $aj ! ahi=aj
729 mulx $nj, $m1, $lo1 ! np[0]*m1
730 umulxhi $nj, $m1, $hi1
732 ldx [$np+8], $nj ! np[1]
734 addcc $lo0, $lo1, $lo1
736 addxc %g0, $hi1, $hi1
738 mulx $nj, $m1, $nlo ! np[1]*m1
739 umulxhi $nj, $m1, $nj ! nhi=nj
742 sub $num, 24, $cnt ! cnt=num-3
746 addcc $alo, $hi0, $lo0
749 ldx [$ap+0], $aj ! ap[j]
750 addcc $nlo, $hi1, $lo1
752 addxc $nj, %g0, $hi1 ! nhi=nj
754 ldx [$np+0], $nj ! np[j]
755 mulx $aj, $m0, $alo ! ap[j]*bp[0]
757 umulxhi $aj, $m0, $aj ! ahi=aj
759 mulx $nj, $m1, $nlo ! np[j]*m1
760 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
761 umulxhi $nj, $m1, $nj ! nhi=nj
762 addxc %g0, $hi1, $hi1
763 stxa $lo1, [$tp]0xe2 ! tp[j-1]
764 add $tp, 8, $tp ! tp++
767 sub $cnt, 8, $cnt ! j--
769 addcc $alo, $hi0, $lo0
770 addxc $aj, %g0, $hi0 ! ahi=aj
772 addcc $nlo, $hi1, $lo1
774 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
775 addxc %g0, $hi1, $hi1
776 stxa $lo1, [$tp]0xe2 ! tp[j-1]
779 addcc $hi0, $hi1, $hi1
780 addxc %g0, %g0, $ovf ! upmost overflow bit
785 sub $num, 16, $i ! i=num-2
789 ldx [$bp+0], $m0 ! m0=bp[i]
792 sub $ap, $num, $ap ! rewind
796 ldx [$ap+0], $aj ! ap[0]
797 ldx [$np+0], $nj ! np[0]
799 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
800 ldx [$tp], $tj ! tp[0]
801 umulxhi $aj, $m0, $hi0
802 ldx [$ap+8], $aj ! ap[1]
803 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
804 mulx $aj, $m0, $alo ! ap[1]*bp[i]
805 addxc %g0, $hi0, $hi0
806 mulx $lo0, $n0, $m1 ! tp[0]*n0
807 umulxhi $aj, $m0, $aj ! ahi=aj
808 mulx $nj, $m1, $lo1 ! np[0]*m1
810 umulxhi $nj, $m1, $hi1
811 ldx [$np+8], $nj ! np[1]
813 addcc $lo1, $lo0, $lo1
814 mulx $nj, $m1, $nlo ! np[1]*m1
815 addxc %g0, $hi1, $hi1
816 umulxhi $nj, $m1, $nj ! nhi=nj
819 sub $num, 24, $cnt ! cnt=num-3
822 addcc $alo, $hi0, $lo0
823 ldx [$tp+8], $tj ! tp[j]
824 addxc $aj, %g0, $hi0 ! ahi=aj
825 ldx [$ap+0], $aj ! ap[j]
827 addcc $nlo, $hi1, $lo1
828 mulx $aj, $m0, $alo ! ap[j]*bp[i]
829 addxc $nj, %g0, $hi1 ! nhi=nj
830 ldx [$np+0], $nj ! np[j]
832 umulxhi $aj, $m0, $aj ! ahi=aj
833 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
834 mulx $nj, $m1, $nlo ! np[j]*m1
835 addxc %g0, $hi0, $hi0
836 umulxhi $nj, $m1, $nj ! nhi=nj
837 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
838 addxc %g0, $hi1, $hi1
839 stx $lo1, [$tp] ! tp[j-1]
841 brnz,pt $cnt, .Linner
844 ldx [$tp+8], $tj ! tp[j]
845 addcc $alo, $hi0, $lo0
846 addxc $aj, %g0, $hi0 ! ahi=aj
847 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
848 addxc %g0, $hi0, $hi0
850 addcc $nlo, $hi1, $lo1
851 addxc $nj, %g0, $hi1 ! nhi=nj
852 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
853 addxc %g0, $hi1, $hi1
854 stx $lo1, [$tp] ! tp[j-1]
856 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
857 addxccc $hi1, $hi0, $hi1
865 sub $ap, $num, $ap ! rewind
869 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
877 subccc $tj, $nj, $t2 ! tp[j]-np[j]
882 st $t2, [$rp-4] ! reverse order
887 sub $np, $num, $np ! rewind
891 subc $ovf, %g0, $ovf ! handle upmost overflow bit
894 or $np, $ap, $ap ! ap=borrow?tp:rp
899 .Lcopy: ! copy or in-place refresh
912 .type bn_mul_mont_t4, #function
913 .size bn_mul_mont_t4, .-bn_mul_mont_t4
916 # int bn_mul_mont_gather5(
917 $rp="%o0"; # u64 *rp,
918 $ap="%o1"; # const u64 *ap,
919 $bp="%o2"; # const u64 *pwrtbl,
920 $np="%o3"; # const u64 *np,
921 $n0p="%o4"; # const BN_ULONG *n0,
922 $num="%o5"; # int num, # caller ensures that num is >=3
925 .globl bn_mul_mont_gather5_t4
927 bn_mul_mont_gather5_t4:
928 add %sp, STACK_BIAS, %g4 ! real top of stack
929 sll $num, 3, $num ! size in bytes
931 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
933 andn %g1, 63, %g1 ! align at 64 byte
934 sub %g1, STACK_FRAME, %g1 ! new top of stack
936 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
940 # +-------------------------------+<----- %sp
942 # +-------------------------------+<----- aligned at 64 bytes
944 # +-------------------------------+
947 # +-------------------------------+<----- aligned at 64 bytes
949 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
950 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
952 &load_ccr($bp,"%g4",$ccr);
953 &load_b($bp,$m0,"%o7"); # m0=bp[0]
956 ld [$n0p+0], $t0 ! pull n0[0..1] value
958 add %sp, STACK_BIAS+STACK_FRAME, $tp
962 ldx [$ap+0], $aj ! ap[0]
964 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
965 umulxhi $aj, $m0, $hi0
967 ldx [$ap+8], $aj ! ap[1]
969 ldx [$np+0], $nj ! np[0]
971 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
973 mulx $aj, $m0, $alo ! ap[1]*bp[0]
974 umulxhi $aj, $m0, $aj ! ahi=aj
976 mulx $nj, $m1, $lo1 ! np[0]*m1
977 umulxhi $nj, $m1, $hi1
979 ldx [$np+8], $nj ! np[1]
981 addcc $lo0, $lo1, $lo1
983 addxc %g0, $hi1, $hi1
985 mulx $nj, $m1, $nlo ! np[1]*m1
986 umulxhi $nj, $m1, $nj ! nhi=nj
989 sub $num, 24, $cnt ! cnt=num-3
993 addcc $alo, $hi0, $lo0
996 ldx [$ap+0], $aj ! ap[j]
997 addcc $nlo, $hi1, $lo1
999 addxc $nj, %g0, $hi1 ! nhi=nj
1001 ldx [$np+0], $nj ! np[j]
1002 mulx $aj, $m0, $alo ! ap[j]*bp[0]
1004 umulxhi $aj, $m0, $aj ! ahi=aj
1006 mulx $nj, $m1, $nlo ! np[j]*m1
1007 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1008 umulxhi $nj, $m1, $nj ! nhi=nj
1009 addxc %g0, $hi1, $hi1
1010 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1011 add $tp, 8, $tp ! tp++
1013 brnz,pt $cnt, .L1st_g5
1014 sub $cnt, 8, $cnt ! j--
1016 addcc $alo, $hi0, $lo0
1017 addxc $aj, %g0, $hi0 ! ahi=aj
1019 addcc $nlo, $hi1, $lo1
1020 addxc $nj, %g0, $hi1
1021 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1022 addxc %g0, $hi1, $hi1
1023 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1026 addcc $hi0, $hi1, $hi1
1027 addxc %g0, %g0, $ovf ! upmost overflow bit
1028 stxa $hi1, [$tp]0xe2
1032 sub $num, 16, $i ! i=num-2
1038 &load_b($bp,$m0); # m0=bp[i]
1040 sub $ap, $num, $ap ! rewind
1044 ldx [$ap+0], $aj ! ap[0]
1045 ldx [$np+0], $nj ! np[0]
1047 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
1048 ldx [$tp], $tj ! tp[0]
1049 umulxhi $aj, $m0, $hi0
1050 ldx [$ap+8], $aj ! ap[1]
1051 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
1052 mulx $aj, $m0, $alo ! ap[1]*bp[i]
1053 addxc %g0, $hi0, $hi0
1054 mulx $lo0, $n0, $m1 ! tp[0]*n0
1055 umulxhi $aj, $m0, $aj ! ahi=aj
1056 mulx $nj, $m1, $lo1 ! np[0]*m1
1058 umulxhi $nj, $m1, $hi1
1059 ldx [$np+8], $nj ! np[1]
1061 addcc $lo1, $lo0, $lo1
1062 mulx $nj, $m1, $nlo ! np[1]*m1
1063 addxc %g0, $hi1, $hi1
1064 umulxhi $nj, $m1, $nj ! nhi=nj
1067 sub $num, 24, $cnt ! cnt=num-3
1070 addcc $alo, $hi0, $lo0
1071 ldx [$tp+8], $tj ! tp[j]
1072 addxc $aj, %g0, $hi0 ! ahi=aj
1073 ldx [$ap+0], $aj ! ap[j]
1075 addcc $nlo, $hi1, $lo1
1076 mulx $aj, $m0, $alo ! ap[j]*bp[i]
1077 addxc $nj, %g0, $hi1 ! nhi=nj
1078 ldx [$np+0], $nj ! np[j]
1080 umulxhi $aj, $m0, $aj ! ahi=aj
1081 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1082 mulx $nj, $m1, $nlo ! np[j]*m1
1083 addxc %g0, $hi0, $hi0
1084 umulxhi $nj, $m1, $nj ! nhi=nj
1085 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1086 addxc %g0, $hi1, $hi1
1087 stx $lo1, [$tp] ! tp[j-1]
1089 brnz,pt $cnt, .Linner_g5
1092 ldx [$tp+8], $tj ! tp[j]
1093 addcc $alo, $hi0, $lo0
1094 addxc $aj, %g0, $hi0 ! ahi=aj
1095 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1096 addxc %g0, $hi0, $hi0
1098 addcc $nlo, $hi1, $lo1
1099 addxc $nj, %g0, $hi1 ! nhi=nj
1100 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1101 addxc %g0, $hi1, $hi1
1102 stx $lo1, [$tp] ! tp[j-1]
1104 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
1105 addxccc $hi1, $hi0, $hi1
1106 addxc %g0, %g0, $ovf
1110 brnz,pt $i, .Louter_g5
1113 sub $ap, $num, $ap ! rewind
1117 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
1125 subccc $tj, $nj, $t2 ! tp[j]-np[j]
1128 subccc $tj, $nj, $t3
1130 st $t2, [$rp-4] ! reverse order
1132 brnz,pt $cnt, .Lsub_g5
1135 sub $np, $num, $np ! rewind
1139 subc $ovf, %g0, $ovf ! handle upmost overflow bit
1142 or $np, $ap, $ap ! ap=borrow?tp:rp
1147 .Lcopy_g5: ! copy or in-place refresh
1150 stx %g0, [$tp] ! zap
1154 brnz $cnt, .Lcopy_g5
1160 .type bn_mul_mont_gather5_t4, #function
1161 .size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
1176 brnz %o2, .Loop_flip
1180 .type bn_flip_t4, #function
1181 .size bn_flip_t4, .-bn_flip_t4
1183 .globl bn_flip_n_scatter5_t4
1185 bn_flip_n_scatter5_t4:
1188 add %o3, %o2, %o2 ! &pwrtbl[pwr]
1190 .Loop_flip_n_scatter5:
1191 ld [%o0+0], %o4 ! inp[i]
1198 brnz %o1, .Loop_flip_n_scatter5
1202 .type bn_flip_n_scatter5_t4, #function
1203 .size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
1205 .globl bn_gather5_t4
1209 &load_ccr("%o2","%o3","%g1");
1214 &load_b("%o2","%g1");
1218 brnz %o1, .Loop_gather5
1223 .type bn_gather5_t4, #function
1224 .size bn_gather5_t4, .-bn_gather5_t4
1226 .asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"