2 # Copyright 2012-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by David S. Miller and Andy Polyakov
12 # The module is licensed under 2-clause BSD license.
13 # November 2012. All rights reserved.
14 # ====================================================================
16 ######################################################################
17 # Montgomery squaring-n-multiplication module for SPARC T4.
19 # The module consists of three parts:
21 # 1) collection of "single-op" subroutines that perform single
22 # operation, Montgomery squaring or multiplication, on 512-,
23 # 1024-, 1536- and 2048-bit operands;
24 # 2) collection of "multi-op" subroutines that perform 5 squaring and
25 # 1 multiplication operations on operands of above lengths;
26 # 3) fall-back and helper VIS3 subroutines.
28 # RSA sign is dominated by multi-op subroutine, while RSA verify and
29 # DSA - by single-op. Special note about 4096-bit RSA verify result.
30 # Operands are too long for dedicated hardware and it's handled by
31 # VIS3 code, which is why you don't see any improvement. It's surely
32 # possible to improve it [by deploying 'mpmul' instruction], maybe in
35 # Performance improvement.
37 # 64-bit process, VIS3:
38 # sign verify sign/s verify/s
39 # rsa 1024 bits 0.000628s 0.000028s 1592.4 35434.4
40 # rsa 2048 bits 0.003282s 0.000106s 304.7 9438.3
41 # rsa 4096 bits 0.025866s 0.000340s 38.7 2940.9
42 # dsa 1024 bits 0.000301s 0.000332s 3323.7 3013.9
43 # dsa 2048 bits 0.001056s 0.001233s 946.9 810.8
45 # 64-bit process, this module:
46 # sign verify sign/s verify/s
47 # rsa 1024 bits 0.000256s 0.000016s 3904.4 61411.9
48 # rsa 2048 bits 0.000946s 0.000029s 1056.8 34292.7
49 # rsa 4096 bits 0.005061s 0.000340s 197.6 2940.5
50 # dsa 1024 bits 0.000176s 0.000195s 5674.7 5130.5
51 # dsa 2048 bits 0.000296s 0.000354s 3383.2 2827.6
53 ######################################################################
54 # 32-bit process, VIS3:
55 # sign verify sign/s verify/s
56 # rsa 1024 bits 0.000665s 0.000028s 1504.8 35233.3
57 # rsa 2048 bits 0.003349s 0.000106s 298.6 9433.4
58 # rsa 4096 bits 0.025959s 0.000341s 38.5 2934.8
59 # dsa 1024 bits 0.000320s 0.000341s 3123.3 2929.6
60 # dsa 2048 bits 0.001101s 0.001260s 908.2 793.4
62 # 32-bit process, this module:
63 # sign verify sign/s verify/s
64 # rsa 1024 bits 0.000301s 0.000017s 3317.1 60240.0
65 # rsa 2048 bits 0.001034s 0.000030s 966.9 33812.7
66 # rsa 4096 bits 0.005244s 0.000341s 190.7 2935.4
67 # dsa 1024 bits 0.000201s 0.000205s 4976.1 4879.2
68 # dsa 2048 bits 0.000328s 0.000360s 3051.1 2774.2
70 # 32-bit code is prone to performance degradation as interrupt rate
71 # dispatched to CPU executing the code grows. This is because in
72 # standard process of handling interrupt in 32-bit process context
73 # upper halves of most integer registers used as input or output are
74 # zeroed. This renders result invalid, and operation has to be re-run.
75 # If CPU is "bothered" with timer interrupts only, the penalty is
76 # hardly measurable. But in order to mitigate this problem for higher
77 # interrupt rates contemporary Linux kernel recognizes biased stack
78 # even in 32-bit process context and preserves full register contents.
79 # See http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=517ffce4e1a03aea979fe3a18a3dd1761a24fafb
82 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
83 push(@INC,"${dir}","${dir}../../perlasm");
84 require "sparcv9_modes.pl";
86 $output = pop and open STDOUT,">$output";
89 #include "sparc_arch.h"
92 .register %g2,#scratch
93 .register %g3,#scratch
96 .section ".text",#alloc,#execinstr
103 ########################################################################
104 # Register layout for mont[mul|sqr] instructions.
105 # For details see "Oracle SPARC Architecture 2011" manual at
106 # http://www.oracle.com/technetwork/server-storage/sun-sparc-enterprise/documentation/.
108 my @R=map("%f".2*$_,(0..11,30,31,12..29));
109 my @N=(map("%l$_",(0..7)),map("%o$_",(0..5))); @N=(@N,@N,@N[0..3]);
110 my @A=(@N[0..13],@R[14..31]);
111 my @B=(map("%i$_",(0..5)),map("%l$_",(0..7))); @B=(@B,@B,map("%o$_",(0..3)));
113 ########################################################################
114 # int bn_mul_mont_t4_$NUM(u64 *rp,const u64 *ap,const u64 *bp,
115 # const u64 *np,const BN_ULONG *n0);
117 sub generate_bn_mul_mont_t4() {
119 my ($rp,$ap,$bp,$np,$sentinel)=map("%g$_",(1..5));
122 .globl bn_mul_mont_t4_$NUM
128 #elif defined(SPARCV9_64BIT_STACK)
129 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
130 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
132 and %g1,SPARCV9_64BIT_STACK,%g1
140 sllx $sentinel,32,$sentinel
143 save %sp,-128,%sp ! warm it up
158 or %g4,$sentinel,$sentinel
160 ! copy arguments to global registers
165 ld [%i4+0],%f1 ! load *n0
170 # load ap[$NUM] ########################################################
172 save %sp,-128,%sp; or $sentinel,%fp,%fp
174 for($i=0; $i<14 && $i<$NUM; $i++) {
175 my $lo=$i<13?@A[$i+1]:"%o7";
178 ld [$ap+$i*8+4],@A[$i]
179 sllx @A[$i],32,@A[$i]
183 for(; $i<$NUM; $i++) {
184 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
191 # load np[$NUM] ########################################################
193 save %sp,-128,%sp; or $sentinel,%fp,%fp
195 for($i=0; $i<14 && $i<$NUM; $i++) {
196 my $lo=$i<13?@N[$i+1]:"%o7";
199 ld [$np+$i*8+4],@N[$i]
200 sllx @N[$i],32,@N[$i]
205 save %sp,-128,%sp; or $sentinel,%fp,%fp
207 for(; $i<28 && $i<$NUM; $i++) {
208 my $lo=$i<27?@N[$i+1]:"%o7";
211 ld [$np+$i*8+4],@N[$i]
212 sllx @N[$i],32,@N[$i]
217 save %sp,-128,%sp; or $sentinel,%fp,%fp
219 for(; $i<$NUM; $i++) {
220 my $lo=($i<$NUM-1)?@N[$i+1]:"%o7";
223 ld [$np+$i*8+4],@N[$i]
224 sllx @N[$i],32,@N[$i]
230 be SIZE_T_CC,.Lmsquare_$NUM
234 # load bp[$NUM] ########################################################
236 save %sp,-128,%sp; or $sentinel,%fp,%fp
238 for($i=0; $i<14 && $i<$NUM; $i++) {
239 my $lo=$i<13?@B[$i+1]:"%o7";
242 ld [$bp+$i*8+4],@B[$i]
243 sllx @B[$i],32,@B[$i]
248 save %sp,-128,%sp; or $sentinel,%fp,%fp
250 for(; $i<$NUM; $i++) {
251 my $lo=($i<$NUM-1)?@B[$i+1]:"%o7";
254 ld [$bp+$i*8+4],@B[$i]
255 sllx @B[$i],32,@B[$i]
259 # magic ################################################################
261 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
263 fbu,pn %fcc3,.Lmabort_$NUM
265 and %fp,$sentinel,$sentinel
266 brz,pn $sentinel,.Lmabort_$NUM
276 restore; and %fp,$sentinel,$sentinel
277 restore; and %fp,$sentinel,$sentinel
278 restore; and %fp,$sentinel,$sentinel
279 restore; and %fp,$sentinel,$sentinel
280 brz,pn $sentinel,.Lmabort1_$NUM
285 # save tp[$NUM] ########################################################
286 for($i=0; $i<14 && $i<$NUM; $i++) {
288 movxtod @A[$i],@R[$i]
295 and %fp,$sentinel,$sentinel
298 and %fp,$sentinel,$sentinel
299 srl %fp,0,%fp ! just in case?
300 or %o7,$sentinel,$sentinel
301 brz,a,pn $sentinel,.Lmdone_$NUM
302 mov 0,%i0 ! return failure
305 for($i=0; $i<12 && $i<$NUM; $i++) {
306 @R[$i] =~ /%f([0-9]+)/;
307 my $lo = "%f".($1+1);
310 st @R[$i],[$rp+$i*8+4]
313 for(; $i<$NUM; $i++) {
314 my ($hi,$lo)=("%f".2*($i%4),"%f".(2*($i%4)+1));
322 mov 1,%i0 ! return success
336 mov 0,%i0 ! return failure
342 save %sp,-128,%sp; or $sentinel,%fp,%fp
343 save %sp,-128,%sp; or $sentinel,%fp,%fp
344 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
347 .type bn_mul_mont_t4_$NUM, #function
348 .size bn_mul_mont_t4_$NUM, .-bn_mul_mont_t4_$NUM
352 for ($i=8;$i<=32;$i+=8) {
353 &generate_bn_mul_mont_t4($i);
356 ########################################################################
359 my ($ptbl,$pwr,$ccr,$skip_wr)=@_;
364 sll %o5, 3, %o5 ! offset within first cache line
365 add %o5, $ptbl, $ptbl ! of the pwrtbl
369 $code.=<<___ if (!$skip_wr);
374 my ($pwrtbl,$B0,$B1)=@_;
377 ldx [$pwrtbl+0*32], $B0
378 ldx [$pwrtbl+8*32], $B1
379 ldx [$pwrtbl+1*32], %o4
380 ldx [$pwrtbl+9*32], %o5
382 ldx [$pwrtbl+2*32], %o4
384 ldx [$pwrtbl+10*32],%o5
386 ldx [$pwrtbl+3*32], %o4
388 ldx [$pwrtbl+11*32],%o5
389 movneg %icc, %o4, $B0
390 ldx [$pwrtbl+4*32], %o4
391 movneg %icc, %o5, $B1
392 ldx [$pwrtbl+12*32],%o5
394 ldx [$pwrtbl+5*32],%o4
396 ldx [$pwrtbl+13*32],%o5
398 ldx [$pwrtbl+6*32], %o4
400 ldx [$pwrtbl+14*32],%o5
402 ldx [$pwrtbl+7*32], %o4
404 ldx [$pwrtbl+15*32],%o5
405 movneg %xcc, %o4, $B0
406 add $pwrtbl,16*32, $pwrtbl
407 movneg %xcc, %o5, $B1
414 ldx [$pwrtbl+0*32], $Bi
415 ldx [$pwrtbl+1*32], %o4
416 ldx [$pwrtbl+2*32], %o5
418 ldx [$pwrtbl+3*32], %o4
420 ldx [$pwrtbl+4*32], %o5
421 movneg %icc, %o4, $Bi
422 ldx [$pwrtbl+5*32], %o4
424 ldx [$pwrtbl+6*32], %o5
426 ldx [$pwrtbl+7*32], %o4
428 add $pwrtbl,8*32, $pwrtbl
429 movneg %xcc, %o4, $Bi
433 ########################################################################
434 # int bn_pwr5_mont_t4_$NUM(u64 *tp,const u64 *np,const BN_ULONG *n0,
435 # const u64 *pwrtbl,int pwr,int stride);
437 sub generate_bn_pwr5_mont_t4() {
439 my ($tp,$np,$pwrtbl,$pwr,$sentinel)=map("%g$_",(1..5));
442 .globl bn_pwr5_mont_t4_$NUM
444 bn_pwr5_mont_t4_$NUM:
448 #elif defined(SPARCV9_64BIT_STACK)
449 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
450 ld [%g1+0],%g1 ! OPENSSL_sparcv9_P[0]
452 and %g1,SPARCV9_64BIT_STACK,%g1
460 sllx $sentinel,32,$sentinel
463 save %sp,-128,%sp ! warm it up
478 or %g4,$sentinel,$sentinel
480 ! copy arguments to global registers
483 ld [%i2+0],%f1 ! load *n0
486 srl %i4,%g0,%i4 ! pack last arguments
492 # load tp[$NUM] ########################################################
494 save %sp,-128,%sp; or $sentinel,%fp,%fp
496 for($i=0; $i<14 && $i<$NUM; $i++) {
498 ldx [$tp+$i*8],@A[$i]
501 for(; $i<$NUM; $i++) {
503 ldd [$tp+$i*8],@A[$i]
506 # load np[$NUM] ########################################################
508 save %sp,-128,%sp; or $sentinel,%fp,%fp
510 for($i=0; $i<14 && $i<$NUM; $i++) {
512 ldx [$np+$i*8],@N[$i]
516 save %sp,-128,%sp; or $sentinel,%fp,%fp
518 for(; $i<28 && $i<$NUM; $i++) {
520 ldx [$np+$i*8],@N[$i]
524 save %sp,-128,%sp; or $sentinel,%fp,%fp
526 for(; $i<$NUM; $i++) {
528 ldx [$np+$i*8],@N[$i]
531 # load pwrtbl[pwr] ########################################################
533 save %sp,-128,%sp; or $sentinel,%fp,%fp
535 srlx $pwr, 32, %o4 ! unpack $pwr
539 sllx %o4, 32, $pwr ! re-pack $pwr
543 &load_ccr("%o7","%o5","%o4");
550 for($i=0; $i<14 && $i<$NUM; $i+=2) {
551 &load_b_pair("%o7",@B[$i],@B[$i+1]);
554 save %sp,-128,%sp; or $sentinel,%fp,%fp
556 for(; $i<$NUM; $i+=2) {
557 &load_b_pair("%i7",@B[$i],@B[$i+1]);
560 srax $pwr, 32, %o4 ! unpack $pwr
564 sllx %o4, 32, $pwr ! re-pack $pwr
568 &load_ccr("%i7","%o5","%o4",1);
570 # magic ################################################################
571 for($i=0; $i<5; $i++) {
573 .word 0x81b02940+$NUM-1 ! montsqr $NUM-1
574 fbu,pn %fcc3,.Labort_$NUM
576 and %fp,$sentinel,$sentinel
577 brz,pn $sentinel,.Labort_$NUM
584 .word 0x81b02920+$NUM-1 ! montmul $NUM-1
585 fbu,pn %fcc3,.Labort_$NUM
587 and %fp,$sentinel,$sentinel
588 brz,pn $sentinel,.Labort_$NUM
593 brgez %o4,.Lstride_$NUM
600 brgez %o4,.Lstride_$NUM
601 restore; and %fp,$sentinel,$sentinel
602 restore; and %fp,$sentinel,$sentinel
603 restore; and %fp,$sentinel,$sentinel
604 restore; and %fp,$sentinel,$sentinel
605 brz,pn $sentinel,.Labort1_$NUM
610 # save tp[$NUM] ########################################################
611 for($i=0; $i<14 && $i<$NUM; $i++) {
613 movxtod @A[$i],@R[$i]
620 and %fp,$sentinel,$sentinel
623 and %fp,$sentinel,$sentinel
624 srl %fp,0,%fp ! just in case?
625 or %o7,$sentinel,$sentinel
626 brz,a,pn $sentinel,.Ldone_$NUM
627 mov 0,%i0 ! return failure
630 for($i=0; $i<$NUM; $i++) {
632 std @R[$i],[$tp+$i*8]
636 mov 1,%i0 ! return success
650 mov 0,%i0 ! return failure
653 .type bn_pwr5_mont_t4_$NUM, #function
654 .size bn_pwr5_mont_t4_$NUM, .-bn_pwr5_mont_t4_$NUM
658 for ($i=8;$i<=32;$i+=8) {
659 &generate_bn_pwr5_mont_t4($i);
663 ########################################################################
664 # Fall-back subroutines
666 # copy of bn_mul_mont_vis3 adjusted for vectors of 64-bit values
668 ($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
669 (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
672 $rp="%o0"; # u64 *rp,
673 $ap="%o1"; # const u64 *ap,
674 $bp="%o2"; # const u64 *bp,
675 $np="%o3"; # const u64 *np,
676 $n0p="%o4"; # const BN_ULONG *n0,
677 $num="%o5"; # int num); # caller ensures that num is >=3
679 .globl bn_mul_mont_t4
682 add %sp, STACK_BIAS, %g4 ! real top of stack
683 sll $num, 3, $num ! size in bytes
685 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
687 andn %g1, 63, %g1 ! align at 64 byte
688 sub %g1, STACK_FRAME, %g1 ! new top of stack
693 # +-------------------------------+<----- %sp
695 # +-------------------------------+<----- aligned at 64 bytes
697 # +-------------------------------+
700 # +-------------------------------+<----- aligned at 64 bytes
702 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
703 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz)=map("%l$_",(0..7));
706 ld [$n0p+0], $t0 ! pull n0[0..1] value
708 add %sp, STACK_BIAS+STACK_FRAME, $tp
709 ldx [$bp+0], $m0 ! m0=bp[0]
714 ldx [$ap+0], $aj ! ap[0]
716 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
717 umulxhi $aj, $m0, $hi0
719 ldx [$ap+8], $aj ! ap[1]
721 ldx [$np+0], $nj ! np[0]
723 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
725 mulx $aj, $m0, $alo ! ap[1]*bp[0]
726 umulxhi $aj, $m0, $aj ! ahi=aj
728 mulx $nj, $m1, $lo1 ! np[0]*m1
729 umulxhi $nj, $m1, $hi1
731 ldx [$np+8], $nj ! np[1]
733 addcc $lo0, $lo1, $lo1
735 addxc %g0, $hi1, $hi1
737 mulx $nj, $m1, $nlo ! np[1]*m1
738 umulxhi $nj, $m1, $nj ! nhi=nj
741 sub $num, 24, $cnt ! cnt=num-3
745 addcc $alo, $hi0, $lo0
748 ldx [$ap+0], $aj ! ap[j]
749 addcc $nlo, $hi1, $lo1
751 addxc $nj, %g0, $hi1 ! nhi=nj
753 ldx [$np+0], $nj ! np[j]
754 mulx $aj, $m0, $alo ! ap[j]*bp[0]
756 umulxhi $aj, $m0, $aj ! ahi=aj
758 mulx $nj, $m1, $nlo ! np[j]*m1
759 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
760 umulxhi $nj, $m1, $nj ! nhi=nj
761 addxc %g0, $hi1, $hi1
762 stxa $lo1, [$tp]0xe2 ! tp[j-1]
763 add $tp, 8, $tp ! tp++
766 sub $cnt, 8, $cnt ! j--
768 addcc $alo, $hi0, $lo0
769 addxc $aj, %g0, $hi0 ! ahi=aj
771 addcc $nlo, $hi1, $lo1
773 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
774 addxc %g0, $hi1, $hi1
775 stxa $lo1, [$tp]0xe2 ! tp[j-1]
778 addcc $hi0, $hi1, $hi1
779 addxc %g0, %g0, $ovf ! upmost overflow bit
784 sub $num, 16, $i ! i=num-2
788 ldx [$bp+0], $m0 ! m0=bp[i]
791 sub $ap, $num, $ap ! rewind
795 ldx [$ap+0], $aj ! ap[0]
796 ldx [$np+0], $nj ! np[0]
798 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
799 ldx [$tp], $tj ! tp[0]
800 umulxhi $aj, $m0, $hi0
801 ldx [$ap+8], $aj ! ap[1]
802 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
803 mulx $aj, $m0, $alo ! ap[1]*bp[i]
804 addxc %g0, $hi0, $hi0
805 mulx $lo0, $n0, $m1 ! tp[0]*n0
806 umulxhi $aj, $m0, $aj ! ahi=aj
807 mulx $nj, $m1, $lo1 ! np[0]*m1
809 umulxhi $nj, $m1, $hi1
810 ldx [$np+8], $nj ! np[1]
812 addcc $lo1, $lo0, $lo1
813 mulx $nj, $m1, $nlo ! np[1]*m1
814 addxc %g0, $hi1, $hi1
815 umulxhi $nj, $m1, $nj ! nhi=nj
818 sub $num, 24, $cnt ! cnt=num-3
821 addcc $alo, $hi0, $lo0
822 ldx [$tp+8], $tj ! tp[j]
823 addxc $aj, %g0, $hi0 ! ahi=aj
824 ldx [$ap+0], $aj ! ap[j]
826 addcc $nlo, $hi1, $lo1
827 mulx $aj, $m0, $alo ! ap[j]*bp[i]
828 addxc $nj, %g0, $hi1 ! nhi=nj
829 ldx [$np+0], $nj ! np[j]
831 umulxhi $aj, $m0, $aj ! ahi=aj
832 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
833 mulx $nj, $m1, $nlo ! np[j]*m1
834 addxc %g0, $hi0, $hi0
835 umulxhi $nj, $m1, $nj ! nhi=nj
836 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
837 addxc %g0, $hi1, $hi1
838 stx $lo1, [$tp] ! tp[j-1]
840 brnz,pt $cnt, .Linner
843 ldx [$tp+8], $tj ! tp[j]
844 addcc $alo, $hi0, $lo0
845 addxc $aj, %g0, $hi0 ! ahi=aj
846 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
847 addxc %g0, $hi0, $hi0
849 addcc $nlo, $hi1, $lo1
850 addxc $nj, %g0, $hi1 ! nhi=nj
851 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
852 addxc %g0, $hi1, $hi1
853 stx $lo1, [$tp] ! tp[j-1]
855 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
856 addxccc $hi1, $hi0, $hi1
864 sub $ap, $num, $ap ! rewind
868 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
876 subccc $tj, $nj, $t2 ! tp[j]-np[j]
881 st $t2, [$rp-4] ! reverse order
886 sub $np, $num, $np ! rewind
890 subccc $ovf, %g0, $ovf ! handle upmost overflow bit
895 .Lcopy: ! conditional copy
909 .type bn_mul_mont_t4, #function
910 .size bn_mul_mont_t4, .-bn_mul_mont_t4
913 # int bn_mul_mont_gather5(
914 $rp="%o0"; # u64 *rp,
915 $ap="%o1"; # const u64 *ap,
916 $bp="%o2"; # const u64 *pwrtbl,
917 $np="%o3"; # const u64 *np,
918 $n0p="%o4"; # const BN_ULONG *n0,
919 $num="%o5"; # int num, # caller ensures that num is >=3
922 .globl bn_mul_mont_gather5_t4
924 bn_mul_mont_gather5_t4:
925 add %sp, STACK_BIAS, %g4 ! real top of stack
926 sll $num, 3, $num ! size in bytes
928 andn %g1, 63, %g1 ! buffer size rounded up to 64 bytes
930 andn %g1, 63, %g1 ! align at 64 byte
931 sub %g1, STACK_FRAME, %g1 ! new top of stack
933 LDPTR [%sp+STACK_7thARG], %g4 ! load power, 7th argument
937 # +-------------------------------+<----- %sp
939 # +-------------------------------+<----- aligned at 64 bytes
941 # +-------------------------------+
944 # +-------------------------------+<----- aligned at 64 bytes
946 ($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
947 ($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$ccr)=map("%l$_",(0..7));
949 &load_ccr($bp,"%g4",$ccr);
950 &load_b($bp,$m0,"%o7"); # m0=bp[0]
953 ld [$n0p+0], $t0 ! pull n0[0..1] value
955 add %sp, STACK_BIAS+STACK_FRAME, $tp
959 ldx [$ap+0], $aj ! ap[0]
961 mulx $aj, $m0, $lo0 ! ap[0]*bp[0]
962 umulxhi $aj, $m0, $hi0
964 ldx [$ap+8], $aj ! ap[1]
966 ldx [$np+0], $nj ! np[0]
968 mulx $lo0, $n0, $m1 ! "tp[0]"*n0
970 mulx $aj, $m0, $alo ! ap[1]*bp[0]
971 umulxhi $aj, $m0, $aj ! ahi=aj
973 mulx $nj, $m1, $lo1 ! np[0]*m1
974 umulxhi $nj, $m1, $hi1
976 ldx [$np+8], $nj ! np[1]
978 addcc $lo0, $lo1, $lo1
980 addxc %g0, $hi1, $hi1
982 mulx $nj, $m1, $nlo ! np[1]*m1
983 umulxhi $nj, $m1, $nj ! nhi=nj
986 sub $num, 24, $cnt ! cnt=num-3
990 addcc $alo, $hi0, $lo0
993 ldx [$ap+0], $aj ! ap[j]
994 addcc $nlo, $hi1, $lo1
996 addxc $nj, %g0, $hi1 ! nhi=nj
998 ldx [$np+0], $nj ! np[j]
999 mulx $aj, $m0, $alo ! ap[j]*bp[0]
1001 umulxhi $aj, $m0, $aj ! ahi=aj
1003 mulx $nj, $m1, $nlo ! np[j]*m1
1004 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1005 umulxhi $nj, $m1, $nj ! nhi=nj
1006 addxc %g0, $hi1, $hi1
1007 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1008 add $tp, 8, $tp ! tp++
1010 brnz,pt $cnt, .L1st_g5
1011 sub $cnt, 8, $cnt ! j--
1013 addcc $alo, $hi0, $lo0
1014 addxc $aj, %g0, $hi0 ! ahi=aj
1016 addcc $nlo, $hi1, $lo1
1017 addxc $nj, %g0, $hi1
1018 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0]
1019 addxc %g0, $hi1, $hi1
1020 stxa $lo1, [$tp]0xe2 ! tp[j-1]
1023 addcc $hi0, $hi1, $hi1
1024 addxc %g0, %g0, $ovf ! upmost overflow bit
1025 stxa $hi1, [$tp]0xe2
1029 sub $num, 16, $i ! i=num-2
1035 &load_b($bp,$m0); # m0=bp[i]
1037 sub $ap, $num, $ap ! rewind
1041 ldx [$ap+0], $aj ! ap[0]
1042 ldx [$np+0], $nj ! np[0]
1044 mulx $aj, $m0, $lo0 ! ap[0]*bp[i]
1045 ldx [$tp], $tj ! tp[0]
1046 umulxhi $aj, $m0, $hi0
1047 ldx [$ap+8], $aj ! ap[1]
1048 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0]
1049 mulx $aj, $m0, $alo ! ap[1]*bp[i]
1050 addxc %g0, $hi0, $hi0
1051 mulx $lo0, $n0, $m1 ! tp[0]*n0
1052 umulxhi $aj, $m0, $aj ! ahi=aj
1053 mulx $nj, $m1, $lo1 ! np[0]*m1
1055 umulxhi $nj, $m1, $hi1
1056 ldx [$np+8], $nj ! np[1]
1058 addcc $lo1, $lo0, $lo1
1059 mulx $nj, $m1, $nlo ! np[1]*m1
1060 addxc %g0, $hi1, $hi1
1061 umulxhi $nj, $m1, $nj ! nhi=nj
1064 sub $num, 24, $cnt ! cnt=num-3
1067 addcc $alo, $hi0, $lo0
1068 ldx [$tp+8], $tj ! tp[j]
1069 addxc $aj, %g0, $hi0 ! ahi=aj
1070 ldx [$ap+0], $aj ! ap[j]
1072 addcc $nlo, $hi1, $lo1
1073 mulx $aj, $m0, $alo ! ap[j]*bp[i]
1074 addxc $nj, %g0, $hi1 ! nhi=nj
1075 ldx [$np+0], $nj ! np[j]
1077 umulxhi $aj, $m0, $aj ! ahi=aj
1078 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1079 mulx $nj, $m1, $nlo ! np[j]*m1
1080 addxc %g0, $hi0, $hi0
1081 umulxhi $nj, $m1, $nj ! nhi=nj
1082 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1083 addxc %g0, $hi1, $hi1
1084 stx $lo1, [$tp] ! tp[j-1]
1086 brnz,pt $cnt, .Linner_g5
1089 ldx [$tp+8], $tj ! tp[j]
1090 addcc $alo, $hi0, $lo0
1091 addxc $aj, %g0, $hi0 ! ahi=aj
1092 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j]
1093 addxc %g0, $hi0, $hi0
1095 addcc $nlo, $hi1, $lo1
1096 addxc $nj, %g0, $hi1 ! nhi=nj
1097 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j]
1098 addxc %g0, $hi1, $hi1
1099 stx $lo1, [$tp] ! tp[j-1]
1101 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc
1102 addxccc $hi1, $hi0, $hi1
1103 addxc %g0, %g0, $ovf
1107 brnz,pt $i, .Louter_g5
1110 sub $ap, $num, $ap ! rewind
1114 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc
1122 subccc $tj, $nj, $t2 ! tp[j]-np[j]
1125 subccc $tj, $nj, $t3
1127 st $t2, [$rp-4] ! reverse order
1129 brnz,pt $cnt, .Lsub_g5
1132 sub $np, $num, $np ! rewind
1136 subccc $ovf, %g0, $ovf ! handle upmost overflow bit
1141 .Lcopy_g5: ! conditional copy
1144 stx %g0, [$tp] ! zap
1146 movcs %icc, $tj, $t2
1149 brnz $cnt, .Lcopy_g5
1155 .type bn_mul_mont_gather5_t4, #function
1156 .size bn_mul_mont_gather5_t4, .-bn_mul_mont_gather5_t4
1171 brnz %o2, .Loop_flip
1175 .type bn_flip_t4, #function
1176 .size bn_flip_t4, .-bn_flip_t4
1178 .globl bn_flip_n_scatter5_t4
1180 bn_flip_n_scatter5_t4:
1183 add %o3, %o2, %o2 ! &pwrtbl[pwr]
1185 .Loop_flip_n_scatter5:
1186 ld [%o0+0], %o4 ! inp[i]
1193 brnz %o1, .Loop_flip_n_scatter5
1197 .type bn_flip_n_scatter5_t4, #function
1198 .size bn_flip_n_scatter5_t4, .-bn_flip_n_scatter5_t4
1200 .globl bn_gather5_t4
1204 &load_ccr("%o2","%o3","%g1");
1209 &load_b("%o2","%g1");
1213 brnz %o1, .Loop_gather5
1218 .type bn_gather5_t4, #function
1219 .size bn_gather5_t4, .-bn_gather5_t4
1221 .asciz "Montgomery Multiplication for SPARC T4, David S. Miller, Andy Polyakov"
1227 close STDOUT or die "error closing STDOUT";