3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # "Teaser" Montgomery multiplication module for ARMv8. Needs more
13 # work. While it does improve RSA sign performance by 20-30% (less for
14 # longer keys) on most processors, for some reason RSA2048 is not
15 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
16 # instruction issue rate is limited on processor in question, meaning
17 # that dedicated squaring procedure is a must. Well, actually all
18 # contemporary AArch64 processors seem to have limited multiplication
19 # issue rate, i.e. they can't issue multiplication every cycle, which
20 # explains moderate improvement coefficients in comparison to
21 # compiler-generated code. Recall that compiler is instructed to use
22 # umulh and therefore uses same amount of multiplication instructions
23 # to do the job. Assembly's edge is to minimize number of "collateral"
24 # instructions and of course instruction scheduling.
28 # Squaring procedure that handles lengths divisible by 8 improves
29 # RSA/DSA performance by 25-40-60% depending on processor and key
30 # length. Overall improvement coefficients are always positive in
31 # comparison to compiler-generated code. On Cortex-A57 improvement
32 # is still modest on longest key lengths, while others exhibit e.g.
33 # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
34 # on Cortex-A57 and ~60-100% faster on others.
39 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42 die "can't locate arm-xlate.pl";
44 open OUT,"| \"$^X\" $xlate $flavour $output";
47 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
48 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
49 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
52 $rp="x0"; # BN_ULONG *rp,
53 $ap="x1"; # const BN_ULONG *ap,
54 $bp="x2"; # const BN_ULONG *bp,
55 $np="x3"; # const BN_ULONG *np,
56 $n0="x4"; # const BN_ULONG *n0,
57 $num="x5"; # int num);
63 .type bn_mul_mont,%function
71 stp x29,x30,[sp,#-64]!
77 ldr $m0,[$bp],#8 // bp[0]
79 ldp $hi0,$aj,[$ap],#16 // ap[0..1]
82 and $tp,$tp,#-16 // ABI says so
83 ldp $hi1,$nj,[$np],#16 // np[0..1]
85 mul $lo0,$hi0,$m0 // ap[0]*bp[0]
86 sub $j,$num,#16 // j=num-2
88 mul $alo,$aj,$m0 // ap[1]*bp[0]
91 mul $m1,$lo0,$n0 // "tp[0]"*n0
94 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
96 mul $nlo,$nj,$m1 // np[1]*m1
97 // (*) adds $lo1,$lo1,$lo0 // discarded
98 // (*) As for removal of first multiplication and addition
99 // instructions. The outcome of first addition is
100 // guaranteed to be zero, which leaves two computationally
101 // significant outcomes: it either carries or not. Then
102 // question is when does it carry? Is there alternative
103 // way to deduce it? If you follow operations, you can
104 // observe that condition for carry is quite simple:
105 // $lo0 being non-zero. So that carry can be calculated
106 // by adding -1 to $lo0. That's what next instruction does.
107 subs xzr,$lo0,#1 // (*)
120 mul $alo,$aj,$m0 // ap[j]*bp[0]
125 mul $nlo,$nj,$m1 // np[j]*m1
128 str $lo1,[$tp],#8 // tp[j-1]
133 sub $ap,$ap,$num // rewind $ap
137 sub $np,$np,$num // rewind $np
141 sub $i,$num,#8 // i=num-1
144 adc $ovf,xzr,xzr // upmost overflow bit
148 ldr $m0,[$bp],#8 // bp[i]
149 ldp $hi0,$aj,[$ap],#16
150 ldr $tj,[sp] // tp[0]
153 mul $lo0,$hi0,$m0 // ap[0]*bp[i]
154 sub $j,$num,#16 // j=num-2
156 ldp $hi1,$nj,[$np],#16
157 mul $alo,$aj,$m0 // ap[1]*bp[i]
165 // (*) mul $lo1,$hi1,$m1 // np[0]*m1
167 mul $nlo,$nj,$m1 // np[1]*m1
168 // (*) adds $lo1,$lo1,$lo0
169 subs xzr,$lo0,#1 // (*)
176 ldr $tj,[$tp],#8 // tp[j]
185 mul $alo,$aj,$m0 // ap[j]*bp[i]
190 mul $nlo,$nj,$m1 // np[j]*m1
193 str $lo1,[$tp,#-16] // tp[j-1]
197 ldr $tj,[$tp],#8 // tp[j]
200 sub $ap,$ap,$num // rewind $ap
204 sub $np,$np,$num // rewind $np
213 adc $ovf,$ovf,xzr // upmost overflow bit
214 stp $lo1,$hi1,[$tp,#-16]
218 // Final step. We see if result is larger than modulus, and
219 // if it is, subtract the modulus. But comparison implies
220 // subtraction. So we subtract modulus, see if it borrowed,
221 // and conditionally copy original value.
222 ldr $tj,[sp] // tp[0]
224 ldr $nj,[$np],#8 // np[0]
225 subs $j,$num,#8 // j=num-1 and clear borrow
228 sbcs $aj,$tj,$nj // tp[j]-np[j]
232 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
236 sbcs $ovf,$ovf,xzr // did it borrow?
237 str $aj,[$ap],#8 // rp[num-1]
239 ldr $tj,[sp] // tp[0]
241 ldr $aj,[$rp],#8 // rp[0]
242 sub $num,$num,#8 // num--
245 sub $num,$num,#8 // num--
246 csel $nj,$tj,$aj,lo // did it borrow?
249 str xzr,[$tp,#-16] // wipe tp
251 cbnz $num,.Lcond_copy
254 str xzr,[$tp,#-8] // wipe tp
257 ldp x19,x20,[x29,#16]
259 ldp x21,x22,[x29,#32]
261 ldp x23,x24,[x29,#48]
264 .size bn_mul_mont,.-bn_mul_mont
267 ########################################################################
268 # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
270 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
271 my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
272 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
273 my ($cnt,$carry,$topmost)=("x27","x28","x30");
274 my ($tp,$ap_end,$na0)=($bp,$np,$carry);
277 .type __bn_sqr8x_mont,%function
283 stp x29,x30,[sp,#-128]!
290 stp $rp,$np,[sp,#96] // offload rp and np
292 ldp $a0,$a1,[$ap,#8*0]
293 ldp $a2,$a3,[$ap,#8*2]
294 ldp $a4,$a5,[$ap,#8*4]
295 ldp $a6,$a7,[$ap,#8*6]
297 sub $tp,sp,$num,lsl#4
306 stp xzr,xzr,[$tp,#8*0]
307 stp xzr,xzr,[$tp,#8*2]
308 stp xzr,xzr,[$tp,#8*4]
309 stp xzr,xzr,[$tp,#8*6]
311 stp xzr,xzr,[$tp,#8*8]
312 stp xzr,xzr,[$tp,#8*10]
313 stp xzr,xzr,[$tp,#8*12]
314 stp xzr,xzr,[$tp,#8*14]
316 cbnz $cnt,.Lsqr8x_zero
329 str $n0,[x29,#112] // offload n0
331 // Multiply everything but a[i]*a[i]
363 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
367 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
374 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
381 stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
382 adc $acc0,xzr,xzr // t[8]
383 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
390 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
403 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
410 stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
411 adc $acc1,xzr,xzr // t[9]
417 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
428 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
435 stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
436 adc $acc2,xzr,xzr // t[10]
440 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
449 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
456 stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
457 adc $acc3,xzr,xzr // t[11]
459 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
466 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
472 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
473 adc $acc4,xzr,xzr // t[12]
477 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
482 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
484 umulh $t3,$a7,$a6 // hi(a[7]*a[6])
485 adc $acc5,xzr,xzr // t[13]
487 sub $cnt,$ap_end,$ap // done yet?
491 sub $t0,$ap_end,$num // rewinded ap
492 adc $acc6,xzr,xzr // t[14]
495 cbz $cnt,.Lsqr8x_outer_break
498 ldp $a0,$a1,[$tp,#8*0]
499 ldp $a2,$a3,[$tp,#8*2]
500 ldp $a4,$a5,[$tp,#8*4]
501 ldp $a6,$a7,[$tp,#8*6]
504 ldp $a0,$a1,[$ap,#8*0]
507 ldp $a2,$a3,[$ap,#8*2]
510 ldp $a4,$a5,[$ap,#8*4]
514 ldp $a6,$a7,[$ap,#8*6]
516 //adc $carry,xzr,xzr // moved below
528 // a[f]a[1]........................
530 // a[f]a[2]........................
532 // a[f]a[3]........................
534 // a[f]a[4]........................
536 // a[f]a[5]........................
538 // a[f]a[6]........................
540 // a[f]a[7]........................
543 adc $carry,xzr,xzr // carry bit, modulo-scheduled
564 adc $carry,$carry,xzr
578 adcs $acc7,$carry,$t3
579 //adc $carry,xzr,xzr // moved above
580 cbnz $cnt,.Lsqr8x_mul
581 // note that carry flag is guaranteed
582 // to be zero at this point
583 cmp $ap,$ap_end // done yet?
586 ldp $a0,$a1,[$tp,#8*0]
587 ldp $a2,$a3,[$tp,#8*2]
588 ldp $a4,$a5,[$tp,#8*4]
589 ldp $a6,$a7,[$tp,#8*6]
593 ldp $a0,$a1,[$ap,#8*0]
596 ldp $a2,$a3,[$ap,#8*2]
599 ldp $a4,$a5,[$ap,#8*4]
603 ldp $a6,$a7,[$ap,#8*6]
605 //adc $carry,xzr,xzr // moved above
610 ldp $a0,$a1,[$rp,#8*0]
612 ldp $a2,$a3,[$rp,#8*2]
613 sub $t0,$ap_end,$ap // is it last iteration?
614 ldp $a4,$a5,[$rp,#8*4]
616 ldp $a6,$a7,[$rp,#8*6]
617 cbz $t0,.Lsqr8x_outer_loop
619 stp $acc0,$acc1,[$tp,#8*0]
620 ldp $acc0,$acc1,[$t1,#8*0]
621 stp $acc2,$acc3,[$tp,#8*2]
622 ldp $acc2,$acc3,[$t1,#8*2]
623 stp $acc4,$acc5,[$tp,#8*4]
624 ldp $acc4,$acc5,[$t1,#8*4]
625 stp $acc6,$acc7,[$tp,#8*6]
627 ldp $acc6,$acc7,[$t1,#8*6]
632 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
633 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
634 ldp $t1,$t2,[sp,#8*1]
635 ldp $a5,$a7,[$t0,#8*2]
637 ldp $t3,$t0,[sp,#8*3]
639 stp $acc0,$acc1,[$tp,#8*0]
641 stp $acc2,$acc3,[$tp,#8*2]
643 stp $acc4,$acc5,[$tp,#8*4]
645 stp $acc6,$acc7,[$tp,#8*6]
648 adds $acc1,$a1,$t1,lsl#1
657 ldp $t1,$t2,[$tp,#8*5]
659 ldp $a1,$a3,[$ap],#8*2
664 stp $acc0,$acc1,[$tp,#8*0]
667 stp $acc2,$acc3,[$tp,#8*2]
669 ldp $t3,$t0,[$tp,#8*7]
674 ldp $t1,$t2,[$tp,#8*9]
676 ldp $a5,$a7,[$ap],#8*2
680 stp $acc4,$acc5,[$tp,#8*4]
682 stp $acc6,$acc7,[$tp,#8*6]
687 ldp $t3,$t0,[$tp,#8*3]
689 cbnz $cnt,.Lsqr4x_shift_n_add
691 my ($np,$np_end)=($ap,$ap_end);
693 ldp $np,$n0,[x29,#104] // pull np and n0
698 ldp $t1,$t2,[$tp,#8*5]
701 stp $acc0,$acc1,[$tp,#8*0]
704 stp $acc2,$acc3,[$tp,#8*2]
708 ldp $acc0,$acc1,[sp,#8*0]
711 ldp $a0,$a1,[$np,#8*0]
714 ldp $a2,$a3,[$np,#8*2]
716 ldp $a4,$a5,[$np,#8*4]
718 // Reduce by 512 bits per iteration
719 mul $na0,$n0,$acc0 // t[0]*n0
720 ldp $a6,$a7,[$np,#8*6]
722 ldp $acc2,$acc3,[sp,#8*2]
723 stp $acc4,$acc5,[$tp,#8*4]
724 ldp $acc4,$acc5,[sp,#8*4]
725 stp $acc6,$acc7,[$tp,#8*6]
726 ldp $acc6,$acc7,[sp,#8*6]
728 mov $topmost,xzr // initial top-most carry
733 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
737 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
739 // (*) adds xzr,$acc0,$t0
740 subs xzr,$acc0,#1 // (*)
749 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
765 mul $na0,$n0,$acc0 // next t[0]*n0
770 cbnz $cnt,.Lsqr8x_reduction
772 ldp $t0,$t1,[$tp,#8*0]
773 ldp $t2,$t3,[$tp,#8*2]
775 sub $cnt,$np_end,$np // done yet?
778 ldp $t0,$t1,[$tp,#8*4]
781 ldp $t2,$t3,[$tp,#8*6]
786 //adc $carry,xzr,xzr // moved below
787 cbz $cnt,.Lsqr8x8_post_condition
790 ldp $a0,$a1,[$np,#8*0]
791 ldp $a2,$a3,[$np,#8*2]
792 ldp $a4,$a5,[$np,#8*4]
794 ldp $a6,$a7,[$np,#8*6]
799 adc $carry,xzr,xzr // carry bit, modulo-scheduled
820 adc $carry,$carry,xzr
834 adcs $acc7,$carry,$t3
835 //adc $carry,xzr,xzr // moved above
836 cbnz $cnt,.Lsqr8x_tail
837 // note that carry flag is guaranteed
838 // to be zero at this point
839 ldp $a0,$a1,[$tp,#8*0]
840 sub $cnt,$np_end,$np // done yet?
841 sub $t2,$np_end,$num // rewinded np
842 ldp $a2,$a3,[$tp,#8*2]
843 ldp $a4,$a5,[$tp,#8*4]
844 ldp $a6,$a7,[$tp,#8*6]
845 cbz $cnt,.Lsqr8x_tail_break
850 ldp $a0,$a1,[$np,#8*0]
853 ldp $a2,$a3,[$np,#8*2]
856 ldp $a4,$a5,[$np,#8*4]
860 ldp $a6,$a7,[$np,#8*6]
862 //adc $carry,xzr,xzr // moved above
867 ldr $n0,[x29,#112] // pull n0
868 add $cnt,$tp,#8*8 // end of current t[num] window
870 subs xzr,$topmost,#1 // "move" top-most carry to carry bit
873 ldp $acc0,$acc1,[$rp,#8*0]
875 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
877 ldp $a2,$a3,[$t2,#8*2]
880 ldp $a4,$a5,[$t2,#8*4]
883 ldp $a6,$a7,[$t2,#8*6]
885 adc $topmost,xzr,xzr // top-most carry
887 stp $t0,$t1,[$tp,#8*0]
888 stp $acc2,$acc3,[$tp,#8*2]
889 ldp $acc2,$acc3,[$rp,#8*2]
890 stp $acc4,$acc5,[$tp,#8*4]
891 ldp $acc4,$acc5,[$rp,#8*4]
892 cmp $cnt,x29 // did we hit the bottom?
893 stp $acc6,$acc7,[$tp,#8*6]
894 mov $tp,$rp // slide the window
895 ldp $acc6,$acc7,[$rp,#8*6]
897 b.ne .Lsqr8x_reduction
899 // Final step. We see if result is larger than modulus, and
900 // if it is, subtract the modulus. But comparison implies
901 // subtraction. So we subtract modulus, see if it borrowed,
902 // and conditionally copy original value.
903 ldr $rp,[x29,#96] // pull rp
908 mov $ap_end,$rp // $rp copy
912 ldp $a0,$a1,[$np,#8*0]
914 stp $t0,$t1,[$rp,#8*0]
916 ldp $a2,$a3,[$np,#8*2]
918 stp $t2,$t3,[$rp,#8*2]
920 ldp $a4,$a5,[$np,#8*4]
922 ldp $a6,$a7,[$np,#8*6]
924 ldp $acc0,$acc1,[$tp,#8*0]
926 ldp $acc2,$acc3,[$tp,#8*2]
927 ldp $acc4,$acc5,[$tp,#8*4]
928 ldp $acc6,$acc7,[$tp,#8*6]
930 stp $t0,$t1,[$rp,#8*4]
932 stp $t2,$t3,[$rp,#8*6]
935 cbnz $cnt,.Lsqr8x_sub
940 ldp $a0,$a1,[$ap_end,#8*0]
942 stp $t0,$t1,[$rp,#8*0]
944 ldp $a2,$a3,[$ap_end,#8*2]
946 stp $t2,$t3,[$rp,#8*2]
948 ldp $acc0,$acc1,[$ap,#8*0]
950 ldp $acc2,$acc3,[$ap,#8*2]
951 sbcs xzr,$topmost,xzr // did it borrow?
952 ldr x30,[x29,#8] // pull return address
953 stp $t0,$t1,[$rp,#8*4]
954 stp $t2,$t3,[$rp,#8*6]
959 csel $t0,$acc0,$a0,lo
960 stp xzr,xzr,[$tp,#8*0]
961 csel $t1,$acc1,$a1,lo
962 ldp $a0,$a1,[$ap_end,#8*4]
963 ldp $acc0,$acc1,[$ap,#8*4]
964 csel $t2,$acc2,$a2,lo
965 stp xzr,xzr,[$tp,#8*2]
967 csel $t3,$acc3,$a3,lo
968 ldp $a2,$a3,[$ap_end,#8*6]
969 ldp $acc2,$acc3,[$ap,#8*6]
971 stp $t0,$t1,[$ap_end,#8*0]
972 stp $t2,$t3,[$ap_end,#8*2]
973 add $ap_end,$ap_end,#8*4
974 stp xzr,xzr,[$ap,#8*0]
975 stp xzr,xzr,[$ap,#8*2]
976 cbnz $cnt,.Lsqr4x_cond_copy
978 csel $t0,$acc0,$a0,lo
979 stp xzr,xzr,[$tp,#8*0]
980 csel $t1,$acc1,$a1,lo
981 stp xzr,xzr,[$tp,#8*2]
982 csel $t2,$acc2,$a2,lo
983 csel $t3,$acc3,$a3,lo
984 stp $t0,$t1,[$ap_end,#8*0]
985 stp $t2,$t3,[$ap_end,#8*2]
990 .Lsqr8x8_post_condition:
992 ldr x30,[x29,#8] // pull return address
993 // $acc0-7,$carry hold result, $a0-7 hold modulus
995 ldr $ap,[x29,#96] // pull rp
997 stp xzr,xzr,[sp,#8*0]
999 stp xzr,xzr,[sp,#8*2]
1001 stp xzr,xzr,[sp,#8*4]
1003 stp xzr,xzr,[sp,#8*6]
1005 stp xzr,xzr,[sp,#8*8]
1007 stp xzr,xzr,[sp,#8*10]
1009 stp xzr,xzr,[sp,#8*12]
1010 sbcs $carry,$carry,xzr // did it borrow?
1011 stp xzr,xzr,[sp,#8*14]
1013 // $a0-7 hold result-modulus
1014 csel $a0,$acc0,$a0,lo
1015 csel $a1,$acc1,$a1,lo
1016 csel $a2,$acc2,$a2,lo
1017 csel $a3,$acc3,$a3,lo
1018 stp $a0,$a1,[$ap,#8*0]
1019 csel $a4,$acc4,$a4,lo
1020 csel $a5,$acc5,$a5,lo
1021 stp $a2,$a3,[$ap,#8*2]
1022 csel $a6,$acc6,$a6,lo
1023 csel $a7,$acc7,$a7,lo
1024 stp $a4,$a5,[$ap,#8*4]
1025 stp $a6,$a7,[$ap,#8*6]
1028 ldp x19,x20,[x29,#16]
1030 ldp x21,x22,[x29,#32]
1032 ldp x23,x24,[x29,#48]
1033 ldp x25,x26,[x29,#64]
1034 ldp x27,x28,[x29,#80]
1037 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1042 ########################################################################
1043 # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1044 # x86_64-mont5 module, it's different in sense that it performs
1045 # reduction 256 bits at a time.
1047 my ($a0,$a1,$a2,$a3,
1050 $acc0,$acc1,$acc2,$acc3,$acc4,
1051 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1053 my ($carry,$topmost) = ($rp,"x30");
1056 .type __bn_mul4x_mont,%function
1059 stp x29,x30,[sp,#-128]!
1061 stp x19,x20,[sp,#16]
1062 stp x21,x22,[sp,#32]
1063 stp x23,x24,[sp,#48]
1064 stp x25,x26,[sp,#64]
1065 stp x27,x28,[sp,#80]
1067 sub $tp,sp,$num,lsl#3
1069 ldr $n0,[$n0] // *n0
1070 sub sp,$tp,#8*4 // alloca
1073 add $ap_end,$ap,$num
1074 stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1076 ldr $bi,[$bp,#8*0] // b[0]
1077 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1078 ldp $a2,$a3,[$ap,#8*2]
1084 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1085 ldp $m2,$m3,[$np,#8*2]
1086 adds $np,$np,#8*4 // clear carry bit
1091 .Loop_mul4x_1st_reduction:
1092 mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1093 adc $carry,$carry,xzr // modulo-scheduled
1099 adds $acc0,$acc0,$t0
1100 umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1101 adcs $acc1,$acc1,$t1
1102 mul $mi,$acc0,$n0 // t[0]*n0
1103 adcs $acc2,$acc2,$t2
1105 adcs $acc3,$acc3,$t3
1109 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1110 adds $acc1,$acc1,$t0
1111 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1112 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1113 adcs $acc2,$acc2,$t1
1115 adcs $acc3,$acc3,$t2
1117 adc $acc4,$acc4,$t3 // can't overflow
1119 // (*) adds xzr,$acc0,$t0
1120 subs xzr,$acc0,#1 // (*)
1121 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1122 adcs $acc0,$acc1,$t1
1124 adcs $acc1,$acc2,$t2
1126 adcs $acc2,$acc3,$t3
1128 adcs $acc3,$acc4,$carry
1130 adds $acc0,$acc0,$t0
1132 adcs $acc1,$acc1,$t1
1133 adcs $acc2,$acc2,$t2
1134 adcs $acc3,$acc3,$t3
1135 //adc $carry,$carry,xzr
1136 cbnz $cnt,.Loop_mul4x_1st_reduction
1138 cbz $t0,.Lmul4x4_post_condition
1140 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1141 ldp $a2,$a3,[$ap,#8*2]
1143 ldr $mi,[sp] // a[0]*n0
1144 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1145 ldp $m2,$m3,[$np,#8*2]
1148 .Loop_mul4x_1st_tail:
1149 mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1150 adc $carry,$carry,xzr // modulo-scheduled
1156 adds $acc0,$acc0,$t0
1157 umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1158 adcs $acc1,$acc1,$t1
1160 adcs $acc2,$acc2,$t2
1162 adcs $acc3,$acc3,$t3
1165 ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1166 adds $acc1,$acc1,$t0
1167 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1168 adcs $acc2,$acc2,$t1
1170 adcs $acc3,$acc3,$t2
1172 adc $acc4,$acc4,$t3 // can't overflow
1174 adds $acc0,$acc0,$t0
1175 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1176 adcs $acc1,$acc1,$t1
1178 adcs $acc2,$acc2,$t2
1180 adcs $acc3,$acc3,$t3
1181 adcs $acc4,$acc4,$carry
1184 ldr $mi,[sp,$cnt] // next t[0]*n0
1185 str $acc0,[$tp],#8 // result!!!
1186 adds $acc0,$acc1,$t0
1187 sub $t0,$ap_end,$ap // done yet?
1188 adcs $acc1,$acc2,$t1
1189 adcs $acc2,$acc3,$t2
1190 adcs $acc3,$acc4,$t3
1191 //adc $carry,$carry,xzr
1192 cbnz $cnt,.Loop_mul4x_1st_tail
1194 sub $t1,$ap_end,$num // rewinded $ap
1195 cbz $t0,.Lmul4x_proceed
1197 ldp $a0,$a1,[$ap,#8*0]
1198 ldp $a2,$a3,[$ap,#8*2]
1200 ldp $m0,$m1,[$np,#8*0]
1201 ldp $m2,$m3,[$np,#8*2]
1203 b .Loop_mul4x_1st_tail
1207 ldr $bi,[$bp,#8*4]! // *++b
1208 adc $topmost,$carry,xzr
1209 ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1210 sub $np,$np,$num // rewind np
1211 ldp $a2,$a3,[$t1,#8*2]
1214 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1215 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1216 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1217 ldp $acc2,$acc3,[sp,#8*6]
1219 ldp $m0,$m1,[$np,#8*0] // n[0..3]
1221 ldp $m2,$m3,[$np,#8*2]
1222 adds $np,$np,#8*4 // clear carry bit
1226 .Loop_mul4x_reduction:
1227 mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1228 adc $carry,$carry,xzr // modulo-scheduled
1234 adds $acc0,$acc0,$t0
1235 umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1236 adcs $acc1,$acc1,$t1
1237 mul $mi,$acc0,$n0 // t[0]*n0
1238 adcs $acc2,$acc2,$t2
1240 adcs $acc3,$acc3,$t3
1244 ldr $bi,[$bp,$cnt] // next b[i]
1245 adds $acc1,$acc1,$t0
1246 // (*) mul $t0,$m0,$mi
1247 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1248 adcs $acc2,$acc2,$t1
1249 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1250 adcs $acc3,$acc3,$t2
1252 adc $acc4,$acc4,$t3 // can't overflow
1254 // (*) adds xzr,$acc0,$t0
1255 subs xzr,$acc0,#1 // (*)
1256 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1257 adcs $acc0,$acc1,$t1
1259 adcs $acc1,$acc2,$t2
1261 adcs $acc2,$acc3,$t3
1263 adcs $acc3,$acc4,$carry
1265 adds $acc0,$acc0,$t0
1266 adcs $acc1,$acc1,$t1
1267 adcs $acc2,$acc2,$t2
1268 adcs $acc3,$acc3,$t3
1269 //adc $carry,$carry,xzr
1270 cbnz $cnt,.Loop_mul4x_reduction
1272 adc $carry,$carry,xzr
1273 ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1274 ldp $t2,$t3,[$tp,#8*6]
1275 ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1276 ldp $a2,$a3,[$ap,#8*2]
1278 adds $acc0,$acc0,$t0
1279 adcs $acc1,$acc1,$t1
1280 adcs $acc2,$acc2,$t2
1281 adcs $acc3,$acc3,$t3
1282 //adc $carry,$carry,xzr
1284 ldr $mi,[sp] // t[0]*n0
1285 ldp $m0,$m1,[$np,#8*0] // n[4..7]
1286 ldp $m2,$m3,[$np,#8*2]
1291 mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1292 adc $carry,$carry,xzr // modulo-scheduled
1298 adds $acc0,$acc0,$t0
1299 umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1300 adcs $acc1,$acc1,$t1
1302 adcs $acc2,$acc2,$t2
1304 adcs $acc3,$acc3,$t3
1307 ldr $bi,[$bp,$cnt] // next b[i]
1308 adds $acc1,$acc1,$t0
1309 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1310 adcs $acc2,$acc2,$t1
1312 adcs $acc3,$acc3,$t2
1314 adc $acc4,$acc4,$t3 // can't overflow
1316 adds $acc0,$acc0,$t0
1317 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1318 adcs $acc1,$acc1,$t1
1320 adcs $acc2,$acc2,$t2
1322 adcs $acc3,$acc3,$t3
1324 adcs $acc4,$acc4,$carry
1325 ldr $mi,[sp,$cnt] // next a[0]*n0
1327 str $acc0,[$tp],#8 // result!!!
1328 adds $acc0,$acc1,$t0
1329 sub $t0,$ap_end,$ap // done yet?
1330 adcs $acc1,$acc2,$t1
1331 adcs $acc2,$acc3,$t2
1332 adcs $acc3,$acc4,$t3
1333 //adc $carry,$carry,xzr
1334 cbnz $cnt,.Loop_mul4x_tail
1336 sub $t1,$np,$num // rewinded np?
1337 adc $carry,$carry,xzr
1338 cbz $t0,.Loop_mul4x_break
1340 ldp $t0,$t1,[$tp,#8*4]
1341 ldp $t2,$t3,[$tp,#8*6]
1342 ldp $a0,$a1,[$ap,#8*0]
1343 ldp $a2,$a3,[$ap,#8*2]
1345 adds $acc0,$acc0,$t0
1346 adcs $acc1,$acc1,$t1
1347 adcs $acc2,$acc2,$t2
1348 adcs $acc3,$acc3,$t3
1349 //adc $carry,$carry,xzr
1350 ldp $m0,$m1,[$np,#8*0]
1351 ldp $m2,$m3,[$np,#8*2]
1357 ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1358 adds $acc0,$acc0,$topmost
1359 add $bp,$bp,#8*4 // bp++
1360 adcs $acc1,$acc1,xzr
1361 sub $ap,$ap,$num // rewind ap
1362 adcs $acc2,$acc2,xzr
1363 stp $acc0,$acc1,[$tp,#8*0] // result!!!
1364 adcs $acc3,$acc3,xzr
1365 ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1366 adc $topmost,$carry,xzr
1367 stp $acc2,$acc3,[$tp,#8*2] // result!!!
1368 cmp $bp,$t3 // done yet?
1369 ldp $acc2,$acc3,[sp,#8*6]
1370 ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1371 ldp $m2,$m3,[$t1,#8*2]
1376 ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1377 ldp $a2,$a3,[$ap,#8*2]
1378 adds $ap,$ap,#8*4 // clear carry bit
1381 b .Loop_mul4x_reduction
1385 // Final step. We see if result is larger than modulus, and
1386 // if it is, subtract the modulus. But comparison implies
1387 // subtraction. So we subtract modulus, see if it borrowed,
1388 // and conditionally copy original value.
1390 mov $ap_end,$t2 // $rp copy
1398 ldp $m0,$m1,[$np,#8*0]
1400 ldp $acc0,$acc1,[$tp,#8*0]
1402 ldp $m2,$m3,[$np,#8*2]
1404 ldp $acc2,$acc3,[$tp,#8*2]
1406 stp $t0,$t1,[$rp,#8*0]
1408 stp $t2,$t3,[$rp,#8*2]
1411 cbnz $cnt,.Lmul4x_sub
1416 ldp $a0,$a1,[$ap_end,#8*0]
1418 stp $t0,$t1,[$rp,#8*0]
1419 ldp $a2,$a3,[$ap_end,#8*2]
1420 stp $t2,$t3,[$rp,#8*2]
1421 ldp $acc0,$acc1,[$ap,#8*0]
1422 ldp $acc2,$acc3,[$ap,#8*2]
1423 sbcs xzr,$topmost,xzr // did it borrow?
1424 ldr x30,[x29,#8] // pull return address
1429 csel $t0,$acc0,$a0,lo
1430 stp xzr,xzr,[$tp,#8*0]
1431 csel $t1,$acc1,$a1,lo
1432 ldp $a0,$a1,[$ap_end,#8*4]
1433 ldp $acc0,$acc1,[$ap,#8*4]
1434 csel $t2,$acc2,$a2,lo
1435 stp xzr,xzr,[$tp,#8*2]
1437 csel $t3,$acc3,$a3,lo
1438 ldp $a2,$a3,[$ap_end,#8*6]
1439 ldp $acc2,$acc3,[$ap,#8*6]
1441 stp $t0,$t1,[$ap_end,#8*0]
1442 stp $t2,$t3,[$ap_end,#8*2]
1443 add $ap_end,$ap_end,#8*4
1444 cbnz $cnt,.Lmul4x_cond_copy
1446 csel $t0,$acc0,$a0,lo
1447 stp xzr,xzr,[$tp,#8*0]
1448 csel $t1,$acc1,$a1,lo
1449 stp xzr,xzr,[$tp,#8*2]
1450 csel $t2,$acc2,$a2,lo
1451 stp xzr,xzr,[$tp,#8*3]
1452 csel $t3,$acc3,$a3,lo
1453 stp xzr,xzr,[$tp,#8*4]
1454 stp $t0,$t1,[$ap_end,#8*0]
1455 stp $t2,$t3,[$ap_end,#8*2]
1460 .Lmul4x4_post_condition:
1461 adc $carry,$carry,xzr
1462 ldr $ap,[x29,#96] // pull rp
1463 // $acc0-3,$carry hold result, $m0-7 hold modulus
1465 ldr x30,[x29,#8] // pull return address
1467 stp xzr,xzr,[sp,#8*0]
1469 stp xzr,xzr,[sp,#8*2]
1471 stp xzr,xzr,[sp,#8*4]
1472 sbcs xzr,$carry,xzr // did it borrow?
1473 stp xzr,xzr,[sp,#8*6]
1475 // $a0-3 hold result-modulus
1476 csel $a0,$acc0,$a0,lo
1477 csel $a1,$acc1,$a1,lo
1478 csel $a2,$acc2,$a2,lo
1479 csel $a3,$acc3,$a3,lo
1480 stp $a0,$a1,[$ap,#8*0]
1481 stp $a2,$a3,[$ap,#8*2]
1484 ldp x19,x20,[x29,#16]
1486 ldp x21,x22,[x29,#32]
1488 ldp x23,x24,[x29,#48]
1489 ldp x25,x26,[x29,#64]
1490 ldp x27,x28,[x29,#80]
1493 .size __bn_mul4x_mont,.-__bn_mul4x_mont
1497 .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"