3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # "Teaser" Montgomery multiplication module for ARMv8. Needs more
13 # work. While it does improve RSA sign performance by 20-30% (less for
14 # longer keys) on most processors, for some reason RSA2048 is not
15 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
16 # instruction issue rate is limited on processor in question, meaning
17 # that dedicated squaring procedure is a must. Well, actually all
18 # contemporary AArch64 processors seem to have limited multiplication
19 # issue rate, i.e. they can't issue multiplication every cycle, which
20 # explains moderate improvement coefficients in comparison to
21 # compiler-generated code. Recall that compiler is instructed to use
22 # umulh and therefore uses same amount of multiplication instructions
23 # to do the job. Assembly's edge is to minimize number of "collateral"
24 # instructions and of course instruction scheduling.
29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
31 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
32 die "can't locate arm-xlate.pl";
34 open OUT,"| \"$^X\" $xlate $flavour $output";
37 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
38 $lo1,$hi1,$nj,$m1,$nlo,$nhi,
39 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
42 $rp="x0"; # BN_ULONG *rp,
43 $ap="x1"; # const BN_ULONG *ap,
44 $bp="x2"; # const BN_ULONG *bp,
45 $np="x3"; # const BN_ULONG *np,
46 $n0="x4"; # const BN_ULONG *n0,
47 $num="x5"; # int num);
53 .type bn_mul_mont,%function
56 stp x29,x30,[sp,#-64]!
62 ldr $m0,[$bp],#8 // bp[0]
64 ldp $hi0,$aj,[$ap],#16 // ap[0..1]
67 and $tp,$tp,#-16 // ABI says so
68 ldp $hi1,$nj,[$np],#16 // np[0..1]
70 mul $lo0,$hi0,$m0 // ap[0]*bp[0]
71 sub $j,$num,#16 // j=num-2
73 mul $alo,$aj,$m0 // ap[1]*bp[0]
76 mul $m1,$lo0,$n0 // "tp[0]"*n0
79 mul $lo1,$hi1,$m1 // np[0]*m1
81 mul $nlo,$nj,$m1 // np[1]*m1
82 adds $lo1,$lo1,$lo0 // discarded
95 mul $alo,$aj,$m0 // ap[j]*bp[0]
100 mul $nlo,$nj,$m1 // np[j]*m1
103 str $lo1,[$tp],#8 // tp[j-1]
108 sub $ap,$ap,$num // rewind $ap
112 sub $np,$np,$num // rewind $np
116 sub $i,$num,#8 // i=num-1
119 adc $ovf,xzr,xzr // upmost overflow bit
123 ldr $m0,[$bp],#8 // bp[i]
124 ldp $hi0,$aj,[$ap],#16
125 ldr $tj,[sp] // tp[0]
128 mul $lo0,$hi0,$m0 // ap[0]*bp[i]
129 sub $j,$num,#16 // j=num-2
131 ldp $hi1,$nj,[$np],#16
132 mul $alo,$aj,$m0 // ap[1]*bp[i]
140 mul $lo1,$hi1,$m1 // np[0]*m1
142 mul $nlo,$nj,$m1 // np[1]*m1
150 ldr $tj,[$tp],#8 // tp[j]
159 mul $alo,$aj,$m0 // ap[j]*bp[i]
164 mul $nlo,$nj,$m1 // np[j]*m1
167 str $lo1,[$tp,#-16] // tp[j-1]
171 ldr $tj,[$tp],#8 // tp[j]
174 sub $ap,$ap,$num // rewind $ap
178 sub $np,$np,$num // rewind $np
186 adc $ovf,xzr,xzr // upmost overflow bit
187 stp $lo1,$hi1,[$tp,#-16]
191 // Final step. We see if result is larger than modulus, and
192 // if it is, subtract the modulus. But comparison implies
193 // subtraction. So we subtract modulus, see if it borrowed,
194 // and conditionally copy original value.
195 ldr $tj,[sp] // tp[0]
197 ldr $nj,[$np],#8 // np[0]
198 subs $j,$num,#8 // j=num-1 and clear borrow
201 sbcs $aj,$tj,$nj // tp[j]-np[j]
205 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
209 sbcs $ovf,$ovf,xzr // did it borrow?
210 str $aj,[$ap],#8 // rp[num-1]
212 ldr $tj,[sp] // tp[0]
214 ldr $aj,[$rp],#8 // rp[0]
215 sub $num,$num,#8 // num--
218 sub $num,$num,#8 // num--
219 csel $nj,$aj,$tj,cs // did it borrow?
222 str xzr,[$tp,#-16] // wipe tp
224 cbnz $num,.Lcond_copy
227 str xzr,[$tp,#-8] // wipe tp
230 ldp x19,x20,[x29,#16]
232 ldp x21,x22,[x29,#32]
233 ldp x23,x24,[x29,#48]
236 .size bn_mul_mont,.-bn_mul_mont
238 .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"