Add ARMv8 Montgomery multiplication module.
authorAndy Polyakov <appro@openssl.org>
Sat, 21 Mar 2015 12:54:17 +0000 (13:54 +0100)
committerAndy Polyakov <appro@openssl.org>
Mon, 20 Apr 2015 12:39:34 +0000 (14:39 +0200)
Reviewed-by: Rich Salz <rsalz@openssl.org>
crypto/bn/asm/armv8-mont.pl [new file with mode: 0755]

diff --git a/crypto/bn/asm/armv8-mont.pl b/crypto/bn/asm/armv8-mont.pl
new file mode 100755 (executable)
index 0000000..0bf9bf3
--- /dev/null
@@ -0,0 +1,244 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2015
+#
+# "Teaser" Montgomery multiplication module for ARMv8. Needs more
+# work. While it does improve RSA sign performance by 20-30% (less for
+# longer keys) on most processors, for some reason RSA2048 is not
+# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
+# instruction issue rate is limited on processor in question, meaning
+# that dedicated squaring procedure is a must. Well, actually all
+# contemporary AArch64 processors seem to have limited multiplication
+# issue rate, i.e. they can't issue multiplication every cycle, which
+# explains moderate improvement coefficients in comparison to
+# compiler-generated code. Recall that compiler is instructed to use
+# umulh and therefore uses same amount of multiplication instructions
+# to do the job. Assembly's edge is to minimize number of "collateral"
+# instructions and of course instruction scheduling.
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+($lo0,$hi0,$aj,$m0,$alo,$ahi,
+ $lo1,$hi1,$nj,$m1,$nlo,$nhi,
+ $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
+
+# int bn_mul_mont(
+$rp="x0";      # BN_ULONG *rp,
+$ap="x1";      # const BN_ULONG *ap,
+$bp="x2";      # const BN_ULONG *bp,
+$np="x3";      # const BN_ULONG *np,
+$n0="x4";      # const BN_ULONG *n0,
+$num="x5";     # int num);
+
+$code.=<<___;
+.text
+
+.globl bn_mul_mont
+.type  bn_mul_mont,%function
+.align 5
+bn_mul_mont:
+       stp     x29,x30,[sp,#-64]!
+       add     x29,sp,#0
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+
+       ldr     $m0,[$bp],#8            // bp[0]
+       sub     $tp,sp,$num,lsl#3
+       ldp     $hi0,$aj,[$ap],#16      // ap[0..1]
+       lsl     $num,$num,#3
+       ldr     $n0,[$n0]               // *n0
+       and     $tp,$tp,#-16            // ABI says so
+       ldp     $hi1,$nj,[$np],#16      // np[0..1]
+
+       mul     $lo0,$hi0,$m0           // ap[0]*bp[0]
+       sub     $j,$num,#16             // j=num-2
+       umulh   $hi0,$hi0,$m0
+       mul     $alo,$aj,$m0            // ap[1]*bp[0]
+       umulh   $ahi,$aj,$m0
+
+       mul     $m1,$lo0,$n0            // "tp[0]"*n0
+       mov     sp,$tp                  // alloca
+
+       mul     $lo1,$hi1,$m1           // np[0]*m1
+       umulh   $hi1,$hi1,$m1
+       mul     $nlo,$nj,$m1            // np[1]*m1
+       adds    $lo1,$lo1,$lo0          // discarded
+       umulh   $nhi,$nj,$m1
+       adc     $hi1,$hi1,xzr
+       cbz     $j,.L1st_skip
+
+.L1st:
+       ldr     $aj,[$ap],#8
+       adds    $lo0,$alo,$hi0
+       sub     $j,$j,#8                // j--
+       adc     $hi0,$ahi,xzr
+
+       ldr     $nj,[$np],#8
+       adds    $lo1,$nlo,$hi1
+       mul     $alo,$aj,$m0            // ap[j]*bp[0]
+       adc     $hi1,$nhi,xzr
+       umulh   $ahi,$aj,$m0
+
+       adds    $lo1,$lo1,$lo0
+       mul     $nlo,$nj,$m1            // np[j]*m1
+       adc     $hi1,$hi1,xzr
+       umulh   $nhi,$nj,$m1
+       str     $lo1,[$tp],#8           // tp[j-1]
+       cbnz    $j,.L1st
+
+.L1st_skip:
+       adds    $lo0,$alo,$hi0
+       sub     $ap,$ap,$num            // rewind $ap
+       adc     $hi0,$ahi,xzr
+
+       adds    $lo1,$nlo,$hi1
+       sub     $np,$np,$num            // rewind $np
+       adc     $hi1,$nhi,xzr
+
+       adds    $lo1,$lo1,$lo0
+       sub     $i,$num,#8              // i=num-1
+       adcs    $hi1,$hi1,$hi0
+
+       adc     $ovf,xzr,xzr            // upmost overflow bit
+       stp     $lo1,$hi1,[$tp]
+
+.Louter:
+       ldr     $m0,[$bp],#8            // bp[i]
+       ldp     $hi0,$aj,[$ap],#16
+       ldr     $tj,[sp]                // tp[0]
+       add     $tp,sp,#8
+
+       mul     $lo0,$hi0,$m0           // ap[0]*bp[i]
+       sub     $j,$num,#16             // j=num-2
+       umulh   $hi0,$hi0,$m0
+       ldp     $hi1,$nj,[$np],#16
+       mul     $alo,$aj,$m0            // ap[1]*bp[i]
+       adds    $lo0,$lo0,$tj
+       umulh   $ahi,$aj,$m0
+       adc     $hi0,$hi0,xzr
+
+       mul     $m1,$lo0,$n0
+       sub     $i,$i,#8                // i--
+
+       mul     $lo1,$hi1,$m1           // np[0]*m1
+       umulh   $hi1,$hi1,$m1
+       mul     $nlo,$nj,$m1            // np[1]*m1
+       adds    $lo1,$lo1,$lo0
+       umulh   $nhi,$nj,$m1
+       cbz     $j,.Linner_skip
+
+.Linner:
+       ldr     $aj,[$ap],#8
+       adc     $hi1,$hi1,xzr
+       ldr     $tj,[$tp],#8            // tp[j]
+       adds    $lo0,$alo,$hi0
+       sub     $j,$j,#8                // j--
+       adc     $hi0,$ahi,xzr
+
+       adds    $lo1,$nlo,$hi1
+       ldr     $nj,[$np],#8
+       adc     $hi1,$nhi,xzr
+
+       mul     $alo,$aj,$m0            // ap[j]*bp[i]
+       adds    $lo0,$lo0,$tj
+       umulh   $ahi,$aj,$m0
+       adc     $hi0,$hi0,xzr
+
+       mul     $nlo,$nj,$m1            // np[j]*m1
+       adds    $lo1,$lo1,$lo0
+       umulh   $nhi,$nj,$m1
+       str     $lo1,[$tp,#-16]         // tp[j-1]
+       cbnz    $j,.Linner
+
+.Linner_skip:
+       ldr     $tj,[$tp],#8            // tp[j]
+       adc     $hi1,$hi1,xzr
+       adds    $lo0,$alo,$hi0
+       sub     $ap,$ap,$num            // rewind $ap
+       adc     $hi0,$ahi,xzr
+
+       adds    $lo1,$nlo,$hi1
+       sub     $np,$np,$num            // rewind $np
+       adc     $hi1,$nhi,$ovf
+
+       adds    $lo0,$lo0,$tj
+       adc     $hi0,$hi0,xzr
+
+       adds    $lo1,$lo1,$lo0
+       adcs    $hi1,$hi1,$hi0
+       adc     $ovf,xzr,xzr            // upmost overflow bit
+       stp     $lo1,$hi1,[$tp,#-16]
+
+       cbnz    $i,.Louter
+
+       // Final step. We see if result is larger than modulus, and
+       // if it is, subtract the modulus. But comparison implies
+       // subtraction. So we subtract modulus, see if it borrowed,
+       // and conditionally copy original value. 
+       ldr     $tj,[sp]                // tp[0]
+       add     $tp,sp,#8
+       ldr     $nj,[$np],#8            // np[0]
+       subs    $j,$num,#8              // j=num-1 and clear borrow
+       mov     $ap,$rp
+.Lsub:
+       sbcs    $aj,$tj,$nj             // tp[j]-np[j]
+       ldr     $tj,[$tp],#8
+       sub     $j,$j,#8                // j--
+       ldr     $nj,[$np],#8
+       str     $aj,[$ap],#8            // rp[j]=tp[j]-np[j]
+       cbnz    $j,.Lsub
+
+       sbcs    $aj,$tj,$nj
+       sbcs    $ovf,$ovf,xzr           // did it borrow?
+       str     $aj,[$ap],#8            // rp[num-1]
+
+       ldr     $tj,[sp]                // tp[0]
+       add     $tp,sp,#8
+       ldr     $aj,[$rp],#8            // rp[0]
+       sub     $num,$num,#8            // num--
+       nop
+.Lcond_copy:
+       sub     $num,$num,#8            // num--
+       csel    $nj,$aj,$tj,cs          // did it borrow?
+       ldr     $tj,[$tp],#8
+       ldr     $aj,[$rp],#8
+       str     xzr,[$tp,#-16]          // wipe tp
+       str     $nj,[$rp,#-16]
+       cbnz    $num,.Lcond_copy
+
+       csel    $nj,$aj,$tj,cs
+       str     xzr,[$tp,#-8]           // wipe tp
+       str     $nj,[$rp,#-8]
+
+       ldp     x19,x20,[x29,#16]
+       mov     sp,x29
+       ldp     x21,x22,[x29,#32]
+       ldp     x23,x24,[x29,#48]
+       ldr     x29,[sp],#64
+       ret
+.size  bn_mul_mont,.-bn_mul_mont
+
+.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+___
+
+print $code;
+
+close STDOUT;