crypto/bn/asm/armv8-mont.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # March 2015
  11 #
  12 # "Teaser" Montgomery multiplication module for ARMv8. Needs more
  13 # work. While it does improve RSA sign performance by 20-30% (less for
  14 # longer keys) on most processors, for some reason RSA2048 is not
  15 # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
  16 # instruction issue rate is limited on processor in question, meaning
  17 # that dedicated squaring procedure is a must. Well, actually all
  18 # contemporary AArch64 processors seem to have limited multiplication
  19 # issue rate, i.e. they can't issue multiplication every cycle, which
  20 # explains moderate improvement coefficients in comparison to
  21 # compiler-generated code. Recall that compiler is instructed to use
  22 # umulh and therefore uses same amount of multiplication instructions
  23 # to do the job. Assembly's edge is to minimize number of "collateral"
  24 # instructions and of course instruction scheduling.
  25
  26 $flavour = shift;
  27 $output  = shift;
  28
  29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  30 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  31 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  32 die "can't locate arm-xlate.pl";
  33
  34 open OUT,"| \"$^X\" $xlate $flavour $output";
  35 *STDOUT=*OUT;
  36
  37 ($lo0,$hi0,$aj,$m0,$alo,$ahi,
  38  $lo1,$hi1,$nj,$m1,$nlo,$nhi,
  39  $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
  40
  41 # int bn_mul_mont(
  42 $rp="x0";       # BN_ULONG *rp,
  43 $ap="x1";       # const BN_ULONG *ap,
  44 $bp="x2";       # const BN_ULONG *bp,
  45 $np="x3";       # const BN_ULONG *np,
  46 $n0="x4";       # const BN_ULONG *n0,
  47 $num="x5";      # int num);
  48
  49 $code.=<<___;
  50 .text
  51
  52 .globl  bn_mul_mont
  53 .type   bn_mul_mont,%function
  54 .align  5
  55 bn_mul_mont:
  56         stp     x29,x30,[sp,#-64]!
  57         add     x29,sp,#0
  58         stp     x19,x20,[sp,#16]
  59         stp     x21,x22,[sp,#32]
  60         stp     x23,x24,[sp,#48]
  61
  62         ldr     $m0,[$bp],#8            // bp[0]
  63         sub     $tp,sp,$num,lsl#3
  64         ldp     $hi0,$aj,[$ap],#16      // ap[0..1]
  65         lsl     $num,$num,#3
  66         ldr     $n0,[$n0]               // *n0
  67         and     $tp,$tp,#-16            // ABI says so
  68         ldp     $hi1,$nj,[$np],#16      // np[0..1]
  69
  70         mul     $lo0,$hi0,$m0           // ap[0]*bp[0]
  71         sub     $j,$num,#16             // j=num-2
  72         umulh   $hi0,$hi0,$m0
  73         mul     $alo,$aj,$m0            // ap[1]*bp[0]
  74         umulh   $ahi,$aj,$m0
  75
  76         mul     $m1,$lo0,$n0            // "tp[0]"*n0
  77         mov     sp,$tp                  // alloca
  78
  79         mul     $lo1,$hi1,$m1           // np[0]*m1
  80         umulh   $hi1,$hi1,$m1
  81         mul     $nlo,$nj,$m1            // np[1]*m1
  82         adds    $lo1,$lo1,$lo0          // discarded
  83         umulh   $nhi,$nj,$m1
  84         adc     $hi1,$hi1,xzr
  85         cbz     $j,.L1st_skip
  86
  87 .L1st:
  88         ldr     $aj,[$ap],#8
  89         adds    $lo0,$alo,$hi0
  90         sub     $j,$j,#8                // j--
  91         adc     $hi0,$ahi,xzr
  92
  93         ldr     $nj,[$np],#8
  94         adds    $lo1,$nlo,$hi1
  95         mul     $alo,$aj,$m0            // ap[j]*bp[0]
  96         adc     $hi1,$nhi,xzr
  97         umulh   $ahi,$aj,$m0
  98
  99         adds    $lo1,$lo1,$lo0
 100         mul     $nlo,$nj,$m1            // np[j]*m1
 101         adc     $hi1,$hi1,xzr
 102         umulh   $nhi,$nj,$m1
 103         str     $lo1,[$tp],#8           // tp[j-1]
 104         cbnz    $j,.L1st
 105
 106 .L1st_skip:
 107         adds    $lo0,$alo,$hi0
 108         sub     $ap,$ap,$num            // rewind $ap
 109         adc     $hi0,$ahi,xzr
 110
 111         adds    $lo1,$nlo,$hi1
 112         sub     $np,$np,$num            // rewind $np
 113         adc     $hi1,$nhi,xzr
 114
 115         adds    $lo1,$lo1,$lo0
 116         sub     $i,$num,#8              // i=num-1
 117         adcs    $hi1,$hi1,$hi0
 118
 119         adc     $ovf,xzr,xzr            // upmost overflow bit
 120         stp     $lo1,$hi1,[$tp]
 121
 122 .Louter:
 123         ldr     $m0,[$bp],#8            // bp[i]
 124         ldp     $hi0,$aj,[$ap],#16
 125         ldr     $tj,[sp]                // tp[0]
 126         add     $tp,sp,#8
 127
 128         mul     $lo0,$hi0,$m0           // ap[0]*bp[i]
 129         sub     $j,$num,#16             // j=num-2
 130         umulh   $hi0,$hi0,$m0
 131         ldp     $hi1,$nj,[$np],#16
 132         mul     $alo,$aj,$m0            // ap[1]*bp[i]
 133         adds    $lo0,$lo0,$tj
 134         umulh   $ahi,$aj,$m0
 135         adc     $hi0,$hi0,xzr
 136
 137         mul     $m1,$lo0,$n0
 138         sub     $i,$i,#8                // i--
 139
 140         mul     $lo1,$hi1,$m1           // np[0]*m1
 141         umulh   $hi1,$hi1,$m1
 142         mul     $nlo,$nj,$m1            // np[1]*m1
 143         adds    $lo1,$lo1,$lo0
 144         umulh   $nhi,$nj,$m1
 145         cbz     $j,.Linner_skip
 146
 147 .Linner:
 148         ldr     $aj,[$ap],#8
 149         adc     $hi1,$hi1,xzr
 150         ldr     $tj,[$tp],#8            // tp[j]
 151         adds    $lo0,$alo,$hi0
 152         sub     $j,$j,#8                // j--
 153         adc     $hi0,$ahi,xzr
 154
 155         adds    $lo1,$nlo,$hi1
 156         ldr     $nj,[$np],#8
 157         adc     $hi1,$nhi,xzr
 158
 159         mul     $alo,$aj,$m0            // ap[j]*bp[i]
 160         adds    $lo0,$lo0,$tj
 161         umulh   $ahi,$aj,$m0
 162         adc     $hi0,$hi0,xzr
 163
 164         mul     $nlo,$nj,$m1            // np[j]*m1
 165         adds    $lo1,$lo1,$lo0
 166         umulh   $nhi,$nj,$m1
 167         str     $lo1,[$tp,#-16]         // tp[j-1]
 168         cbnz    $j,.Linner
 169
 170 .Linner_skip:
 171         ldr     $tj,[$tp],#8            // tp[j]
 172         adc     $hi1,$hi1,xzr
 173         adds    $lo0,$alo,$hi0
 174         sub     $ap,$ap,$num            // rewind $ap
 175         adc     $hi0,$ahi,xzr
 176
 177         adds    $lo1,$nlo,$hi1
 178         sub     $np,$np,$num            // rewind $np
 179         adc     $hi1,$nhi,$ovf
 180
 181         adds    $lo0,$lo0,$tj
 182         adc     $hi0,$hi0,xzr
 183
 184         adds    $lo1,$lo1,$lo0
 185         adcs    $hi1,$hi1,$hi0
 186         adc     $ovf,xzr,xzr            // upmost overflow bit
 187         stp     $lo1,$hi1,[$tp,#-16]
 188
 189         cbnz    $i,.Louter
 190
 191         // Final step. We see if result is larger than modulus, and
 192         // if it is, subtract the modulus. But comparison implies
 193         // subtraction. So we subtract modulus, see if it borrowed,
 194         // and conditionally copy original value.
 195         ldr     $tj,[sp]                // tp[0]
 196         add     $tp,sp,#8
 197         ldr     $nj,[$np],#8            // np[0]
 198         subs    $j,$num,#8              // j=num-1 and clear borrow
 199         mov     $ap,$rp
 200 .Lsub:
 201         sbcs    $aj,$tj,$nj             // tp[j]-np[j]
 202         ldr     $tj,[$tp],#8
 203         sub     $j,$j,#8                // j--
 204         ldr     $nj,[$np],#8
 205         str     $aj,[$ap],#8            // rp[j]=tp[j]-np[j]
 206         cbnz    $j,.Lsub
 207
 208         sbcs    $aj,$tj,$nj
 209         sbcs    $ovf,$ovf,xzr           // did it borrow?
 210         str     $aj,[$ap],#8            // rp[num-1]
 211
 212         ldr     $tj,[sp]                // tp[0]
 213         add     $tp,sp,#8
 214         ldr     $aj,[$rp],#8            // rp[0]
 215         sub     $num,$num,#8            // num--
 216         nop
 217 .Lcond_copy:
 218         sub     $num,$num,#8            // num--
 219         csel    $nj,$aj,$tj,cs          // did it borrow?
 220         ldr     $tj,[$tp],#8
 221         ldr     $aj,[$rp],#8
 222         str     xzr,[$tp,#-16]          // wipe tp
 223         str     $nj,[$rp,#-16]
 224         cbnz    $num,.Lcond_copy
 225
 226         csel    $nj,$aj,$tj,cs
 227         str     xzr,[$tp,#-8]           // wipe tp
 228         str     $nj,[$rp,#-8]
 229
 230         ldp     x19,x20,[x29,#16]
 231         mov     sp,x29
 232         ldp     x21,x22,[x29,#32]
 233         ldp     x23,x24,[x29,#48]
 234         ldr     x29,[sp],#64
 235         ret
 236 .size   bn_mul_mont,.-bn_mul_mont
 237
 238 .asciz  "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 239 .align  4
 240 ___
 241
 242 print $code;
 243
 244 close STDOUT;