2 # Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # On 21264 RSA sign performance improves by 70/35/20/15 percent for
18 # 512/1024/2048/4096 bit key lengths. This is against vendor compiler
19 # instructed to '-tune host' code with in-line assembler. Other
20 # benchmarks improve by 15-20%. To anchor it to something else, the
21 # code provides approximately the same performance per GHz as AMD64.
22 # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
26 open STDOUT,">$output";
29 $rp="a0"; # BN_ULONG *rp,
30 $ap="a1"; # const BN_ULONG *ap,
31 $bp="a2"; # const BN_ULONG *bp,
32 $np="a3"; # const BN_ULONG *np,
33 $n0="a4"; # const BN_ULONG *n0,
34 $num="a5"; # int num);
55 #include <asm/regdef.h>
88 ldq $hi0,0($ap) # ap[0]
92 ldq $bi,0($bp) # bp[0]
93 lda AT,-4096(zero) # mov -4096,AT
98 ldq $hi1,0($np) # np[0]
213 s8addq $j,$np,$nj #U0
217 addq $alo,$hi0,$lo0 #L1
220 mulq $aj,$bi,$alo #U1
221 cmpult $lo0,$hi0,AT #L0
222 addq $nlo,$hi1,$lo1 #L1
225 mulq $nj,$m1,$nlo #U1
226 addq $ahi,AT,$hi0 #L0
227 addq $lo0,$tj,$lo0 #L1
228 cmpult $lo1,$hi1,v0 #U0
230 umulh $aj,$bi,$ahi #U1
231 cmpult $lo0,$tj,AT #L0
232 addq $lo1,$lo0,$lo1 #L1
233 addq $nhi,v0,$hi1 #U0
235 umulh $nj,$m1,$nhi #U1
236 s8addq $j,$ap,$aj #L0
237 cmpult $lo1,$lo0,v0 #L1
238 cmplt $j,$num,$tj #U0 # borrow $tj
240 addq $hi0,AT,$hi0 #L0
241 addq $hi1,v0,$hi1 #U1
265 cmpult $lo1,$hi0,$hi1
271 cmplt $i,$num,$tj # borrow $tj
275 s8addq $num,sp,$tj # &tp[num]
276 mov $rp,$bp # put rp aside
279 mov 0,$hi0 # clear borrow bit
282 .Lsub: ldq $lo0,0($tp)
286 subq $lo0,$lo1,$lo1 # tp[i]-np[i]
289 cmpult $lo1,$lo0,$hi0
296 subq $hi1,$hi0,$hi0 # handle upmost overflow bit
298 mov $bp,$rp # restore rp
302 bis $bp,$ap,$ap # ap=borrow?tp:rp
305 .Lcopy: ldq $aj,0($ap) # copy or in-place refresh
309 stq zero,-8($tp) # zap tp
326 .ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"