2 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # ECP_NISTZ256 module for ARMv8.
21 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22 # http://eprint.iacr.org/2013/816.
24 # with/without -DECP_NISTZ256_ASM
26 # Cortex-A53 +190-400%
27 # Cortex-A57 +190-350%
30 # Ranges denote minimum and maximum improvement coefficients depending
31 # on benchmark. Lower coefficients are for ECDSA sign, server-side
32 # operation. Keep in mind that +400% means 5x improvement.
35 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
37 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
39 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
40 die "can't locate arm-xlate.pl";
42 open OUT,"| \"$^X\" $xlate $flavour $output";
46 my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
47 $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
48 map("x$_",(0..17,19,20));
50 my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont
57 ########################################################################
58 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
60 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
61 open TABLE,"<ecp_nistz256_table.c" or
62 open TABLE,"<${dir}../ecp_nistz256_table.c" or
63 die "failed to open ecp_nistz256_table.c:",$!;
68 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
72 # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
73 # 64*16*37-1 is because $#arr returns last valid index or @arr, not
75 die "insane number of elements" if ($#arr != 64*16*37-1);
78 .globl ecp_nistz256_precomputed
79 .type ecp_nistz256_precomputed,%object
81 ecp_nistz256_precomputed:
83 ########################################################################
84 # this conversion smashes P256_POINT_AFFINE by individual bytes with
85 # 64 byte interval, similar to
89 @tbl = splice(@arr,0,64*16);
90 for($i=0;$i<64;$i++) {
92 for($j=0;$j<64;$j++) {
93 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
96 $code.=join(',',map { sprintf "0x%02x",$_} @line);
101 .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
104 .quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
105 .LRR: // 2^512 mod P precomputed for NIST P256 polynomial
106 .quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
108 .quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
112 .quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
114 .quad 0xccd1c8aaee00bc4f
115 .asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
117 // void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
118 .globl ecp_nistz256_to_mont
119 .type ecp_nistz256_to_mont,%function
121 ecp_nistz256_to_mont:
122 stp x29,x30,[sp,#-32]!
126 ldr $bi,.LRR // bp[0]
128 ldp $a2,$a3,[$ap,#16]
131 adr $bp,.LRR // &bp[0]
133 bl __ecp_nistz256_mul_mont
138 .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
140 // void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
141 .globl ecp_nistz256_from_mont
142 .type ecp_nistz256_from_mont,%function
144 ecp_nistz256_from_mont:
145 stp x29,x30,[sp,#-32]!
151 ldp $a2,$a3,[$ap,#16]
154 adr $bp,.Lone // &bp[0]
156 bl __ecp_nistz256_mul_mont
161 .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
163 // void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
164 // const BN_ULONG x2[4]);
165 .globl ecp_nistz256_mul_mont
166 .type ecp_nistz256_mul_mont,%function
168 ecp_nistz256_mul_mont:
169 stp x29,x30,[sp,#-32]!
173 ldr $bi,[$bp] // bp[0]
175 ldp $a2,$a3,[$ap,#16]
179 bl __ecp_nistz256_mul_mont
184 .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
186 // void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
187 .globl ecp_nistz256_sqr_mont
188 .type ecp_nistz256_sqr_mont,%function
190 ecp_nistz256_sqr_mont:
191 stp x29,x30,[sp,#-32]!
196 ldp $a2,$a3,[$ap,#16]
200 bl __ecp_nistz256_sqr_mont
205 .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
207 // void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
208 // const BN_ULONG x2[4]);
209 .globl ecp_nistz256_add
210 .type ecp_nistz256_add,%function
213 stp x29,x30,[sp,#-16]!
216 ldp $acc0,$acc1,[$ap]
218 ldp $acc2,$acc3,[$ap,#16]
219 ldp $t2,$t3,[$bp,#16]
223 bl __ecp_nistz256_add
227 .size ecp_nistz256_add,.-ecp_nistz256_add
229 // void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
230 .globl ecp_nistz256_div_by_2
231 .type ecp_nistz256_div_by_2,%function
233 ecp_nistz256_div_by_2:
234 stp x29,x30,[sp,#-16]!
237 ldp $acc0,$acc1,[$ap]
238 ldp $acc2,$acc3,[$ap,#16]
242 bl __ecp_nistz256_div_by_2
246 .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
248 // void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
249 .globl ecp_nistz256_mul_by_2
250 .type ecp_nistz256_mul_by_2,%function
252 ecp_nistz256_mul_by_2:
253 stp x29,x30,[sp,#-16]!
256 ldp $acc0,$acc1,[$ap]
257 ldp $acc2,$acc3,[$ap,#16]
265 bl __ecp_nistz256_add // ret = a+a // 2*a
269 .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
271 // void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
272 .globl ecp_nistz256_mul_by_3
273 .type ecp_nistz256_mul_by_3,%function
275 ecp_nistz256_mul_by_3:
276 stp x29,x30,[sp,#-16]!
279 ldp $acc0,$acc1,[$ap]
280 ldp $acc2,$acc3,[$ap,#16]
292 bl __ecp_nistz256_add // ret = a+a // 2*a
299 bl __ecp_nistz256_add // ret += a // 2*a+a=3*a
303 .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
305 // void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
306 // const BN_ULONG x2[4]);
307 .globl ecp_nistz256_sub
308 .type ecp_nistz256_sub,%function
311 stp x29,x30,[sp,#-16]!
314 ldp $acc0,$acc1,[$ap]
315 ldp $acc2,$acc3,[$ap,#16]
319 bl __ecp_nistz256_sub_from
323 .size ecp_nistz256_sub,.-ecp_nistz256_sub
325 // void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
326 .globl ecp_nistz256_neg
327 .type ecp_nistz256_neg,%function
330 stp x29,x30,[sp,#-16]!
334 mov $acc0,xzr // a = 0
341 bl __ecp_nistz256_sub_from
345 .size ecp_nistz256_neg,.-ecp_nistz256_neg
347 // note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
348 // to $a0-$a3 and b[0] - to $bi
349 .type __ecp_nistz256_mul_mont,%function
351 __ecp_nistz256_mul_mont:
352 mul $acc0,$a0,$bi // a[0]*b[0]
355 mul $acc1,$a1,$bi // a[1]*b[0]
358 mul $acc2,$a2,$bi // a[2]*b[0]
361 mul $acc3,$a3,$bi // a[3]*b[0]
363 ldr $bi,[$bp,#8] // b[1]
365 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
373 for($i=1;$i<4;$i++) {
374 # Reduction iteration is normally performed by accumulating
375 # result of multiplication of modulus by "magic" digit [and
376 # omitting least significant word, which is guaranteed to
377 # be 0], but thanks to special form of modulus and "magic"
378 # digit being equal to least significant word, it can be
379 # performed with additions and subtractions alone. Indeed:
381 # ffff0001.00000000.0000ffff.ffffffff
383 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
385 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
388 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
389 # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
390 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
392 # or marking redundant operations:
394 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
395 # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
396 # - 0000abcd.efgh0000.--------.--------.--------
399 subs $t2,$acc0,$t0 // "*0xffff0001"
401 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
402 mul $t0,$a0,$bi // lo(a[0]*b[i])
404 mul $t1,$a1,$bi // lo(a[1]*b[i])
405 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
406 mul $t2,$a2,$bi // lo(a[2]*b[i])
408 mul $t3,$a3,$bi // lo(a[3]*b[i])
411 adds $acc0,$acc0,$t0 // accumulate low parts of multiplication
412 umulh $t0,$a0,$bi // hi(a[0]*b[i])
414 umulh $t1,$a1,$bi // hi(a[1]*b[i])
416 umulh $t2,$a2,$bi // hi(a[2]*b[i])
418 umulh $t3,$a3,$bi // hi(a[3]*b[i])
421 $code.=<<___ if ($i<3);
422 ldr $bi,[$bp,#8*($i+1)] // b[$i+1]
425 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
436 subs $t2,$acc0,$t0 // "*0xffff0001"
438 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
440 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
444 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
445 sbcs $t1,$acc1,$poly1
447 sbcs $t3,$acc3,$poly3
448 sbcs xzr,$acc4,xzr // did it borrow?
450 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
451 csel $acc1,$acc1,$t1,lo
452 csel $acc2,$acc2,$t2,lo
453 stp $acc0,$acc1,[$rp]
454 csel $acc3,$acc3,$t3,lo
455 stp $acc2,$acc3,[$rp,#16]
458 .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
460 // note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
462 .type __ecp_nistz256_sqr_mont,%function
464 __ecp_nistz256_sqr_mont:
465 // | | | | | |a1*a0| |
466 // | | | | |a2*a0| | |
467 // | |a3*a2|a3*a0| | | |
468 // | | | |a2*a1| | | |
469 // | | |a3*a1| | | | |
470 // *| | | | | | | | 2|
471 // +|a3*a3|a2*a2|a1*a1|a0*a0|
472 // |--+--+--+--+--+--+--+--|
473 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
475 // "can't overflow" below mark carrying into high part of
476 // multiplication result, which can't overflow, because it
477 // can never be all ones.
479 mul $acc1,$a1,$a0 // a[1]*a[0]
481 mul $acc2,$a2,$a0 // a[2]*a[0]
483 mul $acc3,$a3,$a0 // a[3]*a[0]
486 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
487 mul $t0,$a2,$a1 // a[2]*a[1]
490 mul $t2,$a3,$a1 // a[3]*a[1]
492 adc $acc4,$acc4,xzr // can't overflow
494 mul $acc5,$a3,$a2 // a[3]*a[2]
497 adds $t1,$t1,$t2 // accumulate high parts of multiplication
498 mul $acc0,$a0,$a0 // a[0]*a[0]
499 adc $t2,$t3,xzr // can't overflow
501 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
504 mul $t1,$a1,$a1 // a[1]*a[1]
507 adc $acc6,$acc6,xzr // can't overflow
509 adds $acc1,$acc1,$acc1 // acc[1-6]*=2
510 mul $t2,$a2,$a2 // a[2]*a[2]
511 adcs $acc2,$acc2,$acc2
513 adcs $acc3,$acc3,$acc3
514 mul $t3,$a3,$a3 // a[3]*a[3]
515 adcs $acc4,$acc4,$acc4
517 adcs $acc5,$acc5,$acc5
518 adcs $acc6,$acc6,$acc6
521 adds $acc1,$acc1,$a0 // +a[i]*a[i]
531 for($i=0;$i<3;$i++) { # reductions, see commentary in
532 # multiplication for details
534 subs $t2,$acc0,$t0 // "*0xffff0001"
536 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
539 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
541 adc $acc3,$t3,xzr // can't overflow
545 subs $t2,$acc0,$t0 // "*0xffff0001"
547 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0]
549 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001
550 adc $acc3,$t3,xzr // can't overflow
552 adds $acc0,$acc0,$acc4 // accumulate upper half
553 adcs $acc1,$acc1,$acc5
554 adcs $acc2,$acc2,$acc6
555 adcs $acc3,$acc3,$acc7
558 adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus
559 sbcs $t1,$acc1,$poly1
561 sbcs $t3,$acc3,$poly3
562 sbcs xzr,$acc4,xzr // did it borrow?
564 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
565 csel $acc1,$acc1,$t1,lo
566 csel $acc2,$acc2,$t2,lo
567 stp $acc0,$acc1,[$rp]
568 csel $acc3,$acc3,$t3,lo
569 stp $acc2,$acc3,[$rp,#16]
572 .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
574 // Note that __ecp_nistz256_add expects both input vectors pre-loaded to
575 // $a0-$a3 and $t0-$t3. This is done because it's used in multiple
576 // contexts, e.g. in multiplication by 2 and 3...
577 .type __ecp_nistz256_add,%function
580 adds $acc0,$acc0,$t0 // ret = a+b
584 adc $ap,xzr,xzr // zap $ap
586 adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus
587 sbcs $t1,$acc1,$poly1
589 sbcs $t3,$acc3,$poly3
590 sbcs xzr,$ap,xzr // did subtraction borrow?
592 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
593 csel $acc1,$acc1,$t1,lo
594 csel $acc2,$acc2,$t2,lo
595 stp $acc0,$acc1,[$rp]
596 csel $acc3,$acc3,$t3,lo
597 stp $acc2,$acc3,[$rp,#16]
600 .size __ecp_nistz256_add,.-__ecp_nistz256_add
602 .type __ecp_nistz256_sub_from,%function
604 __ecp_nistz256_sub_from:
606 ldp $t2,$t3,[$bp,#16]
607 subs $acc0,$acc0,$t0 // ret = a-b
611 sbc $ap,xzr,xzr // zap $ap
613 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
614 adcs $t1,$acc1,$poly1
617 cmp $ap,xzr // did subtraction borrow?
619 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
620 csel $acc1,$acc1,$t1,eq
621 csel $acc2,$acc2,$t2,eq
622 stp $acc0,$acc1,[$rp]
623 csel $acc3,$acc3,$t3,eq
624 stp $acc2,$acc3,[$rp,#16]
627 .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
629 .type __ecp_nistz256_sub_morf,%function
631 __ecp_nistz256_sub_morf:
633 ldp $t2,$t3,[$bp,#16]
634 subs $acc0,$t0,$acc0 // ret = b-a
638 sbc $ap,xzr,xzr // zap $ap
640 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus
641 adcs $t1,$acc1,$poly1
644 cmp $ap,xzr // did subtraction borrow?
646 csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret
647 csel $acc1,$acc1,$t1,eq
648 csel $acc2,$acc2,$t2,eq
649 stp $acc0,$acc1,[$rp]
650 csel $acc3,$acc3,$t3,eq
651 stp $acc2,$acc3,[$rp,#16]
654 .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
656 .type __ecp_nistz256_div_by_2,%function
658 __ecp_nistz256_div_by_2:
659 subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus
660 adcs $t1,$acc1,$poly1
662 adcs $t3,$acc3,$poly3
663 adc $ap,xzr,xzr // zap $ap
664 tst $acc0,#1 // is a even?
666 csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
667 csel $acc1,$acc1,$t1,eq
668 csel $acc2,$acc2,$t2,eq
669 csel $acc3,$acc3,$t3,eq
672 lsr $acc0,$acc0,#1 // ret >>= 1
673 orr $acc0,$acc0,$acc1,lsl#63
675 orr $acc1,$acc1,$acc2,lsl#63
677 orr $acc2,$acc2,$acc3,lsl#63
679 stp $acc0,$acc1,[$rp]
680 orr $acc3,$acc3,$ap,lsl#63
681 stp $acc2,$acc3,[$rp,#16]
684 .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
686 ########################################################################
687 # following subroutines are "literal" implementation of those found in
690 ########################################################################
691 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
694 my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
695 # above map() describes stack layout with 4 temporary
696 # 256-bit vectors on top.
697 my ($rp_real,$ap_real) = map("x$_",(21,22));
700 .globl ecp_nistz256_point_double
701 .type ecp_nistz256_point_double,%function
703 ecp_nistz256_point_double:
704 stp x29,x30,[sp,#-80]!
711 ldp $acc0,$acc1,[$ap,#32]
713 ldp $acc2,$acc3,[$ap,#48]
719 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont
722 ldp $a2,$a3,[$ap_real,#64+16]
724 bl __ecp_nistz256_add // p256_mul_by_2(S, in_y);
727 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z);
729 ldp $t0,$t1,[$ap_real]
730 ldp $t2,$t3,[$ap_real,#16]
731 mov $a0,$acc0 // put Zsqr aside for p256_sub
736 bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x);
739 mov $acc0,$a0 // restore Zsqr
741 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
744 ldp $a2,$a3,[sp,#$S+16]
746 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr);
749 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S);
751 ldr $bi,[$ap_real,#32]
752 ldp $a0,$a1,[$ap_real,#64]
753 ldp $a2,$a3,[$ap_real,#64+16]
756 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y);
760 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont
763 ldp $a2,$a3,[sp,#$S+16]
765 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0);
768 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S);
770 ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont
772 ldp $a2,$a3,[sp,#$M+16]
774 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0);
778 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr);
780 mov $t0,$acc0 // duplicate M
784 mov $a0,$acc0 // put M aside
789 bl __ecp_nistz256_add
790 mov $t0,$a0 // restore M
792 ldr $bi,[$ap_real] // forward load for p256_mul_mont
796 ldp $a2,$a3,[sp,#$S+16]
797 bl __ecp_nistz256_add // p256_mul_by_3(M, M);
801 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x);
805 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont
808 ldp $a2,$a3,[sp,#$M+16]
810 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S);
813 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M);
816 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0);
820 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x);
823 mov $a0,$acc0 // copy S
828 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M);
832 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y);
834 add sp,x29,#0 // destroy frame
835 ldp x19,x20,[x29,#16]
836 ldp x21,x22,[x29,#32]
839 .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
843 ########################################################################
844 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
845 # const P256_POINT *in2);
847 my ($res_x,$res_y,$res_z,
848 $H,$Hsqr,$R,$Rsqr,$Hcub,
849 $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
850 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
851 # above map() describes stack layout with 12 temporary
852 # 256-bit vectors on top.
853 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
856 .globl ecp_nistz256_point_add
857 .type ecp_nistz256_point_add,%function
859 ecp_nistz256_point_add:
860 stp x29,x30,[sp,#-80]!
868 ldp $a0,$a1,[$bp,#64] // in2_z
869 ldp $a2,$a3,[$bp,#64+16]
877 orr $in2infty,$t0,$t2
879 csetm $in2infty,ne // !in2infty
881 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z);
883 ldp $a0,$a1,[$ap_real,#64] // in1_z
884 ldp $a2,$a3,[$ap_real,#64+16]
887 orr $in1infty,$t0,$t2
889 csetm $in1infty,ne // !in1infty
891 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
893 ldr $bi,[$bp_real,#64]
894 ldp $a0,$a1,[sp,#$Z2sqr]
895 ldp $a2,$a3,[sp,#$Z2sqr+16]
898 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z);
900 ldr $bi,[$ap_real,#64]
901 ldp $a0,$a1,[sp,#$Z1sqr]
902 ldp $a2,$a3,[sp,#$Z1sqr+16]
905 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
907 ldr $bi,[$ap_real,#32]
908 ldp $a0,$a1,[sp,#$S1]
909 ldp $a2,$a3,[sp,#$S1+16]
912 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y);
914 ldr $bi,[$bp_real,#32]
915 ldp $a0,$a1,[sp,#$S2]
916 ldp $a2,$a3,[sp,#$S2+16]
919 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
922 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont
923 ldp $a0,$a1,[$ap_real]
924 ldp $a2,$a3,[$ap_real,#16]
926 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1);
928 orr $acc0,$acc0,$acc1 // see if result is zero
929 orr $acc2,$acc2,$acc3
930 orr $temp,$acc0,$acc2
934 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr);
937 ldp $a0,$a1,[$bp_real]
938 ldp $a2,$a3,[$bp_real,#16]
941 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr);
944 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont
945 ldp $a2,$a3,[sp,#$R+16]
947 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1);
949 orr $acc0,$acc0,$acc1 // see if result is zero
950 orr $acc2,$acc2,$acc3
951 orr $acc0,$acc0,$acc2
953 b.ne .Ladd_proceed // is_equal(U1,U2)?
955 tst $in1infty,$in2infty
956 b.eq .Ladd_proceed // (in1infty || in2infty)?
959 b.eq .Ladd_double // is_equal(S1,S2)?
963 stp $a0,$a1,[$rp_real]
964 stp $a0,$a1,[$rp_real,#16]
965 stp $a0,$a1,[$rp_real,#32]
966 stp $a0,$a1,[$rp_real,#48]
967 stp $a0,$a1,[$rp_real,#64]
968 stp $a0,$a1,[$rp_real,#80]
975 ldp x23,x24,[x29,#48]
976 ldp x25,x26,[x29,#64]
977 add sp,sp,#32*(12-4) // difference in stack frames
983 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
985 ldr $bi,[$ap_real,#64]
987 ldp $a2,$a3,[sp,#$H+16]
990 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
993 ldp $a2,$a3,[sp,#$H+16]
995 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
997 ldr $bi,[$bp_real,#64]
998 ldp $a0,$a1,[sp,#$res_z]
999 ldp $a2,$a3,[sp,#$res_z+16]
1000 add $bp,$bp_real,#64
1002 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z);
1005 ldp $a0,$a1,[sp,#$Hsqr]
1006 ldp $a2,$a3,[sp,#$Hsqr+16]
1009 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
1012 ldp $a0,$a1,[sp,#$U1]
1013 ldp $a2,$a3,[sp,#$U1+16]
1016 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr);
1023 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2);
1027 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
1030 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
1033 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont
1034 ldp $a0,$a1,[sp,#$S1]
1035 ldp $a2,$a3,[sp,#$S1+16]
1037 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
1041 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub);
1044 ldp $a0,$a1,[sp,#$res_y]
1045 ldp $a2,$a3,[sp,#$res_y+16]
1048 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
1051 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
1053 ldp $a0,$a1,[sp,#$res_x] // res
1054 ldp $a2,$a3,[sp,#$res_x+16]
1055 ldp $t0,$t1,[$bp_real] // in2
1056 ldp $t2,$t3,[$bp_real,#16]
1058 for($i=0;$i<64;$i+=32) { # conditional moves
1060 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1061 cmp $in1infty,#0 // !$in1intfy, remember?
1062 ldp $acc2,$acc3,[$ap_real,#$i+16]
1065 ldp $a0,$a1,[sp,#$res_x+$i+32] // res
1068 cmp $in2infty,#0 // !$in2intfy, remember?
1069 ldp $a2,$a3,[sp,#$res_x+$i+48]
1070 csel $acc0,$t0,$acc0,ne
1071 csel $acc1,$t1,$acc1,ne
1072 ldp $t0,$t1,[$bp_real,#$i+32] // in2
1073 csel $acc2,$t2,$acc2,ne
1074 csel $acc3,$t3,$acc3,ne
1075 ldp $t2,$t3,[$bp_real,#$i+48]
1076 stp $acc0,$acc1,[$rp_real,#$i]
1077 stp $acc2,$acc3,[$rp_real,#$i+16]
1081 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1082 cmp $in1infty,#0 // !$in1intfy, remember?
1083 ldp $acc2,$acc3,[$ap_real,#$i+16]
1088 cmp $in2infty,#0 // !$in2intfy, remember?
1089 csel $acc0,$t0,$acc0,ne
1090 csel $acc1,$t1,$acc1,ne
1091 csel $acc2,$t2,$acc2,ne
1092 csel $acc3,$t3,$acc3,ne
1093 stp $acc0,$acc1,[$rp_real,#$i]
1094 stp $acc2,$acc3,[$rp_real,#$i+16]
1097 add sp,x29,#0 // destroy frame
1098 ldp x19,x20,[x29,#16]
1099 ldp x21,x22,[x29,#32]
1100 ldp x23,x24,[x29,#48]
1101 ldp x25,x26,[x29,#64]
1102 ldp x29,x30,[sp],#80
1104 .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1108 ########################################################################
1109 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1110 # const P256_POINT_AFFINE *in2);
1112 my ($res_x,$res_y,$res_z,
1113 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1115 # above map() describes stack layout with 10 temporary
1116 # 256-bit vectors on top.
1117 my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
1120 .globl ecp_nistz256_point_add_affine
1121 .type ecp_nistz256_point_add_affine,%function
1123 ecp_nistz256_point_add_affine:
1124 stp x29,x30,[sp,#-80]!
1126 stp x19,x20,[sp,#16]
1127 stp x21,x22,[sp,#32]
1128 stp x23,x24,[sp,#48]
1129 stp x25,x26,[sp,#64]
1136 ldr $poly3,.Lpoly+24
1138 ldp $a0,$a1,[$ap,#64] // in1_z
1139 ldp $a2,$a3,[$ap,#64+16]
1142 orr $in1infty,$t0,$t2
1144 csetm $in1infty,ne // !in1infty
1146 ldp $acc0,$acc1,[$bp] // in2_x
1147 ldp $acc2,$acc3,[$bp,#16]
1148 ldp $t0,$t1,[$bp,#32] // in2_y
1149 ldp $t2,$t3,[$bp,#48]
1150 orr $acc0,$acc0,$acc1
1151 orr $acc2,$acc2,$acc3
1154 orr $acc0,$acc0,$acc2
1156 orr $in2infty,$acc0,$t0
1158 csetm $in2infty,ne // !in2infty
1161 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z);
1170 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x);
1173 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont
1174 ldp $a0,$a1,[sp,#$Z1sqr]
1175 ldp $a2,$a3,[sp,#$Z1sqr+16]
1177 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x);
1179 add $bp,$ap_real,#64
1181 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z);
1183 ldr $bi,[$ap_real,#64]
1184 ldp $a0,$a1,[sp,#$H]
1185 ldp $a2,$a3,[sp,#$H+16]
1186 add $bp,$ap_real,#64
1188 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z);
1190 ldr $bi,[$bp_real,#32]
1191 ldp $a0,$a1,[sp,#$S2]
1192 ldp $a2,$a3,[sp,#$S2+16]
1193 add $bp,$bp_real,#32
1195 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y);
1197 add $bp,$ap_real,#32
1198 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont
1199 ldp $a2,$a3,[sp,#$H+16]
1201 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y);
1204 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H);
1206 ldp $a0,$a1,[sp,#$R]
1207 ldp $a2,$a3,[sp,#$R+16]
1209 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R);
1212 ldp $a0,$a1,[sp,#$Hsqr]
1213 ldp $a2,$a3,[sp,#$Hsqr+16]
1216 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H);
1219 ldp $a0,$a1,[sp,#$Hsqr]
1220 ldp $a2,$a3,[sp,#$Hsqr+16]
1223 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr);
1230 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2);
1234 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr);
1237 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub);
1240 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont
1241 ldp $a0,$a1,[sp,#$Hcub]
1242 ldp $a2,$a3,[sp,#$Hcub+16]
1244 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x);
1246 add $bp,$ap_real,#32
1248 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub);
1251 ldp $a0,$a1,[sp,#$res_y]
1252 ldp $a2,$a3,[sp,#$res_y+16]
1255 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R);
1258 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2);
1260 ldp $a0,$a1,[sp,#$res_x] // res
1261 ldp $a2,$a3,[sp,#$res_x+16]
1262 ldp $t0,$t1,[$bp_real] // in2
1263 ldp $t2,$t3,[$bp_real,#16]
1265 for($i=0;$i<64;$i+=32) { # conditional moves
1267 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1268 cmp $in1infty,#0 // !$in1intfy, remember?
1269 ldp $acc2,$acc3,[$ap_real,#$i+16]
1272 ldp $a0,$a1,[sp,#$res_x+$i+32] // res
1275 cmp $in2infty,#0 // !$in2intfy, remember?
1276 ldp $a2,$a3,[sp,#$res_x+$i+48]
1277 csel $acc0,$t0,$acc0,ne
1278 csel $acc1,$t1,$acc1,ne
1279 ldp $t0,$t1,[$bp_real,#$i+32] // in2
1280 csel $acc2,$t2,$acc2,ne
1281 csel $acc3,$t3,$acc3,ne
1282 ldp $t2,$t3,[$bp_real,#$i+48]
1283 stp $acc0,$acc1,[$rp_real,#$i]
1284 stp $acc2,$acc3,[$rp_real,#$i+16]
1286 $code.=<<___ if ($i == 0);
1287 adr $bp_real,.Lone_mont-64
1291 ldp $acc0,$acc1,[$ap_real,#$i] // in1
1292 cmp $in1infty,#0 // !$in1intfy, remember?
1293 ldp $acc2,$acc3,[$ap_real,#$i+16]
1298 cmp $in2infty,#0 // !$in2intfy, remember?
1299 csel $acc0,$t0,$acc0,ne
1300 csel $acc1,$t1,$acc1,ne
1301 csel $acc2,$t2,$acc2,ne
1302 csel $acc3,$t3,$acc3,ne
1303 stp $acc0,$acc1,[$rp_real,#$i]
1304 stp $acc2,$acc3,[$rp_real,#$i+16]
1306 add sp,x29,#0 // destroy frame
1307 ldp x19,x20,[x29,#16]
1308 ldp x21,x22,[x29,#32]
1309 ldp x23,x24,[x29,#48]
1310 ldp x25,x26,[x29,#64]
1311 ldp x29,x30,[sp],#80
1313 .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1317 my ($ord0,$ord1) = ($poly1,$poly3);
1318 my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
1322 ////////////////////////////////////////////////////////////////////////
1323 // void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1325 .globl ecp_nistz256_ord_mul_mont
1326 .type ecp_nistz256_ord_mul_mont,%function
1328 ecp_nistz256_ord_mul_mont:
1329 stp x29,x30,[sp,#-64]!
1331 stp x19,x20,[sp,#16]
1332 stp x21,x22,[sp,#32]
1333 stp x23,x24,[sp,#48]
1336 ldr $bi,[$bp] // bp[0]
1338 ldp $a2,$a3,[$ap,#16]
1340 ldp $ord0,$ord1,[$ordk,#0]
1341 ldp $ord2,$ord3,[$ordk,#16]
1342 ldr $ordk,[$ordk,#32]
1344 mul $acc0,$a0,$bi // a[0]*b[0]
1347 mul $acc1,$a1,$bi // a[1]*b[0]
1350 mul $acc2,$a2,$bi // a[2]*b[0]
1353 mul $acc3,$a3,$bi // a[3]*b[0]
1358 adds $acc1,$acc1,$t0 // accumulate high parts of multiplication
1359 adcs $acc2,$acc2,$t1
1360 adcs $acc3,$acc3,$t2
1364 for ($i=1;$i<4;$i++) {
1365 ################################################################
1366 # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1368 # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1370 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1373 # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1374 # - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1375 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1377 ldr $bi,[$bp,#8*$i] // b[i]
1380 subs $acc2,$acc2,$t4
1382 sbcs $acc3,$acc3,$t0
1383 sbcs $acc4,$acc4,$t1
1396 adds $acc0,$acc1,$t2
1398 adcs $acc1,$acc2,$t3
1400 adcs $acc2,$acc3,$t4
1401 adcs $acc3,$acc4,$t4
1404 adds $acc0,$acc0,$t0 // accumulate low parts
1406 adcs $acc1,$acc1,$t1
1408 adcs $acc2,$acc2,$t2
1410 adcs $acc3,$acc3,$t3
1414 adds $acc1,$acc1,$t0 // accumulate high parts
1415 adcs $acc2,$acc2,$t1
1416 adcs $acc3,$acc3,$t2
1417 adcs $acc4,$acc4,$t3
1422 lsl $t0,$t4,#32 // last reduction
1423 subs $acc2,$acc2,$t4
1425 sbcs $acc3,$acc3,$t0
1426 sbcs $acc4,$acc4,$t1
1437 adds $acc0,$acc1,$t2
1438 adcs $acc1,$acc2,$t3
1439 adcs $acc2,$acc3,$t4
1440 adcs $acc3,$acc4,$t4
1443 subs $t0,$acc0,$ord0 // ret -= modulus
1444 sbcs $t1,$acc1,$ord1
1445 sbcs $t2,$acc2,$ord2
1446 sbcs $t3,$acc3,$ord3
1449 csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
1450 csel $acc1,$acc1,$t1,lo
1451 csel $acc2,$acc2,$t2,lo
1452 stp $acc0,$acc1,[$rp]
1453 csel $acc3,$acc3,$t3,lo
1454 stp $acc2,$acc3,[$rp,#16]
1456 ldp x19,x20,[sp,#16]
1457 ldp x21,x22,[sp,#32]
1458 ldp x23,x24,[sp,#48]
1461 .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1463 ////////////////////////////////////////////////////////////////////////
1464 // void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1466 .globl ecp_nistz256_ord_sqr_mont
1467 .type ecp_nistz256_ord_sqr_mont,%function
1469 ecp_nistz256_ord_sqr_mont:
1470 stp x29,x30,[sp,#-64]!
1472 stp x19,x20,[sp,#16]
1473 stp x21,x22,[sp,#32]
1474 stp x23,x24,[sp,#48]
1478 ldp $a2,$a3,[$ap,#16]
1480 ldp $ord0,$ord1,[$ordk,#0]
1481 ldp $ord2,$ord3,[$ordk,#16]
1482 ldr $ordk,[$ordk,#32]
1488 ////////////////////////////////////////////////////////////////
1489 // | | | | | |a1*a0| |
1490 // | | | | |a2*a0| | |
1491 // | |a3*a2|a3*a0| | | |
1492 // | | | |a2*a1| | | |
1493 // | | |a3*a1| | | | |
1494 // *| | | | | | | | 2|
1495 // +|a3*a3|a2*a2|a1*a1|a0*a0|
1496 // |--+--+--+--+--+--+--+--|
1497 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1499 // "can't overflow" below mark carrying into high part of
1500 // multiplication result, which can't overflow, because it
1501 // can never be all ones.
1503 mul $acc1,$a1,$a0 // a[1]*a[0]
1505 mul $acc2,$a2,$a0 // a[2]*a[0]
1507 mul $acc3,$a3,$a0 // a[3]*a[0]
1510 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication
1511 mul $t0,$a2,$a1 // a[2]*a[1]
1513 adcs $acc3,$acc3,$t2
1514 mul $t2,$a3,$a1 // a[3]*a[1]
1516 adc $acc4,$acc4,xzr // can't overflow
1518 mul $acc5,$a3,$a2 // a[3]*a[2]
1521 adds $t1,$t1,$t2 // accumulate high parts of multiplication
1522 mul $acc0,$a0,$a0 // a[0]*a[0]
1523 adc $t2,$t3,xzr // can't overflow
1525 adds $acc3,$acc3,$t0 // accumulate low parts of multiplication
1527 adcs $acc4,$acc4,$t1
1528 mul $t1,$a1,$a1 // a[1]*a[1]
1529 adcs $acc5,$acc5,$t2
1531 adc $acc6,$acc6,xzr // can't overflow
1533 adds $acc1,$acc1,$acc1 // acc[1-6]*=2
1534 mul $t2,$a2,$a2 // a[2]*a[2]
1535 adcs $acc2,$acc2,$acc2
1537 adcs $acc3,$acc3,$acc3
1538 mul $t3,$a3,$a3 // a[3]*a[3]
1539 adcs $acc4,$acc4,$acc4
1541 adcs $acc5,$acc5,$acc5
1542 adcs $acc6,$acc6,$acc6
1545 adds $acc1,$acc1,$a0 // +a[i]*a[i]
1547 adcs $acc2,$acc2,$t1
1548 adcs $acc3,$acc3,$a1
1549 adcs $acc4,$acc4,$t2
1550 adcs $acc5,$acc5,$a2
1551 adcs $acc6,$acc6,$t3
1554 for($i=0; $i<4; $i++) { # reductions
1564 adds $acc0,$acc1,$t2
1565 adcs $acc1,$acc2,$t3
1566 adcs $acc2,$acc3,$t4
1567 adc $acc3,xzr,$t4 // can't overflow
1569 $code.=<<___ if ($i<3);
1574 subs $acc1,$acc1,$t4
1576 sbcs $acc2,$acc2,$t0
1577 sbc $acc3,$acc3,$t1 // can't borrow
1579 ($t3,$t4) = ($t4,$t3);
1582 adds $acc0,$acc0,$acc4 // accumulate upper half
1583 adcs $acc1,$acc1,$acc5
1584 adcs $acc2,$acc2,$acc6
1585 adcs $acc3,$acc3,$acc7
1588 subs $t0,$acc0,$ord0 // ret -= modulus
1589 sbcs $t1,$acc1,$ord1
1590 sbcs $t2,$acc2,$ord2
1591 sbcs $t3,$acc3,$ord3
1594 csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus
1595 csel $a1,$acc1,$t1,lo
1596 csel $a2,$acc2,$t2,lo
1597 csel $a3,$acc3,$t3,lo
1599 cbnz $bp,.Loop_ord_sqr
1602 stp $a2,$a3,[$rp,#16]
1604 ldp x19,x20,[sp,#16]
1605 ldp x21,x22,[sp,#32]
1606 ldp x23,x24,[sp,#48]
1609 .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1613 ########################################################################
1614 # scatter-gather subroutines
1616 my ($out,$inp,$index,$mask)=map("x$_",(0..3));
1618 // void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1,
1620 .globl ecp_nistz256_scatter_w5
1621 .type ecp_nistz256_scatter_w5,%function
1623 ecp_nistz256_scatter_w5:
1624 stp x29,x30,[sp,#-16]!
1627 add $out,$out,$index,lsl#2
1629 ldp x4,x5,[$inp] // X
1630 ldp x6,x7,[$inp,#16]
1631 str w4,[$out,#64*0-4]
1633 str w5,[$out,#64*1-4]
1635 str w6,[$out,#64*2-4]
1637 str w7,[$out,#64*3-4]
1639 str w4,[$out,#64*4-4]
1640 str w5,[$out,#64*5-4]
1641 str w6,[$out,#64*6-4]
1642 str w7,[$out,#64*7-4]
1645 ldp x4,x5,[$inp,#32] // Y
1646 ldp x6,x7,[$inp,#48]
1647 str w4,[$out,#64*0-4]
1649 str w5,[$out,#64*1-4]
1651 str w6,[$out,#64*2-4]
1653 str w7,[$out,#64*3-4]
1655 str w4,[$out,#64*4-4]
1656 str w5,[$out,#64*5-4]
1657 str w6,[$out,#64*6-4]
1658 str w7,[$out,#64*7-4]
1661 ldp x4,x5,[$inp,#64] // Z
1662 ldp x6,x7,[$inp,#80]
1663 str w4,[$out,#64*0-4]
1665 str w5,[$out,#64*1-4]
1667 str w6,[$out,#64*2-4]
1669 str w7,[$out,#64*3-4]
1671 str w4,[$out,#64*4-4]
1672 str w5,[$out,#64*5-4]
1673 str w6,[$out,#64*6-4]
1674 str w7,[$out,#64*7-4]
1678 .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1680 // void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1,
1682 .globl ecp_nistz256_gather_w5
1683 .type ecp_nistz256_gather_w5,%function
1685 ecp_nistz256_gather_w5:
1686 stp x29,x30,[sp,#-16]!
1691 add $index,$index,x3
1692 add $inp,$inp,$index,lsl#2
1700 ldr w10,[$inp,#64*6]
1701 ldr w11,[$inp,#64*7]
1705 orr x6,x6,x10,lsl#32
1706 orr x7,x7,x11,lsl#32
1711 stp x4,x5,[$out] // X
1712 stp x6,x7,[$out,#16]
1720 ldr w10,[$inp,#64*6]
1721 ldr w11,[$inp,#64*7]
1725 orr x6,x6,x10,lsl#32
1726 orr x7,x7,x11,lsl#32
1731 stp x4,x5,[$out,#32] // Y
1732 stp x6,x7,[$out,#48]
1740 ldr w10,[$inp,#64*6]
1741 ldr w11,[$inp,#64*7]
1744 orr x6,x6,x10,lsl#32
1745 orr x7,x7,x11,lsl#32
1750 stp x4,x5,[$out,#64] // Z
1751 stp x6,x7,[$out,#80]
1755 .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1757 // void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1,
1759 .globl ecp_nistz256_scatter_w7
1760 .type ecp_nistz256_scatter_w7,%function
1762 ecp_nistz256_scatter_w7:
1763 stp x29,x30,[sp,#-16]!
1766 add $out,$out,$index
1770 subs $index,$index,#1
1771 prfm pstl1strm,[$out,#4096+64*0]
1772 prfm pstl1strm,[$out,#4096+64*1]
1773 prfm pstl1strm,[$out,#4096+64*2]
1774 prfm pstl1strm,[$out,#4096+64*3]
1775 prfm pstl1strm,[$out,#4096+64*4]
1776 prfm pstl1strm,[$out,#4096+64*5]
1777 prfm pstl1strm,[$out,#4096+64*6]
1778 prfm pstl1strm,[$out,#4096+64*7]
1779 strb w3,[$out,#64*0-1]
1781 strb w3,[$out,#64*1-1]
1783 strb w3,[$out,#64*2-1]
1785 strb w3,[$out,#64*3-1]
1787 strb w3,[$out,#64*4-1]
1789 strb w3,[$out,#64*5-1]
1791 strb w3,[$out,#64*6-1]
1793 strb w3,[$out,#64*7-1]
1795 b.ne .Loop_scatter_w7
1799 .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1801 // void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1,
1803 .globl ecp_nistz256_gather_w7
1804 .type ecp_nistz256_gather_w7,%function
1806 ecp_nistz256_gather_w7:
1807 stp x29,x30,[sp,#-16]!
1812 add $index,$index,x3
1813 add $inp,$inp,$index
1817 ldrb w4,[$inp,#64*0]
1818 prfm pldl1strm,[$inp,#4096+64*0]
1819 subs $index,$index,#1
1820 ldrb w5,[$inp,#64*1]
1821 prfm pldl1strm,[$inp,#4096+64*1]
1822 ldrb w6,[$inp,#64*2]
1823 prfm pldl1strm,[$inp,#4096+64*2]
1824 ldrb w7,[$inp,#64*3]
1825 prfm pldl1strm,[$inp,#4096+64*3]
1826 ldrb w8,[$inp,#64*4]
1827 prfm pldl1strm,[$inp,#4096+64*4]
1828 ldrb w9,[$inp,#64*5]
1829 prfm pldl1strm,[$inp,#4096+64*5]
1830 ldrb w10,[$inp,#64*6]
1831 prfm pldl1strm,[$inp,#4096+64*6]
1832 ldrb w11,[$inp,#64*7]
1833 prfm pldl1strm,[$inp,#4096+64*7]
1839 orr x10,x10,x11,lsl#8
1841 orr x4,x4,x10,lsl#48
1844 b.ne .Loop_gather_w7
1848 .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1852 foreach (split("\n",$code)) {
1853 s/\`([^\`]*)\`/eval $1/ge;
1857 close STDOUT; # enforce flush