2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
21 # Performance in cycles per byte out of large buffer.
23 # IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU
25 # Apple A7 5.50/+49% 3.33 1.70
26 # Cortex-A53 8.40/+80% 4.72 4.72(*)
27 # Cortex-A57 8.06/+43% 4.90 4.43(**)
28 # Denver 4.50/+82% 2.63 2.67(*)
29 # X-Gene 9.50/+46% 8.82 8.89(*)
30 # Mongoose 8.00/+44% 3.64 3.25
31 # Kryo 8.17/+50% 4.83 4.65
33 # (*) it's expected that doubling interleave factor doesn't help
34 # all processors, only those with higher NEON latency and
35 # higher instruction issue rate;
36 # (**) expected improvement was actually higher;
41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
44 die "can't locate arm-xlate.pl";
46 open OUT,"| \"$^X\" $xlate $flavour $output";
49 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
50 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
52 $arg = "#$arg" if ($arg*1 eq $arg);
53 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
56 my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
58 my @x=map("x$_",(5..17,19..21));
59 my @d=map("x$_",(22..28,30));
62 my ($a0,$b0,$c0,$d0)=@_;
63 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
64 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
65 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
68 "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
69 "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
70 "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
71 "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
72 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
73 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
74 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
75 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
76 "&ror_32 (@x[$d0],@x[$d0],16)",
77 "&ror_32 (@x[$d1],@x[$d1],16)",
78 "&ror_32 (@x[$d2],@x[$d2],16)",
79 "&ror_32 (@x[$d3],@x[$d3],16)",
81 "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
82 "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
83 "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
84 "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
85 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
86 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
87 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
88 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
89 "&ror_32 (@x[$b0],@x[$b0],20)",
90 "&ror_32 (@x[$b1],@x[$b1],20)",
91 "&ror_32 (@x[$b2],@x[$b2],20)",
92 "&ror_32 (@x[$b3],@x[$b3],20)",
94 "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
95 "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
96 "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
97 "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
98 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
99 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
100 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
101 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
102 "&ror_32 (@x[$d0],@x[$d0],24)",
103 "&ror_32 (@x[$d1],@x[$d1],24)",
104 "&ror_32 (@x[$d2],@x[$d2],24)",
105 "&ror_32 (@x[$d3],@x[$d3],24)",
107 "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
108 "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
109 "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
110 "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
111 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
112 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
113 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
114 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
115 "&ror_32 (@x[$b0],@x[$b0],25)",
116 "&ror_32 (@x[$b1],@x[$b1],25)",
117 "&ror_32 (@x[$b2],@x[$b2],25)",
118 "&ror_32 (@x[$b3],@x[$b3],25)"
123 #include "arm_arch.h"
127 .extern OPENSSL_armcap_P
131 .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
136 .long OPENSSL_armcap_P-.
138 .quad OPENSSL_armcap_P-.
140 .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
142 .globl ChaCha20_ctr32
143 .type ChaCha20_ctr32,%function
147 adr @x[0],.LOPENSSL_armcap_P
155 ldr w17,[@x[1],@x[0]]
160 .inst 0xd503233f // paciasp
161 stp x29,x30,[sp,#-96]!
172 ldp @d[0],@d[1],[@x[0]] // load sigma
173 ldp @d[2],@d[3],[$key] // load key
174 ldp @d[4],@d[5],[$key,#16]
175 ldp @d[6],@d[7],[$ctr] // load counter
186 mov.32 @x[0],@d[0] // unpack key block
208 foreach (&ROUND(0, 4, 8,12)) { eval; }
209 foreach (&ROUND(0, 5,10,15)) { eval; }
213 add.32 @x[0],@x[0],@d[0] // accumulate key block
214 add @x[1],@x[1],@d[0],lsr#32
215 add.32 @x[2],@x[2],@d[1]
216 add @x[3],@x[3],@d[1],lsr#32
217 add.32 @x[4],@x[4],@d[2]
218 add @x[5],@x[5],@d[2],lsr#32
219 add.32 @x[6],@x[6],@d[3]
220 add @x[7],@x[7],@d[3],lsr#32
221 add.32 @x[8],@x[8],@d[4]
222 add @x[9],@x[9],@d[4],lsr#32
223 add.32 @x[10],@x[10],@d[5]
224 add @x[11],@x[11],@d[5],lsr#32
225 add.32 @x[12],@x[12],@d[6]
226 add @x[13],@x[13],@d[6],lsr#32
227 add.32 @x[14],@x[14],@d[7]
228 add @x[15],@x[15],@d[7],lsr#32
232 add @x[0],@x[0],@x[1],lsl#32 // pack
233 add @x[2],@x[2],@x[3],lsl#32
234 ldp @x[1],@x[3],[$inp,#0] // load input
235 add @x[4],@x[4],@x[5],lsl#32
236 add @x[6],@x[6],@x[7],lsl#32
237 ldp @x[5],@x[7],[$inp,#16]
238 add @x[8],@x[8],@x[9],lsl#32
239 add @x[10],@x[10],@x[11],lsl#32
240 ldp @x[9],@x[11],[$inp,#32]
241 add @x[12],@x[12],@x[13],lsl#32
242 add @x[14],@x[14],@x[15],lsl#32
243 ldp @x[13],@x[15],[$inp,#48]
255 eor @x[0],@x[0],@x[1]
256 eor @x[2],@x[2],@x[3]
257 eor @x[4],@x[4],@x[5]
258 eor @x[6],@x[6],@x[7]
259 eor @x[8],@x[8],@x[9]
260 eor @x[10],@x[10],@x[11]
261 eor @x[12],@x[12],@x[13]
262 eor @x[14],@x[14],@x[15]
264 stp @x[0],@x[2],[$out,#0] // store output
265 add @d[6],@d[6],#1 // increment counter
266 stp @x[4],@x[6],[$out,#16]
267 stp @x[8],@x[10],[$out,#32]
268 stp @x[12],@x[14],[$out,#48]
273 ldp x19,x20,[x29,#16]
275 ldp x21,x22,[x29,#32]
276 ldp x23,x24,[x29,#48]
277 ldp x25,x26,[x29,#64]
278 ldp x27,x28,[x29,#80]
280 .inst 0xd50323bf // autiasp
294 add @x[0],@x[0],@x[1],lsl#32 // pack
295 add @x[2],@x[2],@x[3],lsl#32
296 add @x[4],@x[4],@x[5],lsl#32
297 add @x[6],@x[6],@x[7],lsl#32
298 add @x[8],@x[8],@x[9],lsl#32
299 add @x[10],@x[10],@x[11],lsl#32
300 add @x[12],@x[12],@x[13],lsl#32
301 add @x[14],@x[14],@x[15],lsl#32
312 stp @x[0],@x[2],[sp,#0]
313 stp @x[4],@x[6],[sp,#16]
314 stp @x[8],@x[10],[sp,#32]
315 stp @x[12],@x[14],[sp,#48]
330 ldp x19,x20,[x29,#16]
332 ldp x21,x22,[x29,#32]
333 ldp x23,x24,[x29,#48]
334 ldp x25,x26,[x29,#64]
335 ldp x27,x28,[x29,#80]
337 .inst 0xd50323bf // autiasp
339 .size ChaCha20_ctr32,.-ChaCha20_ctr32
343 my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
344 map("v$_.4s",(0..7,16..23));
345 my (@K)=map("v$_.4s",(24..30));
350 my ($a,$b,$c,$d,$t)=@_;
353 "&add ('$a','$a','$b')",
354 "&eor ('$d','$d','$a')",
355 "&rev32_16 ('$d','$d')", # vrot ($d,16)
357 "&add ('$c','$c','$d')",
358 "&eor ('$t','$b','$c')",
359 "&ushr ('$b','$t',20)",
360 "&sli ('$b','$t',12)",
362 "&add ('$a','$a','$b')",
363 "&eor ('$t','$d','$a')",
364 "&ushr ('$d','$t',24)",
365 "&sli ('$d','$t',8)",
367 "&add ('$c','$c','$d')",
368 "&eor ('$t','$b','$c')",
369 "&ushr ('$b','$t',25)",
370 "&sli ('$b','$t',7)",
372 "&ext ('$c','$c','$c',8)",
373 "&ext ('$d','$d','$d',$odd?4:12)",
374 "&ext ('$b','$b','$b',$odd?12:4)"
380 .type ChaCha20_neon,%function
383 .inst 0xd503233f // paciasp
384 stp x29,x30,[sp,#-96]!
394 b.hs .L512_or_more_neon
398 ldp @d[0],@d[1],[@x[0]] // load sigma
399 ld1 {@K[0]},[@x[0]],#16
400 ldp @d[2],@d[3],[$key] // load key
401 ldp @d[4],@d[5],[$key,#16]
402 ld1 {@K[1],@K[2]},[$key]
403 ldp @d[6],@d[7],[$ctr] // load counter
415 add @K[3],@K[3],$ONE // += 1
418 shl $ONE,$ONE,#2 // 1 -> 4
421 mov.32 @x[0],@d[0] // unpack key block
455 my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
456 my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
457 my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
458 my @thread3=&ROUND(0,4,8,12);
461 eval; eval(shift(@thread3));
462 eval(shift(@thread1)); eval(shift(@thread3));
463 eval(shift(@thread2)); eval(shift(@thread3));
466 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
467 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
468 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
469 @thread3=&ROUND(0,5,10,15);
472 eval; eval(shift(@thread3));
473 eval(shift(@thread1)); eval(shift(@thread3));
474 eval(shift(@thread2)); eval(shift(@thread3));
479 add.32 @x[0],@x[0],@d[0] // accumulate key block
481 add @x[1],@x[1],@d[0],lsr#32
483 add.32 @x[2],@x[2],@d[1]
485 add @x[3],@x[3],@d[1],lsr#32
487 add.32 @x[4],@x[4],@d[2]
489 add @x[5],@x[5],@d[2],lsr#32
491 add.32 @x[6],@x[6],@d[3]
493 add @x[7],@x[7],@d[3],lsr#32
494 add.32 @x[8],@x[8],@d[4]
496 add @x[9],@x[9],@d[4],lsr#32
497 add.32 @x[10],@x[10],@d[5]
499 add @x[11],@x[11],@d[5],lsr#32
500 add.32 @x[12],@x[12],@d[6]
502 add @x[13],@x[13],@d[6],lsr#32
503 add.32 @x[14],@x[14],@d[7]
505 add @x[15],@x[15],@d[7],lsr#32
510 add @x[0],@x[0],@x[1],lsl#32 // pack
511 add @x[2],@x[2],@x[3],lsl#32
512 ldp @x[1],@x[3],[$inp,#0] // load input
513 add @x[4],@x[4],@x[5],lsl#32
514 add @x[6],@x[6],@x[7],lsl#32
515 ldp @x[5],@x[7],[$inp,#16]
516 add @x[8],@x[8],@x[9],lsl#32
517 add @x[10],@x[10],@x[11],lsl#32
518 ldp @x[9],@x[11],[$inp,#32]
519 add @x[12],@x[12],@x[13],lsl#32
520 add @x[14],@x[14],@x[15],lsl#32
521 ldp @x[13],@x[15],[$inp,#48]
533 ld1.8 {$T0-$T3},[$inp],#64
534 eor @x[0],@x[0],@x[1]
535 eor @x[2],@x[2],@x[3]
536 eor @x[4],@x[4],@x[5]
537 eor @x[6],@x[6],@x[7]
538 eor @x[8],@x[8],@x[9]
540 eor @x[10],@x[10],@x[11]
542 eor @x[12],@x[12],@x[13]
544 eor @x[14],@x[14],@x[15]
546 ld1.8 {$T0-$T3},[$inp],#64
548 stp @x[0],@x[2],[$out,#0] // store output
549 add @d[6],@d[6],#4 // increment counter
550 stp @x[4],@x[6],[$out,#16]
551 add @K[3],@K[3],$ONE // += 4
552 stp @x[8],@x[10],[$out,#32]
554 stp @x[12],@x[14],[$out,#48]
558 st1.8 {$A0-$D0},[$out],#64
559 ld1.8 {$A0-$D0},[$inp],#64
565 st1.8 {$A1-$D1},[$out],#64
571 st1.8 {$A2-$D2},[$out],#64
573 b.hi .Loop_outer_neon
575 ldp x19,x20,[x29,#16]
577 ldp x21,x22,[x29,#32]
578 ldp x23,x24,[x29,#48]
579 ldp x25,x26,[x29,#64]
580 ldp x27,x28,[x29,#80]
582 .inst 0xd50323bf // autiasp
590 add @x[0],@x[0],@x[1],lsl#32 // pack
591 add @x[2],@x[2],@x[3],lsl#32
592 ldp @x[1],@x[3],[$inp,#0] // load input
593 add @x[4],@x[4],@x[5],lsl#32
594 add @x[6],@x[6],@x[7],lsl#32
595 ldp @x[5],@x[7],[$inp,#16]
596 add @x[8],@x[8],@x[9],lsl#32
597 add @x[10],@x[10],@x[11],lsl#32
598 ldp @x[9],@x[11],[$inp,#32]
599 add @x[12],@x[12],@x[13],lsl#32
600 add @x[14],@x[14],@x[15],lsl#32
601 ldp @x[13],@x[15],[$inp,#48]
613 eor @x[0],@x[0],@x[1]
614 eor @x[2],@x[2],@x[3]
615 eor @x[4],@x[4],@x[5]
616 eor @x[6],@x[6],@x[7]
617 eor @x[8],@x[8],@x[9]
618 eor @x[10],@x[10],@x[11]
619 eor @x[12],@x[12],@x[13]
620 eor @x[14],@x[14],@x[15]
622 stp @x[0],@x[2],[$out,#0] // store output
623 add @d[6],@d[6],#4 // increment counter
624 stp @x[4],@x[6],[$out,#16]
625 stp @x[8],@x[10],[$out,#32]
626 stp @x[12],@x[14],[$out,#48]
633 ld1.8 {$T0-$T3},[$inp],#64
638 st1.8 {$A0-$D0},[$out],#64
644 ld1.8 {$T0-$T3},[$inp],#64
649 st1.8 {$A1-$D1},[$out],#64
677 cbnz $len,.Loop_tail_neon
685 ldp x19,x20,[x29,#16]
687 ldp x21,x22,[x29,#32]
688 ldp x23,x24,[x29,#48]
689 ldp x25,x26,[x29,#64]
690 ldp x27,x28,[x29,#80]
692 .inst 0xd50323bf // autiasp
694 .size ChaCha20_neon,.-ChaCha20_neon
697 my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
698 my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
699 $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
702 .type ChaCha20_512_neon,%function
705 .inst 0xd503233f // paciasp
706 stp x29,x30,[sp,#-96]!
719 ldp @d[0],@d[1],[@x[0]] // load sigma
720 ld1 {@K[0]},[@x[0]],#16
721 ldp @d[2],@d[3],[$key] // load key
722 ldp @d[4],@d[5],[$key,#16]
723 ld1 {@K[1],@K[2]},[$key]
724 ldp @d[6],@d[7],[$ctr] // load counter
736 add @K[3],@K[3],$ONE // += 1
737 stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
738 add @K[3],@K[3],$ONE // not typo
743 shl $ONE,$ONE,#2 // 1 -> 4
745 stp d8,d9,[sp,#128+0] // meet ABI requirements
746 stp d10,d11,[sp,#128+16]
747 stp d12,d13,[sp,#128+32]
748 stp d14,d15,[sp,#128+48]
750 sub $len,$len,#512 // not typo
752 .Loop_outer_512_neon:
760 mov.32 @x[0],@d[0] // unpack key block
783 add $D4,$D0,$ONE // +4
785 add $D5,$D1,$ONE // +4
792 stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
801 my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
802 my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
803 my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
804 my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
805 my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
806 my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
807 my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
808 my $diff = ($#thread0+1)*6 - $#thread67 - 1;
812 eval; eval(shift(@thread67));
813 eval(shift(@thread1)); eval(shift(@thread67));
814 eval(shift(@thread2)); eval(shift(@thread67));
815 eval(shift(@thread3)); eval(shift(@thread67));
816 eval(shift(@thread4)); eval(shift(@thread67));
817 eval(shift(@thread5)); eval(shift(@thread67));
820 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
821 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
822 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
823 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
824 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
825 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
826 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
829 eval; eval(shift(@thread67));
830 eval(shift(@thread1)); eval(shift(@thread67));
831 eval(shift(@thread2)); eval(shift(@thread67));
832 eval(shift(@thread3)); eval(shift(@thread67));
833 eval(shift(@thread4)); eval(shift(@thread67));
834 eval(shift(@thread5)); eval(shift(@thread67));
837 cbnz $ctr,.Loop_upper_neon
839 add.32 @x[0],@x[0],@d[0] // accumulate key block
840 add @x[1],@x[1],@d[0],lsr#32
841 add.32 @x[2],@x[2],@d[1]
842 add @x[3],@x[3],@d[1],lsr#32
843 add.32 @x[4],@x[4],@d[2]
844 add @x[5],@x[5],@d[2],lsr#32
845 add.32 @x[6],@x[6],@d[3]
846 add @x[7],@x[7],@d[3],lsr#32
847 add.32 @x[8],@x[8],@d[4]
848 add @x[9],@x[9],@d[4],lsr#32
849 add.32 @x[10],@x[10],@d[5]
850 add @x[11],@x[11],@d[5],lsr#32
851 add.32 @x[12],@x[12],@d[6]
852 add @x[13],@x[13],@d[6],lsr#32
853 add.32 @x[14],@x[14],@d[7]
854 add @x[15],@x[15],@d[7],lsr#32
856 add @x[0],@x[0],@x[1],lsl#32 // pack
857 add @x[2],@x[2],@x[3],lsl#32
858 ldp @x[1],@x[3],[$inp,#0] // load input
859 add @x[4],@x[4],@x[5],lsl#32
860 add @x[6],@x[6],@x[7],lsl#32
861 ldp @x[5],@x[7],[$inp,#16]
862 add @x[8],@x[8],@x[9],lsl#32
863 add @x[10],@x[10],@x[11],lsl#32
864 ldp @x[9],@x[11],[$inp,#32]
865 add @x[12],@x[12],@x[13],lsl#32
866 add @x[14],@x[14],@x[15],lsl#32
867 ldp @x[13],@x[15],[$inp,#48]
879 eor @x[0],@x[0],@x[1]
880 eor @x[2],@x[2],@x[3]
881 eor @x[4],@x[4],@x[5]
882 eor @x[6],@x[6],@x[7]
883 eor @x[8],@x[8],@x[9]
884 eor @x[10],@x[10],@x[11]
885 eor @x[12],@x[12],@x[13]
886 eor @x[14],@x[14],@x[15]
888 stp @x[0],@x[2],[$out,#0] // store output
889 add @d[6],@d[6],#1 // increment counter
890 mov.32 @x[0],@d[0] // unpack key block
892 stp @x[4],@x[6],[$out,#16]
895 stp @x[8],@x[10],[$out,#32]
898 stp @x[12],@x[14],[$out,#48]
915 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
916 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
917 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
918 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
919 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
920 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
921 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
924 eval; eval(shift(@thread67));
925 eval(shift(@thread1)); eval(shift(@thread67));
926 eval(shift(@thread2)); eval(shift(@thread67));
927 eval(shift(@thread3)); eval(shift(@thread67));
928 eval(shift(@thread4)); eval(shift(@thread67));
929 eval(shift(@thread5)); eval(shift(@thread67));
932 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
933 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
934 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
935 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
936 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
937 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
938 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
941 eval; eval(shift(@thread67));
942 eval(shift(@thread1)); eval(shift(@thread67));
943 eval(shift(@thread2)); eval(shift(@thread67));
944 eval(shift(@thread3)); eval(shift(@thread67));
945 eval(shift(@thread4)); eval(shift(@thread67));
946 eval(shift(@thread5)); eval(shift(@thread67));
949 cbnz $ctr,.Loop_lower_neon
951 add.32 @x[0],@x[0],@d[0] // accumulate key block
952 ldp @K[0],@K[1],[sp,#0]
953 add @x[1],@x[1],@d[0],lsr#32
954 ldp @K[2],@K[3],[sp,#32]
955 add.32 @x[2],@x[2],@d[1]
956 ldp @K[4],@K[5],[sp,#64]
957 add @x[3],@x[3],@d[1],lsr#32
959 add.32 @x[4],@x[4],@d[2]
961 add @x[5],@x[5],@d[2],lsr#32
963 add.32 @x[6],@x[6],@d[3]
965 add @x[7],@x[7],@d[3],lsr#32
967 add.32 @x[8],@x[8],@d[4]
969 add @x[9],@x[9],@d[4],lsr#32
971 add.32 @x[10],@x[10],@d[5]
973 add @x[11],@x[11],@d[5],lsr#32
975 add.32 @x[12],@x[12],@d[6]
977 add @x[13],@x[13],@d[6],lsr#32
979 add.32 @x[14],@x[14],@d[7]
981 add @x[15],@x[15],@d[7],lsr#32
982 add $D4,$D4,$ONE // +4
983 add @x[0],@x[0],@x[1],lsl#32 // pack
984 add $D5,$D5,$ONE // +4
985 add @x[2],@x[2],@x[3],lsl#32
987 ldp @x[1],@x[3],[$inp,#0] // load input
989 add @x[4],@x[4],@x[5],lsl#32
991 add @x[6],@x[6],@x[7],lsl#32
993 ldp @x[5],@x[7],[$inp,#16]
995 add @x[8],@x[8],@x[9],lsl#32
997 add @x[10],@x[10],@x[11],lsl#32
999 ldp @x[9],@x[11],[$inp,#32]
1001 add @x[12],@x[12],@x[13],lsl#32
1003 add @x[14],@x[14],@x[15],lsl#32
1005 ldp @x[13],@x[15],[$inp,#48]
1020 ld1.8 {$T0-$T3},[$inp],#64
1021 eor @x[0],@x[0],@x[1]
1022 eor @x[2],@x[2],@x[3]
1023 eor @x[4],@x[4],@x[5]
1024 eor @x[6],@x[6],@x[7]
1025 eor @x[8],@x[8],@x[9]
1027 eor @x[10],@x[10],@x[11]
1029 eor @x[12],@x[12],@x[13]
1031 eor @x[14],@x[14],@x[15]
1033 ld1.8 {$T0-$T3},[$inp],#64
1035 stp @x[0],@x[2],[$out,#0] // store output
1036 add @d[6],@d[6],#7 // increment counter
1037 stp @x[4],@x[6],[$out,#16]
1038 stp @x[8],@x[10],[$out,#32]
1039 stp @x[12],@x[14],[$out,#48]
1041 st1.8 {$A0-$D0},[$out],#64
1043 ld1.8 {$A0-$D0},[$inp],#64
1048 st1.8 {$A1-$D1},[$out],#64
1050 ld1.8 {$A1-$D1},[$inp],#64
1052 ldp @K[0],@K[1],[sp,#0]
1054 ldp @K[2],@K[3],[sp,#32]
1057 st1.8 {$A2-$D2},[$out],#64
1059 ld1.8 {$A2-$D2},[$inp],#64
1064 st1.8 {$A3-$D3},[$out],#64
1066 ld1.8 {$A3-$D3},[$inp],#64
1071 st1.8 {$A4-$D4},[$out],#64
1073 shl $A0,$ONE,#1 // 4 -> 8
1078 st1.8 {$A5-$D5},[$out],#64
1080 add @K[3],@K[3],$A0 // += 8
1085 b.hs .Loop_outer_512_neon
1088 ushr $A0,$ONE,#2 // 4 -> 1
1090 ldp d8,d9,[sp,#128+0] // meet ABI requirements
1091 ldp d10,d11,[sp,#128+16]
1092 ldp d12,d13,[sp,#128+32]
1093 ldp d14,d15,[sp,#128+48]
1095 stp @K[0],$ONE,[sp,#0] // wipe off-load area
1096 stp @K[0],$ONE,[sp,#32]
1097 stp @K[0],$ONE,[sp,#64]
1099 b.eq .Ldone_512_neon
1102 sub @K[3],@K[3],$A0 // -= 1
1106 b.hs .Loop_outer_neon
1108 eor @K[1],@K[1],@K[1]
1109 eor @K[2],@K[2],@K[2]
1110 eor @K[3],@K[3],@K[3]
1111 eor @K[4],@K[4],@K[4]
1112 eor @K[5],@K[5],@K[5]
1113 eor @K[6],@K[6],@K[6]
1117 ldp x19,x20,[x29,#16]
1119 ldp x21,x22,[x29,#32]
1120 ldp x23,x24,[x29,#48]
1121 ldp x25,x26,[x29,#64]
1122 ldp x27,x28,[x29,#80]
1123 ldp x29,x30,[sp],#96
1124 .inst 0xd50323bf // autiasp
1126 .size ChaCha20_512_neon,.-ChaCha20_512_neon
1131 foreach (split("\n",$code)) {
1132 s/\`([^\`]*)\`/eval $1/geo;
1134 (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
1135 (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
1136 (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
1137 (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
1138 (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1140 #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1144 close STDOUT; # flush