3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 # Performance in cycles per byte out of large buffer.
16 # IALU/gcc-4.4 1xNEON 3xNEON+1xIALU
18 # Cortex-A5 19.3(*)/+95% 21.8 14.1
19 # Cortex-A8 10.5(*)/+160% 13.9 6.35
20 # Cortex-A9 12.9(**)/+110% 14.3 6.50
21 # Cortex-A15 11.0/+40% 16.0 5.00
22 # Snapdragon S4 11.5/+125% 13.6 4.90
24 # (*) most "favourable" result for aligned data on little-endian
25 # processor, result for misaligned data is 10-15% lower;
26 # (**) this result is a trade-off: it can be improved by 20%,
27 # but then Snapdragon S4 and Cortex-A8 results get
31 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
32 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
34 if ($flavour && $flavour ne "void") {
35 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
37 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
38 die "can't locate arm-xlate.pl";
40 open STDOUT,"| \"$^X\" $xlate $flavour $output";
42 open STDOUT,">$output";
45 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
46 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
48 $arg = "#$arg" if ($arg*1 eq $arg);
49 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
52 my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
53 my @t=map("r$_",(8..11));
56 my ($a0,$b0,$c0,$d0)=@_;
57 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
58 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
59 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
61 my ($xc,$xc_) = (@t[0..1]);
62 my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
65 # Consider order in which variables are addressed by their
70 # 0 4 8 12 < even round
74 # 0 5 10 15 < odd round
79 # 'a', 'b' are permanently allocated in registers, @x[0..7],
80 # while 'c's and pair of 'd's are maintained in memory. If
81 # you observe 'c' column, you'll notice that pair of 'c's is
82 # invariant between rounds. This means that we have to reload
83 # them once per round, in the middle. This is why you'll see
84 # bunch of 'c' stores and loads in the middle, but none in
85 # the beginning or end. If you observe 'd' column, you'll
86 # notice that 15 and 13 are reused in next pair of rounds.
87 # This is why these two are chosen for offloading to memory,
88 # to make loads count more.
90 "&add (@x[$a0],@x[$a0],@x[$b0])",
91 "&mov ($xd,$xd,'ror#16')",
92 "&add (@x[$a1],@x[$a1],@x[$b1])",
93 "&mov ($xd_,$xd_,'ror#16')",
94 "&eor ($xd,$xd,@x[$a0],'ror#16')",
95 "&eor ($xd_,$xd_,@x[$a1],'ror#16')",
98 "&mov (@x[$b0],@x[$b0],'ror#20')",
99 "&add ($xc_,$xc_,$xd_)",
100 "&mov (@x[$b1],@x[$b1],'ror#20')",
101 "&eor (@x[$b0],@x[$b0],$xc,'ror#20')",
102 "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')",
104 "&add (@x[$a0],@x[$a0],@x[$b0])",
105 "&mov ($xd,$xd,'ror#24')",
106 "&add (@x[$a1],@x[$a1],@x[$b1])",
107 "&mov ($xd_,$xd_,'ror#24')",
108 "&eor ($xd,$xd,@x[$a0],'ror#24')",
109 "&eor ($xd_,$xd_,@x[$a1],'ror#24')",
111 "&add ($xc,$xc,$xd)",
112 "&mov (@x[$b0],@x[$b0],'ror#25')" );
114 "&str ($xd,'[sp,#4*(16+$d0)]')",
115 "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd);
117 "&add ($xc_,$xc_,$xd_)",
118 "&mov (@x[$b1],@x[$b1],'ror#25')" );
120 "&str ($xd_,'[sp,#4*(16+$d1)]')",
121 "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd);
123 "&eor (@x[$b0],@x[$b0],$xc,'ror#25')",
124 "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" );
126 $xd=@x[$d2] if (!$odd);
127 $xd_=@x[$d3] if ($odd);
129 "&str ($xc,'[sp,#4*(16+$c0)]')",
130 "&ldr ($xc,'[sp,#4*(16+$c2)]')",
131 "&add (@x[$a2],@x[$a2],@x[$b2])",
132 "&mov ($xd,$xd,'ror#16')",
133 "&str ($xc_,'[sp,#4*(16+$c1)]')",
134 "&ldr ($xc_,'[sp,#4*(16+$c3)]')",
135 "&add (@x[$a3],@x[$a3],@x[$b3])",
136 "&mov ($xd_,$xd_,'ror#16')",
137 "&eor ($xd,$xd,@x[$a2],'ror#16')",
138 "&eor ($xd_,$xd_,@x[$a3],'ror#16')",
140 "&add ($xc,$xc,$xd)",
141 "&mov (@x[$b2],@x[$b2],'ror#20')",
142 "&add ($xc_,$xc_,$xd_)",
143 "&mov (@x[$b3],@x[$b3],'ror#20')",
144 "&eor (@x[$b2],@x[$b2],$xc,'ror#20')",
145 "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')",
147 "&add (@x[$a2],@x[$a2],@x[$b2])",
148 "&mov ($xd,$xd,'ror#24')",
149 "&add (@x[$a3],@x[$a3],@x[$b3])",
150 "&mov ($xd_,$xd_,'ror#24')",
151 "&eor ($xd,$xd,@x[$a2],'ror#24')",
152 "&eor ($xd_,$xd_,@x[$a3],'ror#24')",
154 "&add ($xc,$xc,$xd)",
155 "&mov (@x[$b2],@x[$b2],'ror#25')",
156 "&add ($xc_,$xc_,$xd_)",
157 "&mov (@x[$b3],@x[$b3],'ror#25')",
158 "&eor (@x[$b2],@x[$b2],$xc,'ror#25')",
159 "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" );
165 #include "arm_arch.h"
168 #if defined(__thumb2__)
175 #if defined(__thumb2__) || defined(__clang__)
176 #define ldrhsb ldrbhs
181 .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
184 #if __ARM_MAX_ARCH__>=7
186 .word OPENSSL_armcap_P-.LChaCha20_ctr32
191 .globl ChaCha20_ctr32
192 .type ChaCha20_ctr32,%function
196 ldr r12,[sp,#0] @ pull pointer to counter and nonce
197 stmdb sp!,{r0-r2,r4-r11,lr}
198 #if __ARM_ARCH__<7 && !defined(__thumb2__)
199 sub r14,pc,#16 @ ChaCha20_ctr32
201 adr r14,.LChaCha20_ctr32
203 #if __ARM_MAX_ARCH__>=7
204 cmp r2,#192 @ test len
215 ldmia r12,{r4-r7} @ load counter and nonce
216 sub sp,sp,#4*(16) @ off-load area
217 sub r14,r14,#64 @ .Lsigma
218 stmdb sp!,{r4-r7} @ copy counter and nonce
219 ldmia r3,{r4-r11} @ load key
220 ldmia r14,{r0-r3} @ load sigma
221 stmdb sp!,{r4-r11} @ copy key
222 stmdb sp!,{r0-r3} @ copy sigma
223 str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
224 str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
229 ldmia sp,{r0-r9} @ load key material
230 str @t[3],[sp,#4*(32+2)] @ save len
231 str r12, [sp,#4*(32+1)] @ save inp
232 str r14, [sp,#4*(32+0)] @ save out
234 ldr @t[3], [sp,#4*(15)]
235 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
236 ldr @t[2], [sp,#4*(13)]
237 ldr @x[14],[sp,#4*(14)]
238 str @t[3], [sp,#4*(16+15)]
246 foreach (&ROUND(0, 4, 8,12)) { eval; }
247 foreach (&ROUND(0, 5,10,15)) { eval; }
251 ldr @t[3],[sp,#4*(32+2)] @ load len
253 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
254 str @t[1], [sp,#4*(16+9)]
255 str @x[12],[sp,#4*(16+12)]
256 str @t[2], [sp,#4*(16+13)]
257 str @x[14],[sp,#4*(16+14)]
259 @ at this point we have first half of 512-bit result in
260 @ @x[0-7] and second half at sp+4*(16+8)
262 cmp @t[3],#64 @ done yet?
266 addlo r12,sp,#4*(0) @ shortcut or ...
267 ldrhs r12,[sp,#4*(32+1)] @ ... load inp
268 addlo r14,sp,#4*(0) @ shortcut or ...
269 ldrhs r14,[sp,#4*(32+0)] @ ... load out
271 ldr @t[0],[sp,#4*(0)] @ load key material
272 ldr @t[1],[sp,#4*(1)]
274 #if __ARM_ARCH__>=6 || !defined(__ARMEB__)
277 tst @t[2],#3 @ are input and output aligned?
278 ldr @t[2],[sp,#4*(2)]
280 cmp @t[3],#64 @ restore flags
282 ldr @t[2],[sp,#4*(2)]
284 ldr @t[3],[sp,#4*(3)]
286 add @x[0],@x[0],@t[0] @ accumulate key material
287 add @x[1],@x[1],@t[1]
291 ldrhs @t[0],[r12],#16 @ load input
292 ldrhs @t[1],[r12,#-12]
294 add @x[2],@x[2],@t[2]
295 add @x[3],@x[3],@t[3]
299 ldrhs @t[2],[r12,#-8]
300 ldrhs @t[3],[r12,#-4]
301 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
310 eorhs @x[0],@x[0],@t[0] @ xor with input
311 eorhs @x[1],@x[1],@t[1]
313 str @x[0],[r14],#16 @ store output
317 eorhs @x[2],@x[2],@t[2]
318 eorhs @x[3],@x[3],@t[3]
319 ldmia @t[0],{@t[0]-@t[3]} @ load key material
324 add @x[4],@x[4],@t[0] @ accumulate key material
325 add @x[5],@x[5],@t[1]
329 ldrhs @t[0],[r12],#16 @ load input
330 ldrhs @t[1],[r12,#-12]
331 add @x[6],@x[6],@t[2]
332 add @x[7],@x[7],@t[3]
336 ldrhs @t[2],[r12,#-8]
337 ldrhs @t[3],[r12,#-4]
338 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
347 eorhs @x[4],@x[4],@t[0]
348 eorhs @x[5],@x[5],@t[1]
350 str @x[4],[r14],#16 @ store output
354 eorhs @x[6],@x[6],@t[2]
355 eorhs @x[7],@x[7],@t[3]
357 ldmia @t[0],{@t[0]-@t[3]} @ load key material
359 add @x[0],sp,#4*(16+8)
362 ldmia @x[0],{@x[0]-@x[7]} @ load second half
364 add @x[0],@x[0],@t[0] @ accumulate key material
365 add @x[1],@x[1],@t[1]
369 ldrhs @t[0],[r12],#16 @ load input
370 ldrhs @t[1],[r12,#-12]
374 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
375 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
376 add @x[2],@x[2],@t[2]
377 add @x[3],@x[3],@t[3]
381 ldrhs @t[2],[r12,#-8]
382 ldrhs @t[3],[r12,#-4]
383 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
392 eorhs @x[0],@x[0],@t[0]
393 eorhs @x[1],@x[1],@t[1]
395 str @x[0],[r14],#16 @ store output
399 eorhs @x[2],@x[2],@t[2]
400 eorhs @x[3],@x[3],@t[3]
402 ldmia @t[0],{@t[0]-@t[3]} @ load key material
406 add @x[4],@x[4],@t[0] @ accumulate key material
407 add @x[5],@x[5],@t[1]
411 addhi @t[0],@t[0],#1 @ next counter value
412 strhi @t[0],[sp,#4*(12)] @ save next counter value
416 ldrhs @t[0],[r12],#16 @ load input
417 ldrhs @t[1],[r12,#-12]
418 add @x[6],@x[6],@t[2]
419 add @x[7],@x[7],@t[3]
423 ldrhs @t[2],[r12,#-8]
424 ldrhs @t[3],[r12,#-4]
425 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
434 eorhs @x[4],@x[4],@t[0]
435 eorhs @x[5],@x[5],@t[1]
439 ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
443 eorhs @x[6],@x[6],@t[2]
444 eorhs @x[7],@x[7],@t[3]
445 str @x[4],[r14],#16 @ store output
450 subhs @t[3],@t[0],#64 @ len-=64
460 .Lunaligned: @ unaligned endian-neutral path
461 cmp @t[3],#64 @ restore flags
465 ldr @t[3],[sp,#4*(3)]
467 for ($i=0;$i<16;$i+=4) {
470 $code.=<<___ if ($i==4);
471 add @x[0],sp,#4*(16+8)
473 $code.=<<___ if ($i==8);
474 ldmia @x[0],{@x[0]-@x[7]} @ load second half
478 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]"
479 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]"
482 add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material
484 $code.=<<___ if ($i==12);
488 addhi @t[0],@t[0],#1 @ next counter value
489 strhi @t[0],[sp,#4*(12)] @ save next counter value
492 add @x[$j+1],@x[$j+1],@t[1]
493 add @x[$j+2],@x[$j+2],@t[2]
497 eorlo @t[0],@t[0],@t[0] @ zero or ...
498 ldrhsb @t[0],[r12],#16 @ ... load input
499 eorlo @t[1],@t[1],@t[1]
500 ldrhsb @t[1],[r12,#-12]
502 add @x[$j+3],@x[$j+3],@t[3]
506 eorlo @t[2],@t[2],@t[2]
507 ldrhsb @t[2],[r12,#-8]
508 eorlo @t[3],@t[3],@t[3]
509 ldrhsb @t[3],[r12,#-4]
511 eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero)
512 eor @x[$j+1],@t[1],@x[$j+1]
516 ldrhsb @t[0],[r12,#-15] @ load more input
517 ldrhsb @t[1],[r12,#-11]
518 eor @x[$j+2],@t[2],@x[$j+2]
519 strb @x[$j+0],[r14],#16 @ store output
520 eor @x[$j+3],@t[3],@x[$j+3]
524 ldrhsb @t[2],[r12,#-7]
525 ldrhsb @t[3],[r12,#-3]
526 strb @x[$j+1],[r14,#-12]
527 eor @x[$j+0],@t[0],@x[$j+0],lsr#8
528 strb @x[$j+2],[r14,#-8]
529 eor @x[$j+1],@t[1],@x[$j+1],lsr#8
533 ldrhsb @t[0],[r12,#-14] @ load more input
534 ldrhsb @t[1],[r12,#-10]
535 strb @x[$j+3],[r14,#-4]
536 eor @x[$j+2],@t[2],@x[$j+2],lsr#8
537 strb @x[$j+0],[r14,#-15]
538 eor @x[$j+3],@t[3],@x[$j+3],lsr#8
542 ldrhsb @t[2],[r12,#-6]
543 ldrhsb @t[3],[r12,#-2]
544 strb @x[$j+1],[r14,#-11]
545 eor @x[$j+0],@t[0],@x[$j+0],lsr#8
546 strb @x[$j+2],[r14,#-7]
547 eor @x[$j+1],@t[1],@x[$j+1],lsr#8
551 ldrhsb @t[0],[r12,#-13] @ load more input
552 ldrhsb @t[1],[r12,#-9]
553 strb @x[$j+3],[r14,#-3]
554 eor @x[$j+2],@t[2],@x[$j+2],lsr#8
555 strb @x[$j+0],[r14,#-14]
556 eor @x[$j+3],@t[3],@x[$j+3],lsr#8
560 ldrhsb @t[2],[r12,#-5]
561 ldrhsb @t[3],[r12,#-1]
562 strb @x[$j+1],[r14,#-10]
563 strb @x[$j+2],[r14,#-6]
564 eor @x[$j+0],@t[0],@x[$j+0],lsr#8
565 strb @x[$j+3],[r14,#-2]
566 eor @x[$j+1],@t[1],@x[$j+1],lsr#8
567 strb @x[$j+0],[r14,#-13]
568 eor @x[$j+2],@t[2],@x[$j+2],lsr#8
569 strb @x[$j+1],[r14,#-9]
570 eor @x[$j+3],@t[3],@x[$j+3],lsr#8
571 strb @x[$j+2],[r14,#-5]
572 strb @x[$j+3],[r14,#-1]
574 $code.=<<___ if ($i<12);
575 add @t[0],sp,#4*(4+$i)
576 ldmia @t[0],{@t[0]-@t[3]} @ load key material
583 ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
587 subhs @t[3],@t[0],#64 @ len-=64
594 ldr r12,[sp,#4*(32+1)] @ load inp
596 ldr r14,[sp,#4*(32+0)] @ load out
599 ldrb @t[0],[@t[2]],#1 @ read buffer on stack
600 ldrb @t[1],[r12],#1 @ read input
602 eor @t[0],@t[0],@t[1]
603 strb @t[0],[r14],#1 @ store output
608 ldmia sp!,{r4-r11,pc}
609 .size ChaCha20_ctr32,.-ChaCha20_ctr32
613 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
618 my ($a,$b,$c,$d,$t)=@_;
621 "&vadd_i32 ($a,$a,$b)",
623 "&vrev32_16 ($d,$d)", # vrot ($d,16)
625 "&vadd_i32 ($c,$c,$d)",
627 "&vshr_u32 ($b,$t,20)",
628 "&vsli_32 ($b,$t,12)",
630 "&vadd_i32 ($a,$a,$b)",
632 "&vshr_u32 ($d,$t,24)",
633 "&vsli_32 ($d,$t,8)",
635 "&vadd_i32 ($c,$c,$d)",
637 "&vshr_u32 ($b,$t,25)",
638 "&vsli_32 ($b,$t,7)",
640 "&vext_8 ($c,$c,$c,8)",
641 "&vext_8 ($b,$b,$b,$odd?12:4)",
642 "&vext_8 ($d,$d,$d,$odd?4:12)"
647 #if __ARM_MAX_ARCH__>=7
651 .type ChaCha20_neon,%function
654 ldr r12,[sp,#0] @ pull pointer to counter and nonce
655 stmdb sp!,{r0-r2,r4-r11,lr}
658 vstmdb sp!,{d8-d15} @ ABI spec says so
661 vld1.32 {$b0-$c0},[r3] @ load key
662 ldmia r3,{r4-r11} @ load key
665 vld1.32 {$d0},[r12] @ load counter and nonce
667 ldmia r14,{r0-r3} @ load sigma
668 vld1.32 {$a0},[r14]! @ load sigma
669 vld1.32 {$t0},[r14] @ one
670 vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce
671 vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key
673 str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
674 str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
675 vshl.i32 $t1#lo,$t0#lo,#1 @ two
676 vstr $t0#lo,[sp,#4*(16+0)]
677 vshl.i32 $t2#lo,$t0#lo,#2 @ four
678 vstr $t1#lo,[sp,#4*(16+2)]
680 vstr $t2#lo,[sp,#4*(16+4)]
688 ldmia sp,{r0-r9} @ load key material
689 cmp @t[3],#64*2 @ if len<=64*2
690 bls .Lbreak_neon @ switch to integer-only
692 str @t[3],[sp,#4*(32+2)] @ save len
694 str r12, [sp,#4*(32+1)] @ save inp
696 str r14, [sp,#4*(32+0)] @ save out
699 ldr @t[3], [sp,#4*(15)]
700 vadd.i32 $d1,$d0,$t0 @ counter+1
701 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
703 ldr @t[2], [sp,#4*(13)]
705 ldr @x[14],[sp,#4*(14)]
706 vadd.i32 $d2,$d1,$t0 @ counter+2
707 str @t[3], [sp,#4*(16+15)]
709 add @x[12],@x[12],#3 @ counter+3
716 my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
717 my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
718 my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
719 my @thread3=&ROUND(0,4,8,12);
722 eval; eval(shift(@thread3));
723 eval(shift(@thread1)); eval(shift(@thread3));
724 eval(shift(@thread2)); eval(shift(@thread3));
727 @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
728 @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
729 @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
730 @thread3=&ROUND(0,5,10,15);
733 eval; eval(shift(@thread3));
734 eval(shift(@thread1)); eval(shift(@thread3));
735 eval(shift(@thread2)); eval(shift(@thread3));
741 vld1.32 {$t0-$t1},[sp] @ load key material
742 vld1.32 {$t2-$t3},[@t[3]]
744 ldr @t[3],[sp,#4*(32+2)] @ load len
746 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
747 str @t[1], [sp,#4*(16+9)]
748 str @x[12],[sp,#4*(16+12)]
749 str @t[2], [sp,#4*(16+13)]
750 str @x[14],[sp,#4*(16+14)]
752 @ at this point we have first half of 512-bit result in
753 @ @x[0-7] and second half at sp+4*(16+8)
755 ldr r12,[sp,#4*(32+1)] @ load inp
756 ldr r14,[sp,#4*(32+0)] @ load out
758 vadd.i32 $a0,$a0,$t0 @ accumulate key material
761 vldr $t0#lo,[sp,#4*(16+0)] @ one
766 vldr $t1#lo,[sp,#4*(16+2)] @ two
771 vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1
772 vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2
781 vld1.8 {$t0-$t1},[r12]! @ load input
783 vld1.8 {$t2-$t3},[r12]!
784 veor $a0,$a0,$t0 @ xor with input
786 vld1.8 {$t0-$t1},[r12]!
789 vld1.8 {$t2-$t3},[r12]!
792 vst1.8 {$a0-$b0},[r14]! @ store output
794 vld1.8 {$t0-$t1},[r12]!
796 vst1.8 {$c0-$d0},[r14]!
798 vld1.8 {$t2-$t3},[r12]!
801 vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration
802 veor $t0#hi,$t0#hi,$t0#hi
803 vldr $t0#lo,[sp,#4*(16+4)] @ four
805 vld1.32 {$c0-$d0},[@t[3]]
807 vst1.8 {$a1-$b1},[r14]!
809 vst1.8 {$c1-$d1},[r14]!
811 vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value
812 vldr $t0#lo,[sp,#4*(16+0)] @ one
814 ldmia sp,{@t[0]-@t[3]} @ load key material
815 add @x[0],@x[0],@t[0] @ accumulate key material
816 ldr @t[0],[r12],#16 @ load input
817 vst1.8 {$a2-$b2},[r14]!
818 add @x[1],@x[1],@t[1]
820 vst1.8 {$c2-$d2},[r14]!
821 add @x[2],@x[2],@t[2]
823 add @x[3],@x[3],@t[3]
831 eor @x[0],@x[0],@t[0] @ xor with input
833 eor @x[1],@x[1],@t[1]
834 str @x[0],[r14],#16 @ store output
835 eor @x[2],@x[2],@t[2]
837 eor @x[3],@x[3],@t[3]
838 ldmia @t[0],{@t[0]-@t[3]} @ load key material
842 add @x[4],@x[4],@t[0] @ accumulate key material
843 ldr @t[0],[r12],#16 @ load input
844 add @x[5],@x[5],@t[1]
846 add @x[6],@x[6],@t[2]
848 add @x[7],@x[7],@t[3]
856 eor @x[4],@x[4],@t[0]
858 eor @x[5],@x[5],@t[1]
859 str @x[4],[r14],#16 @ store output
860 eor @x[6],@x[6],@t[2]
862 eor @x[7],@x[7],@t[3]
863 ldmia @t[0],{@t[0]-@t[3]} @ load key material
865 add @x[0],sp,#4*(16+8)
868 ldmia @x[0],{@x[0]-@x[7]} @ load second half
870 add @x[0],@x[0],@t[0] @ accumulate key material
871 ldr @t[0],[r12],#16 @ load input
872 add @x[1],@x[1],@t[1]
877 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
878 add @x[2],@x[2],@t[2]
883 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
884 add @x[3],@x[3],@t[3]
892 eor @x[0],@x[0],@t[0]
894 eor @x[1],@x[1],@t[1]
895 str @x[0],[r14],#16 @ store output
896 eor @x[2],@x[2],@t[2]
898 eor @x[3],@x[3],@t[3]
899 ldmia @t[0],{@t[0]-@t[3]} @ load key material
903 add @x[4],@x[4],@t[0] @ accumulate key material
904 add @t[0],@t[0],#4 @ next counter value
905 add @x[5],@x[5],@t[1]
906 str @t[0],[sp,#4*(12)] @ save next counter value
907 ldr @t[0],[r12],#16 @ load input
908 add @x[6],@x[6],@t[2]
909 add @x[4],@x[4],#3 @ counter+3
911 add @x[7],@x[7],@t[3]
920 eor @x[4],@x[4],@t[0]
924 ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
925 eor @x[5],@x[5],@t[1]
926 eor @x[6],@x[6],@t[2]
927 str @x[4],[r14],#16 @ store output
928 eor @x[7],@x[7],@t[3]
930 sub @t[3],@t[0],#64*4 @ len-=64*4
939 @ harmonize NEON and integer-only stack frames: load data
940 @ from NEON frame, but save to integer-only one; distance
941 @ between the two is 4*(32+4+16-32)=4*(20).
943 str @t[3], [sp,#4*(20+32+2)] @ save len
944 add @t[3],sp,#4*(32+4)
945 str r12, [sp,#4*(20+32+1)] @ save inp
946 str r14, [sp,#4*(20+32+0)] @ save out
948 ldr @x[12],[sp,#4*(16+10)]
949 ldr @x[14],[sp,#4*(16+11)]
950 vldmia @t[3],{d8-d15} @ fulfill ABI requirement
951 str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]"
952 str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]"
954 ldr @t[3], [sp,#4*(15)]
955 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
956 ldr @t[2], [sp,#4*(13)]
957 ldr @x[14],[sp,#4*(14)]
958 str @t[3], [sp,#4*(20+16+15)]
960 vst1.32 {$a0-$b0},[@t[3]]! @ copy key
961 add sp,sp,#4*(20) @ switch frame
962 vst1.32 {$c0-$d0},[@t[3]]
964 b .Loop @ go integer-only
969 bhs .L192_or_more_neon
971 bhs .L128_or_more_neon
973 bhs .L64_or_more_neon
976 vst1.8 {$a0-$b0},[sp]
978 vst1.8 {$c0-$d0},[@t[0]]
983 vld1.8 {$t0-$t1},[r12]!
984 vld1.8 {$t2-$t3},[r12]!
989 vst1.8 {$a0-$b0},[r14]!
990 vst1.8 {$c0-$d0},[r14]!
995 vst1.8 {$a1-$b1},[sp]
997 vst1.8 {$c1-$d1},[@t[0]]
998 sub @t[3],@t[3],#64*1 @ len-=64*1
1003 vld1.8 {$t0-$t1},[r12]!
1004 vld1.8 {$t2-$t3},[r12]!
1007 vld1.8 {$t0-$t1},[r12]!
1010 vld1.8 {$t2-$t3},[r12]!
1014 vst1.8 {$a0-$b0},[r14]!
1016 vst1.8 {$c0-$d0},[r14]!
1018 vst1.8 {$a1-$b1},[r14]!
1019 vst1.8 {$c1-$d1},[r14]!
1024 vst1.8 {$a2-$b2},[sp]
1026 vst1.8 {$c2-$d2},[@t[0]]
1027 sub @t[3],@t[3],#64*2 @ len-=64*2
1032 vld1.8 {$t0-$t1},[r12]!
1033 vld1.8 {$t2-$t3},[r12]!
1036 vld1.8 {$t0-$t1},[r12]!
1039 vld1.8 {$t2-$t3},[r12]!
1043 vld1.8 {$t0-$t1},[r12]!
1045 vst1.8 {$a0-$b0},[r14]!
1047 vld1.8 {$t2-$t3},[r12]!
1050 vst1.8 {$c0-$d0},[r14]!
1052 vst1.8 {$a1-$b1},[r14]!
1054 vst1.8 {$c1-$d1},[r14]!
1056 vst1.8 {$a2-$b2},[r14]!
1057 vst1.8 {$c2-$d2},[r14]!
1061 ldmia sp,{@t[0]-@t[3]} @ load key material
1062 add @x[0],@x[0],@t[0] @ accumulate key material
1064 add @x[1],@x[1],@t[1]
1065 add @x[2],@x[2],@t[2]
1066 add @x[3],@x[3],@t[3]
1067 ldmia @t[0],{@t[0]-@t[3]} @ load key material
1069 add @x[4],@x[4],@t[0] @ accumulate key material
1071 add @x[5],@x[5],@t[1]
1072 add @x[6],@x[6],@t[2]
1073 add @x[7],@x[7],@t[3]
1074 ldmia @t[0],{@t[0]-@t[3]} @ load key material
1085 stmia sp,{@x[0]-@x[7]}
1086 add @x[0],sp,#4*(16+8)
1088 ldmia @x[0],{@x[0]-@x[7]} @ load second half
1090 add @x[0],@x[0],@t[0] @ accumulate key material
1091 add @t[0],sp,#4*(12)
1092 add @x[1],@x[1],@t[1]
1093 add @x[2],@x[2],@t[2]
1094 add @x[3],@x[3],@t[3]
1095 ldmia @t[0],{@t[0]-@t[3]} @ load key material
1097 add @x[4],@x[4],@t[0] @ accumulate key material
1099 add @x[5],@x[5],@t[1]
1100 add @x[4],@x[4],#3 @ counter+3
1101 add @x[6],@x[6],@t[2]
1102 add @x[7],@x[7],@t[3]
1103 ldr @t[3],[sp,#4*(32+2)] @ re-load len
1114 stmia @t[0],{@x[0]-@x[7]}
1116 sub @t[3],@t[0],#64*3 @ len-=64*3
1119 ldrb @t[0],[@t[2]],#1 @ read buffer on stack
1120 ldrb @t[1],[r12],#1 @ read input
1122 eor @t[0],@t[0],@t[1]
1123 strb @t[0],[r14],#1 @ store ouput
1130 ldmia sp!,{r4-r11,pc}
1131 .size ChaCha20_neon,.-ChaCha20_neon
1132 .comm OPENSSL_armcap_P,4,4
1137 foreach (split("\n",$code)) {
1138 s/\`([^\`]*)\`/eval $1/geo;
1140 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;