3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 # Performance in cycles per byte out of large buffer.
16 # IALU/gcc-4.4 1xNEON 3xNEON+1xIALU
18 # Cortex-A5 19.3(*)/+95% 21.8 14.1
19 # Cortex-A8 10.5(*)/+160% 13.9 6.35
20 # Cortex-A9 12.9(**)/+110% 14.3 6.50
21 # Cortex-A15 11.0/+40% 16.0 5.00
22 # Snapdragon S4 11.5/+125% 13.6 4.90
24 # (*) most "favourable" result for aligned data on little-endian
25 # processor, result for misaligned data is 10-15% lower;
26 # (**) this result is a trade-off: it can be improved by 20%,
27 # but then Snapdragon S4 and Cortex-A8 results get
31 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
32 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
34 if ($flavour && $flavour ne "void") {
35 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
37 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
38 die "can't locate arm-xlate.pl";
40 open STDOUT,"| \"$^X\" $xlate $flavour $output";
42 open STDOUT,">$output";
45 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
46 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
48 $arg = "#$arg" if ($arg*1 eq $arg);
49 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
52 my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x"));
53 my @t=map("r$_",(8..11));
56 my ($a0,$b0,$c0,$d0)=@_;
57 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
58 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
59 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
61 my ($xc,$xc_) = (@t[0..1]);
62 my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]);
65 # Consider order in which variables are addressed by their
70 # 0 4 8 12 < even round
74 # 0 5 10 15 < odd round
79 # 'a', 'b' are permanently allocated in registers, @x[0..7],
80 # while 'c's and pair of 'd's are maintained in memory. If
81 # you observe 'c' column, you'll notice that pair of 'c's is
82 # invariant between rounds. This means that we have to reload
83 # them once per round, in the middle. This is why you'll see
84 # bunch of 'c' stores and loads in the middle, but none in
85 # the beginning or end. If you observe 'd' column, you'll
86 # notice that 15 and 13 are reused in next pair of rounds.
87 # This is why these two are chosen for offloading to memory,
88 # to make loads count more.
90 "&add (@x[$a0],@x[$a0],@x[$b0])",
91 "&mov ($xd,$xd,'ror#16')",
92 "&add (@x[$a1],@x[$a1],@x[$b1])",
93 "&mov ($xd_,$xd_,'ror#16')",
94 "&eor ($xd,$xd,@x[$a0],'ror#16')",
95 "&eor ($xd_,$xd_,@x[$a1],'ror#16')",
98 "&mov (@x[$b0],@x[$b0],'ror#20')",
99 "&add ($xc_,$xc_,$xd_)",
100 "&mov (@x[$b1],@x[$b1],'ror#20')",
101 "&eor (@x[$b0],@x[$b0],$xc,'ror#20')",
102 "&eor (@x[$b1],@x[$b1],$xc_,'ror#20')",
104 "&add (@x[$a0],@x[$a0],@x[$b0])",
105 "&mov ($xd,$xd,'ror#24')",
106 "&add (@x[$a1],@x[$a1],@x[$b1])",
107 "&mov ($xd_,$xd_,'ror#24')",
108 "&eor ($xd,$xd,@x[$a0],'ror#24')",
109 "&eor ($xd_,$xd_,@x[$a1],'ror#24')",
111 "&add ($xc,$xc,$xd)",
112 "&mov (@x[$b0],@x[$b0],'ror#25')" );
114 "&str ($xd,'[sp,#4*(16+$d0)]')",
115 "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd);
117 "&add ($xc_,$xc_,$xd_)",
118 "&mov (@x[$b1],@x[$b1],'ror#25')" );
120 "&str ($xd_,'[sp,#4*(16+$d1)]')",
121 "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd);
123 "&eor (@x[$b0],@x[$b0],$xc,'ror#25')",
124 "&eor (@x[$b1],@x[$b1],$xc_,'ror#25')" );
126 $xd=@x[$d2] if (!$odd);
127 $xd_=@x[$d3] if ($odd);
129 "&str ($xc,'[sp,#4*(16+$c0)]')",
130 "&ldr ($xc,'[sp,#4*(16+$c2)]')",
131 "&add (@x[$a2],@x[$a2],@x[$b2])",
132 "&mov ($xd,$xd,'ror#16')",
133 "&str ($xc_,'[sp,#4*(16+$c1)]')",
134 "&ldr ($xc_,'[sp,#4*(16+$c3)]')",
135 "&add (@x[$a3],@x[$a3],@x[$b3])",
136 "&mov ($xd_,$xd_,'ror#16')",
137 "&eor ($xd,$xd,@x[$a2],'ror#16')",
138 "&eor ($xd_,$xd_,@x[$a3],'ror#16')",
140 "&add ($xc,$xc,$xd)",
141 "&mov (@x[$b2],@x[$b2],'ror#20')",
142 "&add ($xc_,$xc_,$xd_)",
143 "&mov (@x[$b3],@x[$b3],'ror#20')",
144 "&eor (@x[$b2],@x[$b2],$xc,'ror#20')",
145 "&eor (@x[$b3],@x[$b3],$xc_,'ror#20')",
147 "&add (@x[$a2],@x[$a2],@x[$b2])",
148 "&mov ($xd,$xd,'ror#24')",
149 "&add (@x[$a3],@x[$a3],@x[$b3])",
150 "&mov ($xd_,$xd_,'ror#24')",
151 "&eor ($xd,$xd,@x[$a2],'ror#24')",
152 "&eor ($xd_,$xd_,@x[$a3],'ror#24')",
154 "&add ($xc,$xc,$xd)",
155 "&mov (@x[$b2],@x[$b2],'ror#25')",
156 "&add ($xc_,$xc_,$xd_)",
157 "&mov (@x[$b3],@x[$b3],'ror#25')",
158 "&eor (@x[$b2],@x[$b2],$xc,'ror#25')",
159 "&eor (@x[$b3],@x[$b3],$xc_,'ror#25')" );
165 #include "arm_arch.h"
168 #if defined(__thumb2__)
175 #if defined(__thumb2__) || defined(__clang__)
176 #define ldrhsb ldrbhs
181 .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
184 #if __ARM_MAX_ARCH__>=7
186 .word OPENSSL_armcap_P-.LChaCha20_ctr32
191 .globl ChaCha20_ctr32
192 .type ChaCha20_ctr32,%function
196 ldr r12,[sp,#0] @ pull pointer to counter and nonce
197 stmdb sp!,{r0-r2,r4-r11,lr}
198 #if __ARM_ARCH__<7 && !defined(__thumb2__)
199 sub r14,pc,#16 @ ChaCha20_ctr32
201 adr r14,.LChaCha20_ctr32
209 #if __ARM_MAX_ARCH__>=7
210 cmp r2,#192 @ test len
221 ldmia r12,{r4-r7} @ load counter and nonce
222 sub sp,sp,#4*(16) @ off-load area
223 sub r14,r14,#64 @ .Lsigma
224 stmdb sp!,{r4-r7} @ copy counter and nonce
225 ldmia r3,{r4-r11} @ load key
226 ldmia r14,{r0-r3} @ load sigma
227 stmdb sp!,{r4-r11} @ copy key
228 stmdb sp!,{r0-r3} @ copy sigma
229 str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
230 str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
235 ldmia sp,{r0-r9} @ load key material
236 str @t[3],[sp,#4*(32+2)] @ save len
237 str r12, [sp,#4*(32+1)] @ save inp
238 str r14, [sp,#4*(32+0)] @ save out
240 ldr @t[3], [sp,#4*(15)]
241 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
242 ldr @t[2], [sp,#4*(13)]
243 ldr @x[14],[sp,#4*(14)]
244 str @t[3], [sp,#4*(16+15)]
252 foreach (&ROUND(0, 4, 8,12)) { eval; }
253 foreach (&ROUND(0, 5,10,15)) { eval; }
257 ldr @t[3],[sp,#4*(32+2)] @ load len
259 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
260 str @t[1], [sp,#4*(16+9)]
261 str @x[12],[sp,#4*(16+12)]
262 str @t[2], [sp,#4*(16+13)]
263 str @x[14],[sp,#4*(16+14)]
265 @ at this point we have first half of 512-bit result in
266 @ @x[0-7] and second half at sp+4*(16+8)
268 cmp @t[3],#64 @ done yet?
272 addlo r12,sp,#4*(0) @ shortcut or ...
273 ldrhs r12,[sp,#4*(32+1)] @ ... load inp
274 addlo r14,sp,#4*(0) @ shortcut or ...
275 ldrhs r14,[sp,#4*(32+0)] @ ... load out
277 ldr @t[0],[sp,#4*(0)] @ load key material
278 ldr @t[1],[sp,#4*(1)]
280 #if __ARM_ARCH__>=6 || !defined(__ARMEB__)
283 tst @t[2],#3 @ are input and output aligned?
284 ldr @t[2],[sp,#4*(2)]
286 cmp @t[3],#64 @ restore flags
288 ldr @t[2],[sp,#4*(2)]
290 ldr @t[3],[sp,#4*(3)]
292 add @x[0],@x[0],@t[0] @ accumulate key material
293 add @x[1],@x[1],@t[1]
297 ldrhs @t[0],[r12],#16 @ load input
298 ldrhs @t[1],[r12,#-12]
300 add @x[2],@x[2],@t[2]
301 add @x[3],@x[3],@t[3]
305 ldrhs @t[2],[r12,#-8]
306 ldrhs @t[3],[r12,#-4]
307 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
316 eorhs @x[0],@x[0],@t[0] @ xor with input
317 eorhs @x[1],@x[1],@t[1]
319 str @x[0],[r14],#16 @ store output
323 eorhs @x[2],@x[2],@t[2]
324 eorhs @x[3],@x[3],@t[3]
325 ldmia @t[0],{@t[0]-@t[3]} @ load key material
330 add @x[4],@x[4],@t[0] @ accumulate key material
331 add @x[5],@x[5],@t[1]
335 ldrhs @t[0],[r12],#16 @ load input
336 ldrhs @t[1],[r12,#-12]
337 add @x[6],@x[6],@t[2]
338 add @x[7],@x[7],@t[3]
342 ldrhs @t[2],[r12,#-8]
343 ldrhs @t[3],[r12,#-4]
344 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
353 eorhs @x[4],@x[4],@t[0]
354 eorhs @x[5],@x[5],@t[1]
356 str @x[4],[r14],#16 @ store output
360 eorhs @x[6],@x[6],@t[2]
361 eorhs @x[7],@x[7],@t[3]
363 ldmia @t[0],{@t[0]-@t[3]} @ load key material
365 add @x[0],sp,#4*(16+8)
368 ldmia @x[0],{@x[0]-@x[7]} @ load second half
370 add @x[0],@x[0],@t[0] @ accumulate key material
371 add @x[1],@x[1],@t[1]
375 ldrhs @t[0],[r12],#16 @ load input
376 ldrhs @t[1],[r12,#-12]
380 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
381 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
382 add @x[2],@x[2],@t[2]
383 add @x[3],@x[3],@t[3]
387 ldrhs @t[2],[r12,#-8]
388 ldrhs @t[3],[r12,#-4]
389 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
398 eorhs @x[0],@x[0],@t[0]
399 eorhs @x[1],@x[1],@t[1]
401 str @x[0],[r14],#16 @ store output
405 eorhs @x[2],@x[2],@t[2]
406 eorhs @x[3],@x[3],@t[3]
408 ldmia @t[0],{@t[0]-@t[3]} @ load key material
412 add @x[4],@x[4],@t[0] @ accumulate key material
413 add @x[5],@x[5],@t[1]
417 addhi @t[0],@t[0],#1 @ next counter value
418 strhi @t[0],[sp,#4*(12)] @ save next counter value
422 ldrhs @t[0],[r12],#16 @ load input
423 ldrhs @t[1],[r12,#-12]
424 add @x[6],@x[6],@t[2]
425 add @x[7],@x[7],@t[3]
429 ldrhs @t[2],[r12,#-8]
430 ldrhs @t[3],[r12,#-4]
431 # if __ARM_ARCH__>=6 && defined(__ARMEB__)
440 eorhs @x[4],@x[4],@t[0]
441 eorhs @x[5],@x[5],@t[1]
445 ldrne @t[0],[sp,#4*(32+2)] @ re-load len
449 eorhs @x[6],@x[6],@t[2]
450 eorhs @x[7],@x[7],@t[3]
451 str @x[4],[r14],#16 @ store output
456 subhs @t[3],@t[0],#64 @ len-=64
466 .Lunaligned: @ unaligned endian-neutral path
467 cmp @t[3],#64 @ restore flags
471 ldr @t[3],[sp,#4*(3)]
473 for ($i=0;$i<16;$i+=4) {
476 $code.=<<___ if ($i==4);
477 add @x[0],sp,#4*(16+8)
479 $code.=<<___ if ($i==8);
480 ldmia @x[0],{@x[0]-@x[7]} @ load second half
484 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]"
485 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]"
488 add @x[$j+0],@x[$j+0],@t[0] @ accumulate key material
490 $code.=<<___ if ($i==12);
494 addhi @t[0],@t[0],#1 @ next counter value
495 strhi @t[0],[sp,#4*(12)] @ save next counter value
498 add @x[$j+1],@x[$j+1],@t[1]
499 add @x[$j+2],@x[$j+2],@t[2]
503 eorlo @t[0],@t[0],@t[0] @ zero or ...
504 ldrhsb @t[0],[r12],#16 @ ... load input
505 eorlo @t[1],@t[1],@t[1]
506 ldrhsb @t[1],[r12,#-12]
508 add @x[$j+3],@x[$j+3],@t[3]
512 eorlo @t[2],@t[2],@t[2]
513 ldrhsb @t[2],[r12,#-8]
514 eorlo @t[3],@t[3],@t[3]
515 ldrhsb @t[3],[r12,#-4]
517 eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero)
518 eor @x[$j+1],@t[1],@x[$j+1]
522 ldrhsb @t[0],[r12,#-15] @ load more input
523 ldrhsb @t[1],[r12,#-11]
524 eor @x[$j+2],@t[2],@x[$j+2]
525 strb @x[$j+0],[r14],#16 @ store output
526 eor @x[$j+3],@t[3],@x[$j+3]
530 ldrhsb @t[2],[r12,#-7]
531 ldrhsb @t[3],[r12,#-3]
532 strb @x[$j+1],[r14,#-12]
533 eor @x[$j+0],@t[0],@x[$j+0],lsr#8
534 strb @x[$j+2],[r14,#-8]
535 eor @x[$j+1],@t[1],@x[$j+1],lsr#8
539 ldrhsb @t[0],[r12,#-14] @ load more input
540 ldrhsb @t[1],[r12,#-10]
541 strb @x[$j+3],[r14,#-4]
542 eor @x[$j+2],@t[2],@x[$j+2],lsr#8
543 strb @x[$j+0],[r14,#-15]
544 eor @x[$j+3],@t[3],@x[$j+3],lsr#8
548 ldrhsb @t[2],[r12,#-6]
549 ldrhsb @t[3],[r12,#-2]
550 strb @x[$j+1],[r14,#-11]
551 eor @x[$j+0],@t[0],@x[$j+0],lsr#8
552 strb @x[$j+2],[r14,#-7]
553 eor @x[$j+1],@t[1],@x[$j+1],lsr#8
557 ldrhsb @t[0],[r12,#-13] @ load more input
558 ldrhsb @t[1],[r12,#-9]
559 strb @x[$j+3],[r14,#-3]
560 eor @x[$j+2],@t[2],@x[$j+2],lsr#8
561 strb @x[$j+0],[r14,#-14]
562 eor @x[$j+3],@t[3],@x[$j+3],lsr#8
566 ldrhsb @t[2],[r12,#-5]
567 ldrhsb @t[3],[r12,#-1]
568 strb @x[$j+1],[r14,#-10]
569 strb @x[$j+2],[r14,#-6]
570 eor @x[$j+0],@t[0],@x[$j+0],lsr#8
571 strb @x[$j+3],[r14,#-2]
572 eor @x[$j+1],@t[1],@x[$j+1],lsr#8
573 strb @x[$j+0],[r14,#-13]
574 eor @x[$j+2],@t[2],@x[$j+2],lsr#8
575 strb @x[$j+1],[r14,#-9]
576 eor @x[$j+3],@t[3],@x[$j+3],lsr#8
577 strb @x[$j+2],[r14,#-5]
578 strb @x[$j+3],[r14,#-1]
580 $code.=<<___ if ($i<12);
581 add @t[0],sp,#4*(4+$i)
582 ldmia @t[0],{@t[0]-@t[3]} @ load key material
589 ldrne @t[0],[sp,#4*(32+2)] @ re-load len
593 subhs @t[3],@t[0],#64 @ len-=64
600 ldr r12,[sp,#4*(32+1)] @ load inp
602 ldr r14,[sp,#4*(32+0)] @ load out
605 ldrb @t[2],[@t[1]],#1 @ read buffer on stack
606 ldrb @t[3],[r12],#1 @ read input
608 eor @t[3],@t[3],@t[2]
609 strb @t[3],[r14],#1 @ store output
615 ldmia sp!,{r4-r11,pc}
616 .size ChaCha20_ctr32,.-ChaCha20_ctr32
620 my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) =
625 my ($a,$b,$c,$d,$t)=@_;
628 "&vadd_i32 ($a,$a,$b)",
630 "&vrev32_16 ($d,$d)", # vrot ($d,16)
632 "&vadd_i32 ($c,$c,$d)",
634 "&vshr_u32 ($b,$t,20)",
635 "&vsli_32 ($b,$t,12)",
637 "&vadd_i32 ($a,$a,$b)",
639 "&vshr_u32 ($d,$t,24)",
640 "&vsli_32 ($d,$t,8)",
642 "&vadd_i32 ($c,$c,$d)",
644 "&vshr_u32 ($b,$t,25)",
645 "&vsli_32 ($b,$t,7)",
647 "&vext_8 ($c,$c,$c,8)",
648 "&vext_8 ($b,$b,$b,$odd?12:4)",
649 "&vext_8 ($d,$d,$d,$odd?4:12)"
654 #if __ARM_MAX_ARCH__>=7
658 .type ChaCha20_neon,%function
661 ldr r12,[sp,#0] @ pull pointer to counter and nonce
662 stmdb sp!,{r0-r2,r4-r11,lr}
665 vstmdb sp!,{d8-d15} @ ABI spec says so
668 vld1.32 {$b0-$c0},[r3] @ load key
669 ldmia r3,{r4-r11} @ load key
672 vld1.32 {$d0},[r12] @ load counter and nonce
674 ldmia r14,{r0-r3} @ load sigma
675 vld1.32 {$a0},[r14]! @ load sigma
676 vld1.32 {$t0},[r14] @ one
677 vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce
678 vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key
680 str r10,[sp,#4*(16+10)] @ off-load "@x[10]"
681 str r11,[sp,#4*(16+11)] @ off-load "@x[11]"
682 vshl.i32 $t1#lo,$t0#lo,#1 @ two
683 vstr $t0#lo,[sp,#4*(16+0)]
684 vshl.i32 $t2#lo,$t0#lo,#2 @ four
685 vstr $t1#lo,[sp,#4*(16+2)]
687 vstr $t2#lo,[sp,#4*(16+4)]
695 ldmia sp,{r0-r9} @ load key material
696 cmp @t[3],#64*2 @ if len<=64*2
697 bls .Lbreak_neon @ switch to integer-only
699 str @t[3],[sp,#4*(32+2)] @ save len
701 str r12, [sp,#4*(32+1)] @ save inp
703 str r14, [sp,#4*(32+0)] @ save out
706 ldr @t[3], [sp,#4*(15)]
707 vadd.i32 $d1,$d0,$t0 @ counter+1
708 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
710 ldr @t[2], [sp,#4*(13)]
712 ldr @x[14],[sp,#4*(14)]
713 vadd.i32 $d2,$d1,$t0 @ counter+2
714 str @t[3], [sp,#4*(16+15)]
716 add @x[12],@x[12],#3 @ counter+3
723 my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0);
724 my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0);
725 my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0);
726 my @thread3=&ROUND(0,4,8,12);
729 eval; eval(shift(@thread3));
730 eval(shift(@thread1)); eval(shift(@thread3));
731 eval(shift(@thread2)); eval(shift(@thread3));
734 @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1);
735 @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1);
736 @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1);
737 @thread3=&ROUND(0,5,10,15);
740 eval; eval(shift(@thread3));
741 eval(shift(@thread1)); eval(shift(@thread3));
742 eval(shift(@thread2)); eval(shift(@thread3));
748 vld1.32 {$t0-$t1},[sp] @ load key material
749 vld1.32 {$t2-$t3},[@t[3]]
751 ldr @t[3],[sp,#4*(32+2)] @ load len
753 str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store
754 str @t[1], [sp,#4*(16+9)]
755 str @x[12],[sp,#4*(16+12)]
756 str @t[2], [sp,#4*(16+13)]
757 str @x[14],[sp,#4*(16+14)]
759 @ at this point we have first half of 512-bit result in
760 @ @x[0-7] and second half at sp+4*(16+8)
762 ldr r12,[sp,#4*(32+1)] @ load inp
763 ldr r14,[sp,#4*(32+0)] @ load out
765 vadd.i32 $a0,$a0,$t0 @ accumulate key material
768 vldr $t0#lo,[sp,#4*(16+0)] @ one
773 vldr $t1#lo,[sp,#4*(16+2)] @ two
778 vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1
779 vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2
788 vld1.8 {$t0-$t1},[r12]! @ load input
790 vld1.8 {$t2-$t3},[r12]!
791 veor $a0,$a0,$t0 @ xor with input
793 vld1.8 {$t0-$t1},[r12]!
796 vld1.8 {$t2-$t3},[r12]!
799 vst1.8 {$a0-$b0},[r14]! @ store output
801 vld1.8 {$t0-$t1},[r12]!
803 vst1.8 {$c0-$d0},[r14]!
805 vld1.8 {$t2-$t3},[r12]!
808 vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration
809 veor $t0#hi,$t0#hi,$t0#hi
810 vldr $t0#lo,[sp,#4*(16+4)] @ four
812 vld1.32 {$c0-$d0},[@t[3]]
814 vst1.8 {$a1-$b1},[r14]!
816 vst1.8 {$c1-$d1},[r14]!
818 vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value
819 vldr $t0#lo,[sp,#4*(16+0)] @ one
821 ldmia sp,{@t[0]-@t[3]} @ load key material
822 add @x[0],@x[0],@t[0] @ accumulate key material
823 ldr @t[0],[r12],#16 @ load input
824 vst1.8 {$a2-$b2},[r14]!
825 add @x[1],@x[1],@t[1]
827 vst1.8 {$c2-$d2},[r14]!
828 add @x[2],@x[2],@t[2]
830 add @x[3],@x[3],@t[3]
838 eor @x[0],@x[0],@t[0] @ xor with input
840 eor @x[1],@x[1],@t[1]
841 str @x[0],[r14],#16 @ store output
842 eor @x[2],@x[2],@t[2]
844 eor @x[3],@x[3],@t[3]
845 ldmia @t[0],{@t[0]-@t[3]} @ load key material
849 add @x[4],@x[4],@t[0] @ accumulate key material
850 ldr @t[0],[r12],#16 @ load input
851 add @x[5],@x[5],@t[1]
853 add @x[6],@x[6],@t[2]
855 add @x[7],@x[7],@t[3]
863 eor @x[4],@x[4],@t[0]
865 eor @x[5],@x[5],@t[1]
866 str @x[4],[r14],#16 @ store output
867 eor @x[6],@x[6],@t[2]
869 eor @x[7],@x[7],@t[3]
870 ldmia @t[0],{@t[0]-@t[3]} @ load key material
872 add @x[0],sp,#4*(16+8)
875 ldmia @x[0],{@x[0]-@x[7]} @ load second half
877 add @x[0],@x[0],@t[0] @ accumulate key material
878 ldr @t[0],[r12],#16 @ load input
879 add @x[1],@x[1],@t[1]
884 strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it
885 add @x[2],@x[2],@t[2]
890 strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it
891 add @x[3],@x[3],@t[3]
899 eor @x[0],@x[0],@t[0]
901 eor @x[1],@x[1],@t[1]
902 str @x[0],[r14],#16 @ store output
903 eor @x[2],@x[2],@t[2]
905 eor @x[3],@x[3],@t[3]
906 ldmia @t[0],{@t[0]-@t[3]} @ load key material
910 add @x[4],@x[4],@t[0] @ accumulate key material
911 add @t[0],@t[0],#4 @ next counter value
912 add @x[5],@x[5],@t[1]
913 str @t[0],[sp,#4*(12)] @ save next counter value
914 ldr @t[0],[r12],#16 @ load input
915 add @x[6],@x[6],@t[2]
916 add @x[4],@x[4],#3 @ counter+3
918 add @x[7],@x[7],@t[3]
927 eor @x[4],@x[4],@t[0]
931 ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
932 eor @x[5],@x[5],@t[1]
933 eor @x[6],@x[6],@t[2]
934 str @x[4],[r14],#16 @ store output
935 eor @x[7],@x[7],@t[3]
937 sub @t[3],@t[0],#64*4 @ len-=64*4
946 @ harmonize NEON and integer-only stack frames: load data
947 @ from NEON frame, but save to integer-only one; distance
948 @ between the two is 4*(32+4+16-32)=4*(20).
950 str @t[3], [sp,#4*(20+32+2)] @ save len
951 add @t[3],sp,#4*(32+4)
952 str r12, [sp,#4*(20+32+1)] @ save inp
953 str r14, [sp,#4*(20+32+0)] @ save out
955 ldr @x[12],[sp,#4*(16+10)]
956 ldr @x[14],[sp,#4*(16+11)]
957 vldmia @t[3],{d8-d15} @ fulfill ABI requirement
958 str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]"
959 str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]"
961 ldr @t[3], [sp,#4*(15)]
962 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load
963 ldr @t[2], [sp,#4*(13)]
964 ldr @x[14],[sp,#4*(14)]
965 str @t[3], [sp,#4*(20+16+15)]
967 vst1.32 {$a0-$b0},[@t[3]]! @ copy key
968 add sp,sp,#4*(20) @ switch frame
969 vst1.32 {$c0-$d0},[@t[3]]
971 b .Loop @ go integer-only
976 bhs .L192_or_more_neon
978 bhs .L128_or_more_neon
980 bhs .L64_or_more_neon
983 vst1.8 {$a0-$b0},[sp]
985 vst1.8 {$c0-$d0},[@t[0]]
990 vld1.8 {$t0-$t1},[r12]!
991 vld1.8 {$t2-$t3},[r12]!
996 vst1.8 {$a0-$b0},[r14]!
997 vst1.8 {$c0-$d0},[r14]!
1002 vst1.8 {$a1-$b1},[sp]
1004 vst1.8 {$c1-$d1},[@t[0]]
1005 sub @t[3],@t[3],#64*1 @ len-=64*1
1010 vld1.8 {$t0-$t1},[r12]!
1011 vld1.8 {$t2-$t3},[r12]!
1014 vld1.8 {$t0-$t1},[r12]!
1017 vld1.8 {$t2-$t3},[r12]!
1021 vst1.8 {$a0-$b0},[r14]!
1023 vst1.8 {$c0-$d0},[r14]!
1025 vst1.8 {$a1-$b1},[r14]!
1026 vst1.8 {$c1-$d1},[r14]!
1031 vst1.8 {$a2-$b2},[sp]
1033 vst1.8 {$c2-$d2},[@t[0]]
1034 sub @t[3],@t[3],#64*2 @ len-=64*2
1039 vld1.8 {$t0-$t1},[r12]!
1040 vld1.8 {$t2-$t3},[r12]!
1043 vld1.8 {$t0-$t1},[r12]!
1046 vld1.8 {$t2-$t3},[r12]!
1050 vld1.8 {$t0-$t1},[r12]!
1052 vst1.8 {$a0-$b0},[r14]!
1054 vld1.8 {$t2-$t3},[r12]!
1057 vst1.8 {$c0-$d0},[r14]!
1059 vst1.8 {$a1-$b1},[r14]!
1061 vst1.8 {$c1-$d1},[r14]!
1063 vst1.8 {$a2-$b2},[r14]!
1064 vst1.8 {$c2-$d2},[r14]!
1068 ldmia sp,{@t[0]-@t[3]} @ load key material
1069 add @x[0],@x[0],@t[0] @ accumulate key material
1071 add @x[1],@x[1],@t[1]
1072 add @x[2],@x[2],@t[2]
1073 add @x[3],@x[3],@t[3]
1074 ldmia @t[0],{@t[0]-@t[3]} @ load key material
1076 add @x[4],@x[4],@t[0] @ accumulate key material
1078 add @x[5],@x[5],@t[1]
1079 add @x[6],@x[6],@t[2]
1080 add @x[7],@x[7],@t[3]
1081 ldmia @t[0],{@t[0]-@t[3]} @ load key material
1092 stmia sp,{@x[0]-@x[7]}
1093 add @x[0],sp,#4*(16+8)
1095 ldmia @x[0],{@x[0]-@x[7]} @ load second half
1097 add @x[0],@x[0],@t[0] @ accumulate key material
1098 add @t[0],sp,#4*(12)
1099 add @x[1],@x[1],@t[1]
1100 add @x[2],@x[2],@t[2]
1101 add @x[3],@x[3],@t[3]
1102 ldmia @t[0],{@t[0]-@t[3]} @ load key material
1104 add @x[4],@x[4],@t[0] @ accumulate key material
1106 add @x[5],@x[5],@t[1]
1107 add @x[4],@x[4],#3 @ counter+3
1108 add @x[6],@x[6],@t[2]
1109 add @x[7],@x[7],@t[3]
1110 ldr @t[3],[sp,#4*(32+2)] @ re-load len
1121 stmia @t[0],{@x[0]-@x[7]}
1123 sub @t[3],@t[3],#64*3 @ len-=64*3
1126 ldrb @t[0],[@t[2]],#1 @ read buffer on stack
1127 ldrb @t[1],[r12],#1 @ read input
1129 eor @t[0],@t[0],@t[1]
1130 strb @t[0],[r14],#1 @ store ouput
1137 ldmia sp!,{r4-r11,pc}
1138 .size ChaCha20_neon,.-ChaCha20_neon
1139 .comm OPENSSL_armcap_P,4,4
1144 foreach (split("\n",$code)) {
1145 s/\`([^\`]*)\`/eval $1/geo;
1147 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;