2 # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # ChaCha20 for x86_64.
23 # Add AVX512F code path.
27 # Add AVX512VL code path.
29 # Performance in cycles per byte out of large buffer.
31 # IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v)
34 # Core2 7.83/+55% 7.90/5.76 4.35
35 # Westmere 7.19/+50% 5.60/4.50 3.00
36 # Sandy Bridge 8.31/+42% 5.45/4.00 2.72
37 # Ivy Bridge 6.71/+46% 5.40/? 2.41
38 # Haswell 5.92/+43% 5.20/3.45 2.42 1.23
39 # Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)]
40 # Silvermont 12.0/+33% 7.75/6.90 7.03(iii)
41 # Knights L 11.7/- ? 9.60(iii) 0.80
42 # Goldmont 10.6/+17% 5.10/3.52 3.28
43 # Sledgehammer 7.28/+52% - -
44 # Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv)
45 # Ryzen 5.96/+50% 5.19/3.00 2.40 2.09
46 # VIA Nano 10.5/+46% 6.72/6.88 6.05
48 # (i) compared to older gcc 3.x one can observe >2x improvement on
50 # (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
51 # by chacha20_poly1305_tls_cipher, results are EVP-free;
52 # (iii) this is not optimal result for Atom because of MSROM
53 # limitations, SSE2 can do better, but gain is considered too
54 # low to justify the [maintenance] effort;
55 # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
56 # and 4.85 for 128-byte inputs;
57 # (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
58 # (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
59 # cpb in single thread, the corresponding capability is suppressed;
61 # $output is the last argument if it looks like a file (it has an extension)
62 # $flavour is the first argument if it doesn't look like a file
63 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
64 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
66 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
68 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
69 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
70 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
71 die "can't locate x86_64-xlate.pl";
73 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
74 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
75 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
78 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
80 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
81 $avx += 1 if ($1==2.11 && $2>=8);
84 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
85 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
86 $avx = ($1>=10) + ($1>=11);
89 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
90 $avx = ($2>=3.0) + ($2>3.0);
93 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
94 or die "can't call $xlate: $!";
97 # input parameter block
98 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
103 .extern OPENSSL_ia32cap_P
115 .long 0,2,4,6,1,3,5,7
117 .long 8,8,8,8,8,8,8,8
119 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
121 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
123 .long 2,0,0,0, 2,0,0,0
126 .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
128 .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
130 .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
132 .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
134 .asciz "expand 32-byte k"
135 .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
138 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
139 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
141 $arg = "\$$arg" if ($arg*1 eq $arg);
142 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
145 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
146 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
149 sub ROUND { # critical path is 24 cycles per round
150 my ($a0,$b0,$c0,$d0)=@_;
151 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
152 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
153 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
154 my ($xc,$xc_)=map("\"$_\"",@t);
155 my @x=map("\"$_\"",@x);
157 # Consider order in which variables are addressed by their
162 # 0 4 8 12 < even round
166 # 0 5 10 15 < odd round
171 # 'a', 'b' and 'd's are permanently allocated in registers,
172 # @x[0..7,12..15], while 'c's are maintained in memory. If
173 # you observe 'c' column, you'll notice that pair of 'c's is
174 # invariant between rounds. This means that we have to reload
175 # them once per round, in the middle. This is why you'll see
176 # bunch of 'c' stores and loads in the middle, but none in
177 # the beginning or end.
179 # Normally instructions would be interleaved to favour in-order
180 # execution. Generally out-of-order cores manage it gracefully,
181 # but not this time for some reason. As in-order execution
182 # cores are dying breed, old Atom is the only one around,
183 # instructions are left uninterleaved. Besides, Atom is better
184 # off executing 1xSSSE3 code anyway...
187 "&add (@x[$a0],@x[$b0])", # Q1
188 "&xor (@x[$d0],@x[$a0])",
190 "&add (@x[$a1],@x[$b1])", # Q2
191 "&xor (@x[$d1],@x[$a1])",
194 "&add ($xc,@x[$d0])",
195 "&xor (@x[$b0],$xc)",
197 "&add ($xc_,@x[$d1])",
198 "&xor (@x[$b1],$xc_)",
201 "&add (@x[$a0],@x[$b0])",
202 "&xor (@x[$d0],@x[$a0])",
204 "&add (@x[$a1],@x[$b1])",
205 "&xor (@x[$d1],@x[$a1])",
208 "&add ($xc,@x[$d0])",
209 "&xor (@x[$b0],$xc)",
211 "&add ($xc_,@x[$d1])",
212 "&xor (@x[$b1],$xc_)",
215 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
216 "&mov (\"4*$c1(%rsp)\",$xc_)",
217 "&mov ($xc,\"4*$c2(%rsp)\")",
218 "&mov ($xc_,\"4*$c3(%rsp)\")",
220 "&add (@x[$a2],@x[$b2])", # Q3
221 "&xor (@x[$d2],@x[$a2])",
223 "&add (@x[$a3],@x[$b3])", # Q4
224 "&xor (@x[$d3],@x[$a3])",
227 "&add ($xc,@x[$d2])",
228 "&xor (@x[$b2],$xc)",
230 "&add ($xc_,@x[$d3])",
231 "&xor (@x[$b3],$xc_)",
234 "&add (@x[$a2],@x[$b2])",
235 "&xor (@x[$d2],@x[$a2])",
237 "&add (@x[$a3],@x[$b3])",
238 "&xor (@x[$d3],@x[$a3])",
241 "&add ($xc,@x[$d2])",
242 "&xor (@x[$b2],$xc)",
244 "&add ($xc_,@x[$d3])",
245 "&xor (@x[$b3],$xc_)",
250 ########################################################################
251 # Generic code path that handles all lengths on pre-SSSE3 processors.
253 .globl ChaCha20_ctr32
254 .type ChaCha20_ctr32,\@function,5
260 mov OPENSSL_ia32cap_P+4(%rip),%r10
262 $code.=<<___ if ($avx>2);
263 bt \$48,%r10 # check for AVX512F
265 test %r10,%r10 # check for AVX512VL
266 js .LChaCha20_avx512vl
269 test \$`1<<(41-32)`,%r10d
285 .cfi_adjust_cfa_offset 64+24
288 #movdqa .Lsigma(%rip),%xmm0
290 movdqu 16($key),%xmm2
291 movdqu ($counter),%xmm3
292 movdqa .Lone(%rip),%xmm4
294 #movdqa %xmm0,4*0(%rsp) # key[0]
295 movdqa %xmm1,4*4(%rsp) # key[1]
296 movdqa %xmm2,4*8(%rsp) # key[2]
297 movdqa %xmm3,4*12(%rsp) # key[3]
298 mov $len,%rbp # reassign $len
303 mov \$0x61707865,@x[0] # 'expa'
304 mov \$0x3320646e,@x[1] # 'nd 3'
305 mov \$0x79622d32,@x[2] # '2-by'
306 mov \$0x6b206574,@x[3] # 'te k'
312 mov 4*13(%rsp),@x[13]
313 mov 4*14(%rsp),@x[14]
314 mov 4*15(%rsp),@x[15]
316 mov %rbp,64+0(%rsp) # save len
318 mov $inp,64+8(%rsp) # save inp
319 movq %xmm2,%rsi # "@x[8]"
320 mov $out,64+16(%rsp) # save out
322 shr \$32,%rdi # "@x[9]"
328 foreach (&ROUND (0, 4, 8,12)) { eval; }
329 foreach (&ROUND (0, 5,10,15)) { eval; }
334 mov @t[1],4*9(%rsp) # modulo-scheduled
336 mov 64(%rsp),%rbp # load len
338 mov 64+8(%rsp),$inp # load inp
339 paddd %xmm4,%xmm3 # increment counter
340 mov 64+16(%rsp),$out # load out
342 add \$0x61707865,@x[0] # 'expa'
343 add \$0x3320646e,@x[1] # 'nd 3'
344 add \$0x79622d32,@x[2] # '2-by'
345 add \$0x6b206574,@x[3] # 'te k'
350 add 4*12(%rsp),@x[12]
351 add 4*13(%rsp),@x[13]
352 add 4*14(%rsp),@x[14]
353 add 4*15(%rsp),@x[15]
354 paddd 4*8(%rsp),%xmm1
359 xor 4*0($inp),@x[0] # xor with input
367 movdqu 4*8($inp),%xmm0
368 xor 4*12($inp),@x[12]
369 xor 4*13($inp),@x[13]
370 xor 4*14($inp),@x[14]
371 xor 4*15($inp),@x[15]
372 lea 4*16($inp),$inp # inp+=64
375 movdqa %xmm2,4*8(%rsp)
376 movd %xmm3,4*12(%rsp)
378 mov @x[0],4*0($out) # write output
386 movdqu %xmm0,4*8($out)
387 mov @x[12],4*12($out)
388 mov @x[13],4*13($out)
389 mov @x[14],4*14($out)
390 mov @x[15],4*15($out)
391 lea 4*16($out),$out # out+=64
409 movdqa %xmm1,4*8(%rsp)
410 mov @x[12],4*12(%rsp)
411 mov @x[13],4*13(%rsp)
412 mov @x[14],4*14(%rsp)
413 mov @x[15],4*15(%rsp)
416 movzb ($inp,%rbx),%eax
417 movzb (%rsp,%rbx),%edx
420 mov %al,-1($out,%rbx)
425 lea 64+24+48(%rsp),%rsi
440 .cfi_def_cfa_register %rsp
444 .size ChaCha20_ctr32,.-ChaCha20_ctr32
447 ########################################################################
448 # SSSE3 code path that handles shorter lengths
450 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
452 sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
476 my $xframe = $win64 ? 32+8 : 8;
479 .type ChaCha20_ssse3,\@function,5
484 mov %rsp,%r9 # frame pointer
485 .cfi_def_cfa_register %r9
487 $code.=<<___ if ($avx);
488 test \$`1<<(43-32)`,%r10d
489 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
492 cmp \$128,$len # we might throw away some data,
494 ja .LChaCha20_4x # but overall it won't be slower
497 sub \$64+$xframe,%rsp
499 $code.=<<___ if ($win64);
500 movaps %xmm6,-0x28(%r9)
501 movaps %xmm7,-0x18(%r9)
505 movdqa .Lsigma(%rip),$a
509 movdqa .Lrot16(%rip),$rot16
510 movdqa .Lrot24(%rip),$rot24
516 mov \$10,$counter # reuse $counter
521 movdqa .Lone(%rip),$d
534 &pshufd ($c,$c,0b01001110);
535 &pshufd ($b,$b,0b00111001);
536 &pshufd ($d,$d,0b10010011);
540 &pshufd ($c,$c,0b01001110);
541 &pshufd ($b,$b,0b10010011);
542 &pshufd ($d,$d,0b00111001);
545 &jnz (".Loop_ssse3");
557 movdqu 0x10($inp),$t1
558 pxor $t,$a # xor with input
561 movdqu 0x30($inp),$t1
562 lea 0x40($inp),$inp # inp+=64
566 movdqu $a,0x00($out) # write output
570 lea 0x40($out),$out # out+=64
573 jnz .Loop_outer_ssse3
583 xor $counter,$counter
586 movzb ($inp,$counter),%eax
587 movzb (%rsp,$counter),%ecx
588 lea 1($counter),$counter
590 mov %al,-1($out,$counter)
596 $code.=<<___ if ($win64);
597 movaps -0x28(%r9),%xmm6
598 movaps -0x18(%r9),%xmm7
602 .cfi_def_cfa_register %rsp
606 .size ChaCha20_ssse3,.-ChaCha20_ssse3
610 ########################################################################
611 # SSSE3 code path that handles 128-byte inputs
613 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
614 my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
658 my $xframe = $win64 ? 0x68 : 8;
661 .type ChaCha20_128,\@function,5
666 mov %rsp,%r9 # frame pointer
667 .cfi_def_cfa_register %r9
668 sub \$64+$xframe,%rsp
670 $code.=<<___ if ($win64);
671 movaps %xmm6,-0x68(%r9)
672 movaps %xmm7,-0x58(%r9)
673 movaps %xmm8,-0x48(%r9)
674 movaps %xmm9,-0x38(%r9)
675 movaps %xmm10,-0x28(%r9)
676 movaps %xmm11,-0x18(%r9)
680 movdqa .Lsigma(%rip),$a
684 movdqa .Lone(%rip),$d1
685 movdqa .Lrot16(%rip),$rot16
686 movdqa .Lrot24(%rip),$rot24
696 mov \$10,$counter # reuse $counter
703 &pshufd ($c,$c,0b01001110);
704 &pshufd ($b,$b,0b00111001);
705 &pshufd ($d,$d,0b10010011);
706 &pshufd ($c1,$c1,0b01001110);
707 &pshufd ($b1,$b1,0b00111001);
708 &pshufd ($d1,$d1,0b10010011);
711 &pshufd ($c,$c,0b01001110);
712 &pshufd ($b,$b,0b10010011);
713 &pshufd ($d,$d,0b00111001);
714 &pshufd ($c1,$c1,0b01001110);
715 &pshufd ($b1,$b1,0b10010011);
716 &pshufd ($d1,$d1,0b00111001);
726 paddd .Lone(%rip),$d1
733 movdqu 0x10($inp),$t1
734 pxor $t,$a # xor with input
737 movdqu 0x30($inp),$t1
741 movdqu 0x50($inp),$t1
745 movdqu 0x70($inp),$t1
749 movdqu $a,0x00($out) # write output
753 movdqu $a1,0x40($out)
754 movdqu $b1,0x50($out)
755 movdqu $c1,0x60($out)
756 movdqu $d1,0x70($out)
758 $code.=<<___ if ($win64);
759 movaps -0x68(%r9),%xmm6
760 movaps -0x58(%r9),%xmm7
761 movaps -0x48(%r9),%xmm8
762 movaps -0x38(%r9),%xmm9
763 movaps -0x28(%r9),%xmm10
764 movaps -0x18(%r9),%xmm11
768 .cfi_def_cfa_register %rsp
772 .size ChaCha20_128,.-ChaCha20_128
776 ########################################################################
777 # SSSE3 code path that handles longer messages.
779 # assign variables to favor Atom front-end
780 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
781 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
782 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
783 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
785 sub SSSE3_lane_ROUND {
786 my ($a0,$b0,$c0,$d0)=@_;
787 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
788 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
789 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
790 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
791 my @x=map("\"$_\"",@xx);
793 # Consider order in which variables are addressed by their
798 # 0 4 8 12 < even round
802 # 0 5 10 15 < odd round
807 # 'a', 'b' and 'd's are permanently allocated in registers,
808 # @x[0..7,12..15], while 'c's are maintained in memory. If
809 # you observe 'c' column, you'll notice that pair of 'c's is
810 # invariant between rounds. This means that we have to reload
811 # them once per round, in the middle. This is why you'll see
812 # bunch of 'c' stores and loads in the middle, but none in
813 # the beginning or end.
816 "&paddd (@x[$a0],@x[$b0])", # Q1
817 "&paddd (@x[$a1],@x[$b1])", # Q2
818 "&pxor (@x[$d0],@x[$a0])",
819 "&pxor (@x[$d1],@x[$a1])",
820 "&pshufb (@x[$d0],$t1)",
821 "&pshufb (@x[$d1],$t1)",
823 "&paddd ($xc,@x[$d0])",
824 "&paddd ($xc_,@x[$d1])",
825 "&pxor (@x[$b0],$xc)",
826 "&pxor (@x[$b1],$xc_)",
827 "&movdqa ($t0,@x[$b0])",
828 "&pslld (@x[$b0],12)",
830 "&movdqa ($t1,@x[$b1])",
831 "&pslld (@x[$b1],12)",
832 "&por (@x[$b0],$t0)",
834 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
835 "&por (@x[$b1],$t1)",
837 "&paddd (@x[$a0],@x[$b0])",
838 "&paddd (@x[$a1],@x[$b1])",
839 "&pxor (@x[$d0],@x[$a0])",
840 "&pxor (@x[$d1],@x[$a1])",
841 "&pshufb (@x[$d0],$t0)",
842 "&pshufb (@x[$d1],$t0)",
844 "&paddd ($xc,@x[$d0])",
845 "&paddd ($xc_,@x[$d1])",
846 "&pxor (@x[$b0],$xc)",
847 "&pxor (@x[$b1],$xc_)",
848 "&movdqa ($t1,@x[$b0])",
849 "&pslld (@x[$b0],7)",
851 "&movdqa ($t0,@x[$b1])",
852 "&pslld (@x[$b1],7)",
853 "&por (@x[$b0],$t1)",
855 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
856 "&por (@x[$b1],$t0)",
858 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
859 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
860 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
861 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
863 "&paddd (@x[$a2],@x[$b2])", # Q3
864 "&paddd (@x[$a3],@x[$b3])", # Q4
865 "&pxor (@x[$d2],@x[$a2])",
866 "&pxor (@x[$d3],@x[$a3])",
867 "&pshufb (@x[$d2],$t1)",
868 "&pshufb (@x[$d3],$t1)",
870 "&paddd ($xc,@x[$d2])",
871 "&paddd ($xc_,@x[$d3])",
872 "&pxor (@x[$b2],$xc)",
873 "&pxor (@x[$b3],$xc_)",
874 "&movdqa ($t0,@x[$b2])",
875 "&pslld (@x[$b2],12)",
877 "&movdqa ($t1,@x[$b3])",
878 "&pslld (@x[$b3],12)",
879 "&por (@x[$b2],$t0)",
881 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
882 "&por (@x[$b3],$t1)",
884 "&paddd (@x[$a2],@x[$b2])",
885 "&paddd (@x[$a3],@x[$b3])",
886 "&pxor (@x[$d2],@x[$a2])",
887 "&pxor (@x[$d3],@x[$a3])",
888 "&pshufb (@x[$d2],$t0)",
889 "&pshufb (@x[$d3],$t0)",
891 "&paddd ($xc,@x[$d2])",
892 "&paddd ($xc_,@x[$d3])",
893 "&pxor (@x[$b2],$xc)",
894 "&pxor (@x[$b3],$xc_)",
895 "&movdqa ($t1,@x[$b2])",
896 "&pslld (@x[$b2],7)",
898 "&movdqa ($t0,@x[$b3])",
899 "&pslld (@x[$b3],7)",
900 "&por (@x[$b2],$t1)",
902 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
907 my $xframe = $win64 ? 0xa8 : 8;
910 .type ChaCha20_4x,\@function,5
915 mov %rsp,%r9 # frame pointer
916 .cfi_def_cfa_register %r9
919 $code.=<<___ if ($avx>1);
920 shr \$32,%r10 # OPENSSL_ia32cap_P+8
921 test \$`1<<5`,%r10 # test AVX2
928 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
929 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
930 je .Ldo_sse3_after_all # to detect Atom
933 sub \$0x140+$xframe,%rsp
935 ################ stack layout
936 # +0x00 SIMD equivalent of @x[8-12]
938 # +0x40 constant copy of key[0-2] smashed by lanes
940 # +0x100 SIMD counters (with nonce smashed by lanes)
943 $code.=<<___ if ($win64);
944 movaps %xmm6,-0xa8(%r9)
945 movaps %xmm7,-0x98(%r9)
946 movaps %xmm8,-0x88(%r9)
947 movaps %xmm9,-0x78(%r9)
948 movaps %xmm10,-0x68(%r9)
949 movaps %xmm11,-0x58(%r9)
950 movaps %xmm12,-0x48(%r9)
951 movaps %xmm13,-0x38(%r9)
952 movaps %xmm14,-0x28(%r9)
953 movaps %xmm15,-0x18(%r9)
957 movdqa .Lsigma(%rip),$xa3 # key[0]
958 movdqu ($key),$xb3 # key[1]
959 movdqu 16($key),$xt3 # key[2]
960 movdqu ($counter),$xd3 # key[3]
961 lea 0x100(%rsp),%rcx # size optimization
962 lea .Lrot16(%rip),%r10
963 lea .Lrot24(%rip),%r11
965 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
966 pshufd \$0x55,$xa3,$xa1
967 movdqa $xa0,0x40(%rsp) # ... and offload
968 pshufd \$0xaa,$xa3,$xa2
969 movdqa $xa1,0x50(%rsp)
970 pshufd \$0xff,$xa3,$xa3
971 movdqa $xa2,0x60(%rsp)
972 movdqa $xa3,0x70(%rsp)
974 pshufd \$0x00,$xb3,$xb0
975 pshufd \$0x55,$xb3,$xb1
976 movdqa $xb0,0x80-0x100(%rcx)
977 pshufd \$0xaa,$xb3,$xb2
978 movdqa $xb1,0x90-0x100(%rcx)
979 pshufd \$0xff,$xb3,$xb3
980 movdqa $xb2,0xa0-0x100(%rcx)
981 movdqa $xb3,0xb0-0x100(%rcx)
983 pshufd \$0x00,$xt3,$xt0 # "$xc0"
984 pshufd \$0x55,$xt3,$xt1 # "$xc1"
985 movdqa $xt0,0xc0-0x100(%rcx)
986 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
987 movdqa $xt1,0xd0-0x100(%rcx)
988 pshufd \$0xff,$xt3,$xt3 # "$xc3"
989 movdqa $xt2,0xe0-0x100(%rcx)
990 movdqa $xt3,0xf0-0x100(%rcx)
992 pshufd \$0x00,$xd3,$xd0
993 pshufd \$0x55,$xd3,$xd1
994 paddd .Linc(%rip),$xd0 # don't save counters yet
995 pshufd \$0xaa,$xd3,$xd2
996 movdqa $xd1,0x110-0x100(%rcx)
997 pshufd \$0xff,$xd3,$xd3
998 movdqa $xd2,0x120-0x100(%rcx)
999 movdqa $xd3,0x130-0x100(%rcx)
1005 movdqa 0x40(%rsp),$xa0 # re-load smashed key
1006 movdqa 0x50(%rsp),$xa1
1007 movdqa 0x60(%rsp),$xa2
1008 movdqa 0x70(%rsp),$xa3
1009 movdqa 0x80-0x100(%rcx),$xb0
1010 movdqa 0x90-0x100(%rcx),$xb1
1011 movdqa 0xa0-0x100(%rcx),$xb2
1012 movdqa 0xb0-0x100(%rcx),$xb3
1013 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1014 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1015 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1016 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1017 movdqa 0x100-0x100(%rcx),$xd0
1018 movdqa 0x110-0x100(%rcx),$xd1
1019 movdqa 0x120-0x100(%rcx),$xd2
1020 movdqa 0x130-0x100(%rcx),$xd3
1021 paddd .Lfour(%rip),$xd0 # next SIMD counters
1024 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
1025 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
1026 movdqa (%r10),$xt3 # .Lrot16(%rip)
1028 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1034 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
1035 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
1040 paddd 0x40(%rsp),$xa0 # accumulate key material
1041 paddd 0x50(%rsp),$xa1
1042 paddd 0x60(%rsp),$xa2
1043 paddd 0x70(%rsp),$xa3
1045 movdqa $xa0,$xt2 # "de-interlace" data
1052 punpcklqdq $xa2,$xa0 # "a0"
1054 punpcklqdq $xt3,$xt2 # "a2"
1055 punpckhqdq $xa2,$xa1 # "a1"
1056 punpckhqdq $xt3,$xa3 # "a3"
1058 ($xa2,$xt2)=($xt2,$xa2);
1060 paddd 0x80-0x100(%rcx),$xb0
1061 paddd 0x90-0x100(%rcx),$xb1
1062 paddd 0xa0-0x100(%rcx),$xb2
1063 paddd 0xb0-0x100(%rcx),$xb3
1065 movdqa $xa0,0x00(%rsp) # offload $xaN
1066 movdqa $xa1,0x10(%rsp)
1067 movdqa 0x20(%rsp),$xa0 # "xc2"
1068 movdqa 0x30(%rsp),$xa1 # "xc3"
1077 punpcklqdq $xb2,$xb0 # "b0"
1079 punpcklqdq $xt3,$xt2 # "b2"
1080 punpckhqdq $xb2,$xb1 # "b1"
1081 punpckhqdq $xt3,$xb3 # "b3"
1083 ($xb2,$xt2)=($xt2,$xb2);
1084 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1086 paddd 0xc0-0x100(%rcx),$xc0
1087 paddd 0xd0-0x100(%rcx),$xc1
1088 paddd 0xe0-0x100(%rcx),$xc2
1089 paddd 0xf0-0x100(%rcx),$xc3
1091 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
1092 movdqa $xa3,0x30(%rsp)
1101 punpcklqdq $xc2,$xc0 # "c0"
1103 punpcklqdq $xt3,$xt2 # "c2"
1104 punpckhqdq $xc2,$xc1 # "c1"
1105 punpckhqdq $xt3,$xc3 # "c3"
1107 ($xc2,$xt2)=($xt2,$xc2);
1108 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
1110 paddd 0x100-0x100(%rcx),$xd0
1111 paddd 0x110-0x100(%rcx),$xd1
1112 paddd 0x120-0x100(%rcx),$xd2
1113 paddd 0x130-0x100(%rcx),$xd3
1122 punpcklqdq $xd2,$xd0 # "d0"
1124 punpcklqdq $xt3,$xt2 # "d2"
1125 punpckhqdq $xd2,$xd1 # "d1"
1126 punpckhqdq $xt3,$xd3 # "d3"
1128 ($xd2,$xt2)=($xt2,$xd2);
1133 movdqu 0x00($inp),$xt0 # xor with input
1134 movdqu 0x10($inp),$xt1
1135 movdqu 0x20($inp),$xt2
1136 movdqu 0x30($inp),$xt3
1137 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1142 movdqu $xt0,0x00($out)
1143 movdqu 0x40($inp),$xt0
1144 movdqu $xt1,0x10($out)
1145 movdqu 0x50($inp),$xt1
1146 movdqu $xt2,0x20($out)
1147 movdqu 0x60($inp),$xt2
1148 movdqu $xt3,0x30($out)
1149 movdqu 0x70($inp),$xt3
1150 lea 0x80($inp),$inp # size optimization
1151 pxor 0x10(%rsp),$xt0
1156 movdqu $xt0,0x40($out)
1157 movdqu 0x00($inp),$xt0
1158 movdqu $xt1,0x50($out)
1159 movdqu 0x10($inp),$xt1
1160 movdqu $xt2,0x60($out)
1161 movdqu 0x20($inp),$xt2
1162 movdqu $xt3,0x70($out)
1163 lea 0x80($out),$out # size optimization
1164 movdqu 0x30($inp),$xt3
1165 pxor 0x20(%rsp),$xt0
1170 movdqu $xt0,0x00($out)
1171 movdqu 0x40($inp),$xt0
1172 movdqu $xt1,0x10($out)
1173 movdqu 0x50($inp),$xt1
1174 movdqu $xt2,0x20($out)
1175 movdqu 0x60($inp),$xt2
1176 movdqu $xt3,0x30($out)
1177 movdqu 0x70($inp),$xt3
1178 lea 0x80($inp),$inp # inp+=64*4
1179 pxor 0x30(%rsp),$xt0
1183 movdqu $xt0,0x40($out)
1184 movdqu $xt1,0x50($out)
1185 movdqu $xt2,0x60($out)
1186 movdqu $xt3,0x70($out)
1187 lea 0x80($out),$out # out+=64*4
1202 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1204 #movdqa $xt0,0x00(%rsp)
1205 movdqa $xb0,0x10(%rsp)
1206 movdqa $xc0,0x20(%rsp)
1207 movdqa $xd0,0x30(%rsp)
1212 movdqu 0x00($inp),$xt0 # xor with input
1213 movdqu 0x10($inp),$xt1
1214 movdqu 0x20($inp),$xt2
1215 movdqu 0x30($inp),$xt3
1216 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
1220 movdqu $xt0,0x00($out)
1221 movdqu $xt1,0x10($out)
1222 movdqu $xt2,0x20($out)
1223 movdqu $xt3,0x30($out)
1226 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1227 lea 0x40($inp),$inp # inp+=64*1
1229 movdqa $xt0,0x00(%rsp)
1230 movdqa $xb1,0x10(%rsp)
1231 lea 0x40($out),$out # out+=64*1
1232 movdqa $xc1,0x20(%rsp)
1233 sub \$64,$len # len-=64*1
1234 movdqa $xd1,0x30(%rsp)
1239 movdqu 0x00($inp),$xt0 # xor with input
1240 movdqu 0x10($inp),$xt1
1241 movdqu 0x20($inp),$xt2
1242 movdqu 0x30($inp),$xt3
1243 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1248 movdqu $xt0,0x00($out)
1249 movdqu 0x40($inp),$xt0
1250 movdqu $xt1,0x10($out)
1251 movdqu 0x50($inp),$xt1
1252 movdqu $xt2,0x20($out)
1253 movdqu 0x60($inp),$xt2
1254 movdqu $xt3,0x30($out)
1255 movdqu 0x70($inp),$xt3
1256 pxor 0x10(%rsp),$xt0
1260 movdqu $xt0,0x40($out)
1261 movdqu $xt1,0x50($out)
1262 movdqu $xt2,0x60($out)
1263 movdqu $xt3,0x70($out)
1266 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1267 lea 0x80($inp),$inp # inp+=64*2
1269 movdqa $xt0,0x00(%rsp)
1270 movdqa $xb2,0x10(%rsp)
1271 lea 0x80($out),$out # out+=64*2
1272 movdqa $xc2,0x20(%rsp)
1273 sub \$128,$len # len-=64*2
1274 movdqa $xd2,0x30(%rsp)
1279 movdqu 0x00($inp),$xt0 # xor with input
1280 movdqu 0x10($inp),$xt1
1281 movdqu 0x20($inp),$xt2
1282 movdqu 0x30($inp),$xt3
1283 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1288 movdqu $xt0,0x00($out)
1289 movdqu 0x40($inp),$xt0
1290 movdqu $xt1,0x10($out)
1291 movdqu 0x50($inp),$xt1
1292 movdqu $xt2,0x20($out)
1293 movdqu 0x60($inp),$xt2
1294 movdqu $xt3,0x30($out)
1295 movdqu 0x70($inp),$xt3
1296 lea 0x80($inp),$inp # size optimization
1297 pxor 0x10(%rsp),$xt0
1302 movdqu $xt0,0x40($out)
1303 movdqu 0x00($inp),$xt0
1304 movdqu $xt1,0x50($out)
1305 movdqu 0x10($inp),$xt1
1306 movdqu $xt2,0x60($out)
1307 movdqu 0x20($inp),$xt2
1308 movdqu $xt3,0x70($out)
1309 lea 0x80($out),$out # size optimization
1310 movdqu 0x30($inp),$xt3
1311 pxor 0x20(%rsp),$xt0
1315 movdqu $xt0,0x00($out)
1316 movdqu $xt1,0x10($out)
1317 movdqu $xt2,0x20($out)
1318 movdqu $xt3,0x30($out)
1321 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1322 lea 0x40($inp),$inp # inp+=64*3
1324 movdqa $xt0,0x00(%rsp)
1325 movdqa $xb3,0x10(%rsp)
1326 lea 0x40($out),$out # out+=64*3
1327 movdqa $xc3,0x20(%rsp)
1328 sub \$192,$len # len-=64*3
1329 movdqa $xd3,0x30(%rsp)
1332 movzb ($inp,%r10),%eax
1333 movzb (%rsp,%r10),%ecx
1336 mov %al,-1($out,%r10)
1342 $code.=<<___ if ($win64);
1343 movaps -0xa8(%r9),%xmm6
1344 movaps -0x98(%r9),%xmm7
1345 movaps -0x88(%r9),%xmm8
1346 movaps -0x78(%r9),%xmm9
1347 movaps -0x68(%r9),%xmm10
1348 movaps -0x58(%r9),%xmm11
1349 movaps -0x48(%r9),%xmm12
1350 movaps -0x38(%r9),%xmm13
1351 movaps -0x28(%r9),%xmm14
1352 movaps -0x18(%r9),%xmm15
1356 .cfi_def_cfa_register %rsp
1360 .size ChaCha20_4x,.-ChaCha20_4x
1364 ########################################################################
1365 # XOP code path that handles all lengths.
1367 # There is some "anomaly" observed depending on instructions' size or
1368 # alignment. If you look closely at below code you'll notice that
1369 # sometimes argument order varies. The order affects instruction
1370 # encoding by making it larger, and such fiddling gives 5% performance
1371 # improvement. This is on FX-4100...
1373 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1374 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1375 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1376 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1378 sub XOP_lane_ROUND {
1379 my ($a0,$b0,$c0,$d0)=@_;
1380 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1381 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1382 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1383 my @x=map("\"$_\"",@xx);
1386 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1387 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1388 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1389 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1390 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1391 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1392 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1393 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1394 "&vprotd (@x[$d0],@x[$d0],16)",
1395 "&vprotd (@x[$d1],@x[$d1],16)",
1396 "&vprotd (@x[$d2],@x[$d2],16)",
1397 "&vprotd (@x[$d3],@x[$d3],16)",
1399 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1400 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1401 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1402 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1403 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1404 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1405 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1406 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1407 "&vprotd (@x[$b0],@x[$b0],12)",
1408 "&vprotd (@x[$b1],@x[$b1],12)",
1409 "&vprotd (@x[$b2],@x[$b2],12)",
1410 "&vprotd (@x[$b3],@x[$b3],12)",
1412 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1413 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1414 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1415 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1416 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1417 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1418 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1419 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1420 "&vprotd (@x[$d0],@x[$d0],8)",
1421 "&vprotd (@x[$d1],@x[$d1],8)",
1422 "&vprotd (@x[$d2],@x[$d2],8)",
1423 "&vprotd (@x[$d3],@x[$d3],8)",
1425 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1426 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1427 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1428 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1429 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1430 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1431 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1432 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1433 "&vprotd (@x[$b0],@x[$b0],7)",
1434 "&vprotd (@x[$b1],@x[$b1],7)",
1435 "&vprotd (@x[$b2],@x[$b2],7)",
1436 "&vprotd (@x[$b3],@x[$b3],7)"
1440 my $xframe = $win64 ? 0xa8 : 8;
1443 .type ChaCha20_4xop,\@function,5
1448 mov %rsp,%r9 # frame pointer
1449 .cfi_def_cfa_register %r9
1450 sub \$0x140+$xframe,%rsp
1452 ################ stack layout
1453 # +0x00 SIMD equivalent of @x[8-12]
1455 # +0x40 constant copy of key[0-2] smashed by lanes
1457 # +0x100 SIMD counters (with nonce smashed by lanes)
1460 $code.=<<___ if ($win64);
1461 movaps %xmm6,-0xa8(%r9)
1462 movaps %xmm7,-0x98(%r9)
1463 movaps %xmm8,-0x88(%r9)
1464 movaps %xmm9,-0x78(%r9)
1465 movaps %xmm10,-0x68(%r9)
1466 movaps %xmm11,-0x58(%r9)
1467 movaps %xmm12,-0x48(%r9)
1468 movaps %xmm13,-0x38(%r9)
1469 movaps %xmm14,-0x28(%r9)
1470 movaps %xmm15,-0x18(%r9)
1476 vmovdqa .Lsigma(%rip),$xa3 # key[0]
1477 vmovdqu ($key),$xb3 # key[1]
1478 vmovdqu 16($key),$xt3 # key[2]
1479 vmovdqu ($counter),$xd3 # key[3]
1480 lea 0x100(%rsp),%rcx # size optimization
1482 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1483 vpshufd \$0x55,$xa3,$xa1
1484 vmovdqa $xa0,0x40(%rsp) # ... and offload
1485 vpshufd \$0xaa,$xa3,$xa2
1486 vmovdqa $xa1,0x50(%rsp)
1487 vpshufd \$0xff,$xa3,$xa3
1488 vmovdqa $xa2,0x60(%rsp)
1489 vmovdqa $xa3,0x70(%rsp)
1491 vpshufd \$0x00,$xb3,$xb0
1492 vpshufd \$0x55,$xb3,$xb1
1493 vmovdqa $xb0,0x80-0x100(%rcx)
1494 vpshufd \$0xaa,$xb3,$xb2
1495 vmovdqa $xb1,0x90-0x100(%rcx)
1496 vpshufd \$0xff,$xb3,$xb3
1497 vmovdqa $xb2,0xa0-0x100(%rcx)
1498 vmovdqa $xb3,0xb0-0x100(%rcx)
1500 vpshufd \$0x00,$xt3,$xt0 # "$xc0"
1501 vpshufd \$0x55,$xt3,$xt1 # "$xc1"
1502 vmovdqa $xt0,0xc0-0x100(%rcx)
1503 vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
1504 vmovdqa $xt1,0xd0-0x100(%rcx)
1505 vpshufd \$0xff,$xt3,$xt3 # "$xc3"
1506 vmovdqa $xt2,0xe0-0x100(%rcx)
1507 vmovdqa $xt3,0xf0-0x100(%rcx)
1509 vpshufd \$0x00,$xd3,$xd0
1510 vpshufd \$0x55,$xd3,$xd1
1511 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
1512 vpshufd \$0xaa,$xd3,$xd2
1513 vmovdqa $xd1,0x110-0x100(%rcx)
1514 vpshufd \$0xff,$xd3,$xd3
1515 vmovdqa $xd2,0x120-0x100(%rcx)
1516 vmovdqa $xd3,0x130-0x100(%rcx)
1522 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
1523 vmovdqa 0x50(%rsp),$xa1
1524 vmovdqa 0x60(%rsp),$xa2
1525 vmovdqa 0x70(%rsp),$xa3
1526 vmovdqa 0x80-0x100(%rcx),$xb0
1527 vmovdqa 0x90-0x100(%rcx),$xb1
1528 vmovdqa 0xa0-0x100(%rcx),$xb2
1529 vmovdqa 0xb0-0x100(%rcx),$xb3
1530 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1531 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1532 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1533 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1534 vmovdqa 0x100-0x100(%rcx),$xd0
1535 vmovdqa 0x110-0x100(%rcx),$xd1
1536 vmovdqa 0x120-0x100(%rcx),$xd2
1537 vmovdqa 0x130-0x100(%rcx),$xd3
1538 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
1542 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1548 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1549 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1554 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
1555 vpaddd 0x50(%rsp),$xa1,$xa1
1556 vpaddd 0x60(%rsp),$xa2,$xa2
1557 vpaddd 0x70(%rsp),$xa3,$xa3
1559 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
1560 vmovdqa $xt3,0x30(%rsp)
1562 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1563 vpunpckldq $xa3,$xa2,$xt3
1564 vpunpckhdq $xa1,$xa0,$xa0
1565 vpunpckhdq $xa3,$xa2,$xa2
1566 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1567 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1568 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1569 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1571 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1573 vpaddd 0x80-0x100(%rcx),$xb0,$xb0
1574 vpaddd 0x90-0x100(%rcx),$xb1,$xb1
1575 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
1576 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
1578 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
1579 vmovdqa $xa1,0x10(%rsp)
1580 vmovdqa 0x20(%rsp),$xa0 # "xc2"
1581 vmovdqa 0x30(%rsp),$xa1 # "xc3"
1583 vpunpckldq $xb1,$xb0,$xt2
1584 vpunpckldq $xb3,$xb2,$xt3
1585 vpunpckhdq $xb1,$xb0,$xb0
1586 vpunpckhdq $xb3,$xb2,$xb2
1587 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1588 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1589 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1590 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1592 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1593 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1595 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
1596 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
1597 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
1598 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
1600 vpunpckldq $xc1,$xc0,$xt2
1601 vpunpckldq $xc3,$xc2,$xt3
1602 vpunpckhdq $xc1,$xc0,$xc0
1603 vpunpckhdq $xc3,$xc2,$xc2
1604 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1605 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1606 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1607 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1609 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1611 vpaddd 0x100-0x100(%rcx),$xd0,$xd0
1612 vpaddd 0x110-0x100(%rcx),$xd1,$xd1
1613 vpaddd 0x120-0x100(%rcx),$xd2,$xd2
1614 vpaddd 0x130-0x100(%rcx),$xd3,$xd3
1616 vpunpckldq $xd1,$xd0,$xt2
1617 vpunpckldq $xd3,$xd2,$xt3
1618 vpunpckhdq $xd1,$xd0,$xd0
1619 vpunpckhdq $xd3,$xd2,$xd2
1620 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1621 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1622 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1623 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1625 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1626 ($xa0,$xa1)=($xt2,$xt3);
1628 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
1629 vmovdqa 0x10(%rsp),$xa1
1634 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1635 vpxor 0x10($inp),$xb0,$xb0
1636 vpxor 0x20($inp),$xc0,$xc0
1637 vpxor 0x30($inp),$xd0,$xd0
1638 vpxor 0x40($inp),$xa1,$xa1
1639 vpxor 0x50($inp),$xb1,$xb1
1640 vpxor 0x60($inp),$xc1,$xc1
1641 vpxor 0x70($inp),$xd1,$xd1
1642 lea 0x80($inp),$inp # size optimization
1643 vpxor 0x00($inp),$xa2,$xa2
1644 vpxor 0x10($inp),$xb2,$xb2
1645 vpxor 0x20($inp),$xc2,$xc2
1646 vpxor 0x30($inp),$xd2,$xd2
1647 vpxor 0x40($inp),$xa3,$xa3
1648 vpxor 0x50($inp),$xb3,$xb3
1649 vpxor 0x60($inp),$xc3,$xc3
1650 vpxor 0x70($inp),$xd3,$xd3
1651 lea 0x80($inp),$inp # inp+=64*4
1653 vmovdqu $xa0,0x00($out)
1654 vmovdqu $xb0,0x10($out)
1655 vmovdqu $xc0,0x20($out)
1656 vmovdqu $xd0,0x30($out)
1657 vmovdqu $xa1,0x40($out)
1658 vmovdqu $xb1,0x50($out)
1659 vmovdqu $xc1,0x60($out)
1660 vmovdqu $xd1,0x70($out)
1661 lea 0x80($out),$out # size optimization
1662 vmovdqu $xa2,0x00($out)
1663 vmovdqu $xb2,0x10($out)
1664 vmovdqu $xc2,0x20($out)
1665 vmovdqu $xd2,0x30($out)
1666 vmovdqu $xa3,0x40($out)
1667 vmovdqu $xb3,0x50($out)
1668 vmovdqu $xc3,0x60($out)
1669 vmovdqu $xd3,0x70($out)
1670 lea 0x80($out),$out # out+=64*4
1680 jae .L192_or_more4xop
1682 jae .L128_or_more4xop
1684 jae .L64_or_more4xop
1687 vmovdqa $xa0,0x00(%rsp)
1688 vmovdqa $xb0,0x10(%rsp)
1689 vmovdqa $xc0,0x20(%rsp)
1690 vmovdqa $xd0,0x30(%rsp)
1695 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1696 vpxor 0x10($inp),$xb0,$xb0
1697 vpxor 0x20($inp),$xc0,$xc0
1698 vpxor 0x30($inp),$xd0,$xd0
1699 vmovdqu $xa0,0x00($out)
1700 vmovdqu $xb0,0x10($out)
1701 vmovdqu $xc0,0x20($out)
1702 vmovdqu $xd0,0x30($out)
1705 lea 0x40($inp),$inp # inp+=64*1
1706 vmovdqa $xa1,0x00(%rsp)
1708 vmovdqa $xb1,0x10(%rsp)
1709 lea 0x40($out),$out # out+=64*1
1710 vmovdqa $xc1,0x20(%rsp)
1711 sub \$64,$len # len-=64*1
1712 vmovdqa $xd1,0x30(%rsp)
1717 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1718 vpxor 0x10($inp),$xb0,$xb0
1719 vpxor 0x20($inp),$xc0,$xc0
1720 vpxor 0x30($inp),$xd0,$xd0
1721 vpxor 0x40($inp),$xa1,$xa1
1722 vpxor 0x50($inp),$xb1,$xb1
1723 vpxor 0x60($inp),$xc1,$xc1
1724 vpxor 0x70($inp),$xd1,$xd1
1726 vmovdqu $xa0,0x00($out)
1727 vmovdqu $xb0,0x10($out)
1728 vmovdqu $xc0,0x20($out)
1729 vmovdqu $xd0,0x30($out)
1730 vmovdqu $xa1,0x40($out)
1731 vmovdqu $xb1,0x50($out)
1732 vmovdqu $xc1,0x60($out)
1733 vmovdqu $xd1,0x70($out)
1736 lea 0x80($inp),$inp # inp+=64*2
1737 vmovdqa $xa2,0x00(%rsp)
1739 vmovdqa $xb2,0x10(%rsp)
1740 lea 0x80($out),$out # out+=64*2
1741 vmovdqa $xc2,0x20(%rsp)
1742 sub \$128,$len # len-=64*2
1743 vmovdqa $xd2,0x30(%rsp)
1748 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1749 vpxor 0x10($inp),$xb0,$xb0
1750 vpxor 0x20($inp),$xc0,$xc0
1751 vpxor 0x30($inp),$xd0,$xd0
1752 vpxor 0x40($inp),$xa1,$xa1
1753 vpxor 0x50($inp),$xb1,$xb1
1754 vpxor 0x60($inp),$xc1,$xc1
1755 vpxor 0x70($inp),$xd1,$xd1
1756 lea 0x80($inp),$inp # size optimization
1757 vpxor 0x00($inp),$xa2,$xa2
1758 vpxor 0x10($inp),$xb2,$xb2
1759 vpxor 0x20($inp),$xc2,$xc2
1760 vpxor 0x30($inp),$xd2,$xd2
1762 vmovdqu $xa0,0x00($out)
1763 vmovdqu $xb0,0x10($out)
1764 vmovdqu $xc0,0x20($out)
1765 vmovdqu $xd0,0x30($out)
1766 vmovdqu $xa1,0x40($out)
1767 vmovdqu $xb1,0x50($out)
1768 vmovdqu $xc1,0x60($out)
1769 vmovdqu $xd1,0x70($out)
1770 lea 0x80($out),$out # size optimization
1771 vmovdqu $xa2,0x00($out)
1772 vmovdqu $xb2,0x10($out)
1773 vmovdqu $xc2,0x20($out)
1774 vmovdqu $xd2,0x30($out)
1777 lea 0x40($inp),$inp # inp+=64*3
1778 vmovdqa $xa3,0x00(%rsp)
1780 vmovdqa $xb3,0x10(%rsp)
1781 lea 0x40($out),$out # out+=64*3
1782 vmovdqa $xc3,0x20(%rsp)
1783 sub \$192,$len # len-=64*3
1784 vmovdqa $xd3,0x30(%rsp)
1787 movzb ($inp,%r10),%eax
1788 movzb (%rsp,%r10),%ecx
1791 mov %al,-1($out,%r10)
1798 $code.=<<___ if ($win64);
1799 movaps -0xa8(%r9),%xmm6
1800 movaps -0x98(%r9),%xmm7
1801 movaps -0x88(%r9),%xmm8
1802 movaps -0x78(%r9),%xmm9
1803 movaps -0x68(%r9),%xmm10
1804 movaps -0x58(%r9),%xmm11
1805 movaps -0x48(%r9),%xmm12
1806 movaps -0x38(%r9),%xmm13
1807 movaps -0x28(%r9),%xmm14
1808 movaps -0x18(%r9),%xmm15
1812 .cfi_def_cfa_register %rsp
1816 .size ChaCha20_4xop,.-ChaCha20_4xop
1820 ########################################################################
1823 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1824 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1825 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1826 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1828 sub AVX2_lane_ROUND {
1829 my ($a0,$b0,$c0,$d0)=@_;
1830 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1831 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1832 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1833 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1834 my @x=map("\"$_\"",@xx);
1836 # Consider order in which variables are addressed by their
1841 # 0 4 8 12 < even round
1845 # 0 5 10 15 < odd round
1850 # 'a', 'b' and 'd's are permanently allocated in registers,
1851 # @x[0..7,12..15], while 'c's are maintained in memory. If
1852 # you observe 'c' column, you'll notice that pair of 'c's is
1853 # invariant between rounds. This means that we have to reload
1854 # them once per round, in the middle. This is why you'll see
1855 # bunch of 'c' stores and loads in the middle, but none in
1856 # the beginning or end.
1859 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1860 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1861 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1862 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1863 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1864 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1866 "&vpaddd ($xc,$xc,@x[$d0])",
1867 "&vpxor (@x[$b0],$xc,@x[$b0])",
1868 "&vpslld ($t0,@x[$b0],12)",
1869 "&vpsrld (@x[$b0],@x[$b0],20)",
1870 "&vpor (@x[$b0],$t0,@x[$b0])",
1871 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1872 "&vpaddd ($xc_,$xc_,@x[$d1])",
1873 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1874 "&vpslld ($t1,@x[$b1],12)",
1875 "&vpsrld (@x[$b1],@x[$b1],20)",
1876 "&vpor (@x[$b1],$t1,@x[$b1])",
1878 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1879 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1880 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1881 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1882 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1883 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1885 "&vpaddd ($xc,$xc,@x[$d0])",
1886 "&vpxor (@x[$b0],$xc,@x[$b0])",
1887 "&vpslld ($t1,@x[$b0],7)",
1888 "&vpsrld (@x[$b0],@x[$b0],25)",
1889 "&vpor (@x[$b0],$t1,@x[$b0])",
1890 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1891 "&vpaddd ($xc_,$xc_,@x[$d1])",
1892 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1893 "&vpslld ($t0,@x[$b1],7)",
1894 "&vpsrld (@x[$b1],@x[$b1],25)",
1895 "&vpor (@x[$b1],$t0,@x[$b1])",
1897 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1898 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1899 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1900 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1902 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1903 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1904 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1905 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1906 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1907 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1909 "&vpaddd ($xc,$xc,@x[$d2])",
1910 "&vpxor (@x[$b2],$xc,@x[$b2])",
1911 "&vpslld ($t0,@x[$b2],12)",
1912 "&vpsrld (@x[$b2],@x[$b2],20)",
1913 "&vpor (@x[$b2],$t0,@x[$b2])",
1914 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1915 "&vpaddd ($xc_,$xc_,@x[$d3])",
1916 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1917 "&vpslld ($t1,@x[$b3],12)",
1918 "&vpsrld (@x[$b3],@x[$b3],20)",
1919 "&vpor (@x[$b3],$t1,@x[$b3])",
1921 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1922 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1923 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1924 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1925 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1926 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1928 "&vpaddd ($xc,$xc,@x[$d2])",
1929 "&vpxor (@x[$b2],$xc,@x[$b2])",
1930 "&vpslld ($t1,@x[$b2],7)",
1931 "&vpsrld (@x[$b2],@x[$b2],25)",
1932 "&vpor (@x[$b2],$t1,@x[$b2])",
1933 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1934 "&vpaddd ($xc_,$xc_,@x[$d3])",
1935 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1936 "&vpslld ($t0,@x[$b3],7)",
1937 "&vpsrld (@x[$b3],@x[$b3],25)",
1938 "&vpor (@x[$b3],$t0,@x[$b3])"
1942 my $xframe = $win64 ? 0xa8 : 8;
1945 .type ChaCha20_8x,\@function,5
1950 mov %rsp,%r9 # frame register
1951 .cfi_def_cfa_register %r9
1952 sub \$0x280+$xframe,%rsp
1955 $code.=<<___ if ($win64);
1956 movaps %xmm6,-0xa8(%r9)
1957 movaps %xmm7,-0x98(%r9)
1958 movaps %xmm8,-0x88(%r9)
1959 movaps %xmm9,-0x78(%r9)
1960 movaps %xmm10,-0x68(%r9)
1961 movaps %xmm11,-0x58(%r9)
1962 movaps %xmm12,-0x48(%r9)
1963 movaps %xmm13,-0x38(%r9)
1964 movaps %xmm14,-0x28(%r9)
1965 movaps %xmm15,-0x18(%r9)
1971 ################ stack layout
1972 # +0x00 SIMD equivalent of @x[8-12]
1974 # +0x80 constant copy of key[0-2] smashed by lanes
1976 # +0x200 SIMD counters (with nonce smashed by lanes)
1980 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1981 vbroadcasti128 ($key),$xb3 # key[1]
1982 vbroadcasti128 16($key),$xt3 # key[2]
1983 vbroadcasti128 ($counter),$xd3 # key[3]
1984 lea 0x100(%rsp),%rcx # size optimization
1985 lea 0x200(%rsp),%rax # size optimization
1986 lea .Lrot16(%rip),%r10
1987 lea .Lrot24(%rip),%r11
1989 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1990 vpshufd \$0x55,$xa3,$xa1
1991 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1992 vpshufd \$0xaa,$xa3,$xa2
1993 vmovdqa $xa1,0xa0-0x100(%rcx)
1994 vpshufd \$0xff,$xa3,$xa3
1995 vmovdqa $xa2,0xc0-0x100(%rcx)
1996 vmovdqa $xa3,0xe0-0x100(%rcx)
1998 vpshufd \$0x00,$xb3,$xb0
1999 vpshufd \$0x55,$xb3,$xb1
2000 vmovdqa $xb0,0x100-0x100(%rcx)
2001 vpshufd \$0xaa,$xb3,$xb2
2002 vmovdqa $xb1,0x120-0x100(%rcx)
2003 vpshufd \$0xff,$xb3,$xb3
2004 vmovdqa $xb2,0x140-0x100(%rcx)
2005 vmovdqa $xb3,0x160-0x100(%rcx)
2007 vpshufd \$0x00,$xt3,$xt0 # "xc0"
2008 vpshufd \$0x55,$xt3,$xt1 # "xc1"
2009 vmovdqa $xt0,0x180-0x200(%rax)
2010 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
2011 vmovdqa $xt1,0x1a0-0x200(%rax)
2012 vpshufd \$0xff,$xt3,$xt3 # "xc3"
2013 vmovdqa $xt2,0x1c0-0x200(%rax)
2014 vmovdqa $xt3,0x1e0-0x200(%rax)
2016 vpshufd \$0x00,$xd3,$xd0
2017 vpshufd \$0x55,$xd3,$xd1
2018 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
2019 vpshufd \$0xaa,$xd3,$xd2
2020 vmovdqa $xd1,0x220-0x200(%rax)
2021 vpshufd \$0xff,$xd3,$xd3
2022 vmovdqa $xd2,0x240-0x200(%rax)
2023 vmovdqa $xd3,0x260-0x200(%rax)
2029 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
2030 vmovdqa 0xa0-0x100(%rcx),$xa1
2031 vmovdqa 0xc0-0x100(%rcx),$xa2
2032 vmovdqa 0xe0-0x100(%rcx),$xa3
2033 vmovdqa 0x100-0x100(%rcx),$xb0
2034 vmovdqa 0x120-0x100(%rcx),$xb1
2035 vmovdqa 0x140-0x100(%rcx),$xb2
2036 vmovdqa 0x160-0x100(%rcx),$xb3
2037 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
2038 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
2039 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
2040 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
2041 vmovdqa 0x200-0x200(%rax),$xd0
2042 vmovdqa 0x220-0x200(%rax),$xd1
2043 vmovdqa 0x240-0x200(%rax),$xd2
2044 vmovdqa 0x260-0x200(%rax),$xd3
2045 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
2048 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
2049 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
2050 vbroadcasti128 (%r10),$xt3
2051 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
2058 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
2059 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
2064 lea 0x200(%rsp),%rax # size optimization
2065 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
2066 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
2067 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
2068 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
2070 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
2071 vpunpckldq $xa3,$xa2,$xt3
2072 vpunpckhdq $xa1,$xa0,$xa0
2073 vpunpckhdq $xa3,$xa2,$xa2
2074 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
2075 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
2076 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
2077 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
2079 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2081 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
2082 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
2083 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
2084 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
2086 vpunpckldq $xb1,$xb0,$xt2
2087 vpunpckldq $xb3,$xb2,$xt3
2088 vpunpckhdq $xb1,$xb0,$xb0
2089 vpunpckhdq $xb3,$xb2,$xb2
2090 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
2091 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
2092 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
2093 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
2095 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2097 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
2098 vperm2i128 \$0x31,$xb0,$xa0,$xb0
2099 vperm2i128 \$0x20,$xb1,$xa1,$xa0
2100 vperm2i128 \$0x31,$xb1,$xa1,$xb1
2101 vperm2i128 \$0x20,$xb2,$xa2,$xa1
2102 vperm2i128 \$0x31,$xb2,$xa2,$xb2
2103 vperm2i128 \$0x20,$xb3,$xa3,$xa2
2104 vperm2i128 \$0x31,$xb3,$xa3,$xb3
2106 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2107 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
2109 vmovdqa $xa0,0x00(%rsp) # offload $xaN
2110 vmovdqa $xa1,0x20(%rsp)
2111 vmovdqa 0x40(%rsp),$xc2 # $xa0
2112 vmovdqa 0x60(%rsp),$xc3 # $xa1
2114 vpaddd 0x180-0x200(%rax),$xc0,$xc0
2115 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
2116 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
2117 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
2119 vpunpckldq $xc1,$xc0,$xt2
2120 vpunpckldq $xc3,$xc2,$xt3
2121 vpunpckhdq $xc1,$xc0,$xc0
2122 vpunpckhdq $xc3,$xc2,$xc2
2123 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
2124 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
2125 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
2126 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
2128 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2130 vpaddd 0x200-0x200(%rax),$xd0,$xd0
2131 vpaddd 0x220-0x200(%rax),$xd1,$xd1
2132 vpaddd 0x240-0x200(%rax),$xd2,$xd2
2133 vpaddd 0x260-0x200(%rax),$xd3,$xd3
2135 vpunpckldq $xd1,$xd0,$xt2
2136 vpunpckldq $xd3,$xd2,$xt3
2137 vpunpckhdq $xd1,$xd0,$xd0
2138 vpunpckhdq $xd3,$xd2,$xd2
2139 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
2140 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
2141 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
2142 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
2144 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2146 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
2147 vperm2i128 \$0x31,$xd0,$xc0,$xd0
2148 vperm2i128 \$0x20,$xd1,$xc1,$xc0
2149 vperm2i128 \$0x31,$xd1,$xc1,$xd1
2150 vperm2i128 \$0x20,$xd2,$xc2,$xc1
2151 vperm2i128 \$0x31,$xd2,$xc2,$xd2
2152 vperm2i128 \$0x20,$xd3,$xc3,$xc2
2153 vperm2i128 \$0x31,$xd3,$xc3,$xd3
2155 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2156 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
2157 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
2158 ($xa0,$xa1)=($xt2,$xt3);
2160 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
2161 vmovdqa 0x20(%rsp),$xa1
2166 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2167 vpxor 0x20($inp),$xb0,$xb0
2168 vpxor 0x40($inp),$xc0,$xc0
2169 vpxor 0x60($inp),$xd0,$xd0
2170 lea 0x80($inp),$inp # size optimization
2171 vmovdqu $xa0,0x00($out)
2172 vmovdqu $xb0,0x20($out)
2173 vmovdqu $xc0,0x40($out)
2174 vmovdqu $xd0,0x60($out)
2175 lea 0x80($out),$out # size optimization
2177 vpxor 0x00($inp),$xa1,$xa1
2178 vpxor 0x20($inp),$xb1,$xb1
2179 vpxor 0x40($inp),$xc1,$xc1
2180 vpxor 0x60($inp),$xd1,$xd1
2181 lea 0x80($inp),$inp # size optimization
2182 vmovdqu $xa1,0x00($out)
2183 vmovdqu $xb1,0x20($out)
2184 vmovdqu $xc1,0x40($out)
2185 vmovdqu $xd1,0x60($out)
2186 lea 0x80($out),$out # size optimization
2188 vpxor 0x00($inp),$xa2,$xa2
2189 vpxor 0x20($inp),$xb2,$xb2
2190 vpxor 0x40($inp),$xc2,$xc2
2191 vpxor 0x60($inp),$xd2,$xd2
2192 lea 0x80($inp),$inp # size optimization
2193 vmovdqu $xa2,0x00($out)
2194 vmovdqu $xb2,0x20($out)
2195 vmovdqu $xc2,0x40($out)
2196 vmovdqu $xd2,0x60($out)
2197 lea 0x80($out),$out # size optimization
2199 vpxor 0x00($inp),$xa3,$xa3
2200 vpxor 0x20($inp),$xb3,$xb3
2201 vpxor 0x40($inp),$xc3,$xc3
2202 vpxor 0x60($inp),$xd3,$xd3
2203 lea 0x80($inp),$inp # size optimization
2204 vmovdqu $xa3,0x00($out)
2205 vmovdqu $xb3,0x20($out)
2206 vmovdqu $xc3,0x40($out)
2207 vmovdqu $xd3,0x60($out)
2208 lea 0x80($out),$out # size optimization
2232 vmovdqa $xa0,0x00(%rsp)
2233 vmovdqa $xb0,0x20(%rsp)
2238 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2239 vpxor 0x20($inp),$xb0,$xb0
2240 vmovdqu $xa0,0x00($out)
2241 vmovdqu $xb0,0x20($out)
2244 lea 0x40($inp),$inp # inp+=64*1
2246 vmovdqa $xc0,0x00(%rsp)
2247 lea 0x40($out),$out # out+=64*1
2248 sub \$64,$len # len-=64*1
2249 vmovdqa $xd0,0x20(%rsp)
2254 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2255 vpxor 0x20($inp),$xb0,$xb0
2256 vpxor 0x40($inp),$xc0,$xc0
2257 vpxor 0x60($inp),$xd0,$xd0
2258 vmovdqu $xa0,0x00($out)
2259 vmovdqu $xb0,0x20($out)
2260 vmovdqu $xc0,0x40($out)
2261 vmovdqu $xd0,0x60($out)
2264 lea 0x80($inp),$inp # inp+=64*2
2266 vmovdqa $xa1,0x00(%rsp)
2267 lea 0x80($out),$out # out+=64*2
2268 sub \$128,$len # len-=64*2
2269 vmovdqa $xb1,0x20(%rsp)
2274 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2275 vpxor 0x20($inp),$xb0,$xb0
2276 vpxor 0x40($inp),$xc0,$xc0
2277 vpxor 0x60($inp),$xd0,$xd0
2278 vpxor 0x80($inp),$xa1,$xa1
2279 vpxor 0xa0($inp),$xb1,$xb1
2280 vmovdqu $xa0,0x00($out)
2281 vmovdqu $xb0,0x20($out)
2282 vmovdqu $xc0,0x40($out)
2283 vmovdqu $xd0,0x60($out)
2284 vmovdqu $xa1,0x80($out)
2285 vmovdqu $xb1,0xa0($out)
2288 lea 0xc0($inp),$inp # inp+=64*3
2290 vmovdqa $xc1,0x00(%rsp)
2291 lea 0xc0($out),$out # out+=64*3
2292 sub \$192,$len # len-=64*3
2293 vmovdqa $xd1,0x20(%rsp)
2298 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2299 vpxor 0x20($inp),$xb0,$xb0
2300 vpxor 0x40($inp),$xc0,$xc0
2301 vpxor 0x60($inp),$xd0,$xd0
2302 vpxor 0x80($inp),$xa1,$xa1
2303 vpxor 0xa0($inp),$xb1,$xb1
2304 vpxor 0xc0($inp),$xc1,$xc1
2305 vpxor 0xe0($inp),$xd1,$xd1
2306 vmovdqu $xa0,0x00($out)
2307 vmovdqu $xb0,0x20($out)
2308 vmovdqu $xc0,0x40($out)
2309 vmovdqu $xd0,0x60($out)
2310 vmovdqu $xa1,0x80($out)
2311 vmovdqu $xb1,0xa0($out)
2312 vmovdqu $xc1,0xc0($out)
2313 vmovdqu $xd1,0xe0($out)
2316 lea 0x100($inp),$inp # inp+=64*4
2318 vmovdqa $xa2,0x00(%rsp)
2319 lea 0x100($out),$out # out+=64*4
2320 sub \$256,$len # len-=64*4
2321 vmovdqa $xb2,0x20(%rsp)
2326 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2327 vpxor 0x20($inp),$xb0,$xb0
2328 vpxor 0x40($inp),$xc0,$xc0
2329 vpxor 0x60($inp),$xd0,$xd0
2330 vpxor 0x80($inp),$xa1,$xa1
2331 vpxor 0xa0($inp),$xb1,$xb1
2332 vpxor 0xc0($inp),$xc1,$xc1
2333 vpxor 0xe0($inp),$xd1,$xd1
2334 vpxor 0x100($inp),$xa2,$xa2
2335 vpxor 0x120($inp),$xb2,$xb2
2336 vmovdqu $xa0,0x00($out)
2337 vmovdqu $xb0,0x20($out)
2338 vmovdqu $xc0,0x40($out)
2339 vmovdqu $xd0,0x60($out)
2340 vmovdqu $xa1,0x80($out)
2341 vmovdqu $xb1,0xa0($out)
2342 vmovdqu $xc1,0xc0($out)
2343 vmovdqu $xd1,0xe0($out)
2344 vmovdqu $xa2,0x100($out)
2345 vmovdqu $xb2,0x120($out)
2348 lea 0x140($inp),$inp # inp+=64*5
2350 vmovdqa $xc2,0x00(%rsp)
2351 lea 0x140($out),$out # out+=64*5
2352 sub \$320,$len # len-=64*5
2353 vmovdqa $xd2,0x20(%rsp)
2358 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2359 vpxor 0x20($inp),$xb0,$xb0
2360 vpxor 0x40($inp),$xc0,$xc0
2361 vpxor 0x60($inp),$xd0,$xd0
2362 vpxor 0x80($inp),$xa1,$xa1
2363 vpxor 0xa0($inp),$xb1,$xb1
2364 vpxor 0xc0($inp),$xc1,$xc1
2365 vpxor 0xe0($inp),$xd1,$xd1
2366 vpxor 0x100($inp),$xa2,$xa2
2367 vpxor 0x120($inp),$xb2,$xb2
2368 vpxor 0x140($inp),$xc2,$xc2
2369 vpxor 0x160($inp),$xd2,$xd2
2370 vmovdqu $xa0,0x00($out)
2371 vmovdqu $xb0,0x20($out)
2372 vmovdqu $xc0,0x40($out)
2373 vmovdqu $xd0,0x60($out)
2374 vmovdqu $xa1,0x80($out)
2375 vmovdqu $xb1,0xa0($out)
2376 vmovdqu $xc1,0xc0($out)
2377 vmovdqu $xd1,0xe0($out)
2378 vmovdqu $xa2,0x100($out)
2379 vmovdqu $xb2,0x120($out)
2380 vmovdqu $xc2,0x140($out)
2381 vmovdqu $xd2,0x160($out)
2384 lea 0x180($inp),$inp # inp+=64*6
2386 vmovdqa $xa3,0x00(%rsp)
2387 lea 0x180($out),$out # out+=64*6
2388 sub \$384,$len # len-=64*6
2389 vmovdqa $xb3,0x20(%rsp)
2394 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2395 vpxor 0x20($inp),$xb0,$xb0
2396 vpxor 0x40($inp),$xc0,$xc0
2397 vpxor 0x60($inp),$xd0,$xd0
2398 vpxor 0x80($inp),$xa1,$xa1
2399 vpxor 0xa0($inp),$xb1,$xb1
2400 vpxor 0xc0($inp),$xc1,$xc1
2401 vpxor 0xe0($inp),$xd1,$xd1
2402 vpxor 0x100($inp),$xa2,$xa2
2403 vpxor 0x120($inp),$xb2,$xb2
2404 vpxor 0x140($inp),$xc2,$xc2
2405 vpxor 0x160($inp),$xd2,$xd2
2406 vpxor 0x180($inp),$xa3,$xa3
2407 vpxor 0x1a0($inp),$xb3,$xb3
2408 vmovdqu $xa0,0x00($out)
2409 vmovdqu $xb0,0x20($out)
2410 vmovdqu $xc0,0x40($out)
2411 vmovdqu $xd0,0x60($out)
2412 vmovdqu $xa1,0x80($out)
2413 vmovdqu $xb1,0xa0($out)
2414 vmovdqu $xc1,0xc0($out)
2415 vmovdqu $xd1,0xe0($out)
2416 vmovdqu $xa2,0x100($out)
2417 vmovdqu $xb2,0x120($out)
2418 vmovdqu $xc2,0x140($out)
2419 vmovdqu $xd2,0x160($out)
2420 vmovdqu $xa3,0x180($out)
2421 vmovdqu $xb3,0x1a0($out)
2424 lea 0x1c0($inp),$inp # inp+=64*7
2426 vmovdqa $xc3,0x00(%rsp)
2427 lea 0x1c0($out),$out # out+=64*7
2428 sub \$448,$len # len-=64*7
2429 vmovdqa $xd3,0x20(%rsp)
2432 movzb ($inp,%r10),%eax
2433 movzb (%rsp,%r10),%ecx
2436 mov %al,-1($out,%r10)
2443 $code.=<<___ if ($win64);
2444 movaps -0xa8(%r9),%xmm6
2445 movaps -0x98(%r9),%xmm7
2446 movaps -0x88(%r9),%xmm8
2447 movaps -0x78(%r9),%xmm9
2448 movaps -0x68(%r9),%xmm10
2449 movaps -0x58(%r9),%xmm11
2450 movaps -0x48(%r9),%xmm12
2451 movaps -0x38(%r9),%xmm13
2452 movaps -0x28(%r9),%xmm14
2453 movaps -0x18(%r9),%xmm15
2457 .cfi_def_cfa_register %rsp
2461 .size ChaCha20_8x,.-ChaCha20_8x
2465 ########################################################################
2468 # This one handles shorter inputs...
2470 my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
2471 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
2473 sub vpxord() # size optimization
2474 { my $opcode = "vpxor"; # adhere to vpxor when possible
2477 if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
2483 $code .= "\t$opcode\t".join(',',reverse @_)."\n";
2486 sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
2504 my $xframe = $win64 ? 32+8 : 8;
2507 .type ChaCha20_avx512,\@function,5
2512 mov %rsp,%r9 # frame pointer
2513 .cfi_def_cfa_register %r9
2517 sub \$64+$xframe,%rsp
2519 $code.=<<___ if ($win64);
2520 movaps %xmm6,-0x28(%r9)
2521 movaps %xmm7,-0x18(%r9)
2525 vbroadcasti32x4 .Lsigma(%rip),$a
2526 vbroadcasti32x4 ($key),$b
2527 vbroadcasti32x4 16($key),$c
2528 vbroadcasti32x4 ($counter),$d
2533 vpaddd .Lzeroz(%rip),$d,$d
2534 vmovdqa32 .Lfourz(%rip),$fourz
2535 mov \$10,$counter # reuse $counter
2544 vpaddd $fourz,$d_,$d
2553 &vpshufd ($c,$c,0b01001110);
2554 &vpshufd ($b,$b,0b00111001);
2555 &vpshufd ($d,$d,0b10010011);
2558 &vpshufd ($c,$c,0b01001110);
2559 &vpshufd ($b,$b,0b10010011);
2560 &vpshufd ($d,$d,0b00111001);
2563 &jnz (".Loop_avx512");
2574 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2575 vpxor 0x10($inp),%x#$b,$t1
2576 vpxor 0x20($inp),%x#$c,$t2
2577 vpxor 0x30($inp),%x#$d,$t3
2578 lea 0x40($inp),$inp # inp+=64
2580 vmovdqu $t0,0x00($out) # write output
2581 vmovdqu $t1,0x10($out)
2582 vmovdqu $t2,0x20($out)
2583 vmovdqu $t3,0x30($out)
2584 lea 0x40($out),$out # out+=64
2588 vextracti32x4 \$1,$a,$t0
2589 vextracti32x4 \$1,$b,$t1
2590 vextracti32x4 \$1,$c,$t2
2591 vextracti32x4 \$1,$d,$t3
2596 vpxor 0x00($inp),$t0,$t0 # xor with input
2597 vpxor 0x10($inp),$t1,$t1
2598 vpxor 0x20($inp),$t2,$t2
2599 vpxor 0x30($inp),$t3,$t3
2600 lea 0x40($inp),$inp # inp+=64
2602 vmovdqu $t0,0x00($out) # write output
2603 vmovdqu $t1,0x10($out)
2604 vmovdqu $t2,0x20($out)
2605 vmovdqu $t3,0x30($out)
2606 lea 0x40($out),$out # out+=64
2610 vextracti32x4 \$2,$a,$t0
2611 vextracti32x4 \$2,$b,$t1
2612 vextracti32x4 \$2,$c,$t2
2613 vextracti32x4 \$2,$d,$t3
2618 vpxor 0x00($inp),$t0,$t0 # xor with input
2619 vpxor 0x10($inp),$t1,$t1
2620 vpxor 0x20($inp),$t2,$t2
2621 vpxor 0x30($inp),$t3,$t3
2622 lea 0x40($inp),$inp # inp+=64
2624 vmovdqu $t0,0x00($out) # write output
2625 vmovdqu $t1,0x10($out)
2626 vmovdqu $t2,0x20($out)
2627 vmovdqu $t3,0x30($out)
2628 lea 0x40($out),$out # out+=64
2632 vextracti32x4 \$3,$a,$t0
2633 vextracti32x4 \$3,$b,$t1
2634 vextracti32x4 \$3,$c,$t2
2635 vextracti32x4 \$3,$d,$t3
2640 vpxor 0x00($inp),$t0,$t0 # xor with input
2641 vpxor 0x10($inp),$t1,$t1
2642 vpxor 0x20($inp),$t2,$t2
2643 vpxor 0x30($inp),$t3,$t3
2644 lea 0x40($inp),$inp # inp+=64
2646 vmovdqu $t0,0x00($out) # write output
2647 vmovdqu $t1,0x10($out)
2648 vmovdqu $t2,0x20($out)
2649 vmovdqu $t3,0x30($out)
2650 lea 0x40($out),$out # out+=64
2652 jnz .Loop_outer_avx512
2658 vmovdqa %x#$a,0x00(%rsp)
2659 vmovdqa %x#$b,0x10(%rsp)
2660 vmovdqa %x#$c,0x20(%rsp)
2661 vmovdqa %x#$d,0x30(%rsp)
2663 jmp .Loop_tail_avx512
2667 vmovdqa $t0,0x00(%rsp)
2668 vmovdqa $t1,0x10(%rsp)
2669 vmovdqa $t2,0x20(%rsp)
2670 vmovdqa $t3,0x30(%rsp)
2674 movzb ($inp,$counter),%eax
2675 movzb (%rsp,$counter),%ecx
2676 lea 1($counter),$counter
2678 mov %al,-1($out,$counter)
2680 jnz .Loop_tail_avx512
2682 vmovdqu32 $a_,0x00(%rsp)
2687 $code.=<<___ if ($win64);
2688 movaps -0x28(%r9),%xmm6
2689 movaps -0x18(%r9),%xmm7
2693 .cfi_def_cfa_register %rsp
2697 .size ChaCha20_avx512,.-ChaCha20_avx512
2700 map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
2703 .type ChaCha20_avx512vl,\@function,5
2707 .LChaCha20_avx512vl:
2708 mov %rsp,%r9 # frame pointer
2709 .cfi_def_cfa_register %r9
2713 sub \$64+$xframe,%rsp
2715 $code.=<<___ if ($win64);
2716 movaps %xmm6,-0x28(%r9)
2717 movaps %xmm7,-0x18(%r9)
2721 vbroadcasti128 .Lsigma(%rip),$a
2722 vbroadcasti128 ($key),$b
2723 vbroadcasti128 16($key),$c
2724 vbroadcasti128 ($counter),$d
2729 vpaddd .Lzeroz(%rip),$d,$d
2730 vmovdqa32 .Ltwoy(%rip),$fourz
2731 mov \$10,$counter # reuse $counter
2736 .Loop_outer_avx512vl:
2738 vpaddd $fourz,$d_,$d
2747 &vpshufd ($c,$c,0b01001110);
2748 &vpshufd ($b,$b,0b00111001);
2749 &vpshufd ($d,$d,0b10010011);
2752 &vpshufd ($c,$c,0b01001110);
2753 &vpshufd ($b,$b,0b10010011);
2754 &vpshufd ($d,$d,0b00111001);
2757 &jnz (".Loop_avx512vl");
2766 jb .Ltail64_avx512vl
2768 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2769 vpxor 0x10($inp),%x#$b,$t1
2770 vpxor 0x20($inp),%x#$c,$t2
2771 vpxor 0x30($inp),%x#$d,$t3
2772 lea 0x40($inp),$inp # inp+=64
2774 vmovdqu $t0,0x00($out) # write output
2775 vmovdqu $t1,0x10($out)
2776 vmovdqu $t2,0x20($out)
2777 vmovdqu $t3,0x30($out)
2778 lea 0x40($out),$out # out+=64
2782 vextracti128 \$1,$a,$t0
2783 vextracti128 \$1,$b,$t1
2784 vextracti128 \$1,$c,$t2
2785 vextracti128 \$1,$d,$t3
2790 vpxor 0x00($inp),$t0,$t0 # xor with input
2791 vpxor 0x10($inp),$t1,$t1
2792 vpxor 0x20($inp),$t2,$t2
2793 vpxor 0x30($inp),$t3,$t3
2794 lea 0x40($inp),$inp # inp+=64
2796 vmovdqu $t0,0x00($out) # write output
2797 vmovdqu $t1,0x10($out)
2798 vmovdqu $t2,0x20($out)
2799 vmovdqu $t3,0x30($out)
2800 lea 0x40($out),$out # out+=64
2804 jnz .Loop_outer_avx512vl
2810 vmovdqa %x#$a,0x00(%rsp)
2811 vmovdqa %x#$b,0x10(%rsp)
2812 vmovdqa %x#$c,0x20(%rsp)
2813 vmovdqa %x#$d,0x30(%rsp)
2815 jmp .Loop_tail_avx512vl
2819 vmovdqa $t0,0x00(%rsp)
2820 vmovdqa $t1,0x10(%rsp)
2821 vmovdqa $t2,0x20(%rsp)
2822 vmovdqa $t3,0x30(%rsp)
2825 .Loop_tail_avx512vl:
2826 movzb ($inp,$counter),%eax
2827 movzb (%rsp,$counter),%ecx
2828 lea 1($counter),$counter
2830 mov %al,-1($out,$counter)
2832 jnz .Loop_tail_avx512vl
2834 vmovdqu32 $a_,0x00(%rsp)
2835 vmovdqu32 $a_,0x20(%rsp)
2840 $code.=<<___ if ($win64);
2841 movaps -0x28(%r9),%xmm6
2842 movaps -0x18(%r9),%xmm7
2846 .cfi_def_cfa_register %rsp
2847 .Lavx512vl_epilogue:
2850 .size ChaCha20_avx512vl,.-ChaCha20_avx512vl
2854 # This one handles longer inputs...
2856 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2857 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2858 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2859 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2860 my @key=map("%zmm$_",(16..31));
2861 my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2863 sub AVX512_lane_ROUND {
2864 my ($a0,$b0,$c0,$d0)=@_;
2865 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2866 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2867 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2868 my @x=map("\"$_\"",@xx);
2871 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
2872 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
2873 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
2874 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
2875 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2876 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2877 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2878 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2879 "&vprold (@x[$d0],@x[$d0],16)",
2880 "&vprold (@x[$d1],@x[$d1],16)",
2881 "&vprold (@x[$d2],@x[$d2],16)",
2882 "&vprold (@x[$d3],@x[$d3],16)",
2884 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2885 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2886 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2887 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2888 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2889 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2890 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2891 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2892 "&vprold (@x[$b0],@x[$b0],12)",
2893 "&vprold (@x[$b1],@x[$b1],12)",
2894 "&vprold (@x[$b2],@x[$b2],12)",
2895 "&vprold (@x[$b3],@x[$b3],12)",
2897 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
2898 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
2899 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
2900 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
2901 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2902 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2903 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2904 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2905 "&vprold (@x[$d0],@x[$d0],8)",
2906 "&vprold (@x[$d1],@x[$d1],8)",
2907 "&vprold (@x[$d2],@x[$d2],8)",
2908 "&vprold (@x[$d3],@x[$d3],8)",
2910 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2911 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2912 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2913 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2914 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2915 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2916 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2917 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2918 "&vprold (@x[$b0],@x[$b0],7)",
2919 "&vprold (@x[$b1],@x[$b1],7)",
2920 "&vprold (@x[$b2],@x[$b2],7)",
2921 "&vprold (@x[$b3],@x[$b3],7)"
2925 my $xframe = $win64 ? 0xa8 : 8;
2928 .type ChaCha20_16x,\@function,5
2933 mov %rsp,%r9 # frame register
2934 .cfi_def_cfa_register %r9
2935 sub \$64+$xframe,%rsp
2938 $code.=<<___ if ($win64);
2939 movaps %xmm6,-0xa8(%r9)
2940 movaps %xmm7,-0x98(%r9)
2941 movaps %xmm8,-0x88(%r9)
2942 movaps %xmm9,-0x78(%r9)
2943 movaps %xmm10,-0x68(%r9)
2944 movaps %xmm11,-0x58(%r9)
2945 movaps %xmm12,-0x48(%r9)
2946 movaps %xmm13,-0x38(%r9)
2947 movaps %xmm14,-0x28(%r9)
2948 movaps %xmm15,-0x18(%r9)
2954 lea .Lsigma(%rip),%r10
2955 vbroadcasti32x4 (%r10),$xa3 # key[0]
2956 vbroadcasti32x4 ($key),$xb3 # key[1]
2957 vbroadcasti32x4 16($key),$xc3 # key[2]
2958 vbroadcasti32x4 ($counter),$xd3 # key[3]
2960 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
2961 vpshufd \$0x55,$xa3,$xa1
2962 vpshufd \$0xaa,$xa3,$xa2
2963 vpshufd \$0xff,$xa3,$xa3
2964 vmovdqa64 $xa0,@key[0]
2965 vmovdqa64 $xa1,@key[1]
2966 vmovdqa64 $xa2,@key[2]
2967 vmovdqa64 $xa3,@key[3]
2969 vpshufd \$0x00,$xb3,$xb0
2970 vpshufd \$0x55,$xb3,$xb1
2971 vpshufd \$0xaa,$xb3,$xb2
2972 vpshufd \$0xff,$xb3,$xb3
2973 vmovdqa64 $xb0,@key[4]
2974 vmovdqa64 $xb1,@key[5]
2975 vmovdqa64 $xb2,@key[6]
2976 vmovdqa64 $xb3,@key[7]
2978 vpshufd \$0x00,$xc3,$xc0
2979 vpshufd \$0x55,$xc3,$xc1
2980 vpshufd \$0xaa,$xc3,$xc2
2981 vpshufd \$0xff,$xc3,$xc3
2982 vmovdqa64 $xc0,@key[8]
2983 vmovdqa64 $xc1,@key[9]
2984 vmovdqa64 $xc2,@key[10]
2985 vmovdqa64 $xc3,@key[11]
2987 vpshufd \$0x00,$xd3,$xd0
2988 vpshufd \$0x55,$xd3,$xd1
2989 vpshufd \$0xaa,$xd3,$xd2
2990 vpshufd \$0xff,$xd3,$xd3
2991 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
2992 vmovdqa64 $xd0,@key[12]
2993 vmovdqa64 $xd1,@key[13]
2994 vmovdqa64 $xd2,@key[14]
2995 vmovdqa64 $xd3,@key[15]
3002 vpbroadcastd 0(%r10),$xa0 # reload key
3003 vpbroadcastd 4(%r10),$xa1
3004 vpbroadcastd 8(%r10),$xa2
3005 vpbroadcastd 12(%r10),$xa3
3006 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
3007 vmovdqa64 @key[4],$xb0
3008 vmovdqa64 @key[5],$xb1
3009 vmovdqa64 @key[6],$xb2
3010 vmovdqa64 @key[7],$xb3
3011 vmovdqa64 @key[8],$xc0
3012 vmovdqa64 @key[9],$xc1
3013 vmovdqa64 @key[10],$xc2
3014 vmovdqa64 @key[11],$xc3
3015 vmovdqa64 @key[12],$xd0
3016 vmovdqa64 @key[13],$xd1
3017 vmovdqa64 @key[14],$xd2
3018 vmovdqa64 @key[15],$xd3
3020 vmovdqa64 $xa0,@key[0]
3021 vmovdqa64 $xa1,@key[1]
3022 vmovdqa64 $xa2,@key[2]
3023 vmovdqa64 $xa3,@key[3]
3031 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3032 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3037 vpaddd @key[0],$xa0,$xa0 # accumulate key
3038 vpaddd @key[1],$xa1,$xa1
3039 vpaddd @key[2],$xa2,$xa2
3040 vpaddd @key[3],$xa3,$xa3
3042 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
3043 vpunpckldq $xa3,$xa2,$xt3
3044 vpunpckhdq $xa1,$xa0,$xa0
3045 vpunpckhdq $xa3,$xa2,$xa2
3046 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
3047 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
3048 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
3049 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
3051 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3053 vpaddd @key[4],$xb0,$xb0
3054 vpaddd @key[5],$xb1,$xb1
3055 vpaddd @key[6],$xb2,$xb2
3056 vpaddd @key[7],$xb3,$xb3
3058 vpunpckldq $xb1,$xb0,$xt2
3059 vpunpckldq $xb3,$xb2,$xt3
3060 vpunpckhdq $xb1,$xb0,$xb0
3061 vpunpckhdq $xb3,$xb2,$xb2
3062 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
3063 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
3064 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
3065 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
3067 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3069 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
3070 vshufi32x4 \$0xee,$xb0,$xa0,$xb0
3071 vshufi32x4 \$0x44,$xb1,$xa1,$xa0
3072 vshufi32x4 \$0xee,$xb1,$xa1,$xb1
3073 vshufi32x4 \$0x44,$xb2,$xa2,$xa1
3074 vshufi32x4 \$0xee,$xb2,$xa2,$xb2
3075 vshufi32x4 \$0x44,$xb3,$xa3,$xa2
3076 vshufi32x4 \$0xee,$xb3,$xa3,$xb3
3078 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3080 vpaddd @key[8],$xc0,$xc0
3081 vpaddd @key[9],$xc1,$xc1
3082 vpaddd @key[10],$xc2,$xc2
3083 vpaddd @key[11],$xc3,$xc3
3085 vpunpckldq $xc1,$xc0,$xt2
3086 vpunpckldq $xc3,$xc2,$xt3
3087 vpunpckhdq $xc1,$xc0,$xc0
3088 vpunpckhdq $xc3,$xc2,$xc2
3089 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
3090 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
3091 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
3092 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
3094 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3096 vpaddd @key[12],$xd0,$xd0
3097 vpaddd @key[13],$xd1,$xd1
3098 vpaddd @key[14],$xd2,$xd2
3099 vpaddd @key[15],$xd3,$xd3
3101 vpunpckldq $xd1,$xd0,$xt2
3102 vpunpckldq $xd3,$xd2,$xt3
3103 vpunpckhdq $xd1,$xd0,$xd0
3104 vpunpckhdq $xd3,$xd2,$xd2
3105 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
3106 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
3107 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
3108 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
3110 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3112 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
3113 vshufi32x4 \$0xee,$xd0,$xc0,$xd0
3114 vshufi32x4 \$0x44,$xd1,$xc1,$xc0
3115 vshufi32x4 \$0xee,$xd1,$xc1,$xd1
3116 vshufi32x4 \$0x44,$xd2,$xc2,$xc1
3117 vshufi32x4 \$0xee,$xd2,$xc2,$xd2
3118 vshufi32x4 \$0x44,$xd3,$xc3,$xc2
3119 vshufi32x4 \$0xee,$xd3,$xc3,$xd3
3121 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3123 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
3124 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
3125 vshufi32x4 \$0x88,$xd0,$xb0,$xc0
3126 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
3127 vshufi32x4 \$0x88,$xc1,$xa1,$xt1
3128 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
3129 vshufi32x4 \$0x88,$xd1,$xb1,$xc1
3130 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
3131 vshufi32x4 \$0x88,$xc2,$xa2,$xt2
3132 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
3133 vshufi32x4 \$0x88,$xd2,$xb2,$xc2
3134 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
3135 vshufi32x4 \$0x88,$xc3,$xa3,$xt3
3136 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
3137 vshufi32x4 \$0x88,$xd3,$xb3,$xc3
3138 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
3140 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
3141 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
3143 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
3144 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
3145 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3146 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3151 vpxord 0x00($inp),$xa0,$xa0 # xor with input
3152 vpxord 0x40($inp),$xb0,$xb0
3153 vpxord 0x80($inp),$xc0,$xc0
3154 vpxord 0xc0($inp),$xd0,$xd0
3155 vmovdqu32 $xa0,0x00($out)
3156 vmovdqu32 $xb0,0x40($out)
3157 vmovdqu32 $xc0,0x80($out)
3158 vmovdqu32 $xd0,0xc0($out)
3160 vpxord 0x100($inp),$xa1,$xa1
3161 vpxord 0x140($inp),$xb1,$xb1
3162 vpxord 0x180($inp),$xc1,$xc1
3163 vpxord 0x1c0($inp),$xd1,$xd1
3164 vmovdqu32 $xa1,0x100($out)
3165 vmovdqu32 $xb1,0x140($out)
3166 vmovdqu32 $xc1,0x180($out)
3167 vmovdqu32 $xd1,0x1c0($out)
3169 vpxord 0x200($inp),$xa2,$xa2
3170 vpxord 0x240($inp),$xb2,$xb2
3171 vpxord 0x280($inp),$xc2,$xc2
3172 vpxord 0x2c0($inp),$xd2,$xd2
3173 vmovdqu32 $xa2,0x200($out)
3174 vmovdqu32 $xb2,0x240($out)
3175 vmovdqu32 $xc2,0x280($out)
3176 vmovdqu32 $xd2,0x2c0($out)
3178 vpxord 0x300($inp),$xa3,$xa3
3179 vpxord 0x340($inp),$xb3,$xb3
3180 vpxord 0x380($inp),$xc3,$xc3
3181 vpxord 0x3c0($inp),$xd3,$xd3
3182 lea 0x400($inp),$inp
3183 vmovdqu32 $xa3,0x300($out)
3184 vmovdqu32 $xb3,0x340($out)
3185 vmovdqu32 $xc3,0x380($out)
3186 vmovdqu32 $xd3,0x3c0($out)
3187 lea 0x400($out),$out
3199 jb .Less_than_64_16x
3200 vpxord ($inp),$xa0,$xa0 # xor with input
3201 vmovdqu32 $xa0,($out,$inp)
3207 jb .Less_than_64_16x
3208 vpxord ($inp),$xb0,$xb0
3209 vmovdqu32 $xb0,($out,$inp)
3215 jb .Less_than_64_16x
3216 vpxord ($inp),$xc0,$xc0
3217 vmovdqu32 $xc0,($out,$inp)
3223 jb .Less_than_64_16x
3224 vpxord ($inp),$xd0,$xd0
3225 vmovdqu32 $xd0,($out,$inp)
3231 jb .Less_than_64_16x
3232 vpxord ($inp),$xa1,$xa1
3233 vmovdqu32 $xa1,($out,$inp)
3239 jb .Less_than_64_16x
3240 vpxord ($inp),$xb1,$xb1
3241 vmovdqu32 $xb1,($out,$inp)
3247 jb .Less_than_64_16x
3248 vpxord ($inp),$xc1,$xc1
3249 vmovdqu32 $xc1,($out,$inp)
3255 jb .Less_than_64_16x
3256 vpxord ($inp),$xd1,$xd1
3257 vmovdqu32 $xd1,($out,$inp)
3263 jb .Less_than_64_16x
3264 vpxord ($inp),$xa2,$xa2
3265 vmovdqu32 $xa2,($out,$inp)
3271 jb .Less_than_64_16x
3272 vpxord ($inp),$xb2,$xb2
3273 vmovdqu32 $xb2,($out,$inp)
3279 jb .Less_than_64_16x
3280 vpxord ($inp),$xc2,$xc2
3281 vmovdqu32 $xc2,($out,$inp)
3287 jb .Less_than_64_16x
3288 vpxord ($inp),$xd2,$xd2
3289 vmovdqu32 $xd2,($out,$inp)
3295 jb .Less_than_64_16x
3296 vpxord ($inp),$xa3,$xa3
3297 vmovdqu32 $xa3,($out,$inp)
3303 jb .Less_than_64_16x
3304 vpxord ($inp),$xb3,$xb3
3305 vmovdqu32 $xb3,($out,$inp)
3311 jb .Less_than_64_16x
3312 vpxord ($inp),$xc3,$xc3
3313 vmovdqu32 $xc3,($out,$inp)
3319 vmovdqa32 $xa0,0x00(%rsp)
3320 lea ($out,$inp),$out
3324 movzb ($inp,%r10),%eax
3325 movzb (%rsp,%r10),%ecx
3328 mov %al,-1($out,%r10)
3332 vpxord $xa0,$xa0,$xa0
3333 vmovdqa32 $xa0,0(%rsp)
3338 $code.=<<___ if ($win64);
3339 movaps -0xa8(%r9),%xmm6
3340 movaps -0x98(%r9),%xmm7
3341 movaps -0x88(%r9),%xmm8
3342 movaps -0x78(%r9),%xmm9
3343 movaps -0x68(%r9),%xmm10
3344 movaps -0x58(%r9),%xmm11
3345 movaps -0x48(%r9),%xmm12
3346 movaps -0x38(%r9),%xmm13
3347 movaps -0x28(%r9),%xmm14
3348 movaps -0x18(%r9),%xmm15
3352 .cfi_def_cfa_register %rsp
3356 .size ChaCha20_16x,.-ChaCha20_16x
3359 # switch to %ymm domain
3360 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3361 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
3362 @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3363 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3364 @key=map("%ymm$_",(16..31));
3365 ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
3368 .type ChaCha20_8xvl,\@function,5
3373 mov %rsp,%r9 # frame register
3374 .cfi_def_cfa_register %r9
3375 sub \$64+$xframe,%rsp
3378 $code.=<<___ if ($win64);
3379 movaps %xmm6,-0xa8(%r9)
3380 movaps %xmm7,-0x98(%r9)
3381 movaps %xmm8,-0x88(%r9)
3382 movaps %xmm9,-0x78(%r9)
3383 movaps %xmm10,-0x68(%r9)
3384 movaps %xmm11,-0x58(%r9)
3385 movaps %xmm12,-0x48(%r9)
3386 movaps %xmm13,-0x38(%r9)
3387 movaps %xmm14,-0x28(%r9)
3388 movaps %xmm15,-0x18(%r9)
3394 lea .Lsigma(%rip),%r10
3395 vbroadcasti128 (%r10),$xa3 # key[0]
3396 vbroadcasti128 ($key),$xb3 # key[1]
3397 vbroadcasti128 16($key),$xc3 # key[2]
3398 vbroadcasti128 ($counter),$xd3 # key[3]
3400 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
3401 vpshufd \$0x55,$xa3,$xa1
3402 vpshufd \$0xaa,$xa3,$xa2
3403 vpshufd \$0xff,$xa3,$xa3
3404 vmovdqa64 $xa0,@key[0]
3405 vmovdqa64 $xa1,@key[1]
3406 vmovdqa64 $xa2,@key[2]
3407 vmovdqa64 $xa3,@key[3]
3409 vpshufd \$0x00,$xb3,$xb0
3410 vpshufd \$0x55,$xb3,$xb1
3411 vpshufd \$0xaa,$xb3,$xb2
3412 vpshufd \$0xff,$xb3,$xb3
3413 vmovdqa64 $xb0,@key[4]
3414 vmovdqa64 $xb1,@key[5]
3415 vmovdqa64 $xb2,@key[6]
3416 vmovdqa64 $xb3,@key[7]
3418 vpshufd \$0x00,$xc3,$xc0
3419 vpshufd \$0x55,$xc3,$xc1
3420 vpshufd \$0xaa,$xc3,$xc2
3421 vpshufd \$0xff,$xc3,$xc3
3422 vmovdqa64 $xc0,@key[8]
3423 vmovdqa64 $xc1,@key[9]
3424 vmovdqa64 $xc2,@key[10]
3425 vmovdqa64 $xc3,@key[11]
3427 vpshufd \$0x00,$xd3,$xd0
3428 vpshufd \$0x55,$xd3,$xd1
3429 vpshufd \$0xaa,$xd3,$xd2
3430 vpshufd \$0xff,$xd3,$xd3
3431 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
3432 vmovdqa64 $xd0,@key[12]
3433 vmovdqa64 $xd1,@key[13]
3434 vmovdqa64 $xd2,@key[14]
3435 vmovdqa64 $xd3,@key[15]
3442 #vpbroadcastd 0(%r10),$xa0 # reload key
3443 #vpbroadcastd 4(%r10),$xa1
3444 vpbroadcastd 8(%r10),$xa2
3445 vpbroadcastd 12(%r10),$xa3
3446 vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters
3447 vmovdqa64 @key[4],$xb0
3448 vmovdqa64 @key[5],$xb1
3449 vmovdqa64 @key[6],$xb2
3450 vmovdqa64 @key[7],$xb3
3451 vmovdqa64 @key[8],$xc0
3452 vmovdqa64 @key[9],$xc1
3453 vmovdqa64 @key[10],$xc2
3454 vmovdqa64 @key[11],$xc3
3455 vmovdqa64 @key[12],$xd0
3456 vmovdqa64 @key[13],$xd1
3457 vmovdqa64 @key[14],$xd2
3458 vmovdqa64 @key[15],$xd3
3460 vmovdqa64 $xa0,@key[0]
3461 vmovdqa64 $xa1,@key[1]
3462 vmovdqa64 $xa2,@key[2]
3463 vmovdqa64 $xa3,@key[3]
3471 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3472 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3477 vpaddd @key[0],$xa0,$xa0 # accumulate key
3478 vpaddd @key[1],$xa1,$xa1
3479 vpaddd @key[2],$xa2,$xa2
3480 vpaddd @key[3],$xa3,$xa3
3482 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
3483 vpunpckldq $xa3,$xa2,$xt3
3484 vpunpckhdq $xa1,$xa0,$xa0
3485 vpunpckhdq $xa3,$xa2,$xa2
3486 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
3487 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
3488 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
3489 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
3491 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3493 vpaddd @key[4],$xb0,$xb0
3494 vpaddd @key[5],$xb1,$xb1
3495 vpaddd @key[6],$xb2,$xb2
3496 vpaddd @key[7],$xb3,$xb3
3498 vpunpckldq $xb1,$xb0,$xt2
3499 vpunpckldq $xb3,$xb2,$xt3
3500 vpunpckhdq $xb1,$xb0,$xb0
3501 vpunpckhdq $xb3,$xb2,$xb2
3502 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
3503 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
3504 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
3505 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
3507 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3509 vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further
3510 vshufi32x4 \$3,$xb0,$xa0,$xb0
3511 vshufi32x4 \$0,$xb1,$xa1,$xa0
3512 vshufi32x4 \$3,$xb1,$xa1,$xb1
3513 vshufi32x4 \$0,$xb2,$xa2,$xa1
3514 vshufi32x4 \$3,$xb2,$xa2,$xb2
3515 vshufi32x4 \$0,$xb3,$xa3,$xa2
3516 vshufi32x4 \$3,$xb3,$xa3,$xb3
3518 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3520 vpaddd @key[8],$xc0,$xc0
3521 vpaddd @key[9],$xc1,$xc1
3522 vpaddd @key[10],$xc2,$xc2
3523 vpaddd @key[11],$xc3,$xc3
3525 vpunpckldq $xc1,$xc0,$xt2
3526 vpunpckldq $xc3,$xc2,$xt3
3527 vpunpckhdq $xc1,$xc0,$xc0
3528 vpunpckhdq $xc3,$xc2,$xc2
3529 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
3530 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
3531 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
3532 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
3534 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3536 vpaddd @key[12],$xd0,$xd0
3537 vpaddd @key[13],$xd1,$xd1
3538 vpaddd @key[14],$xd2,$xd2
3539 vpaddd @key[15],$xd3,$xd3
3541 vpunpckldq $xd1,$xd0,$xt2
3542 vpunpckldq $xd3,$xd2,$xt3
3543 vpunpckhdq $xd1,$xd0,$xd0
3544 vpunpckhdq $xd3,$xd2,$xd2
3545 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
3546 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
3547 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
3548 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
3550 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3552 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
3553 vperm2i128 \$0x31,$xd0,$xc0,$xd0
3554 vperm2i128 \$0x20,$xd1,$xc1,$xc0
3555 vperm2i128 \$0x31,$xd1,$xc1,$xd1
3556 vperm2i128 \$0x20,$xd2,$xc2,$xc1
3557 vperm2i128 \$0x31,$xd2,$xc2,$xd2
3558 vperm2i128 \$0x20,$xd3,$xc3,$xc2
3559 vperm2i128 \$0x31,$xd3,$xc3,$xd3
3561 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3562 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
3563 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
3568 mov \$0x80,%eax # size optimization
3569 vpxord 0x00($inp),$xa0,$xa0 # xor with input
3570 vpxor 0x20($inp),$xb0,$xb0
3571 vpxor 0x40($inp),$xc0,$xc0
3572 vpxor 0x60($inp),$xd0,$xd0
3573 lea ($inp,%rax),$inp # size optimization
3574 vmovdqu32 $xa0,0x00($out)
3575 vmovdqu $xb0,0x20($out)
3576 vmovdqu $xc0,0x40($out)
3577 vmovdqu $xd0,0x60($out)
3578 lea ($out,%rax),$out # size optimization
3580 vpxor 0x00($inp),$xa1,$xa1
3581 vpxor 0x20($inp),$xb1,$xb1
3582 vpxor 0x40($inp),$xc1,$xc1
3583 vpxor 0x60($inp),$xd1,$xd1
3584 lea ($inp,%rax),$inp # size optimization
3585 vmovdqu $xa1,0x00($out)
3586 vmovdqu $xb1,0x20($out)
3587 vmovdqu $xc1,0x40($out)
3588 vmovdqu $xd1,0x60($out)
3589 lea ($out,%rax),$out # size optimization
3591 vpxord 0x00($inp),$xa2,$xa2
3592 vpxor 0x20($inp),$xb2,$xb2
3593 vpxor 0x40($inp),$xc2,$xc2
3594 vpxor 0x60($inp),$xd2,$xd2
3595 lea ($inp,%rax),$inp # size optimization
3596 vmovdqu32 $xa2,0x00($out)
3597 vmovdqu $xb2,0x20($out)
3598 vmovdqu $xc2,0x40($out)
3599 vmovdqu $xd2,0x60($out)
3600 lea ($out,%rax),$out # size optimization
3602 vpxor 0x00($inp),$xa3,$xa3
3603 vpxor 0x20($inp),$xb3,$xb3
3604 vpxor 0x40($inp),$xc3,$xc3
3605 vpxor 0x60($inp),$xd3,$xd3
3606 lea ($inp,%rax),$inp # size optimization
3607 vmovdqu $xa3,0x00($out)
3608 vmovdqu $xb3,0x20($out)
3609 vmovdqu $xc3,0x40($out)
3610 vmovdqu $xd3,0x60($out)
3611 lea ($out,%rax),$out # size optimization
3613 vpbroadcastd 0(%r10),%ymm0 # reload key
3614 vpbroadcastd 4(%r10),%ymm1
3623 vmovdqa64 $xa0,%ymm8 # size optimization
3630 jb .Less_than_64_8xvl
3631 vpxor 0x00($inp),$xa0,$xa0 # xor with input
3632 vpxor 0x20($inp),$xb0,$xb0
3633 vmovdqu $xa0,0x00($out,$inp)
3634 vmovdqu $xb0,0x20($out,$inp)
3641 jb .Less_than_64_8xvl
3642 vpxor 0x00($inp),$xc0,$xc0
3643 vpxor 0x20($inp),$xd0,$xd0
3644 vmovdqu $xc0,0x00($out,$inp)
3645 vmovdqu $xd0,0x20($out,$inp)
3652 jb .Less_than_64_8xvl
3653 vpxor 0x00($inp),$xa1,$xa1
3654 vpxor 0x20($inp),$xb1,$xb1
3655 vmovdqu $xa1,0x00($out,$inp)
3656 vmovdqu $xb1,0x20($out,$inp)
3663 jb .Less_than_64_8xvl
3664 vpxor 0x00($inp),$xc1,$xc1
3665 vpxor 0x20($inp),$xd1,$xd1
3666 vmovdqu $xc1,0x00($out,$inp)
3667 vmovdqu $xd1,0x20($out,$inp)
3674 jb .Less_than_64_8xvl
3675 vpxord 0x00($inp),$xa2,$xa2
3676 vpxor 0x20($inp),$xb2,$xb2
3677 vmovdqu32 $xa2,0x00($out,$inp)
3678 vmovdqu $xb2,0x20($out,$inp)
3685 jb .Less_than_64_8xvl
3686 vpxor 0x00($inp),$xc2,$xc2
3687 vpxor 0x20($inp),$xd2,$xd2
3688 vmovdqu $xc2,0x00($out,$inp)
3689 vmovdqu $xd2,0x20($out,$inp)
3696 jb .Less_than_64_8xvl
3697 vpxor 0x00($inp),$xa3,$xa3
3698 vpxor 0x20($inp),$xb3,$xb3
3699 vmovdqu $xa3,0x00($out,$inp)
3700 vmovdqu $xb3,0x20($out,$inp)
3707 vmovdqa $xa0,0x00(%rsp)
3708 vmovdqa $xb0,0x20(%rsp)
3709 lea ($out,$inp),$out
3713 movzb ($inp,%r10),%eax
3714 movzb (%rsp,%r10),%ecx
3717 mov %al,-1($out,%r10)
3721 vpxor $xa0,$xa0,$xa0
3722 vmovdqa $xa0,0x00(%rsp)
3723 vmovdqa $xa0,0x20(%rsp)
3728 $code.=<<___ if ($win64);
3729 movaps -0xa8(%r9),%xmm6
3730 movaps -0x98(%r9),%xmm7
3731 movaps -0x88(%r9),%xmm8
3732 movaps -0x78(%r9),%xmm9
3733 movaps -0x68(%r9),%xmm10
3734 movaps -0x58(%r9),%xmm11
3735 movaps -0x48(%r9),%xmm12
3736 movaps -0x38(%r9),%xmm13
3737 movaps -0x28(%r9),%xmm14
3738 movaps -0x18(%r9),%xmm15
3742 .cfi_def_cfa_register %rsp
3746 .size ChaCha20_8xvl,.-ChaCha20_8xvl
3750 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3751 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
3759 .extern __imp_RtlVirtualUnwind
3760 .type se_handler,\@abi-omnipotent
3774 mov 120($context),%rax # pull context->Rax
3775 mov 248($context),%rbx # pull context->Rip
3777 mov 8($disp),%rsi # disp->ImageBase
3778 mov 56($disp),%r11 # disp->HandlerData
3780 lea .Lctr32_body(%rip),%r10
3781 cmp %r10,%rbx # context->Rip<.Lprologue
3782 jb .Lcommon_seh_tail
3784 mov 152($context),%rax # pull context->Rsp
3786 lea .Lno_data(%rip),%r10 # epilogue label
3787 cmp %r10,%rbx # context->Rip>=.Lepilogue
3788 jae .Lcommon_seh_tail
3790 lea 64+24+48(%rax),%rax
3798 mov %rbx,144($context) # restore context->Rbx
3799 mov %rbp,160($context) # restore context->Rbp
3800 mov %r12,216($context) # restore context->R12
3801 mov %r13,224($context) # restore context->R13
3802 mov %r14,232($context) # restore context->R14
3803 mov %r15,240($context) # restore context->R14
3808 mov %rax,152($context) # restore context->Rsp
3809 mov %rsi,168($context) # restore context->Rsi
3810 mov %rdi,176($context) # restore context->Rdi
3812 mov 40($disp),%rdi # disp->ContextRecord
3813 mov $context,%rsi # context
3814 mov \$154,%ecx # sizeof(CONTEXT)
3815 .long 0xa548f3fc # cld; rep movsq
3818 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3819 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3820 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3821 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3822 mov 40(%rsi),%r10 # disp->ContextRecord
3823 lea 56(%rsi),%r11 # &disp->HandlerData
3824 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3825 mov %r10,32(%rsp) # arg5
3826 mov %r11,40(%rsp) # arg6
3827 mov %r12,48(%rsp) # arg7
3828 mov %rcx,56(%rsp) # arg8, (NULL)
3829 call *__imp_RtlVirtualUnwind(%rip)
3831 mov \$1,%eax # ExceptionContinueSearch
3843 .size se_handler,.-se_handler
3845 .type simd_handler,\@abi-omnipotent
3859 mov 120($context),%rax # pull context->Rax
3860 mov 248($context),%rbx # pull context->Rip
3862 mov 8($disp),%rsi # disp->ImageBase
3863 mov 56($disp),%r11 # disp->HandlerData
3865 mov 0(%r11),%r10d # HandlerData[0]
3866 lea (%rsi,%r10),%r10 # prologue label
3867 cmp %r10,%rbx # context->Rip<prologue label
3868 jb .Lcommon_seh_tail
3870 mov 192($context),%rax # pull context->R9
3872 mov 4(%r11),%r10d # HandlerData[1]
3873 mov 8(%r11),%ecx # HandlerData[2]
3874 lea (%rsi,%r10),%r10 # epilogue label
3875 cmp %r10,%rbx # context->Rip>=epilogue label
3876 jae .Lcommon_seh_tail
3879 lea -8(%rax,%rcx),%rsi
3880 lea 512($context),%rdi # &context.Xmm6
3883 .long 0xa548f3fc # cld; rep movsq
3885 jmp .Lcommon_seh_tail
3886 .size simd_handler,.-simd_handler
3890 .rva .LSEH_begin_ChaCha20_ctr32
3891 .rva .LSEH_end_ChaCha20_ctr32
3892 .rva .LSEH_info_ChaCha20_ctr32
3894 .rva .LSEH_begin_ChaCha20_ssse3
3895 .rva .LSEH_end_ChaCha20_ssse3
3896 .rva .LSEH_info_ChaCha20_ssse3
3898 .rva .LSEH_begin_ChaCha20_128
3899 .rva .LSEH_end_ChaCha20_128
3900 .rva .LSEH_info_ChaCha20_128
3902 .rva .LSEH_begin_ChaCha20_4x
3903 .rva .LSEH_end_ChaCha20_4x
3904 .rva .LSEH_info_ChaCha20_4x
3906 $code.=<<___ if ($avx);
3907 .rva .LSEH_begin_ChaCha20_4xop
3908 .rva .LSEH_end_ChaCha20_4xop
3909 .rva .LSEH_info_ChaCha20_4xop
3911 $code.=<<___ if ($avx>1);
3912 .rva .LSEH_begin_ChaCha20_8x
3913 .rva .LSEH_end_ChaCha20_8x
3914 .rva .LSEH_info_ChaCha20_8x
3916 $code.=<<___ if ($avx>2);
3917 .rva .LSEH_begin_ChaCha20_avx512
3918 .rva .LSEH_end_ChaCha20_avx512
3919 .rva .LSEH_info_ChaCha20_avx512
3921 .rva .LSEH_begin_ChaCha20_avx512vl
3922 .rva .LSEH_end_ChaCha20_avx512vl
3923 .rva .LSEH_info_ChaCha20_avx512vl
3925 .rva .LSEH_begin_ChaCha20_16x
3926 .rva .LSEH_end_ChaCha20_16x
3927 .rva .LSEH_info_ChaCha20_16x
3929 .rva .LSEH_begin_ChaCha20_8xvl
3930 .rva .LSEH_end_ChaCha20_8xvl
3931 .rva .LSEH_info_ChaCha20_8xvl
3936 .LSEH_info_ChaCha20_ctr32:
3940 .LSEH_info_ChaCha20_ssse3:
3943 .rva .Lssse3_body,.Lssse3_epilogue
3946 .LSEH_info_ChaCha20_128:
3949 .rva .L128_body,.L128_epilogue
3952 .LSEH_info_ChaCha20_4x:
3955 .rva .L4x_body,.L4x_epilogue
3958 $code.=<<___ if ($avx);
3959 .LSEH_info_ChaCha20_4xop:
3962 .rva .L4xop_body,.L4xop_epilogue # HandlerData[]
3965 $code.=<<___ if ($avx>1);
3966 .LSEH_info_ChaCha20_8x:
3969 .rva .L8x_body,.L8x_epilogue # HandlerData[]
3972 $code.=<<___ if ($avx>2);
3973 .LSEH_info_ChaCha20_avx512:
3976 .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
3979 .LSEH_info_ChaCha20_avx512vl:
3982 .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[]
3985 .LSEH_info_ChaCha20_16x:
3988 .rva .L16x_body,.L16x_epilogue # HandlerData[]
3991 .LSEH_info_ChaCha20_8xvl:
3994 .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[]
3999 foreach (split("\n",$code)) {
4000 s/\`([^\`]*)\`/eval $1/ge;
4002 s/%x#%[yz]/%x/g; # "down-shift"
4007 close STDOUT or die "error closing STDOUT: $!";