2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # ChaCha20 for x86_64.
23 # Add AVX512F code path.
27 # Add AVX512VL code path.
29 # Performance in cycles per byte out of large buffer.
31 # IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 NxAVX(v)
33 # P4 9.48/+99% -/22.7(ii) -
34 # Core2 7.83/+55% 7.90/8.08 4.35
35 # Westmere 7.19/+50% 5.60/6.70 3.00
36 # Sandy Bridge 8.31/+42% 5.45/6.76 2.72
37 # Ivy Bridge 6.71/+46% 5.40/6.49 2.41
38 # Haswell 5.92/+43% 5.20/6.45 2.42 1.23
39 # Skylake[-X] 5.87/+39% 4.70/- 2.31 1.19[0.80(vi)]
40 # Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
41 # Knights L 11.7/- - 9.60(iii) 0.80
42 # Goldmont 10.6/+17% 5.10/- 3.28
43 # Sledgehammer 7.28/+52% -/14.2(ii) -
44 # Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
45 # Ryzen 5.96/+50% 5.19/- 2.40 2.09
46 # VIA Nano 10.5/+46% 6.72/8.60 6.05
48 # (i) compared to older gcc 3.x one can observe >2x improvement on
50 # (ii) as it can be seen, SSE2 performance is too low on legacy
51 # processors; NxSSE2 results are naturally better, but not
52 # impressively better than IALU ones, which is why you won't
53 # find SSE2 code below;
54 # (iii) this is not optimal result for Atom because of MSROM
55 # limitations, SSE2 can do better, but gain is considered too
56 # low to justify the [maintenance] effort;
57 # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
58 # (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
59 # (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
60 # cpb in single thread, the corresponding capability is suppressed;
64 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
66 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
68 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
69 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
70 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
71 die "can't locate x86_64-xlate.pl";
73 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
74 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
75 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
78 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
79 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
80 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
81 $avx += 1 if ($1==2.11 && $2>=8);
84 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
85 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
86 $avx = ($1>=10) + ($1>=11);
89 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
90 $avx = ($2>=3.0) + ($2>3.0);
93 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
96 # input parameter block
97 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
102 .extern OPENSSL_ia32cap_P
114 .long 0,2,4,6,1,3,5,7
116 .long 8,8,8,8,8,8,8,8
118 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
120 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
122 .long 2,0,0,0, 2,0,0,0
125 .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
127 .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
129 .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
131 .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
133 .asciz "expand 32-byte k"
134 .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
137 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
138 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
140 $arg = "\$$arg" if ($arg*1 eq $arg);
141 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
144 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
145 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
148 sub ROUND { # critical path is 24 cycles per round
149 my ($a0,$b0,$c0,$d0)=@_;
150 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
151 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
152 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
153 my ($xc,$xc_)=map("\"$_\"",@t);
154 my @x=map("\"$_\"",@x);
156 # Consider order in which variables are addressed by their
161 # 0 4 8 12 < even round
165 # 0 5 10 15 < odd round
170 # 'a', 'b' and 'd's are permanently allocated in registers,
171 # @x[0..7,12..15], while 'c's are maintained in memory. If
172 # you observe 'c' column, you'll notice that pair of 'c's is
173 # invariant between rounds. This means that we have to reload
174 # them once per round, in the middle. This is why you'll see
175 # bunch of 'c' stores and loads in the middle, but none in
176 # the beginning or end.
178 # Normally instructions would be interleaved to favour in-order
179 # execution. Generally out-of-order cores manage it gracefully,
180 # but not this time for some reason. As in-order execution
181 # cores are dying breed, old Atom is the only one around,
182 # instructions are left uninterleaved. Besides, Atom is better
183 # off executing 1xSSSE3 code anyway...
186 "&add (@x[$a0],@x[$b0])", # Q1
187 "&xor (@x[$d0],@x[$a0])",
189 "&add (@x[$a1],@x[$b1])", # Q2
190 "&xor (@x[$d1],@x[$a1])",
193 "&add ($xc,@x[$d0])",
194 "&xor (@x[$b0],$xc)",
196 "&add ($xc_,@x[$d1])",
197 "&xor (@x[$b1],$xc_)",
200 "&add (@x[$a0],@x[$b0])",
201 "&xor (@x[$d0],@x[$a0])",
203 "&add (@x[$a1],@x[$b1])",
204 "&xor (@x[$d1],@x[$a1])",
207 "&add ($xc,@x[$d0])",
208 "&xor (@x[$b0],$xc)",
210 "&add ($xc_,@x[$d1])",
211 "&xor (@x[$b1],$xc_)",
214 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
215 "&mov (\"4*$c1(%rsp)\",$xc_)",
216 "&mov ($xc,\"4*$c2(%rsp)\")",
217 "&mov ($xc_,\"4*$c3(%rsp)\")",
219 "&add (@x[$a2],@x[$b2])", # Q3
220 "&xor (@x[$d2],@x[$a2])",
222 "&add (@x[$a3],@x[$b3])", # Q4
223 "&xor (@x[$d3],@x[$a3])",
226 "&add ($xc,@x[$d2])",
227 "&xor (@x[$b2],$xc)",
229 "&add ($xc_,@x[$d3])",
230 "&xor (@x[$b3],$xc_)",
233 "&add (@x[$a2],@x[$b2])",
234 "&xor (@x[$d2],@x[$a2])",
236 "&add (@x[$a3],@x[$b3])",
237 "&xor (@x[$d3],@x[$a3])",
240 "&add ($xc,@x[$d2])",
241 "&xor (@x[$b2],$xc)",
243 "&add ($xc_,@x[$d3])",
244 "&xor (@x[$b3],$xc_)",
249 ########################################################################
250 # Generic code path that handles all lengths on pre-SSSE3 processors.
252 .globl ChaCha20_ctr32
253 .type ChaCha20_ctr32,\@function,5
259 mov OPENSSL_ia32cap_P+4(%rip),%r10
261 $code.=<<___ if ($avx>2);
262 bt \$48,%r10 # check for AVX512F
264 test %r10,%r10 # check for AVX512VL
265 js .LChaCha20_avx512vl
268 test \$`1<<(41-32)`,%r10d
284 .cfi_adjust_cfa_offset 64+24
287 #movdqa .Lsigma(%rip),%xmm0
289 movdqu 16($key),%xmm2
290 movdqu ($counter),%xmm3
291 movdqa .Lone(%rip),%xmm4
293 #movdqa %xmm0,4*0(%rsp) # key[0]
294 movdqa %xmm1,4*4(%rsp) # key[1]
295 movdqa %xmm2,4*8(%rsp) # key[2]
296 movdqa %xmm3,4*12(%rsp) # key[3]
297 mov $len,%rbp # reassign $len
302 mov \$0x61707865,@x[0] # 'expa'
303 mov \$0x3320646e,@x[1] # 'nd 3'
304 mov \$0x79622d32,@x[2] # '2-by'
305 mov \$0x6b206574,@x[3] # 'te k'
311 mov 4*13(%rsp),@x[13]
312 mov 4*14(%rsp),@x[14]
313 mov 4*15(%rsp),@x[15]
315 mov %rbp,64+0(%rsp) # save len
317 mov $inp,64+8(%rsp) # save inp
318 movq %xmm2,%rsi # "@x[8]"
319 mov $out,64+16(%rsp) # save out
321 shr \$32,%rdi # "@x[9]"
327 foreach (&ROUND (0, 4, 8,12)) { eval; }
328 foreach (&ROUND (0, 5,10,15)) { eval; }
333 mov @t[1],4*9(%rsp) # modulo-scheduled
335 mov 64(%rsp),%rbp # load len
337 mov 64+8(%rsp),$inp # load inp
338 paddd %xmm4,%xmm3 # increment counter
339 mov 64+16(%rsp),$out # load out
341 add \$0x61707865,@x[0] # 'expa'
342 add \$0x3320646e,@x[1] # 'nd 3'
343 add \$0x79622d32,@x[2] # '2-by'
344 add \$0x6b206574,@x[3] # 'te k'
349 add 4*12(%rsp),@x[12]
350 add 4*13(%rsp),@x[13]
351 add 4*14(%rsp),@x[14]
352 add 4*15(%rsp),@x[15]
353 paddd 4*8(%rsp),%xmm1
358 xor 4*0($inp),@x[0] # xor with input
366 movdqu 4*8($inp),%xmm0
367 xor 4*12($inp),@x[12]
368 xor 4*13($inp),@x[13]
369 xor 4*14($inp),@x[14]
370 xor 4*15($inp),@x[15]
371 lea 4*16($inp),$inp # inp+=64
374 movdqa %xmm2,4*8(%rsp)
375 movd %xmm3,4*12(%rsp)
377 mov @x[0],4*0($out) # write output
385 movdqu %xmm0,4*8($out)
386 mov @x[12],4*12($out)
387 mov @x[13],4*13($out)
388 mov @x[14],4*14($out)
389 mov @x[15],4*15($out)
390 lea 4*16($out),$out # out+=64
408 movdqa %xmm1,4*8(%rsp)
409 mov @x[12],4*12(%rsp)
410 mov @x[13],4*13(%rsp)
411 mov @x[14],4*14(%rsp)
412 mov @x[15],4*15(%rsp)
415 movzb ($inp,%rbx),%eax
416 movzb (%rsp,%rbx),%edx
419 mov %al,-1($out,%rbx)
424 lea 64+24+48(%rsp),%rsi
439 .cfi_def_cfa_register %rsp
443 .size ChaCha20_ctr32,.-ChaCha20_ctr32
446 ########################################################################
447 # SSSE3 code path that handles shorter lengths
449 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
451 sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
475 my $xframe = $win64 ? 32+8 : 8;
478 .type ChaCha20_ssse3,\@function,5
483 mov %rsp,%r9 # frame pointer
484 .cfi_def_cfa_register %r9
486 $code.=<<___ if ($avx);
487 test \$`1<<(43-32)`,%r10d
488 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
491 cmp \$128,$len # we might throw away some data,
492 ja .LChaCha20_4x # but overall it won't be slower
495 sub \$64+$xframe,%rsp
497 $code.=<<___ if ($win64);
498 movaps %xmm6,-0x28(%r9)
499 movaps %xmm7,-0x18(%r9)
503 movdqa .Lsigma(%rip),$a
507 movdqa .Lrot16(%rip),$rot16
508 movdqa .Lrot24(%rip),$rot24
514 mov \$10,$counter # reuse $counter
519 movdqa .Lone(%rip),$d
532 &pshufd ($c,$c,0b01001110);
533 &pshufd ($b,$b,0b00111001);
534 &pshufd ($d,$d,0b10010011);
538 &pshufd ($c,$c,0b01001110);
539 &pshufd ($b,$b,0b10010011);
540 &pshufd ($d,$d,0b00111001);
543 &jnz (".Loop_ssse3");
555 movdqu 0x10($inp),$t1
556 pxor $t,$a # xor with input
559 movdqu 0x30($inp),$t1
560 lea 0x40($inp),$inp # inp+=64
564 movdqu $a,0x00($out) # write output
568 lea 0x40($out),$out # out+=64
571 jnz .Loop_outer_ssse3
581 xor $counter,$counter
584 movzb ($inp,$counter),%eax
585 movzb (%rsp,$counter),%ecx
586 lea 1($counter),$counter
588 mov %al,-1($out,$counter)
594 $code.=<<___ if ($win64);
595 movaps -0x28(%r9),%xmm6
596 movaps -0x18(%r9),%xmm7
600 .cfi_def_cfa_register %rsp
604 .size ChaCha20_ssse3,.-ChaCha20_ssse3
608 ########################################################################
609 # SSSE3 code path that handles longer messages.
611 # assign variables to favor Atom front-end
612 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
613 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
614 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
615 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
617 sub SSSE3_lane_ROUND {
618 my ($a0,$b0,$c0,$d0)=@_;
619 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
620 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
621 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
622 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
623 my @x=map("\"$_\"",@xx);
625 # Consider order in which variables are addressed by their
630 # 0 4 8 12 < even round
634 # 0 5 10 15 < odd round
639 # 'a', 'b' and 'd's are permanently allocated in registers,
640 # @x[0..7,12..15], while 'c's are maintained in memory. If
641 # you observe 'c' column, you'll notice that pair of 'c's is
642 # invariant between rounds. This means that we have to reload
643 # them once per round, in the middle. This is why you'll see
644 # bunch of 'c' stores and loads in the middle, but none in
645 # the beginning or end.
648 "&paddd (@x[$a0],@x[$b0])", # Q1
649 "&paddd (@x[$a1],@x[$b1])", # Q2
650 "&pxor (@x[$d0],@x[$a0])",
651 "&pxor (@x[$d1],@x[$a1])",
652 "&pshufb (@x[$d0],$t1)",
653 "&pshufb (@x[$d1],$t1)",
655 "&paddd ($xc,@x[$d0])",
656 "&paddd ($xc_,@x[$d1])",
657 "&pxor (@x[$b0],$xc)",
658 "&pxor (@x[$b1],$xc_)",
659 "&movdqa ($t0,@x[$b0])",
660 "&pslld (@x[$b0],12)",
662 "&movdqa ($t1,@x[$b1])",
663 "&pslld (@x[$b1],12)",
664 "&por (@x[$b0],$t0)",
666 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
667 "&por (@x[$b1],$t1)",
669 "&paddd (@x[$a0],@x[$b0])",
670 "&paddd (@x[$a1],@x[$b1])",
671 "&pxor (@x[$d0],@x[$a0])",
672 "&pxor (@x[$d1],@x[$a1])",
673 "&pshufb (@x[$d0],$t0)",
674 "&pshufb (@x[$d1],$t0)",
676 "&paddd ($xc,@x[$d0])",
677 "&paddd ($xc_,@x[$d1])",
678 "&pxor (@x[$b0],$xc)",
679 "&pxor (@x[$b1],$xc_)",
680 "&movdqa ($t1,@x[$b0])",
681 "&pslld (@x[$b0],7)",
683 "&movdqa ($t0,@x[$b1])",
684 "&pslld (@x[$b1],7)",
685 "&por (@x[$b0],$t1)",
687 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
688 "&por (@x[$b1],$t0)",
690 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
691 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
692 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
693 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
695 "&paddd (@x[$a2],@x[$b2])", # Q3
696 "&paddd (@x[$a3],@x[$b3])", # Q4
697 "&pxor (@x[$d2],@x[$a2])",
698 "&pxor (@x[$d3],@x[$a3])",
699 "&pshufb (@x[$d2],$t1)",
700 "&pshufb (@x[$d3],$t1)",
702 "&paddd ($xc,@x[$d2])",
703 "&paddd ($xc_,@x[$d3])",
704 "&pxor (@x[$b2],$xc)",
705 "&pxor (@x[$b3],$xc_)",
706 "&movdqa ($t0,@x[$b2])",
707 "&pslld (@x[$b2],12)",
709 "&movdqa ($t1,@x[$b3])",
710 "&pslld (@x[$b3],12)",
711 "&por (@x[$b2],$t0)",
713 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
714 "&por (@x[$b3],$t1)",
716 "&paddd (@x[$a2],@x[$b2])",
717 "&paddd (@x[$a3],@x[$b3])",
718 "&pxor (@x[$d2],@x[$a2])",
719 "&pxor (@x[$d3],@x[$a3])",
720 "&pshufb (@x[$d2],$t0)",
721 "&pshufb (@x[$d3],$t0)",
723 "&paddd ($xc,@x[$d2])",
724 "&paddd ($xc_,@x[$d3])",
725 "&pxor (@x[$b2],$xc)",
726 "&pxor (@x[$b3],$xc_)",
727 "&movdqa ($t1,@x[$b2])",
728 "&pslld (@x[$b2],7)",
730 "&movdqa ($t0,@x[$b3])",
731 "&pslld (@x[$b3],7)",
732 "&por (@x[$b2],$t1)",
734 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
739 my $xframe = $win64 ? 0xa8 : 8;
742 .type ChaCha20_4x,\@function,5
747 mov %rsp,%r9 # frame pointer
748 .cfi_def_cfa_register %r9
751 $code.=<<___ if ($avx>1);
752 shr \$32,%r10 # OPENSSL_ia32cap_P+8
753 test \$`1<<5`,%r10 # test AVX2
760 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
761 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
762 je .Ldo_sse3_after_all # to detect Atom
765 sub \$0x140+$xframe,%rsp
767 ################ stack layout
768 # +0x00 SIMD equivalent of @x[8-12]
770 # +0x40 constant copy of key[0-2] smashed by lanes
772 # +0x100 SIMD counters (with nonce smashed by lanes)
775 $code.=<<___ if ($win64);
776 movaps %xmm6,-0xa8(%r9)
777 movaps %xmm7,-0x98(%r9)
778 movaps %xmm8,-0x88(%r9)
779 movaps %xmm9,-0x78(%r9)
780 movaps %xmm10,-0x68(%r9)
781 movaps %xmm11,-0x58(%r9)
782 movaps %xmm12,-0x48(%r9)
783 movaps %xmm13,-0x38(%r9)
784 movaps %xmm14,-0x28(%r9)
785 movaps %xmm15,-0x18(%r9)
789 movdqa .Lsigma(%rip),$xa3 # key[0]
790 movdqu ($key),$xb3 # key[1]
791 movdqu 16($key),$xt3 # key[2]
792 movdqu ($counter),$xd3 # key[3]
793 lea 0x100(%rsp),%rcx # size optimization
794 lea .Lrot16(%rip),%r10
795 lea .Lrot24(%rip),%r11
797 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
798 pshufd \$0x55,$xa3,$xa1
799 movdqa $xa0,0x40(%rsp) # ... and offload
800 pshufd \$0xaa,$xa3,$xa2
801 movdqa $xa1,0x50(%rsp)
802 pshufd \$0xff,$xa3,$xa3
803 movdqa $xa2,0x60(%rsp)
804 movdqa $xa3,0x70(%rsp)
806 pshufd \$0x00,$xb3,$xb0
807 pshufd \$0x55,$xb3,$xb1
808 movdqa $xb0,0x80-0x100(%rcx)
809 pshufd \$0xaa,$xb3,$xb2
810 movdqa $xb1,0x90-0x100(%rcx)
811 pshufd \$0xff,$xb3,$xb3
812 movdqa $xb2,0xa0-0x100(%rcx)
813 movdqa $xb3,0xb0-0x100(%rcx)
815 pshufd \$0x00,$xt3,$xt0 # "$xc0"
816 pshufd \$0x55,$xt3,$xt1 # "$xc1"
817 movdqa $xt0,0xc0-0x100(%rcx)
818 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
819 movdqa $xt1,0xd0-0x100(%rcx)
820 pshufd \$0xff,$xt3,$xt3 # "$xc3"
821 movdqa $xt2,0xe0-0x100(%rcx)
822 movdqa $xt3,0xf0-0x100(%rcx)
824 pshufd \$0x00,$xd3,$xd0
825 pshufd \$0x55,$xd3,$xd1
826 paddd .Linc(%rip),$xd0 # don't save counters yet
827 pshufd \$0xaa,$xd3,$xd2
828 movdqa $xd1,0x110-0x100(%rcx)
829 pshufd \$0xff,$xd3,$xd3
830 movdqa $xd2,0x120-0x100(%rcx)
831 movdqa $xd3,0x130-0x100(%rcx)
837 movdqa 0x40(%rsp),$xa0 # re-load smashed key
838 movdqa 0x50(%rsp),$xa1
839 movdqa 0x60(%rsp),$xa2
840 movdqa 0x70(%rsp),$xa3
841 movdqa 0x80-0x100(%rcx),$xb0
842 movdqa 0x90-0x100(%rcx),$xb1
843 movdqa 0xa0-0x100(%rcx),$xb2
844 movdqa 0xb0-0x100(%rcx),$xb3
845 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
846 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
847 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
848 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
849 movdqa 0x100-0x100(%rcx),$xd0
850 movdqa 0x110-0x100(%rcx),$xd1
851 movdqa 0x120-0x100(%rcx),$xd2
852 movdqa 0x130-0x100(%rcx),$xd3
853 paddd .Lfour(%rip),$xd0 # next SIMD counters
856 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
857 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
858 movdqa (%r10),$xt3 # .Lrot16(%rip)
860 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
866 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
867 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
872 paddd 0x40(%rsp),$xa0 # accumulate key material
873 paddd 0x50(%rsp),$xa1
874 paddd 0x60(%rsp),$xa2
875 paddd 0x70(%rsp),$xa3
877 movdqa $xa0,$xt2 # "de-interlace" data
884 punpcklqdq $xa2,$xa0 # "a0"
886 punpcklqdq $xt3,$xt2 # "a2"
887 punpckhqdq $xa2,$xa1 # "a1"
888 punpckhqdq $xt3,$xa3 # "a3"
890 ($xa2,$xt2)=($xt2,$xa2);
892 paddd 0x80-0x100(%rcx),$xb0
893 paddd 0x90-0x100(%rcx),$xb1
894 paddd 0xa0-0x100(%rcx),$xb2
895 paddd 0xb0-0x100(%rcx),$xb3
897 movdqa $xa0,0x00(%rsp) # offload $xaN
898 movdqa $xa1,0x10(%rsp)
899 movdqa 0x20(%rsp),$xa0 # "xc2"
900 movdqa 0x30(%rsp),$xa1 # "xc3"
909 punpcklqdq $xb2,$xb0 # "b0"
911 punpcklqdq $xt3,$xt2 # "b2"
912 punpckhqdq $xb2,$xb1 # "b1"
913 punpckhqdq $xt3,$xb3 # "b3"
915 ($xb2,$xt2)=($xt2,$xb2);
916 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
918 paddd 0xc0-0x100(%rcx),$xc0
919 paddd 0xd0-0x100(%rcx),$xc1
920 paddd 0xe0-0x100(%rcx),$xc2
921 paddd 0xf0-0x100(%rcx),$xc3
923 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
924 movdqa $xa3,0x30(%rsp)
933 punpcklqdq $xc2,$xc0 # "c0"
935 punpcklqdq $xt3,$xt2 # "c2"
936 punpckhqdq $xc2,$xc1 # "c1"
937 punpckhqdq $xt3,$xc3 # "c3"
939 ($xc2,$xt2)=($xt2,$xc2);
940 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
942 paddd 0x100-0x100(%rcx),$xd0
943 paddd 0x110-0x100(%rcx),$xd1
944 paddd 0x120-0x100(%rcx),$xd2
945 paddd 0x130-0x100(%rcx),$xd3
954 punpcklqdq $xd2,$xd0 # "d0"
956 punpcklqdq $xt3,$xt2 # "d2"
957 punpckhqdq $xd2,$xd1 # "d1"
958 punpckhqdq $xt3,$xd3 # "d3"
960 ($xd2,$xt2)=($xt2,$xd2);
965 movdqu 0x00($inp),$xt0 # xor with input
966 movdqu 0x10($inp),$xt1
967 movdqu 0x20($inp),$xt2
968 movdqu 0x30($inp),$xt3
969 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
974 movdqu $xt0,0x00($out)
975 movdqu 0x40($inp),$xt0
976 movdqu $xt1,0x10($out)
977 movdqu 0x50($inp),$xt1
978 movdqu $xt2,0x20($out)
979 movdqu 0x60($inp),$xt2
980 movdqu $xt3,0x30($out)
981 movdqu 0x70($inp),$xt3
982 lea 0x80($inp),$inp # size optimization
988 movdqu $xt0,0x40($out)
989 movdqu 0x00($inp),$xt0
990 movdqu $xt1,0x50($out)
991 movdqu 0x10($inp),$xt1
992 movdqu $xt2,0x60($out)
993 movdqu 0x20($inp),$xt2
994 movdqu $xt3,0x70($out)
995 lea 0x80($out),$out # size optimization
996 movdqu 0x30($inp),$xt3
1002 movdqu $xt0,0x00($out)
1003 movdqu 0x40($inp),$xt0
1004 movdqu $xt1,0x10($out)
1005 movdqu 0x50($inp),$xt1
1006 movdqu $xt2,0x20($out)
1007 movdqu 0x60($inp),$xt2
1008 movdqu $xt3,0x30($out)
1009 movdqu 0x70($inp),$xt3
1010 lea 0x80($inp),$inp # inp+=64*4
1011 pxor 0x30(%rsp),$xt0
1015 movdqu $xt0,0x40($out)
1016 movdqu $xt1,0x50($out)
1017 movdqu $xt2,0x60($out)
1018 movdqu $xt3,0x70($out)
1019 lea 0x80($out),$out # out+=64*4
1034 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1036 #movdqa $xt0,0x00(%rsp)
1037 movdqa $xb0,0x10(%rsp)
1038 movdqa $xc0,0x20(%rsp)
1039 movdqa $xd0,0x30(%rsp)
1044 movdqu 0x00($inp),$xt0 # xor with input
1045 movdqu 0x10($inp),$xt1
1046 movdqu 0x20($inp),$xt2
1047 movdqu 0x30($inp),$xt3
1048 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
1052 movdqu $xt0,0x00($out)
1053 movdqu $xt1,0x10($out)
1054 movdqu $xt2,0x20($out)
1055 movdqu $xt3,0x30($out)
1058 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1059 lea 0x40($inp),$inp # inp+=64*1
1061 movdqa $xt0,0x00(%rsp)
1062 movdqa $xb1,0x10(%rsp)
1063 lea 0x40($out),$out # out+=64*1
1064 movdqa $xc1,0x20(%rsp)
1065 sub \$64,$len # len-=64*1
1066 movdqa $xd1,0x30(%rsp)
1071 movdqu 0x00($inp),$xt0 # xor with input
1072 movdqu 0x10($inp),$xt1
1073 movdqu 0x20($inp),$xt2
1074 movdqu 0x30($inp),$xt3
1075 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1080 movdqu $xt0,0x00($out)
1081 movdqu 0x40($inp),$xt0
1082 movdqu $xt1,0x10($out)
1083 movdqu 0x50($inp),$xt1
1084 movdqu $xt2,0x20($out)
1085 movdqu 0x60($inp),$xt2
1086 movdqu $xt3,0x30($out)
1087 movdqu 0x70($inp),$xt3
1088 pxor 0x10(%rsp),$xt0
1092 movdqu $xt0,0x40($out)
1093 movdqu $xt1,0x50($out)
1094 movdqu $xt2,0x60($out)
1095 movdqu $xt3,0x70($out)
1098 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1099 lea 0x80($inp),$inp # inp+=64*2
1101 movdqa $xt0,0x00(%rsp)
1102 movdqa $xb2,0x10(%rsp)
1103 lea 0x80($out),$out # out+=64*2
1104 movdqa $xc2,0x20(%rsp)
1105 sub \$128,$len # len-=64*2
1106 movdqa $xd2,0x30(%rsp)
1111 movdqu 0x00($inp),$xt0 # xor with input
1112 movdqu 0x10($inp),$xt1
1113 movdqu 0x20($inp),$xt2
1114 movdqu 0x30($inp),$xt3
1115 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1120 movdqu $xt0,0x00($out)
1121 movdqu 0x40($inp),$xt0
1122 movdqu $xt1,0x10($out)
1123 movdqu 0x50($inp),$xt1
1124 movdqu $xt2,0x20($out)
1125 movdqu 0x60($inp),$xt2
1126 movdqu $xt3,0x30($out)
1127 movdqu 0x70($inp),$xt3
1128 lea 0x80($inp),$inp # size optimization
1129 pxor 0x10(%rsp),$xt0
1134 movdqu $xt0,0x40($out)
1135 movdqu 0x00($inp),$xt0
1136 movdqu $xt1,0x50($out)
1137 movdqu 0x10($inp),$xt1
1138 movdqu $xt2,0x60($out)
1139 movdqu 0x20($inp),$xt2
1140 movdqu $xt3,0x70($out)
1141 lea 0x80($out),$out # size optimization
1142 movdqu 0x30($inp),$xt3
1143 pxor 0x20(%rsp),$xt0
1147 movdqu $xt0,0x00($out)
1148 movdqu $xt1,0x10($out)
1149 movdqu $xt2,0x20($out)
1150 movdqu $xt3,0x30($out)
1153 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1154 lea 0x40($inp),$inp # inp+=64*3
1156 movdqa $xt0,0x00(%rsp)
1157 movdqa $xb3,0x10(%rsp)
1158 lea 0x40($out),$out # out+=64*3
1159 movdqa $xc3,0x20(%rsp)
1160 sub \$192,$len # len-=64*3
1161 movdqa $xd3,0x30(%rsp)
1164 movzb ($inp,%r10),%eax
1165 movzb (%rsp,%r10),%ecx
1168 mov %al,-1($out,%r10)
1174 $code.=<<___ if ($win64);
1175 movaps -0xa8(%r9),%xmm6
1176 movaps -0x98(%r9),%xmm7
1177 movaps -0x88(%r9),%xmm8
1178 movaps -0x78(%r9),%xmm9
1179 movaps -0x68(%r9),%xmm10
1180 movaps -0x58(%r9),%xmm11
1181 movaps -0x48(%r9),%xmm12
1182 movaps -0x38(%r9),%xmm13
1183 movaps -0x28(%r9),%xmm14
1184 movaps -0x18(%r9),%xmm15
1188 .cfi_def_cfa_register %rsp
1192 .size ChaCha20_4x,.-ChaCha20_4x
1196 ########################################################################
1197 # XOP code path that handles all lengths.
1199 # There is some "anomaly" observed depending on instructions' size or
1200 # alignment. If you look closely at below code you'll notice that
1201 # sometimes argument order varies. The order affects instruction
1202 # encoding by making it larger, and such fiddling gives 5% performance
1203 # improvement. This is on FX-4100...
1205 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1206 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1207 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1208 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1210 sub XOP_lane_ROUND {
1211 my ($a0,$b0,$c0,$d0)=@_;
1212 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1213 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1214 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1215 my @x=map("\"$_\"",@xx);
1218 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1219 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1220 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1221 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1222 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1223 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1224 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1225 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1226 "&vprotd (@x[$d0],@x[$d0],16)",
1227 "&vprotd (@x[$d1],@x[$d1],16)",
1228 "&vprotd (@x[$d2],@x[$d2],16)",
1229 "&vprotd (@x[$d3],@x[$d3],16)",
1231 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1232 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1233 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1234 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1235 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1236 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1237 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1238 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1239 "&vprotd (@x[$b0],@x[$b0],12)",
1240 "&vprotd (@x[$b1],@x[$b1],12)",
1241 "&vprotd (@x[$b2],@x[$b2],12)",
1242 "&vprotd (@x[$b3],@x[$b3],12)",
1244 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1245 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1246 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1247 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1248 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1249 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1250 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1251 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1252 "&vprotd (@x[$d0],@x[$d0],8)",
1253 "&vprotd (@x[$d1],@x[$d1],8)",
1254 "&vprotd (@x[$d2],@x[$d2],8)",
1255 "&vprotd (@x[$d3],@x[$d3],8)",
1257 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1258 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1259 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1260 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1261 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1262 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1263 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1264 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1265 "&vprotd (@x[$b0],@x[$b0],7)",
1266 "&vprotd (@x[$b1],@x[$b1],7)",
1267 "&vprotd (@x[$b2],@x[$b2],7)",
1268 "&vprotd (@x[$b3],@x[$b3],7)"
1272 my $xframe = $win64 ? 0xa8 : 8;
1275 .type ChaCha20_4xop,\@function,5
1280 mov %rsp,%r9 # frame pointer
1281 .cfi_def_cfa_register %r9
1282 sub \$0x140+$xframe,%rsp
1284 ################ stack layout
1285 # +0x00 SIMD equivalent of @x[8-12]
1287 # +0x40 constant copy of key[0-2] smashed by lanes
1289 # +0x100 SIMD counters (with nonce smashed by lanes)
1292 $code.=<<___ if ($win64);
1293 movaps %xmm6,-0xa8(%r9)
1294 movaps %xmm7,-0x98(%r9)
1295 movaps %xmm8,-0x88(%r9)
1296 movaps %xmm9,-0x78(%r9)
1297 movaps %xmm10,-0x68(%r9)
1298 movaps %xmm11,-0x58(%r9)
1299 movaps %xmm12,-0x48(%r9)
1300 movaps %xmm13,-0x38(%r9)
1301 movaps %xmm14,-0x28(%r9)
1302 movaps %xmm15,-0x18(%r9)
1308 vmovdqa .Lsigma(%rip),$xa3 # key[0]
1309 vmovdqu ($key),$xb3 # key[1]
1310 vmovdqu 16($key),$xt3 # key[2]
1311 vmovdqu ($counter),$xd3 # key[3]
1312 lea 0x100(%rsp),%rcx # size optimization
1314 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1315 vpshufd \$0x55,$xa3,$xa1
1316 vmovdqa $xa0,0x40(%rsp) # ... and offload
1317 vpshufd \$0xaa,$xa3,$xa2
1318 vmovdqa $xa1,0x50(%rsp)
1319 vpshufd \$0xff,$xa3,$xa3
1320 vmovdqa $xa2,0x60(%rsp)
1321 vmovdqa $xa3,0x70(%rsp)
1323 vpshufd \$0x00,$xb3,$xb0
1324 vpshufd \$0x55,$xb3,$xb1
1325 vmovdqa $xb0,0x80-0x100(%rcx)
1326 vpshufd \$0xaa,$xb3,$xb2
1327 vmovdqa $xb1,0x90-0x100(%rcx)
1328 vpshufd \$0xff,$xb3,$xb3
1329 vmovdqa $xb2,0xa0-0x100(%rcx)
1330 vmovdqa $xb3,0xb0-0x100(%rcx)
1332 vpshufd \$0x00,$xt3,$xt0 # "$xc0"
1333 vpshufd \$0x55,$xt3,$xt1 # "$xc1"
1334 vmovdqa $xt0,0xc0-0x100(%rcx)
1335 vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
1336 vmovdqa $xt1,0xd0-0x100(%rcx)
1337 vpshufd \$0xff,$xt3,$xt3 # "$xc3"
1338 vmovdqa $xt2,0xe0-0x100(%rcx)
1339 vmovdqa $xt3,0xf0-0x100(%rcx)
1341 vpshufd \$0x00,$xd3,$xd0
1342 vpshufd \$0x55,$xd3,$xd1
1343 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
1344 vpshufd \$0xaa,$xd3,$xd2
1345 vmovdqa $xd1,0x110-0x100(%rcx)
1346 vpshufd \$0xff,$xd3,$xd3
1347 vmovdqa $xd2,0x120-0x100(%rcx)
1348 vmovdqa $xd3,0x130-0x100(%rcx)
1354 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
1355 vmovdqa 0x50(%rsp),$xa1
1356 vmovdqa 0x60(%rsp),$xa2
1357 vmovdqa 0x70(%rsp),$xa3
1358 vmovdqa 0x80-0x100(%rcx),$xb0
1359 vmovdqa 0x90-0x100(%rcx),$xb1
1360 vmovdqa 0xa0-0x100(%rcx),$xb2
1361 vmovdqa 0xb0-0x100(%rcx),$xb3
1362 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1363 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1364 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1365 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1366 vmovdqa 0x100-0x100(%rcx),$xd0
1367 vmovdqa 0x110-0x100(%rcx),$xd1
1368 vmovdqa 0x120-0x100(%rcx),$xd2
1369 vmovdqa 0x130-0x100(%rcx),$xd3
1370 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
1374 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1380 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1381 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1386 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
1387 vpaddd 0x50(%rsp),$xa1,$xa1
1388 vpaddd 0x60(%rsp),$xa2,$xa2
1389 vpaddd 0x70(%rsp),$xa3,$xa3
1391 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
1392 vmovdqa $xt3,0x30(%rsp)
1394 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1395 vpunpckldq $xa3,$xa2,$xt3
1396 vpunpckhdq $xa1,$xa0,$xa0
1397 vpunpckhdq $xa3,$xa2,$xa2
1398 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1399 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1400 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1401 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1403 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1405 vpaddd 0x80-0x100(%rcx),$xb0,$xb0
1406 vpaddd 0x90-0x100(%rcx),$xb1,$xb1
1407 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
1408 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
1410 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
1411 vmovdqa $xa1,0x10(%rsp)
1412 vmovdqa 0x20(%rsp),$xa0 # "xc2"
1413 vmovdqa 0x30(%rsp),$xa1 # "xc3"
1415 vpunpckldq $xb1,$xb0,$xt2
1416 vpunpckldq $xb3,$xb2,$xt3
1417 vpunpckhdq $xb1,$xb0,$xb0
1418 vpunpckhdq $xb3,$xb2,$xb2
1419 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1420 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1421 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1422 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1424 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1425 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1427 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
1428 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
1429 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
1430 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
1432 vpunpckldq $xc1,$xc0,$xt2
1433 vpunpckldq $xc3,$xc2,$xt3
1434 vpunpckhdq $xc1,$xc0,$xc0
1435 vpunpckhdq $xc3,$xc2,$xc2
1436 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1437 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1438 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1439 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1441 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1443 vpaddd 0x100-0x100(%rcx),$xd0,$xd0
1444 vpaddd 0x110-0x100(%rcx),$xd1,$xd1
1445 vpaddd 0x120-0x100(%rcx),$xd2,$xd2
1446 vpaddd 0x130-0x100(%rcx),$xd3,$xd3
1448 vpunpckldq $xd1,$xd0,$xt2
1449 vpunpckldq $xd3,$xd2,$xt3
1450 vpunpckhdq $xd1,$xd0,$xd0
1451 vpunpckhdq $xd3,$xd2,$xd2
1452 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1453 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1454 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1455 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1457 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1458 ($xa0,$xa1)=($xt2,$xt3);
1460 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
1461 vmovdqa 0x10(%rsp),$xa1
1466 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1467 vpxor 0x10($inp),$xb0,$xb0
1468 vpxor 0x20($inp),$xc0,$xc0
1469 vpxor 0x30($inp),$xd0,$xd0
1470 vpxor 0x40($inp),$xa1,$xa1
1471 vpxor 0x50($inp),$xb1,$xb1
1472 vpxor 0x60($inp),$xc1,$xc1
1473 vpxor 0x70($inp),$xd1,$xd1
1474 lea 0x80($inp),$inp # size optimization
1475 vpxor 0x00($inp),$xa2,$xa2
1476 vpxor 0x10($inp),$xb2,$xb2
1477 vpxor 0x20($inp),$xc2,$xc2
1478 vpxor 0x30($inp),$xd2,$xd2
1479 vpxor 0x40($inp),$xa3,$xa3
1480 vpxor 0x50($inp),$xb3,$xb3
1481 vpxor 0x60($inp),$xc3,$xc3
1482 vpxor 0x70($inp),$xd3,$xd3
1483 lea 0x80($inp),$inp # inp+=64*4
1485 vmovdqu $xa0,0x00($out)
1486 vmovdqu $xb0,0x10($out)
1487 vmovdqu $xc0,0x20($out)
1488 vmovdqu $xd0,0x30($out)
1489 vmovdqu $xa1,0x40($out)
1490 vmovdqu $xb1,0x50($out)
1491 vmovdqu $xc1,0x60($out)
1492 vmovdqu $xd1,0x70($out)
1493 lea 0x80($out),$out # size optimization
1494 vmovdqu $xa2,0x00($out)
1495 vmovdqu $xb2,0x10($out)
1496 vmovdqu $xc2,0x20($out)
1497 vmovdqu $xd2,0x30($out)
1498 vmovdqu $xa3,0x40($out)
1499 vmovdqu $xb3,0x50($out)
1500 vmovdqu $xc3,0x60($out)
1501 vmovdqu $xd3,0x70($out)
1502 lea 0x80($out),$out # out+=64*4
1512 jae .L192_or_more4xop
1514 jae .L128_or_more4xop
1516 jae .L64_or_more4xop
1519 vmovdqa $xa0,0x00(%rsp)
1520 vmovdqa $xb0,0x10(%rsp)
1521 vmovdqa $xc0,0x20(%rsp)
1522 vmovdqa $xd0,0x30(%rsp)
1527 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1528 vpxor 0x10($inp),$xb0,$xb0
1529 vpxor 0x20($inp),$xc0,$xc0
1530 vpxor 0x30($inp),$xd0,$xd0
1531 vmovdqu $xa0,0x00($out)
1532 vmovdqu $xb0,0x10($out)
1533 vmovdqu $xc0,0x20($out)
1534 vmovdqu $xd0,0x30($out)
1537 lea 0x40($inp),$inp # inp+=64*1
1538 vmovdqa $xa1,0x00(%rsp)
1540 vmovdqa $xb1,0x10(%rsp)
1541 lea 0x40($out),$out # out+=64*1
1542 vmovdqa $xc1,0x20(%rsp)
1543 sub \$64,$len # len-=64*1
1544 vmovdqa $xd1,0x30(%rsp)
1549 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1550 vpxor 0x10($inp),$xb0,$xb0
1551 vpxor 0x20($inp),$xc0,$xc0
1552 vpxor 0x30($inp),$xd0,$xd0
1553 vpxor 0x40($inp),$xa1,$xa1
1554 vpxor 0x50($inp),$xb1,$xb1
1555 vpxor 0x60($inp),$xc1,$xc1
1556 vpxor 0x70($inp),$xd1,$xd1
1558 vmovdqu $xa0,0x00($out)
1559 vmovdqu $xb0,0x10($out)
1560 vmovdqu $xc0,0x20($out)
1561 vmovdqu $xd0,0x30($out)
1562 vmovdqu $xa1,0x40($out)
1563 vmovdqu $xb1,0x50($out)
1564 vmovdqu $xc1,0x60($out)
1565 vmovdqu $xd1,0x70($out)
1568 lea 0x80($inp),$inp # inp+=64*2
1569 vmovdqa $xa2,0x00(%rsp)
1571 vmovdqa $xb2,0x10(%rsp)
1572 lea 0x80($out),$out # out+=64*2
1573 vmovdqa $xc2,0x20(%rsp)
1574 sub \$128,$len # len-=64*2
1575 vmovdqa $xd2,0x30(%rsp)
1580 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1581 vpxor 0x10($inp),$xb0,$xb0
1582 vpxor 0x20($inp),$xc0,$xc0
1583 vpxor 0x30($inp),$xd0,$xd0
1584 vpxor 0x40($inp),$xa1,$xa1
1585 vpxor 0x50($inp),$xb1,$xb1
1586 vpxor 0x60($inp),$xc1,$xc1
1587 vpxor 0x70($inp),$xd1,$xd1
1588 lea 0x80($inp),$inp # size optimization
1589 vpxor 0x00($inp),$xa2,$xa2
1590 vpxor 0x10($inp),$xb2,$xb2
1591 vpxor 0x20($inp),$xc2,$xc2
1592 vpxor 0x30($inp),$xd2,$xd2
1594 vmovdqu $xa0,0x00($out)
1595 vmovdqu $xb0,0x10($out)
1596 vmovdqu $xc0,0x20($out)
1597 vmovdqu $xd0,0x30($out)
1598 vmovdqu $xa1,0x40($out)
1599 vmovdqu $xb1,0x50($out)
1600 vmovdqu $xc1,0x60($out)
1601 vmovdqu $xd1,0x70($out)
1602 lea 0x80($out),$out # size optimization
1603 vmovdqu $xa2,0x00($out)
1604 vmovdqu $xb2,0x10($out)
1605 vmovdqu $xc2,0x20($out)
1606 vmovdqu $xd2,0x30($out)
1609 lea 0x40($inp),$inp # inp+=64*3
1610 vmovdqa $xa3,0x00(%rsp)
1612 vmovdqa $xb3,0x10(%rsp)
1613 lea 0x40($out),$out # out+=64*3
1614 vmovdqa $xc3,0x20(%rsp)
1615 sub \$192,$len # len-=64*3
1616 vmovdqa $xd3,0x30(%rsp)
1619 movzb ($inp,%r10),%eax
1620 movzb (%rsp,%r10),%ecx
1623 mov %al,-1($out,%r10)
1630 $code.=<<___ if ($win64);
1631 movaps -0xa8(%r9),%xmm6
1632 movaps -0x98(%r9),%xmm7
1633 movaps -0x88(%r9),%xmm8
1634 movaps -0x78(%r9),%xmm9
1635 movaps -0x68(%r9),%xmm10
1636 movaps -0x58(%r9),%xmm11
1637 movaps -0x48(%r9),%xmm12
1638 movaps -0x38(%r9),%xmm13
1639 movaps -0x28(%r9),%xmm14
1640 movaps -0x18(%r9),%xmm15
1644 .cfi_def_cfa_register %rsp
1648 .size ChaCha20_4xop,.-ChaCha20_4xop
1652 ########################################################################
1655 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1656 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1657 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1658 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1660 sub AVX2_lane_ROUND {
1661 my ($a0,$b0,$c0,$d0)=@_;
1662 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1663 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1664 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1665 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1666 my @x=map("\"$_\"",@xx);
1668 # Consider order in which variables are addressed by their
1673 # 0 4 8 12 < even round
1677 # 0 5 10 15 < odd round
1682 # 'a', 'b' and 'd's are permanently allocated in registers,
1683 # @x[0..7,12..15], while 'c's are maintained in memory. If
1684 # you observe 'c' column, you'll notice that pair of 'c's is
1685 # invariant between rounds. This means that we have to reload
1686 # them once per round, in the middle. This is why you'll see
1687 # bunch of 'c' stores and loads in the middle, but none in
1688 # the beginning or end.
1691 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1692 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1693 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1694 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1695 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1696 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1698 "&vpaddd ($xc,$xc,@x[$d0])",
1699 "&vpxor (@x[$b0],$xc,@x[$b0])",
1700 "&vpslld ($t0,@x[$b0],12)",
1701 "&vpsrld (@x[$b0],@x[$b0],20)",
1702 "&vpor (@x[$b0],$t0,@x[$b0])",
1703 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1704 "&vpaddd ($xc_,$xc_,@x[$d1])",
1705 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1706 "&vpslld ($t1,@x[$b1],12)",
1707 "&vpsrld (@x[$b1],@x[$b1],20)",
1708 "&vpor (@x[$b1],$t1,@x[$b1])",
1710 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1711 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1712 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1713 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1714 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1715 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1717 "&vpaddd ($xc,$xc,@x[$d0])",
1718 "&vpxor (@x[$b0],$xc,@x[$b0])",
1719 "&vpslld ($t1,@x[$b0],7)",
1720 "&vpsrld (@x[$b0],@x[$b0],25)",
1721 "&vpor (@x[$b0],$t1,@x[$b0])",
1722 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1723 "&vpaddd ($xc_,$xc_,@x[$d1])",
1724 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1725 "&vpslld ($t0,@x[$b1],7)",
1726 "&vpsrld (@x[$b1],@x[$b1],25)",
1727 "&vpor (@x[$b1],$t0,@x[$b1])",
1729 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1730 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1731 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1732 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1734 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1735 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1736 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1737 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1738 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1739 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1741 "&vpaddd ($xc,$xc,@x[$d2])",
1742 "&vpxor (@x[$b2],$xc,@x[$b2])",
1743 "&vpslld ($t0,@x[$b2],12)",
1744 "&vpsrld (@x[$b2],@x[$b2],20)",
1745 "&vpor (@x[$b2],$t0,@x[$b2])",
1746 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1747 "&vpaddd ($xc_,$xc_,@x[$d3])",
1748 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1749 "&vpslld ($t1,@x[$b3],12)",
1750 "&vpsrld (@x[$b3],@x[$b3],20)",
1751 "&vpor (@x[$b3],$t1,@x[$b3])",
1753 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1754 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1755 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1756 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1757 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1758 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1760 "&vpaddd ($xc,$xc,@x[$d2])",
1761 "&vpxor (@x[$b2],$xc,@x[$b2])",
1762 "&vpslld ($t1,@x[$b2],7)",
1763 "&vpsrld (@x[$b2],@x[$b2],25)",
1764 "&vpor (@x[$b2],$t1,@x[$b2])",
1765 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1766 "&vpaddd ($xc_,$xc_,@x[$d3])",
1767 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1768 "&vpslld ($t0,@x[$b3],7)",
1769 "&vpsrld (@x[$b3],@x[$b3],25)",
1770 "&vpor (@x[$b3],$t0,@x[$b3])"
1774 my $xframe = $win64 ? 0xa8 : 8;
1777 .type ChaCha20_8x,\@function,5
1782 mov %rsp,%r9 # frame register
1783 .cfi_def_cfa_register %r9
1784 sub \$0x280+$xframe,%rsp
1787 $code.=<<___ if ($win64);
1788 movaps %xmm6,-0xa8(%r9)
1789 movaps %xmm7,-0x98(%r9)
1790 movaps %xmm8,-0x88(%r9)
1791 movaps %xmm9,-0x78(%r9)
1792 movaps %xmm10,-0x68(%r9)
1793 movaps %xmm11,-0x58(%r9)
1794 movaps %xmm12,-0x48(%r9)
1795 movaps %xmm13,-0x38(%r9)
1796 movaps %xmm14,-0x28(%r9)
1797 movaps %xmm15,-0x18(%r9)
1803 ################ stack layout
1804 # +0x00 SIMD equivalent of @x[8-12]
1806 # +0x80 constant copy of key[0-2] smashed by lanes
1808 # +0x200 SIMD counters (with nonce smashed by lanes)
1812 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1813 vbroadcasti128 ($key),$xb3 # key[1]
1814 vbroadcasti128 16($key),$xt3 # key[2]
1815 vbroadcasti128 ($counter),$xd3 # key[3]
1816 lea 0x100(%rsp),%rcx # size optimization
1817 lea 0x200(%rsp),%rax # size optimization
1818 lea .Lrot16(%rip),%r10
1819 lea .Lrot24(%rip),%r11
1821 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1822 vpshufd \$0x55,$xa3,$xa1
1823 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1824 vpshufd \$0xaa,$xa3,$xa2
1825 vmovdqa $xa1,0xa0-0x100(%rcx)
1826 vpshufd \$0xff,$xa3,$xa3
1827 vmovdqa $xa2,0xc0-0x100(%rcx)
1828 vmovdqa $xa3,0xe0-0x100(%rcx)
1830 vpshufd \$0x00,$xb3,$xb0
1831 vpshufd \$0x55,$xb3,$xb1
1832 vmovdqa $xb0,0x100-0x100(%rcx)
1833 vpshufd \$0xaa,$xb3,$xb2
1834 vmovdqa $xb1,0x120-0x100(%rcx)
1835 vpshufd \$0xff,$xb3,$xb3
1836 vmovdqa $xb2,0x140-0x100(%rcx)
1837 vmovdqa $xb3,0x160-0x100(%rcx)
1839 vpshufd \$0x00,$xt3,$xt0 # "xc0"
1840 vpshufd \$0x55,$xt3,$xt1 # "xc1"
1841 vmovdqa $xt0,0x180-0x200(%rax)
1842 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
1843 vmovdqa $xt1,0x1a0-0x200(%rax)
1844 vpshufd \$0xff,$xt3,$xt3 # "xc3"
1845 vmovdqa $xt2,0x1c0-0x200(%rax)
1846 vmovdqa $xt3,0x1e0-0x200(%rax)
1848 vpshufd \$0x00,$xd3,$xd0
1849 vpshufd \$0x55,$xd3,$xd1
1850 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
1851 vpshufd \$0xaa,$xd3,$xd2
1852 vmovdqa $xd1,0x220-0x200(%rax)
1853 vpshufd \$0xff,$xd3,$xd3
1854 vmovdqa $xd2,0x240-0x200(%rax)
1855 vmovdqa $xd3,0x260-0x200(%rax)
1861 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
1862 vmovdqa 0xa0-0x100(%rcx),$xa1
1863 vmovdqa 0xc0-0x100(%rcx),$xa2
1864 vmovdqa 0xe0-0x100(%rcx),$xa3
1865 vmovdqa 0x100-0x100(%rcx),$xb0
1866 vmovdqa 0x120-0x100(%rcx),$xb1
1867 vmovdqa 0x140-0x100(%rcx),$xb2
1868 vmovdqa 0x160-0x100(%rcx),$xb3
1869 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
1870 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
1871 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
1872 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
1873 vmovdqa 0x200-0x200(%rax),$xd0
1874 vmovdqa 0x220-0x200(%rax),$xd1
1875 vmovdqa 0x240-0x200(%rax),$xd2
1876 vmovdqa 0x260-0x200(%rax),$xd3
1877 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
1880 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
1881 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
1882 vbroadcasti128 (%r10),$xt3
1883 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
1890 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
1891 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
1896 lea 0x200(%rsp),%rax # size optimization
1897 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
1898 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
1899 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
1900 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
1902 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1903 vpunpckldq $xa3,$xa2,$xt3
1904 vpunpckhdq $xa1,$xa0,$xa0
1905 vpunpckhdq $xa3,$xa2,$xa2
1906 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1907 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1908 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1909 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1911 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1913 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
1914 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
1915 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
1916 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
1918 vpunpckldq $xb1,$xb0,$xt2
1919 vpunpckldq $xb3,$xb2,$xt3
1920 vpunpckhdq $xb1,$xb0,$xb0
1921 vpunpckhdq $xb3,$xb2,$xb2
1922 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1923 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1924 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1925 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1927 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1929 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
1930 vperm2i128 \$0x31,$xb0,$xa0,$xb0
1931 vperm2i128 \$0x20,$xb1,$xa1,$xa0
1932 vperm2i128 \$0x31,$xb1,$xa1,$xb1
1933 vperm2i128 \$0x20,$xb2,$xa2,$xa1
1934 vperm2i128 \$0x31,$xb2,$xa2,$xb2
1935 vperm2i128 \$0x20,$xb3,$xa3,$xa2
1936 vperm2i128 \$0x31,$xb3,$xa3,$xb3
1938 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1939 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1941 vmovdqa $xa0,0x00(%rsp) # offload $xaN
1942 vmovdqa $xa1,0x20(%rsp)
1943 vmovdqa 0x40(%rsp),$xc2 # $xa0
1944 vmovdqa 0x60(%rsp),$xc3 # $xa1
1946 vpaddd 0x180-0x200(%rax),$xc0,$xc0
1947 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
1948 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
1949 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
1951 vpunpckldq $xc1,$xc0,$xt2
1952 vpunpckldq $xc3,$xc2,$xt3
1953 vpunpckhdq $xc1,$xc0,$xc0
1954 vpunpckhdq $xc3,$xc2,$xc2
1955 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1956 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1957 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1958 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1960 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1962 vpaddd 0x200-0x200(%rax),$xd0,$xd0
1963 vpaddd 0x220-0x200(%rax),$xd1,$xd1
1964 vpaddd 0x240-0x200(%rax),$xd2,$xd2
1965 vpaddd 0x260-0x200(%rax),$xd3,$xd3
1967 vpunpckldq $xd1,$xd0,$xt2
1968 vpunpckldq $xd3,$xd2,$xt3
1969 vpunpckhdq $xd1,$xd0,$xd0
1970 vpunpckhdq $xd3,$xd2,$xd2
1971 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1972 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1973 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1974 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1976 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1978 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
1979 vperm2i128 \$0x31,$xd0,$xc0,$xd0
1980 vperm2i128 \$0x20,$xd1,$xc1,$xc0
1981 vperm2i128 \$0x31,$xd1,$xc1,$xd1
1982 vperm2i128 \$0x20,$xd2,$xc2,$xc1
1983 vperm2i128 \$0x31,$xd2,$xc2,$xd2
1984 vperm2i128 \$0x20,$xd3,$xc3,$xc2
1985 vperm2i128 \$0x31,$xd3,$xc3,$xd3
1987 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1988 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1989 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1990 ($xa0,$xa1)=($xt2,$xt3);
1992 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
1993 vmovdqa 0x20(%rsp),$xa1
1998 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1999 vpxor 0x20($inp),$xb0,$xb0
2000 vpxor 0x40($inp),$xc0,$xc0
2001 vpxor 0x60($inp),$xd0,$xd0
2002 lea 0x80($inp),$inp # size optimization
2003 vmovdqu $xa0,0x00($out)
2004 vmovdqu $xb0,0x20($out)
2005 vmovdqu $xc0,0x40($out)
2006 vmovdqu $xd0,0x60($out)
2007 lea 0x80($out),$out # size optimization
2009 vpxor 0x00($inp),$xa1,$xa1
2010 vpxor 0x20($inp),$xb1,$xb1
2011 vpxor 0x40($inp),$xc1,$xc1
2012 vpxor 0x60($inp),$xd1,$xd1
2013 lea 0x80($inp),$inp # size optimization
2014 vmovdqu $xa1,0x00($out)
2015 vmovdqu $xb1,0x20($out)
2016 vmovdqu $xc1,0x40($out)
2017 vmovdqu $xd1,0x60($out)
2018 lea 0x80($out),$out # size optimization
2020 vpxor 0x00($inp),$xa2,$xa2
2021 vpxor 0x20($inp),$xb2,$xb2
2022 vpxor 0x40($inp),$xc2,$xc2
2023 vpxor 0x60($inp),$xd2,$xd2
2024 lea 0x80($inp),$inp # size optimization
2025 vmovdqu $xa2,0x00($out)
2026 vmovdqu $xb2,0x20($out)
2027 vmovdqu $xc2,0x40($out)
2028 vmovdqu $xd2,0x60($out)
2029 lea 0x80($out),$out # size optimization
2031 vpxor 0x00($inp),$xa3,$xa3
2032 vpxor 0x20($inp),$xb3,$xb3
2033 vpxor 0x40($inp),$xc3,$xc3
2034 vpxor 0x60($inp),$xd3,$xd3
2035 lea 0x80($inp),$inp # size optimization
2036 vmovdqu $xa3,0x00($out)
2037 vmovdqu $xb3,0x20($out)
2038 vmovdqu $xc3,0x40($out)
2039 vmovdqu $xd3,0x60($out)
2040 lea 0x80($out),$out # size optimization
2064 vmovdqa $xa0,0x00(%rsp)
2065 vmovdqa $xb0,0x20(%rsp)
2070 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2071 vpxor 0x20($inp),$xb0,$xb0
2072 vmovdqu $xa0,0x00($out)
2073 vmovdqu $xb0,0x20($out)
2076 lea 0x40($inp),$inp # inp+=64*1
2078 vmovdqa $xc0,0x00(%rsp)
2079 lea 0x40($out),$out # out+=64*1
2080 sub \$64,$len # len-=64*1
2081 vmovdqa $xd0,0x20(%rsp)
2086 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2087 vpxor 0x20($inp),$xb0,$xb0
2088 vpxor 0x40($inp),$xc0,$xc0
2089 vpxor 0x60($inp),$xd0,$xd0
2090 vmovdqu $xa0,0x00($out)
2091 vmovdqu $xb0,0x20($out)
2092 vmovdqu $xc0,0x40($out)
2093 vmovdqu $xd0,0x60($out)
2096 lea 0x80($inp),$inp # inp+=64*2
2098 vmovdqa $xa1,0x00(%rsp)
2099 lea 0x80($out),$out # out+=64*2
2100 sub \$128,$len # len-=64*2
2101 vmovdqa $xb1,0x20(%rsp)
2106 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2107 vpxor 0x20($inp),$xb0,$xb0
2108 vpxor 0x40($inp),$xc0,$xc0
2109 vpxor 0x60($inp),$xd0,$xd0
2110 vpxor 0x80($inp),$xa1,$xa1
2111 vpxor 0xa0($inp),$xb1,$xb1
2112 vmovdqu $xa0,0x00($out)
2113 vmovdqu $xb0,0x20($out)
2114 vmovdqu $xc0,0x40($out)
2115 vmovdqu $xd0,0x60($out)
2116 vmovdqu $xa1,0x80($out)
2117 vmovdqu $xb1,0xa0($out)
2120 lea 0xc0($inp),$inp # inp+=64*3
2122 vmovdqa $xc1,0x00(%rsp)
2123 lea 0xc0($out),$out # out+=64*3
2124 sub \$192,$len # len-=64*3
2125 vmovdqa $xd1,0x20(%rsp)
2130 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2131 vpxor 0x20($inp),$xb0,$xb0
2132 vpxor 0x40($inp),$xc0,$xc0
2133 vpxor 0x60($inp),$xd0,$xd0
2134 vpxor 0x80($inp),$xa1,$xa1
2135 vpxor 0xa0($inp),$xb1,$xb1
2136 vpxor 0xc0($inp),$xc1,$xc1
2137 vpxor 0xe0($inp),$xd1,$xd1
2138 vmovdqu $xa0,0x00($out)
2139 vmovdqu $xb0,0x20($out)
2140 vmovdqu $xc0,0x40($out)
2141 vmovdqu $xd0,0x60($out)
2142 vmovdqu $xa1,0x80($out)
2143 vmovdqu $xb1,0xa0($out)
2144 vmovdqu $xc1,0xc0($out)
2145 vmovdqu $xd1,0xe0($out)
2148 lea 0x100($inp),$inp # inp+=64*4
2150 vmovdqa $xa2,0x00(%rsp)
2151 lea 0x100($out),$out # out+=64*4
2152 sub \$256,$len # len-=64*4
2153 vmovdqa $xb2,0x20(%rsp)
2158 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2159 vpxor 0x20($inp),$xb0,$xb0
2160 vpxor 0x40($inp),$xc0,$xc0
2161 vpxor 0x60($inp),$xd0,$xd0
2162 vpxor 0x80($inp),$xa1,$xa1
2163 vpxor 0xa0($inp),$xb1,$xb1
2164 vpxor 0xc0($inp),$xc1,$xc1
2165 vpxor 0xe0($inp),$xd1,$xd1
2166 vpxor 0x100($inp),$xa2,$xa2
2167 vpxor 0x120($inp),$xb2,$xb2
2168 vmovdqu $xa0,0x00($out)
2169 vmovdqu $xb0,0x20($out)
2170 vmovdqu $xc0,0x40($out)
2171 vmovdqu $xd0,0x60($out)
2172 vmovdqu $xa1,0x80($out)
2173 vmovdqu $xb1,0xa0($out)
2174 vmovdqu $xc1,0xc0($out)
2175 vmovdqu $xd1,0xe0($out)
2176 vmovdqu $xa2,0x100($out)
2177 vmovdqu $xb2,0x120($out)
2180 lea 0x140($inp),$inp # inp+=64*5
2182 vmovdqa $xc2,0x00(%rsp)
2183 lea 0x140($out),$out # out+=64*5
2184 sub \$320,$len # len-=64*5
2185 vmovdqa $xd2,0x20(%rsp)
2190 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2191 vpxor 0x20($inp),$xb0,$xb0
2192 vpxor 0x40($inp),$xc0,$xc0
2193 vpxor 0x60($inp),$xd0,$xd0
2194 vpxor 0x80($inp),$xa1,$xa1
2195 vpxor 0xa0($inp),$xb1,$xb1
2196 vpxor 0xc0($inp),$xc1,$xc1
2197 vpxor 0xe0($inp),$xd1,$xd1
2198 vpxor 0x100($inp),$xa2,$xa2
2199 vpxor 0x120($inp),$xb2,$xb2
2200 vpxor 0x140($inp),$xc2,$xc2
2201 vpxor 0x160($inp),$xd2,$xd2
2202 vmovdqu $xa0,0x00($out)
2203 vmovdqu $xb0,0x20($out)
2204 vmovdqu $xc0,0x40($out)
2205 vmovdqu $xd0,0x60($out)
2206 vmovdqu $xa1,0x80($out)
2207 vmovdqu $xb1,0xa0($out)
2208 vmovdqu $xc1,0xc0($out)
2209 vmovdqu $xd1,0xe0($out)
2210 vmovdqu $xa2,0x100($out)
2211 vmovdqu $xb2,0x120($out)
2212 vmovdqu $xc2,0x140($out)
2213 vmovdqu $xd2,0x160($out)
2216 lea 0x180($inp),$inp # inp+=64*6
2218 vmovdqa $xa3,0x00(%rsp)
2219 lea 0x180($out),$out # out+=64*6
2220 sub \$384,$len # len-=64*6
2221 vmovdqa $xb3,0x20(%rsp)
2226 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2227 vpxor 0x20($inp),$xb0,$xb0
2228 vpxor 0x40($inp),$xc0,$xc0
2229 vpxor 0x60($inp),$xd0,$xd0
2230 vpxor 0x80($inp),$xa1,$xa1
2231 vpxor 0xa0($inp),$xb1,$xb1
2232 vpxor 0xc0($inp),$xc1,$xc1
2233 vpxor 0xe0($inp),$xd1,$xd1
2234 vpxor 0x100($inp),$xa2,$xa2
2235 vpxor 0x120($inp),$xb2,$xb2
2236 vpxor 0x140($inp),$xc2,$xc2
2237 vpxor 0x160($inp),$xd2,$xd2
2238 vpxor 0x180($inp),$xa3,$xa3
2239 vpxor 0x1a0($inp),$xb3,$xb3
2240 vmovdqu $xa0,0x00($out)
2241 vmovdqu $xb0,0x20($out)
2242 vmovdqu $xc0,0x40($out)
2243 vmovdqu $xd0,0x60($out)
2244 vmovdqu $xa1,0x80($out)
2245 vmovdqu $xb1,0xa0($out)
2246 vmovdqu $xc1,0xc0($out)
2247 vmovdqu $xd1,0xe0($out)
2248 vmovdqu $xa2,0x100($out)
2249 vmovdqu $xb2,0x120($out)
2250 vmovdqu $xc2,0x140($out)
2251 vmovdqu $xd2,0x160($out)
2252 vmovdqu $xa3,0x180($out)
2253 vmovdqu $xb3,0x1a0($out)
2256 lea 0x1c0($inp),$inp # inp+=64*7
2258 vmovdqa $xc3,0x00(%rsp)
2259 lea 0x1c0($out),$out # out+=64*7
2260 sub \$448,$len # len-=64*7
2261 vmovdqa $xd3,0x20(%rsp)
2264 movzb ($inp,%r10),%eax
2265 movzb (%rsp,%r10),%ecx
2268 mov %al,-1($out,%r10)
2275 $code.=<<___ if ($win64);
2276 movaps -0xa8(%r9),%xmm6
2277 movaps -0x98(%r9),%xmm7
2278 movaps -0x88(%r9),%xmm8
2279 movaps -0x78(%r9),%xmm9
2280 movaps -0x68(%r9),%xmm10
2281 movaps -0x58(%r9),%xmm11
2282 movaps -0x48(%r9),%xmm12
2283 movaps -0x38(%r9),%xmm13
2284 movaps -0x28(%r9),%xmm14
2285 movaps -0x18(%r9),%xmm15
2289 .cfi_def_cfa_register %rsp
2293 .size ChaCha20_8x,.-ChaCha20_8x
2297 ########################################################################
2300 # This one handles shorter inputs...
2302 my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
2303 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
2305 sub vpxord() # size optimization
2306 { my $opcode = "vpxor"; # adhere to vpxor when possible
2309 if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) {
2315 $code .= "\t$opcode\t".join(',',reverse @_)."\n";
2318 sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
2336 my $xframe = $win64 ? 32+8 : 8;
2339 .type ChaCha20_avx512,\@function,5
2344 mov %rsp,%r9 # frame pointer
2345 .cfi_def_cfa_register %r9
2349 sub \$64+$xframe,%rsp
2351 $code.=<<___ if ($win64);
2352 movaps %xmm6,-0x28(%r9)
2353 movaps %xmm7,-0x18(%r9)
2357 vbroadcasti32x4 .Lsigma(%rip),$a
2358 vbroadcasti32x4 ($key),$b
2359 vbroadcasti32x4 16($key),$c
2360 vbroadcasti32x4 ($counter),$d
2365 vpaddd .Lzeroz(%rip),$d,$d
2366 vmovdqa32 .Lfourz(%rip),$fourz
2367 mov \$10,$counter # reuse $counter
2376 vpaddd $fourz,$d_,$d
2385 &vpshufd ($c,$c,0b01001110);
2386 &vpshufd ($b,$b,0b00111001);
2387 &vpshufd ($d,$d,0b10010011);
2390 &vpshufd ($c,$c,0b01001110);
2391 &vpshufd ($b,$b,0b10010011);
2392 &vpshufd ($d,$d,0b00111001);
2395 &jnz (".Loop_avx512");
2406 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2407 vpxor 0x10($inp),%x#$b,$t1
2408 vpxor 0x20($inp),%x#$c,$t2
2409 vpxor 0x30($inp),%x#$d,$t3
2410 lea 0x40($inp),$inp # inp+=64
2412 vmovdqu $t0,0x00($out) # write output
2413 vmovdqu $t1,0x10($out)
2414 vmovdqu $t2,0x20($out)
2415 vmovdqu $t3,0x30($out)
2416 lea 0x40($out),$out # out+=64
2420 vextracti32x4 \$1,$a,$t0
2421 vextracti32x4 \$1,$b,$t1
2422 vextracti32x4 \$1,$c,$t2
2423 vextracti32x4 \$1,$d,$t3
2428 vpxor 0x00($inp),$t0,$t0 # xor with input
2429 vpxor 0x10($inp),$t1,$t1
2430 vpxor 0x20($inp),$t2,$t2
2431 vpxor 0x30($inp),$t3,$t3
2432 lea 0x40($inp),$inp # inp+=64
2434 vmovdqu $t0,0x00($out) # write output
2435 vmovdqu $t1,0x10($out)
2436 vmovdqu $t2,0x20($out)
2437 vmovdqu $t3,0x30($out)
2438 lea 0x40($out),$out # out+=64
2442 vextracti32x4 \$2,$a,$t0
2443 vextracti32x4 \$2,$b,$t1
2444 vextracti32x4 \$2,$c,$t2
2445 vextracti32x4 \$2,$d,$t3
2450 vpxor 0x00($inp),$t0,$t0 # xor with input
2451 vpxor 0x10($inp),$t1,$t1
2452 vpxor 0x20($inp),$t2,$t2
2453 vpxor 0x30($inp),$t3,$t3
2454 lea 0x40($inp),$inp # inp+=64
2456 vmovdqu $t0,0x00($out) # write output
2457 vmovdqu $t1,0x10($out)
2458 vmovdqu $t2,0x20($out)
2459 vmovdqu $t3,0x30($out)
2460 lea 0x40($out),$out # out+=64
2464 vextracti32x4 \$3,$a,$t0
2465 vextracti32x4 \$3,$b,$t1
2466 vextracti32x4 \$3,$c,$t2
2467 vextracti32x4 \$3,$d,$t3
2472 vpxor 0x00($inp),$t0,$t0 # xor with input
2473 vpxor 0x10($inp),$t1,$t1
2474 vpxor 0x20($inp),$t2,$t2
2475 vpxor 0x30($inp),$t3,$t3
2476 lea 0x40($inp),$inp # inp+=64
2478 vmovdqu $t0,0x00($out) # write output
2479 vmovdqu $t1,0x10($out)
2480 vmovdqu $t2,0x20($out)
2481 vmovdqu $t3,0x30($out)
2482 lea 0x40($out),$out # out+=64
2484 jnz .Loop_outer_avx512
2490 vmovdqa %x#$a,0x00(%rsp)
2491 vmovdqa %x#$b,0x10(%rsp)
2492 vmovdqa %x#$c,0x20(%rsp)
2493 vmovdqa %x#$d,0x30(%rsp)
2495 jmp .Loop_tail_avx512
2499 vmovdqa $t0,0x00(%rsp)
2500 vmovdqa $t1,0x10(%rsp)
2501 vmovdqa $t2,0x20(%rsp)
2502 vmovdqa $t3,0x30(%rsp)
2506 movzb ($inp,$counter),%eax
2507 movzb (%rsp,$counter),%ecx
2508 lea 1($counter),$counter
2510 mov %al,-1($out,$counter)
2512 jnz .Loop_tail_avx512
2514 vmovdqu32 $a_,0x00(%rsp)
2519 $code.=<<___ if ($win64);
2520 movaps -0x28(%r9),%xmm6
2521 movaps -0x18(%r9),%xmm7
2525 .cfi_def_cfa_register %rsp
2529 .size ChaCha20_avx512,.-ChaCha20_avx512
2532 map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
2535 .type ChaCha20_avx512vl,\@function,5
2539 .LChaCha20_avx512vl:
2540 mov %rsp,%r9 # frame pointer
2541 .cfi_def_cfa_register %r9
2545 sub \$64+$xframe,%rsp
2547 $code.=<<___ if ($win64);
2548 movaps %xmm6,-0x28(%r9)
2549 movaps %xmm7,-0x18(%r9)
2553 vbroadcasti128 .Lsigma(%rip),$a
2554 vbroadcasti128 ($key),$b
2555 vbroadcasti128 16($key),$c
2556 vbroadcasti128 ($counter),$d
2561 vpaddd .Lzeroz(%rip),$d,$d
2562 vmovdqa32 .Ltwoy(%rip),$fourz
2563 mov \$10,$counter # reuse $counter
2568 .Loop_outer_avx512vl:
2570 vpaddd $fourz,$d_,$d
2579 &vpshufd ($c,$c,0b01001110);
2580 &vpshufd ($b,$b,0b00111001);
2581 &vpshufd ($d,$d,0b10010011);
2584 &vpshufd ($c,$c,0b01001110);
2585 &vpshufd ($b,$b,0b10010011);
2586 &vpshufd ($d,$d,0b00111001);
2589 &jnz (".Loop_avx512vl");
2598 jb .Ltail64_avx512vl
2600 vpxor 0x00($inp),%x#$a,$t0 # xor with input
2601 vpxor 0x10($inp),%x#$b,$t1
2602 vpxor 0x20($inp),%x#$c,$t2
2603 vpxor 0x30($inp),%x#$d,$t3
2604 lea 0x40($inp),$inp # inp+=64
2606 vmovdqu $t0,0x00($out) # write output
2607 vmovdqu $t1,0x10($out)
2608 vmovdqu $t2,0x20($out)
2609 vmovdqu $t3,0x30($out)
2610 lea 0x40($out),$out # out+=64
2614 vextracti128 \$1,$a,$t0
2615 vextracti128 \$1,$b,$t1
2616 vextracti128 \$1,$c,$t2
2617 vextracti128 \$1,$d,$t3
2622 vpxor 0x00($inp),$t0,$t0 # xor with input
2623 vpxor 0x10($inp),$t1,$t1
2624 vpxor 0x20($inp),$t2,$t2
2625 vpxor 0x30($inp),$t3,$t3
2626 lea 0x40($inp),$inp # inp+=64
2628 vmovdqu $t0,0x00($out) # write output
2629 vmovdqu $t1,0x10($out)
2630 vmovdqu $t2,0x20($out)
2631 vmovdqu $t3,0x30($out)
2632 lea 0x40($out),$out # out+=64
2636 jnz .Loop_outer_avx512vl
2642 vmovdqa %x#$a,0x00(%rsp)
2643 vmovdqa %x#$b,0x10(%rsp)
2644 vmovdqa %x#$c,0x20(%rsp)
2645 vmovdqa %x#$d,0x30(%rsp)
2647 jmp .Loop_tail_avx512vl
2651 vmovdqa $t0,0x00(%rsp)
2652 vmovdqa $t1,0x10(%rsp)
2653 vmovdqa $t2,0x20(%rsp)
2654 vmovdqa $t3,0x30(%rsp)
2657 .Loop_tail_avx512vl:
2658 movzb ($inp,$counter),%eax
2659 movzb (%rsp,$counter),%ecx
2660 lea 1($counter),$counter
2662 mov %al,-1($out,$counter)
2664 jnz .Loop_tail_avx512vl
2666 vmovdqu32 $a_,0x00(%rsp)
2667 vmovdqu32 $a_,0x20(%rsp)
2672 $code.=<<___ if ($win64);
2673 movaps -0x28(%r9),%xmm6
2674 movaps -0x18(%r9),%xmm7
2678 .cfi_def_cfa_register %rsp
2679 .Lavx512vl_epilogue:
2682 .size ChaCha20_avx512vl,.-ChaCha20_avx512vl
2686 # This one handles longer inputs...
2688 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2689 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2690 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2691 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2692 my @key=map("%zmm$_",(16..31));
2693 my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2695 sub AVX512_lane_ROUND {
2696 my ($a0,$b0,$c0,$d0)=@_;
2697 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2698 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2699 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2700 my @x=map("\"$_\"",@xx);
2703 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
2704 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
2705 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
2706 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
2707 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2708 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2709 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2710 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2711 "&vprold (@x[$d0],@x[$d0],16)",
2712 "&vprold (@x[$d1],@x[$d1],16)",
2713 "&vprold (@x[$d2],@x[$d2],16)",
2714 "&vprold (@x[$d3],@x[$d3],16)",
2716 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2717 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2718 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2719 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2720 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2721 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2722 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2723 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2724 "&vprold (@x[$b0],@x[$b0],12)",
2725 "&vprold (@x[$b1],@x[$b1],12)",
2726 "&vprold (@x[$b2],@x[$b2],12)",
2727 "&vprold (@x[$b3],@x[$b3],12)",
2729 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
2730 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
2731 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
2732 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
2733 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2734 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2735 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2736 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2737 "&vprold (@x[$d0],@x[$d0],8)",
2738 "&vprold (@x[$d1],@x[$d1],8)",
2739 "&vprold (@x[$d2],@x[$d2],8)",
2740 "&vprold (@x[$d3],@x[$d3],8)",
2742 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2743 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2744 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2745 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2746 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2747 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2748 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2749 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2750 "&vprold (@x[$b0],@x[$b0],7)",
2751 "&vprold (@x[$b1],@x[$b1],7)",
2752 "&vprold (@x[$b2],@x[$b2],7)",
2753 "&vprold (@x[$b3],@x[$b3],7)"
2757 my $xframe = $win64 ? 0xa8 : 8;
2760 .type ChaCha20_16x,\@function,5
2765 mov %rsp,%r9 # frame register
2766 .cfi_def_cfa_register %r9
2767 sub \$64+$xframe,%rsp
2770 $code.=<<___ if ($win64);
2771 movaps %xmm6,-0xa8(%r9)
2772 movaps %xmm7,-0x98(%r9)
2773 movaps %xmm8,-0x88(%r9)
2774 movaps %xmm9,-0x78(%r9)
2775 movaps %xmm10,-0x68(%r9)
2776 movaps %xmm11,-0x58(%r9)
2777 movaps %xmm12,-0x48(%r9)
2778 movaps %xmm13,-0x38(%r9)
2779 movaps %xmm14,-0x28(%r9)
2780 movaps %xmm15,-0x18(%r9)
2786 lea .Lsigma(%rip),%r10
2787 vbroadcasti32x4 (%r10),$xa3 # key[0]
2788 vbroadcasti32x4 ($key),$xb3 # key[1]
2789 vbroadcasti32x4 16($key),$xc3 # key[2]
2790 vbroadcasti32x4 ($counter),$xd3 # key[3]
2792 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
2793 vpshufd \$0x55,$xa3,$xa1
2794 vpshufd \$0xaa,$xa3,$xa2
2795 vpshufd \$0xff,$xa3,$xa3
2796 vmovdqa64 $xa0,@key[0]
2797 vmovdqa64 $xa1,@key[1]
2798 vmovdqa64 $xa2,@key[2]
2799 vmovdqa64 $xa3,@key[3]
2801 vpshufd \$0x00,$xb3,$xb0
2802 vpshufd \$0x55,$xb3,$xb1
2803 vpshufd \$0xaa,$xb3,$xb2
2804 vpshufd \$0xff,$xb3,$xb3
2805 vmovdqa64 $xb0,@key[4]
2806 vmovdqa64 $xb1,@key[5]
2807 vmovdqa64 $xb2,@key[6]
2808 vmovdqa64 $xb3,@key[7]
2810 vpshufd \$0x00,$xc3,$xc0
2811 vpshufd \$0x55,$xc3,$xc1
2812 vpshufd \$0xaa,$xc3,$xc2
2813 vpshufd \$0xff,$xc3,$xc3
2814 vmovdqa64 $xc0,@key[8]
2815 vmovdqa64 $xc1,@key[9]
2816 vmovdqa64 $xc2,@key[10]
2817 vmovdqa64 $xc3,@key[11]
2819 vpshufd \$0x00,$xd3,$xd0
2820 vpshufd \$0x55,$xd3,$xd1
2821 vpshufd \$0xaa,$xd3,$xd2
2822 vpshufd \$0xff,$xd3,$xd3
2823 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
2824 vmovdqa64 $xd0,@key[12]
2825 vmovdqa64 $xd1,@key[13]
2826 vmovdqa64 $xd2,@key[14]
2827 vmovdqa64 $xd3,@key[15]
2834 vpbroadcastd 0(%r10),$xa0 # reload key
2835 vpbroadcastd 4(%r10),$xa1
2836 vpbroadcastd 8(%r10),$xa2
2837 vpbroadcastd 12(%r10),$xa3
2838 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
2839 vmovdqa64 @key[4],$xb0
2840 vmovdqa64 @key[5],$xb1
2841 vmovdqa64 @key[6],$xb2
2842 vmovdqa64 @key[7],$xb3
2843 vmovdqa64 @key[8],$xc0
2844 vmovdqa64 @key[9],$xc1
2845 vmovdqa64 @key[10],$xc2
2846 vmovdqa64 @key[11],$xc3
2847 vmovdqa64 @key[12],$xd0
2848 vmovdqa64 @key[13],$xd1
2849 vmovdqa64 @key[14],$xd2
2850 vmovdqa64 @key[15],$xd3
2852 vmovdqa64 $xa0,@key[0]
2853 vmovdqa64 $xa1,@key[1]
2854 vmovdqa64 $xa2,@key[2]
2855 vmovdqa64 $xa3,@key[3]
2863 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
2864 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
2869 vpaddd @key[0],$xa0,$xa0 # accumulate key
2870 vpaddd @key[1],$xa1,$xa1
2871 vpaddd @key[2],$xa2,$xa2
2872 vpaddd @key[3],$xa3,$xa3
2874 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
2875 vpunpckldq $xa3,$xa2,$xt3
2876 vpunpckhdq $xa1,$xa0,$xa0
2877 vpunpckhdq $xa3,$xa2,$xa2
2878 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
2879 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
2880 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
2881 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
2883 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2885 vpaddd @key[4],$xb0,$xb0
2886 vpaddd @key[5],$xb1,$xb1
2887 vpaddd @key[6],$xb2,$xb2
2888 vpaddd @key[7],$xb3,$xb3
2890 vpunpckldq $xb1,$xb0,$xt2
2891 vpunpckldq $xb3,$xb2,$xt3
2892 vpunpckhdq $xb1,$xb0,$xb0
2893 vpunpckhdq $xb3,$xb2,$xb2
2894 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
2895 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
2896 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
2897 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
2899 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2901 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
2902 vshufi32x4 \$0xee,$xb0,$xa0,$xb0
2903 vshufi32x4 \$0x44,$xb1,$xa1,$xa0
2904 vshufi32x4 \$0xee,$xb1,$xa1,$xb1
2905 vshufi32x4 \$0x44,$xb2,$xa2,$xa1
2906 vshufi32x4 \$0xee,$xb2,$xa2,$xb2
2907 vshufi32x4 \$0x44,$xb3,$xa3,$xa2
2908 vshufi32x4 \$0xee,$xb3,$xa3,$xb3
2910 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2912 vpaddd @key[8],$xc0,$xc0
2913 vpaddd @key[9],$xc1,$xc1
2914 vpaddd @key[10],$xc2,$xc2
2915 vpaddd @key[11],$xc3,$xc3
2917 vpunpckldq $xc1,$xc0,$xt2
2918 vpunpckldq $xc3,$xc2,$xt3
2919 vpunpckhdq $xc1,$xc0,$xc0
2920 vpunpckhdq $xc3,$xc2,$xc2
2921 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
2922 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
2923 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
2924 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
2926 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2928 vpaddd @key[12],$xd0,$xd0
2929 vpaddd @key[13],$xd1,$xd1
2930 vpaddd @key[14],$xd2,$xd2
2931 vpaddd @key[15],$xd3,$xd3
2933 vpunpckldq $xd1,$xd0,$xt2
2934 vpunpckldq $xd3,$xd2,$xt3
2935 vpunpckhdq $xd1,$xd0,$xd0
2936 vpunpckhdq $xd3,$xd2,$xd2
2937 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
2938 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
2939 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
2940 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
2942 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2944 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
2945 vshufi32x4 \$0xee,$xd0,$xc0,$xd0
2946 vshufi32x4 \$0x44,$xd1,$xc1,$xc0
2947 vshufi32x4 \$0xee,$xd1,$xc1,$xd1
2948 vshufi32x4 \$0x44,$xd2,$xc2,$xc1
2949 vshufi32x4 \$0xee,$xd2,$xc2,$xd2
2950 vshufi32x4 \$0x44,$xd3,$xc3,$xc2
2951 vshufi32x4 \$0xee,$xd3,$xc3,$xd3
2953 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2955 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
2956 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
2957 vshufi32x4 \$0x88,$xd0,$xb0,$xc0
2958 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
2959 vshufi32x4 \$0x88,$xc1,$xa1,$xt1
2960 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
2961 vshufi32x4 \$0x88,$xd1,$xb1,$xc1
2962 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
2963 vshufi32x4 \$0x88,$xc2,$xa2,$xt2
2964 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
2965 vshufi32x4 \$0x88,$xd2,$xb2,$xc2
2966 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
2967 vshufi32x4 \$0x88,$xc3,$xa3,$xt3
2968 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
2969 vshufi32x4 \$0x88,$xd3,$xb3,$xc3
2970 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
2972 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
2973 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
2975 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
2976 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
2977 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2978 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2983 vpxord 0x00($inp),$xa0,$xa0 # xor with input
2984 vpxord 0x40($inp),$xb0,$xb0
2985 vpxord 0x80($inp),$xc0,$xc0
2986 vpxord 0xc0($inp),$xd0,$xd0
2987 vmovdqu32 $xa0,0x00($out)
2988 vmovdqu32 $xb0,0x40($out)
2989 vmovdqu32 $xc0,0x80($out)
2990 vmovdqu32 $xd0,0xc0($out)
2992 vpxord 0x100($inp),$xa1,$xa1
2993 vpxord 0x140($inp),$xb1,$xb1
2994 vpxord 0x180($inp),$xc1,$xc1
2995 vpxord 0x1c0($inp),$xd1,$xd1
2996 vmovdqu32 $xa1,0x100($out)
2997 vmovdqu32 $xb1,0x140($out)
2998 vmovdqu32 $xc1,0x180($out)
2999 vmovdqu32 $xd1,0x1c0($out)
3001 vpxord 0x200($inp),$xa2,$xa2
3002 vpxord 0x240($inp),$xb2,$xb2
3003 vpxord 0x280($inp),$xc2,$xc2
3004 vpxord 0x2c0($inp),$xd2,$xd2
3005 vmovdqu32 $xa2,0x200($out)
3006 vmovdqu32 $xb2,0x240($out)
3007 vmovdqu32 $xc2,0x280($out)
3008 vmovdqu32 $xd2,0x2c0($out)
3010 vpxord 0x300($inp),$xa3,$xa3
3011 vpxord 0x340($inp),$xb3,$xb3
3012 vpxord 0x380($inp),$xc3,$xc3
3013 vpxord 0x3c0($inp),$xd3,$xd3
3014 lea 0x400($inp),$inp
3015 vmovdqu32 $xa3,0x300($out)
3016 vmovdqu32 $xb3,0x340($out)
3017 vmovdqu32 $xc3,0x380($out)
3018 vmovdqu32 $xd3,0x3c0($out)
3019 lea 0x400($out),$out
3031 jb .Less_than_64_16x
3032 vpxord ($inp),$xa0,$xa0 # xor with input
3033 vmovdqu32 $xa0,($out,$inp)
3039 jb .Less_than_64_16x
3040 vpxord ($inp),$xb0,$xb0
3041 vmovdqu32 $xb0,($out,$inp)
3047 jb .Less_than_64_16x
3048 vpxord ($inp),$xc0,$xc0
3049 vmovdqu32 $xc0,($out,$inp)
3055 jb .Less_than_64_16x
3056 vpxord ($inp),$xd0,$xd0
3057 vmovdqu32 $xd0,($out,$inp)
3063 jb .Less_than_64_16x
3064 vpxord ($inp),$xa1,$xa1
3065 vmovdqu32 $xa1,($out,$inp)
3071 jb .Less_than_64_16x
3072 vpxord ($inp),$xb1,$xb1
3073 vmovdqu32 $xb1,($out,$inp)
3079 jb .Less_than_64_16x
3080 vpxord ($inp),$xc1,$xc1
3081 vmovdqu32 $xc1,($out,$inp)
3087 jb .Less_than_64_16x
3088 vpxord ($inp),$xd1,$xd1
3089 vmovdqu32 $xd1,($out,$inp)
3095 jb .Less_than_64_16x
3096 vpxord ($inp),$xa2,$xa2
3097 vmovdqu32 $xa2,($out,$inp)
3103 jb .Less_than_64_16x
3104 vpxord ($inp),$xb2,$xb2
3105 vmovdqu32 $xb2,($out,$inp)
3111 jb .Less_than_64_16x
3112 vpxord ($inp),$xc2,$xc2
3113 vmovdqu32 $xc2,($out,$inp)
3119 jb .Less_than_64_16x
3120 vpxord ($inp),$xd2,$xd2
3121 vmovdqu32 $xd2,($out,$inp)
3127 jb .Less_than_64_16x
3128 vpxord ($inp),$xa3,$xa3
3129 vmovdqu32 $xa3,($out,$inp)
3135 jb .Less_than_64_16x
3136 vpxord ($inp),$xb3,$xb3
3137 vmovdqu32 $xb3,($out,$inp)
3143 jb .Less_than_64_16x
3144 vpxord ($inp),$xc3,$xc3
3145 vmovdqu32 $xc3,($out,$inp)
3151 vmovdqa32 $xa0,0x00(%rsp)
3152 lea ($out,$inp),$out
3156 movzb ($inp,%r10),%eax
3157 movzb (%rsp,%r10),%ecx
3160 mov %al,-1($out,%r10)
3164 vpxord $xa0,$xa0,$xa0
3165 vmovdqa32 $xa0,0(%rsp)
3170 $code.=<<___ if ($win64);
3171 movaps -0xa8(%r9),%xmm6
3172 movaps -0x98(%r9),%xmm7
3173 movaps -0x88(%r9),%xmm8
3174 movaps -0x78(%r9),%xmm9
3175 movaps -0x68(%r9),%xmm10
3176 movaps -0x58(%r9),%xmm11
3177 movaps -0x48(%r9),%xmm12
3178 movaps -0x38(%r9),%xmm13
3179 movaps -0x28(%r9),%xmm14
3180 movaps -0x18(%r9),%xmm15
3184 .cfi_def_cfa_register %rsp
3188 .size ChaCha20_16x,.-ChaCha20_16x
3191 # switch to %ymm domain
3192 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3193 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15));
3194 @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
3195 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
3196 @key=map("%ymm$_",(16..31));
3197 ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
3200 .type ChaCha20_8xvl,\@function,5
3205 mov %rsp,%r9 # frame register
3206 .cfi_def_cfa_register %r9
3207 sub \$64+$xframe,%rsp
3210 $code.=<<___ if ($win64);
3211 movaps %xmm6,-0xa8(%r9)
3212 movaps %xmm7,-0x98(%r9)
3213 movaps %xmm8,-0x88(%r9)
3214 movaps %xmm9,-0x78(%r9)
3215 movaps %xmm10,-0x68(%r9)
3216 movaps %xmm11,-0x58(%r9)
3217 movaps %xmm12,-0x48(%r9)
3218 movaps %xmm13,-0x38(%r9)
3219 movaps %xmm14,-0x28(%r9)
3220 movaps %xmm15,-0x18(%r9)
3226 lea .Lsigma(%rip),%r10
3227 vbroadcasti128 (%r10),$xa3 # key[0]
3228 vbroadcasti128 ($key),$xb3 # key[1]
3229 vbroadcasti128 16($key),$xc3 # key[2]
3230 vbroadcasti128 ($counter),$xd3 # key[3]
3232 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
3233 vpshufd \$0x55,$xa3,$xa1
3234 vpshufd \$0xaa,$xa3,$xa2
3235 vpshufd \$0xff,$xa3,$xa3
3236 vmovdqa64 $xa0,@key[0]
3237 vmovdqa64 $xa1,@key[1]
3238 vmovdqa64 $xa2,@key[2]
3239 vmovdqa64 $xa3,@key[3]
3241 vpshufd \$0x00,$xb3,$xb0
3242 vpshufd \$0x55,$xb3,$xb1
3243 vpshufd \$0xaa,$xb3,$xb2
3244 vpshufd \$0xff,$xb3,$xb3
3245 vmovdqa64 $xb0,@key[4]
3246 vmovdqa64 $xb1,@key[5]
3247 vmovdqa64 $xb2,@key[6]
3248 vmovdqa64 $xb3,@key[7]
3250 vpshufd \$0x00,$xc3,$xc0
3251 vpshufd \$0x55,$xc3,$xc1
3252 vpshufd \$0xaa,$xc3,$xc2
3253 vpshufd \$0xff,$xc3,$xc3
3254 vmovdqa64 $xc0,@key[8]
3255 vmovdqa64 $xc1,@key[9]
3256 vmovdqa64 $xc2,@key[10]
3257 vmovdqa64 $xc3,@key[11]
3259 vpshufd \$0x00,$xd3,$xd0
3260 vpshufd \$0x55,$xd3,$xd1
3261 vpshufd \$0xaa,$xd3,$xd2
3262 vpshufd \$0xff,$xd3,$xd3
3263 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
3264 vmovdqa64 $xd0,@key[12]
3265 vmovdqa64 $xd1,@key[13]
3266 vmovdqa64 $xd2,@key[14]
3267 vmovdqa64 $xd3,@key[15]
3274 #vpbroadcastd 0(%r10),$xa0 # reload key
3275 #vpbroadcastd 4(%r10),$xa1
3276 vpbroadcastd 8(%r10),$xa2
3277 vpbroadcastd 12(%r10),$xa3
3278 vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters
3279 vmovdqa64 @key[4],$xb0
3280 vmovdqa64 @key[5],$xb1
3281 vmovdqa64 @key[6],$xb2
3282 vmovdqa64 @key[7],$xb3
3283 vmovdqa64 @key[8],$xc0
3284 vmovdqa64 @key[9],$xc1
3285 vmovdqa64 @key[10],$xc2
3286 vmovdqa64 @key[11],$xc3
3287 vmovdqa64 @key[12],$xd0
3288 vmovdqa64 @key[13],$xd1
3289 vmovdqa64 @key[14],$xd2
3290 vmovdqa64 @key[15],$xd3
3292 vmovdqa64 $xa0,@key[0]
3293 vmovdqa64 $xa1,@key[1]
3294 vmovdqa64 $xa2,@key[2]
3295 vmovdqa64 $xa3,@key[3]
3303 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
3304 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
3309 vpaddd @key[0],$xa0,$xa0 # accumulate key
3310 vpaddd @key[1],$xa1,$xa1
3311 vpaddd @key[2],$xa2,$xa2
3312 vpaddd @key[3],$xa3,$xa3
3314 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
3315 vpunpckldq $xa3,$xa2,$xt3
3316 vpunpckhdq $xa1,$xa0,$xa0
3317 vpunpckhdq $xa3,$xa2,$xa2
3318 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
3319 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
3320 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
3321 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
3323 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
3325 vpaddd @key[4],$xb0,$xb0
3326 vpaddd @key[5],$xb1,$xb1
3327 vpaddd @key[6],$xb2,$xb2
3328 vpaddd @key[7],$xb3,$xb3
3330 vpunpckldq $xb1,$xb0,$xt2
3331 vpunpckldq $xb3,$xb2,$xt3
3332 vpunpckhdq $xb1,$xb0,$xb0
3333 vpunpckhdq $xb3,$xb2,$xb2
3334 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
3335 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
3336 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
3337 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
3339 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
3341 vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further
3342 vshufi32x4 \$3,$xb0,$xa0,$xb0
3343 vshufi32x4 \$0,$xb1,$xa1,$xa0
3344 vshufi32x4 \$3,$xb1,$xa1,$xb1
3345 vshufi32x4 \$0,$xb2,$xa2,$xa1
3346 vshufi32x4 \$3,$xb2,$xa2,$xb2
3347 vshufi32x4 \$0,$xb3,$xa3,$xa2
3348 vshufi32x4 \$3,$xb3,$xa3,$xb3
3350 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
3352 vpaddd @key[8],$xc0,$xc0
3353 vpaddd @key[9],$xc1,$xc1
3354 vpaddd @key[10],$xc2,$xc2
3355 vpaddd @key[11],$xc3,$xc3
3357 vpunpckldq $xc1,$xc0,$xt2
3358 vpunpckldq $xc3,$xc2,$xt3
3359 vpunpckhdq $xc1,$xc0,$xc0
3360 vpunpckhdq $xc3,$xc2,$xc2
3361 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
3362 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
3363 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
3364 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
3366 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
3368 vpaddd @key[12],$xd0,$xd0
3369 vpaddd @key[13],$xd1,$xd1
3370 vpaddd @key[14],$xd2,$xd2
3371 vpaddd @key[15],$xd3,$xd3
3373 vpunpckldq $xd1,$xd0,$xt2
3374 vpunpckldq $xd3,$xd2,$xt3
3375 vpunpckhdq $xd1,$xd0,$xd0
3376 vpunpckhdq $xd3,$xd2,$xd2
3377 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
3378 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
3379 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
3380 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
3382 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
3384 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
3385 vperm2i128 \$0x31,$xd0,$xc0,$xd0
3386 vperm2i128 \$0x20,$xd1,$xc1,$xc0
3387 vperm2i128 \$0x31,$xd1,$xc1,$xd1
3388 vperm2i128 \$0x20,$xd2,$xc2,$xc1
3389 vperm2i128 \$0x31,$xd2,$xc2,$xd2
3390 vperm2i128 \$0x20,$xd3,$xc3,$xc2
3391 vperm2i128 \$0x31,$xd3,$xc3,$xd3
3393 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
3394 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
3395 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
3400 mov \$0x80,%eax # size optimization
3401 vpxord 0x00($inp),$xa0,$xa0 # xor with input
3402 vpxor 0x20($inp),$xb0,$xb0
3403 vpxor 0x40($inp),$xc0,$xc0
3404 vpxor 0x60($inp),$xd0,$xd0
3405 lea ($inp,%rax),$inp # size optimization
3406 vmovdqu32 $xa0,0x00($out)
3407 vmovdqu $xb0,0x20($out)
3408 vmovdqu $xc0,0x40($out)
3409 vmovdqu $xd0,0x60($out)
3410 lea ($out,%rax),$out # size optimization
3412 vpxor 0x00($inp),$xa1,$xa1
3413 vpxor 0x20($inp),$xb1,$xb1
3414 vpxor 0x40($inp),$xc1,$xc1
3415 vpxor 0x60($inp),$xd1,$xd1
3416 lea ($inp,%rax),$inp # size optimization
3417 vmovdqu $xa1,0x00($out)
3418 vmovdqu $xb1,0x20($out)
3419 vmovdqu $xc1,0x40($out)
3420 vmovdqu $xd1,0x60($out)
3421 lea ($out,%rax),$out # size optimization
3423 vpxord 0x00($inp),$xa2,$xa2
3424 vpxor 0x20($inp),$xb2,$xb2
3425 vpxor 0x40($inp),$xc2,$xc2
3426 vpxor 0x60($inp),$xd2,$xd2
3427 lea ($inp,%rax),$inp # size optimization
3428 vmovdqu32 $xa2,0x00($out)
3429 vmovdqu $xb2,0x20($out)
3430 vmovdqu $xc2,0x40($out)
3431 vmovdqu $xd2,0x60($out)
3432 lea ($out,%rax),$out # size optimization
3434 vpxor 0x00($inp),$xa3,$xa3
3435 vpxor 0x20($inp),$xb3,$xb3
3436 vpxor 0x40($inp),$xc3,$xc3
3437 vpxor 0x60($inp),$xd3,$xd3
3438 lea ($inp,%rax),$inp # size optimization
3439 vmovdqu $xa3,0x00($out)
3440 vmovdqu $xb3,0x20($out)
3441 vmovdqu $xc3,0x40($out)
3442 vmovdqu $xd3,0x60($out)
3443 lea ($out,%rax),$out # size optimization
3445 vpbroadcastd 0(%r10),%ymm0 # reload key
3446 vpbroadcastd 4(%r10),%ymm1
3455 vmovdqa64 $xa0,%ymm8 # size optimization
3462 jb .Less_than_64_8xvl
3463 vpxor 0x00($inp),$xa0,$xa0 # xor with input
3464 vpxor 0x20($inp),$xb0,$xb0
3465 vmovdqu $xa0,0x00($out,$inp)
3466 vmovdqu $xb0,0x20($out,$inp)
3473 jb .Less_than_64_8xvl
3474 vpxor 0x00($inp),$xc0,$xc0
3475 vpxor 0x20($inp),$xd0,$xd0
3476 vmovdqu $xc0,0x00($out,$inp)
3477 vmovdqu $xd0,0x20($out,$inp)
3484 jb .Less_than_64_8xvl
3485 vpxor 0x00($inp),$xa1,$xa1
3486 vpxor 0x20($inp),$xb1,$xb1
3487 vmovdqu $xa1,0x00($out,$inp)
3488 vmovdqu $xb1,0x20($out,$inp)
3495 jb .Less_than_64_8xvl
3496 vpxor 0x00($inp),$xc1,$xc1
3497 vpxor 0x20($inp),$xd1,$xd1
3498 vmovdqu $xc1,0x00($out,$inp)
3499 vmovdqu $xd1,0x20($out,$inp)
3506 jb .Less_than_64_8xvl
3507 vpxord 0x00($inp),$xa2,$xa2
3508 vpxor 0x20($inp),$xb2,$xb2
3509 vmovdqu32 $xa2,0x00($out,$inp)
3510 vmovdqu $xb2,0x20($out,$inp)
3517 jb .Less_than_64_8xvl
3518 vpxor 0x00($inp),$xc2,$xc2
3519 vpxor 0x20($inp),$xd2,$xd2
3520 vmovdqu $xc2,0x00($out,$inp)
3521 vmovdqu $xd2,0x20($out,$inp)
3528 jb .Less_than_64_8xvl
3529 vpxor 0x00($inp),$xa3,$xa3
3530 vpxor 0x20($inp),$xb3,$xb3
3531 vmovdqu $xa3,0x00($out,$inp)
3532 vmovdqu $xb3,0x20($out,$inp)
3539 vmovdqa $xa0,0x00(%rsp)
3540 vmovdqa $xb0,0x20(%rsp)
3541 lea ($out,$inp),$out
3545 movzb ($inp,%r10),%eax
3546 movzb (%rsp,%r10),%ecx
3549 mov %al,-1($out,%r10)
3553 vpxor $xa0,$xa0,$xa0
3554 vmovdqa $xa0,0x00(%rsp)
3555 vmovdqa $xa0,0x20(%rsp)
3560 $code.=<<___ if ($win64);
3561 movaps -0xa8(%r9),%xmm6
3562 movaps -0x98(%r9),%xmm7
3563 movaps -0x88(%r9),%xmm8
3564 movaps -0x78(%r9),%xmm9
3565 movaps -0x68(%r9),%xmm10
3566 movaps -0x58(%r9),%xmm11
3567 movaps -0x48(%r9),%xmm12
3568 movaps -0x38(%r9),%xmm13
3569 movaps -0x28(%r9),%xmm14
3570 movaps -0x18(%r9),%xmm15
3574 .cfi_def_cfa_register %rsp
3578 .size ChaCha20_8xvl,.-ChaCha20_8xvl
3582 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3583 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
3591 .extern __imp_RtlVirtualUnwind
3592 .type se_handler,\@abi-omnipotent
3606 mov 120($context),%rax # pull context->Rax
3607 mov 248($context),%rbx # pull context->Rip
3609 mov 8($disp),%rsi # disp->ImageBase
3610 mov 56($disp),%r11 # disp->HandlerData
3612 lea .Lctr32_body(%rip),%r10
3613 cmp %r10,%rbx # context->Rip<.Lprologue
3614 jb .Lcommon_seh_tail
3616 mov 152($context),%rax # pull context->Rsp
3618 lea .Lno_data(%rip),%r10 # epilogue label
3619 cmp %r10,%rbx # context->Rip>=.Lepilogue
3620 jae .Lcommon_seh_tail
3622 lea 64+24+48(%rax),%rax
3630 mov %rbx,144($context) # restore context->Rbx
3631 mov %rbp,160($context) # restore context->Rbp
3632 mov %r12,216($context) # restore context->R12
3633 mov %r13,224($context) # restore context->R13
3634 mov %r14,232($context) # restore context->R14
3635 mov %r15,240($context) # restore context->R14
3640 mov %rax,152($context) # restore context->Rsp
3641 mov %rsi,168($context) # restore context->Rsi
3642 mov %rdi,176($context) # restore context->Rdi
3644 mov 40($disp),%rdi # disp->ContextRecord
3645 mov $context,%rsi # context
3646 mov \$154,%ecx # sizeof(CONTEXT)
3647 .long 0xa548f3fc # cld; rep movsq
3650 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3651 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3652 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3653 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3654 mov 40(%rsi),%r10 # disp->ContextRecord
3655 lea 56(%rsi),%r11 # &disp->HandlerData
3656 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3657 mov %r10,32(%rsp) # arg5
3658 mov %r11,40(%rsp) # arg6
3659 mov %r12,48(%rsp) # arg7
3660 mov %rcx,56(%rsp) # arg8, (NULL)
3661 call *__imp_RtlVirtualUnwind(%rip)
3663 mov \$1,%eax # ExceptionContinueSearch
3675 .size se_handler,.-se_handler
3677 .type ssse3_handler,\@abi-omnipotent
3691 mov 120($context),%rax # pull context->Rax
3692 mov 248($context),%rbx # pull context->Rip
3694 mov 8($disp),%rsi # disp->ImageBase
3695 mov 56($disp),%r11 # disp->HandlerData
3697 mov 0(%r11),%r10d # HandlerData[0]
3698 lea (%rsi,%r10),%r10 # prologue label
3699 cmp %r10,%rbx # context->Rip<prologue label
3700 jb .Lcommon_seh_tail
3702 mov 192($context),%rax # pull context->R9
3704 mov 4(%r11),%r10d # HandlerData[1]
3705 lea (%rsi,%r10),%r10 # epilogue label
3706 cmp %r10,%rbx # context->Rip>=epilogue label
3707 jae .Lcommon_seh_tail
3709 lea -0x28(%rax),%rsi
3710 lea 512($context),%rdi # &context.Xmm6
3712 .long 0xa548f3fc # cld; rep movsq
3714 jmp .Lcommon_seh_tail
3715 .size ssse3_handler,.-ssse3_handler
3717 .type full_handler,\@abi-omnipotent
3731 mov 120($context),%rax # pull context->Rax
3732 mov 248($context),%rbx # pull context->Rip
3734 mov 8($disp),%rsi # disp->ImageBase
3735 mov 56($disp),%r11 # disp->HandlerData
3737 mov 0(%r11),%r10d # HandlerData[0]
3738 lea (%rsi,%r10),%r10 # prologue label
3739 cmp %r10,%rbx # context->Rip<prologue label
3740 jb .Lcommon_seh_tail
3742 mov 192($context),%rax # pull context->R9
3744 mov 4(%r11),%r10d # HandlerData[1]
3745 lea (%rsi,%r10),%r10 # epilogue label
3746 cmp %r10,%rbx # context->Rip>=epilogue label
3747 jae .Lcommon_seh_tail
3749 lea -0xa8(%rax),%rsi
3750 lea 512($context),%rdi # &context.Xmm6
3752 .long 0xa548f3fc # cld; rep movsq
3754 jmp .Lcommon_seh_tail
3755 .size full_handler,.-full_handler
3759 .rva .LSEH_begin_ChaCha20_ctr32
3760 .rva .LSEH_end_ChaCha20_ctr32
3761 .rva .LSEH_info_ChaCha20_ctr32
3763 .rva .LSEH_begin_ChaCha20_ssse3
3764 .rva .LSEH_end_ChaCha20_ssse3
3765 .rva .LSEH_info_ChaCha20_ssse3
3767 .rva .LSEH_begin_ChaCha20_4x
3768 .rva .LSEH_end_ChaCha20_4x
3769 .rva .LSEH_info_ChaCha20_4x
3771 $code.=<<___ if ($avx);
3772 .rva .LSEH_begin_ChaCha20_4xop
3773 .rva .LSEH_end_ChaCha20_4xop
3774 .rva .LSEH_info_ChaCha20_4xop
3776 $code.=<<___ if ($avx>1);
3777 .rva .LSEH_begin_ChaCha20_8x
3778 .rva .LSEH_end_ChaCha20_8x
3779 .rva .LSEH_info_ChaCha20_8x
3781 $code.=<<___ if ($avx>2);
3782 .rva .LSEH_begin_ChaCha20_avx512
3783 .rva .LSEH_end_ChaCha20_avx512
3784 .rva .LSEH_info_ChaCha20_avx512
3786 .rva .LSEH_begin_ChaCha20_avx512vl
3787 .rva .LSEH_end_ChaCha20_avx512vl
3788 .rva .LSEH_info_ChaCha20_avx512vl
3790 .rva .LSEH_begin_ChaCha20_16x
3791 .rva .LSEH_end_ChaCha20_16x
3792 .rva .LSEH_info_ChaCha20_16x
3794 .rva .LSEH_begin_ChaCha20_8xvl
3795 .rva .LSEH_end_ChaCha20_8xvl
3796 .rva .LSEH_info_ChaCha20_8xvl
3801 .LSEH_info_ChaCha20_ctr32:
3805 .LSEH_info_ChaCha20_ssse3:
3808 .rva .Lssse3_body,.Lssse3_epilogue
3810 .LSEH_info_ChaCha20_4x:
3813 .rva .L4x_body,.L4x_epilogue
3815 $code.=<<___ if ($avx);
3816 .LSEH_info_ChaCha20_4xop:
3819 .rva .L4xop_body,.L4xop_epilogue # HandlerData[]
3821 $code.=<<___ if ($avx>1);
3822 .LSEH_info_ChaCha20_8x:
3825 .rva .L8x_body,.L8x_epilogue # HandlerData[]
3827 $code.=<<___ if ($avx>2);
3828 .LSEH_info_ChaCha20_avx512:
3831 .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[]
3833 .LSEH_info_ChaCha20_avx512vl:
3836 .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[]
3838 .LSEH_info_ChaCha20_16x:
3841 .rva .L16x_body,.L16x_epilogue # HandlerData[]
3843 .LSEH_info_ChaCha20_8xvl:
3846 .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[]
3850 foreach (split("\n",$code)) {
3851 s/\`([^\`]*)\`/eval $1/ge;
3853 s/%x#%[yz]/%x/g; # "down-shift"