2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # ChaCha20 for x86_64.
23 # Add AVX512F code path.
25 # Performance in cycles per byte out of large buffer.
27 # IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2
29 # P4 9.48/+99% -/22.7(ii) -
30 # Core2 7.83/+55% 7.90/8.08 4.35
31 # Westmere 7.19/+50% 5.60/6.70 3.00
32 # Sandy Bridge 8.31/+42% 5.45/6.76 2.72
33 # Ivy Bridge 6.71/+46% 5.40/6.49 2.41
34 # Haswell 5.92/+43% 5.20/6.45 2.42 1.23
35 # Skylake 5.87/+39% 4.70/- 2.31 1.19
36 # Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
37 # Goldmont 10.6/+17% 5.10/- 3.28
38 # Sledgehammer 7.28/+52% -/14.2(ii) -
39 # Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
40 # VIA Nano 10.5/+46% 6.72/8.60 6.05
42 # (i) compared to older gcc 3.x one can observe >2x improvement on
44 # (ii) as it can be seen, SSE2 performance is too low on legacy
45 # processors; NxSSE2 results are naturally better, but not
46 # impressively better than IALU ones, which is why you won't
47 # find SSE2 code below;
48 # (iii) this is not optimal result for Atom because of MSROM
49 # limitations, SSE2 can do better, but gain is considered too
50 # low to justify the [maintenance] effort;
51 # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
55 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
57 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
61 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
62 die "can't locate x86_64-xlate.pl";
64 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
65 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
66 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
69 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
70 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
71 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
72 $avx += 1 if ($1==2.11 && $2>=8);
75 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
76 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
77 $avx = ($1>=10) + ($1>=11);
80 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
81 $avx = ($2>=3.0) + ($2>3.0);
84 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
87 # input parameter block
88 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
93 .extern OPENSSL_ia32cap_P
105 .long 0,2,4,6,1,3,5,7
107 .long 8,8,8,8,8,8,8,8
109 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
111 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
113 .asciz "expand 32-byte k"
116 .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
118 .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
119 .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
122 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
123 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
125 $arg = "\$$arg" if ($arg*1 eq $arg);
126 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
129 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
130 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
133 sub ROUND { # critical path is 24 cycles per round
134 my ($a0,$b0,$c0,$d0)=@_;
135 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
136 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
137 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
138 my ($xc,$xc_)=map("\"$_\"",@t);
139 my @x=map("\"$_\"",@x);
141 # Consider order in which variables are addressed by their
146 # 0 4 8 12 < even round
150 # 0 5 10 15 < odd round
155 # 'a', 'b' and 'd's are permanently allocated in registers,
156 # @x[0..7,12..15], while 'c's are maintained in memory. If
157 # you observe 'c' column, you'll notice that pair of 'c's is
158 # invariant between rounds. This means that we have to reload
159 # them once per round, in the middle. This is why you'll see
160 # bunch of 'c' stores and loads in the middle, but none in
161 # the beginning or end.
163 # Normally instructions would be interleaved to favour in-order
164 # execution. Generally out-of-order cores manage it gracefully,
165 # but not this time for some reason. As in-order execution
166 # cores are dying breed, old Atom is the only one around,
167 # instructions are left uninterleaved. Besides, Atom is better
168 # off executing 1xSSSE3 code anyway...
171 "&add (@x[$a0],@x[$b0])", # Q1
172 "&xor (@x[$d0],@x[$a0])",
174 "&add (@x[$a1],@x[$b1])", # Q2
175 "&xor (@x[$d1],@x[$a1])",
178 "&add ($xc,@x[$d0])",
179 "&xor (@x[$b0],$xc)",
181 "&add ($xc_,@x[$d1])",
182 "&xor (@x[$b1],$xc_)",
185 "&add (@x[$a0],@x[$b0])",
186 "&xor (@x[$d0],@x[$a0])",
188 "&add (@x[$a1],@x[$b1])",
189 "&xor (@x[$d1],@x[$a1])",
192 "&add ($xc,@x[$d0])",
193 "&xor (@x[$b0],$xc)",
195 "&add ($xc_,@x[$d1])",
196 "&xor (@x[$b1],$xc_)",
199 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
200 "&mov (\"4*$c1(%rsp)\",$xc_)",
201 "&mov ($xc,\"4*$c2(%rsp)\")",
202 "&mov ($xc_,\"4*$c3(%rsp)\")",
204 "&add (@x[$a2],@x[$b2])", # Q3
205 "&xor (@x[$d2],@x[$a2])",
207 "&add (@x[$a3],@x[$b3])", # Q4
208 "&xor (@x[$d3],@x[$a3])",
211 "&add ($xc,@x[$d2])",
212 "&xor (@x[$b2],$xc)",
214 "&add ($xc_,@x[$d3])",
215 "&xor (@x[$b3],$xc_)",
218 "&add (@x[$a2],@x[$b2])",
219 "&xor (@x[$d2],@x[$a2])",
221 "&add (@x[$a3],@x[$b3])",
222 "&xor (@x[$d3],@x[$a3])",
225 "&add ($xc,@x[$d2])",
226 "&xor (@x[$b2],$xc)",
228 "&add ($xc_,@x[$d3])",
229 "&xor (@x[$b3],$xc_)",
234 ########################################################################
235 # Generic code path that handles all lengths on pre-SSSE3 processors.
237 .globl ChaCha20_ctr32
238 .type ChaCha20_ctr32,\@function,5
243 mov OPENSSL_ia32cap_P+4(%rip),%r10
244 test \$`1<<(41-32)`,%r10d
255 #movdqa .Lsigma(%rip),%xmm0
257 movdqu 16($key),%xmm2
258 movdqu ($counter),%xmm3
259 movdqa .Lone(%rip),%xmm4
261 #movdqa %xmm0,4*0(%rsp) # key[0]
262 movdqa %xmm1,4*4(%rsp) # key[1]
263 movdqa %xmm2,4*8(%rsp) # key[2]
264 movdqa %xmm3,4*12(%rsp) # key[3]
265 mov $len,%rbp # reassign $len
270 mov \$0x61707865,@x[0] # 'expa'
271 mov \$0x3320646e,@x[1] # 'nd 3'
272 mov \$0x79622d32,@x[2] # '2-by'
273 mov \$0x6b206574,@x[3] # 'te k'
279 mov 4*13(%rsp),@x[13]
280 mov 4*14(%rsp),@x[14]
281 mov 4*15(%rsp),@x[15]
283 mov %rbp,64+0(%rsp) # save len
285 mov $inp,64+8(%rsp) # save inp
286 movq %xmm2,%rsi # "@x[8]"
287 mov $out,64+16(%rsp) # save out
289 shr \$32,%rdi # "@x[9]"
295 foreach (&ROUND (0, 4, 8,12)) { eval; }
296 foreach (&ROUND (0, 5,10,15)) { eval; }
301 mov @t[1],4*9(%rsp) # modulo-scheduled
303 mov 64(%rsp),%rbp # load len
305 mov 64+8(%rsp),$inp # load inp
306 paddd %xmm4,%xmm3 # increment counter
307 mov 64+16(%rsp),$out # load out
309 add \$0x61707865,@x[0] # 'expa'
310 add \$0x3320646e,@x[1] # 'nd 3'
311 add \$0x79622d32,@x[2] # '2-by'
312 add \$0x6b206574,@x[3] # 'te k'
317 add 4*12(%rsp),@x[12]
318 add 4*13(%rsp),@x[13]
319 add 4*14(%rsp),@x[14]
320 add 4*15(%rsp),@x[15]
321 paddd 4*8(%rsp),%xmm1
326 xor 4*0($inp),@x[0] # xor with input
334 movdqu 4*8($inp),%xmm0
335 xor 4*12($inp),@x[12]
336 xor 4*13($inp),@x[13]
337 xor 4*14($inp),@x[14]
338 xor 4*15($inp),@x[15]
339 lea 4*16($inp),$inp # inp+=64
342 movdqa %xmm2,4*8(%rsp)
343 movd %xmm3,4*12(%rsp)
345 mov @x[0],4*0($out) # write output
353 movdqu %xmm0,4*8($out)
354 mov @x[12],4*12($out)
355 mov @x[13],4*13($out)
356 mov @x[14],4*14($out)
357 mov @x[15],4*15($out)
358 lea 4*16($out),$out # out+=64
376 movdqa %xmm1,4*8(%rsp)
377 mov @x[12],4*12(%rsp)
378 mov @x[13],4*13(%rsp)
379 mov @x[14],4*14(%rsp)
380 mov @x[15],4*15(%rsp)
383 movzb ($inp,%rbx),%eax
384 movzb (%rsp,%rbx),%edx
387 mov %al,-1($out,%rbx)
401 .size ChaCha20_ctr32,.-ChaCha20_ctr32
404 ########################################################################
405 # SSSE3 code path that handles shorter lengths
407 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
409 sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
433 my $xframe = $win64 ? 32+32+8 : 24;
436 .type ChaCha20_ssse3,\@function,5
441 $code.=<<___ if ($avx);
442 test \$`1<<(43-32)`,%r10d
443 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
446 cmp \$128,$len # we might throw away some data,
447 ja .LChaCha20_4x # but overall it won't be slower
457 sub \$64+$xframe,%rsp
459 $code.=<<___ if ($win64);
460 movaps %xmm6,64+32(%rsp)
461 movaps %xmm7,64+48(%rsp)
464 movdqa .Lsigma(%rip),$a
468 movdqa .Lrot16(%rip),$rot16
469 movdqa .Lrot24(%rip),$rot24
480 movdqa .Lone(%rip),$d
493 &pshufd ($c,$c,0b01001110);
494 &pshufd ($b,$b,0b00111001);
495 &pshufd ($d,$d,0b10010011);
499 &pshufd ($c,$c,0b01001110);
500 &pshufd ($b,$b,0b10010011);
501 &pshufd ($d,$d,0b00111001);
504 &jnz (".Loop_ssse3");
516 movdqu 0x10($inp),$t1
517 pxor $t,$a # xor with input
520 movdqu 0x30($inp),$t1
521 lea 0x40($inp),$inp # inp+=64
525 movdqu $a,0x00($out) # write output
529 lea 0x40($out),$out # out+=64
532 jnz .Loop_outer_ssse3
545 movzb ($inp,%rbx),%eax
546 movzb (%rsp,%rbx),%ecx
549 mov %al,-1($out,%rbx)
555 $code.=<<___ if ($win64);
556 movaps 64+32(%rsp),%xmm6
557 movaps 64+48(%rsp),%xmm7
560 add \$64+$xframe,%rsp
568 .size ChaCha20_ssse3,.-ChaCha20_ssse3
572 ########################################################################
573 # SSSE3 code path that handles longer messages.
575 # assign variables to favor Atom front-end
576 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
577 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
578 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
579 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
581 sub SSSE3_lane_ROUND {
582 my ($a0,$b0,$c0,$d0)=@_;
583 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
584 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
585 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
586 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
587 my @x=map("\"$_\"",@xx);
589 # Consider order in which variables are addressed by their
594 # 0 4 8 12 < even round
598 # 0 5 10 15 < odd round
603 # 'a', 'b' and 'd's are permanently allocated in registers,
604 # @x[0..7,12..15], while 'c's are maintained in memory. If
605 # you observe 'c' column, you'll notice that pair of 'c's is
606 # invariant between rounds. This means that we have to reload
607 # them once per round, in the middle. This is why you'll see
608 # bunch of 'c' stores and loads in the middle, but none in
609 # the beginning or end.
612 "&paddd (@x[$a0],@x[$b0])", # Q1
613 "&paddd (@x[$a1],@x[$b1])", # Q2
614 "&pxor (@x[$d0],@x[$a0])",
615 "&pxor (@x[$d1],@x[$a1])",
616 "&pshufb (@x[$d0],$t1)",
617 "&pshufb (@x[$d1],$t1)",
619 "&paddd ($xc,@x[$d0])",
620 "&paddd ($xc_,@x[$d1])",
621 "&pxor (@x[$b0],$xc)",
622 "&pxor (@x[$b1],$xc_)",
623 "&movdqa ($t0,@x[$b0])",
624 "&pslld (@x[$b0],12)",
626 "&movdqa ($t1,@x[$b1])",
627 "&pslld (@x[$b1],12)",
628 "&por (@x[$b0],$t0)",
630 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
631 "&por (@x[$b1],$t1)",
633 "&paddd (@x[$a0],@x[$b0])",
634 "&paddd (@x[$a1],@x[$b1])",
635 "&pxor (@x[$d0],@x[$a0])",
636 "&pxor (@x[$d1],@x[$a1])",
637 "&pshufb (@x[$d0],$t0)",
638 "&pshufb (@x[$d1],$t0)",
640 "&paddd ($xc,@x[$d0])",
641 "&paddd ($xc_,@x[$d1])",
642 "&pxor (@x[$b0],$xc)",
643 "&pxor (@x[$b1],$xc_)",
644 "&movdqa ($t1,@x[$b0])",
645 "&pslld (@x[$b0],7)",
647 "&movdqa ($t0,@x[$b1])",
648 "&pslld (@x[$b1],7)",
649 "&por (@x[$b0],$t1)",
651 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
652 "&por (@x[$b1],$t0)",
654 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
655 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
656 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
657 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
659 "&paddd (@x[$a2],@x[$b2])", # Q3
660 "&paddd (@x[$a3],@x[$b3])", # Q4
661 "&pxor (@x[$d2],@x[$a2])",
662 "&pxor (@x[$d3],@x[$a3])",
663 "&pshufb (@x[$d2],$t1)",
664 "&pshufb (@x[$d3],$t1)",
666 "&paddd ($xc,@x[$d2])",
667 "&paddd ($xc_,@x[$d3])",
668 "&pxor (@x[$b2],$xc)",
669 "&pxor (@x[$b3],$xc_)",
670 "&movdqa ($t0,@x[$b2])",
671 "&pslld (@x[$b2],12)",
673 "&movdqa ($t1,@x[$b3])",
674 "&pslld (@x[$b3],12)",
675 "&por (@x[$b2],$t0)",
677 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
678 "&por (@x[$b3],$t1)",
680 "&paddd (@x[$a2],@x[$b2])",
681 "&paddd (@x[$a3],@x[$b3])",
682 "&pxor (@x[$d2],@x[$a2])",
683 "&pxor (@x[$d3],@x[$a3])",
684 "&pshufb (@x[$d2],$t0)",
685 "&pshufb (@x[$d3],$t0)",
687 "&paddd ($xc,@x[$d2])",
688 "&paddd ($xc_,@x[$d3])",
689 "&pxor (@x[$b2],$xc)",
690 "&pxor (@x[$b3],$xc_)",
691 "&movdqa ($t1,@x[$b2])",
692 "&pslld (@x[$b2],7)",
694 "&movdqa ($t0,@x[$b3])",
695 "&pslld (@x[$b3],7)",
696 "&por (@x[$b2],$t1)",
698 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
703 my $xframe = $win64 ? 0xa0 : 0;
706 .type ChaCha20_4x,\@function,5
712 $code.=<<___ if ($avx>1);
713 shr \$32,%r10 # OPENSSL_ia32cap_P+8
714 test \$`1<<5`,%r10 # test AVX2
721 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
722 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
723 je .Ldo_sse3_after_all # to detect Atom
727 sub \$0x148+$xframe,%rsp
729 ################ stack layout
730 # +0x00 SIMD equivalent of @x[8-12]
732 # +0x40 constant copy of key[0-2] smashed by lanes
734 # +0x100 SIMD counters (with nonce smashed by lanes)
737 $code.=<<___ if ($win64);
738 movaps %xmm6,-0x30(%r11)
739 movaps %xmm7,-0x20(%r11)
740 movaps %xmm8,-0x10(%r11)
741 movaps %xmm9,0x00(%r11)
742 movaps %xmm10,0x10(%r11)
743 movaps %xmm11,0x20(%r11)
744 movaps %xmm12,0x30(%r11)
745 movaps %xmm13,0x40(%r11)
746 movaps %xmm14,0x50(%r11)
747 movaps %xmm15,0x60(%r11)
750 movdqa .Lsigma(%rip),$xa3 # key[0]
751 movdqu ($key),$xb3 # key[1]
752 movdqu 16($key),$xt3 # key[2]
753 movdqu ($counter),$xd3 # key[3]
754 lea 0x100(%rsp),%rcx # size optimization
755 lea .Lrot16(%rip),%r10
756 lea .Lrot24(%rip),%r11
758 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
759 pshufd \$0x55,$xa3,$xa1
760 movdqa $xa0,0x40(%rsp) # ... and offload
761 pshufd \$0xaa,$xa3,$xa2
762 movdqa $xa1,0x50(%rsp)
763 pshufd \$0xff,$xa3,$xa3
764 movdqa $xa2,0x60(%rsp)
765 movdqa $xa3,0x70(%rsp)
767 pshufd \$0x00,$xb3,$xb0
768 pshufd \$0x55,$xb3,$xb1
769 movdqa $xb0,0x80-0x100(%rcx)
770 pshufd \$0xaa,$xb3,$xb2
771 movdqa $xb1,0x90-0x100(%rcx)
772 pshufd \$0xff,$xb3,$xb3
773 movdqa $xb2,0xa0-0x100(%rcx)
774 movdqa $xb3,0xb0-0x100(%rcx)
776 pshufd \$0x00,$xt3,$xt0 # "$xc0"
777 pshufd \$0x55,$xt3,$xt1 # "$xc1"
778 movdqa $xt0,0xc0-0x100(%rcx)
779 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
780 movdqa $xt1,0xd0-0x100(%rcx)
781 pshufd \$0xff,$xt3,$xt3 # "$xc3"
782 movdqa $xt2,0xe0-0x100(%rcx)
783 movdqa $xt3,0xf0-0x100(%rcx)
785 pshufd \$0x00,$xd3,$xd0
786 pshufd \$0x55,$xd3,$xd1
787 paddd .Linc(%rip),$xd0 # don't save counters yet
788 pshufd \$0xaa,$xd3,$xd2
789 movdqa $xd1,0x110-0x100(%rcx)
790 pshufd \$0xff,$xd3,$xd3
791 movdqa $xd2,0x120-0x100(%rcx)
792 movdqa $xd3,0x130-0x100(%rcx)
798 movdqa 0x40(%rsp),$xa0 # re-load smashed key
799 movdqa 0x50(%rsp),$xa1
800 movdqa 0x60(%rsp),$xa2
801 movdqa 0x70(%rsp),$xa3
802 movdqa 0x80-0x100(%rcx),$xb0
803 movdqa 0x90-0x100(%rcx),$xb1
804 movdqa 0xa0-0x100(%rcx),$xb2
805 movdqa 0xb0-0x100(%rcx),$xb3
806 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
807 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
808 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
809 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
810 movdqa 0x100-0x100(%rcx),$xd0
811 movdqa 0x110-0x100(%rcx),$xd1
812 movdqa 0x120-0x100(%rcx),$xd2
813 movdqa 0x130-0x100(%rcx),$xd3
814 paddd .Lfour(%rip),$xd0 # next SIMD counters
817 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
818 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
819 movdqa (%r10),$xt3 # .Lrot16(%rip)
821 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
827 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
828 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
833 paddd 0x40(%rsp),$xa0 # accumulate key material
834 paddd 0x50(%rsp),$xa1
835 paddd 0x60(%rsp),$xa2
836 paddd 0x70(%rsp),$xa3
838 movdqa $xa0,$xt2 # "de-interlace" data
845 punpcklqdq $xa2,$xa0 # "a0"
847 punpcklqdq $xt3,$xt2 # "a2"
848 punpckhqdq $xa2,$xa1 # "a1"
849 punpckhqdq $xt3,$xa3 # "a3"
851 ($xa2,$xt2)=($xt2,$xa2);
853 paddd 0x80-0x100(%rcx),$xb0
854 paddd 0x90-0x100(%rcx),$xb1
855 paddd 0xa0-0x100(%rcx),$xb2
856 paddd 0xb0-0x100(%rcx),$xb3
858 movdqa $xa0,0x00(%rsp) # offload $xaN
859 movdqa $xa1,0x10(%rsp)
860 movdqa 0x20(%rsp),$xa0 # "xc2"
861 movdqa 0x30(%rsp),$xa1 # "xc3"
870 punpcklqdq $xb2,$xb0 # "b0"
872 punpcklqdq $xt3,$xt2 # "b2"
873 punpckhqdq $xb2,$xb1 # "b1"
874 punpckhqdq $xt3,$xb3 # "b3"
876 ($xb2,$xt2)=($xt2,$xb2);
877 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
879 paddd 0xc0-0x100(%rcx),$xc0
880 paddd 0xd0-0x100(%rcx),$xc1
881 paddd 0xe0-0x100(%rcx),$xc2
882 paddd 0xf0-0x100(%rcx),$xc3
884 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
885 movdqa $xa3,0x30(%rsp)
894 punpcklqdq $xc2,$xc0 # "c0"
896 punpcklqdq $xt3,$xt2 # "c2"
897 punpckhqdq $xc2,$xc1 # "c1"
898 punpckhqdq $xt3,$xc3 # "c3"
900 ($xc2,$xt2)=($xt2,$xc2);
901 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
903 paddd 0x100-0x100(%rcx),$xd0
904 paddd 0x110-0x100(%rcx),$xd1
905 paddd 0x120-0x100(%rcx),$xd2
906 paddd 0x130-0x100(%rcx),$xd3
915 punpcklqdq $xd2,$xd0 # "d0"
917 punpcklqdq $xt3,$xt2 # "d2"
918 punpckhqdq $xd2,$xd1 # "d1"
919 punpckhqdq $xt3,$xd3 # "d3"
921 ($xd2,$xt2)=($xt2,$xd2);
926 movdqu 0x00($inp),$xt0 # xor with input
927 movdqu 0x10($inp),$xt1
928 movdqu 0x20($inp),$xt2
929 movdqu 0x30($inp),$xt3
930 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
935 movdqu $xt0,0x00($out)
936 movdqu 0x40($inp),$xt0
937 movdqu $xt1,0x10($out)
938 movdqu 0x50($inp),$xt1
939 movdqu $xt2,0x20($out)
940 movdqu 0x60($inp),$xt2
941 movdqu $xt3,0x30($out)
942 movdqu 0x70($inp),$xt3
943 lea 0x80($inp),$inp # size optimization
949 movdqu $xt0,0x40($out)
950 movdqu 0x00($inp),$xt0
951 movdqu $xt1,0x50($out)
952 movdqu 0x10($inp),$xt1
953 movdqu $xt2,0x60($out)
954 movdqu 0x20($inp),$xt2
955 movdqu $xt3,0x70($out)
956 lea 0x80($out),$out # size optimization
957 movdqu 0x30($inp),$xt3
963 movdqu $xt0,0x00($out)
964 movdqu 0x40($inp),$xt0
965 movdqu $xt1,0x10($out)
966 movdqu 0x50($inp),$xt1
967 movdqu $xt2,0x20($out)
968 movdqu 0x60($inp),$xt2
969 movdqu $xt3,0x30($out)
970 movdqu 0x70($inp),$xt3
971 lea 0x80($inp),$inp # inp+=64*4
976 movdqu $xt0,0x40($out)
977 movdqu $xt1,0x50($out)
978 movdqu $xt2,0x60($out)
979 movdqu $xt3,0x70($out)
980 lea 0x80($out),$out # out+=64*4
995 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
997 #movdqa $xt0,0x00(%rsp)
998 movdqa $xb0,0x10(%rsp)
999 movdqa $xc0,0x20(%rsp)
1000 movdqa $xd0,0x30(%rsp)
1005 movdqu 0x00($inp),$xt0 # xor with input
1006 movdqu 0x10($inp),$xt1
1007 movdqu 0x20($inp),$xt2
1008 movdqu 0x30($inp),$xt3
1009 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
1013 movdqu $xt0,0x00($out)
1014 movdqu $xt1,0x10($out)
1015 movdqu $xt2,0x20($out)
1016 movdqu $xt3,0x30($out)
1019 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1020 lea 0x40($inp),$inp # inp+=64*1
1022 movdqa $xt0,0x00(%rsp)
1023 movdqa $xb1,0x10(%rsp)
1024 lea 0x40($out),$out # out+=64*1
1025 movdqa $xc1,0x20(%rsp)
1026 sub \$64,$len # len-=64*1
1027 movdqa $xd1,0x30(%rsp)
1032 movdqu 0x00($inp),$xt0 # xor with input
1033 movdqu 0x10($inp),$xt1
1034 movdqu 0x20($inp),$xt2
1035 movdqu 0x30($inp),$xt3
1036 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1041 movdqu $xt0,0x00($out)
1042 movdqu 0x40($inp),$xt0
1043 movdqu $xt1,0x10($out)
1044 movdqu 0x50($inp),$xt1
1045 movdqu $xt2,0x20($out)
1046 movdqu 0x60($inp),$xt2
1047 movdqu $xt3,0x30($out)
1048 movdqu 0x70($inp),$xt3
1049 pxor 0x10(%rsp),$xt0
1053 movdqu $xt0,0x40($out)
1054 movdqu $xt1,0x50($out)
1055 movdqu $xt2,0x60($out)
1056 movdqu $xt3,0x70($out)
1059 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1060 lea 0x80($inp),$inp # inp+=64*2
1062 movdqa $xt0,0x00(%rsp)
1063 movdqa $xb2,0x10(%rsp)
1064 lea 0x80($out),$out # out+=64*2
1065 movdqa $xc2,0x20(%rsp)
1066 sub \$128,$len # len-=64*2
1067 movdqa $xd2,0x30(%rsp)
1072 movdqu 0x00($inp),$xt0 # xor with input
1073 movdqu 0x10($inp),$xt1
1074 movdqu 0x20($inp),$xt2
1075 movdqu 0x30($inp),$xt3
1076 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1081 movdqu $xt0,0x00($out)
1082 movdqu 0x40($inp),$xt0
1083 movdqu $xt1,0x10($out)
1084 movdqu 0x50($inp),$xt1
1085 movdqu $xt2,0x20($out)
1086 movdqu 0x60($inp),$xt2
1087 movdqu $xt3,0x30($out)
1088 movdqu 0x70($inp),$xt3
1089 lea 0x80($inp),$inp # size optimization
1090 pxor 0x10(%rsp),$xt0
1095 movdqu $xt0,0x40($out)
1096 movdqu 0x00($inp),$xt0
1097 movdqu $xt1,0x50($out)
1098 movdqu 0x10($inp),$xt1
1099 movdqu $xt2,0x60($out)
1100 movdqu 0x20($inp),$xt2
1101 movdqu $xt3,0x70($out)
1102 lea 0x80($out),$out # size optimization
1103 movdqu 0x30($inp),$xt3
1104 pxor 0x20(%rsp),$xt0
1108 movdqu $xt0,0x00($out)
1109 movdqu $xt1,0x10($out)
1110 movdqu $xt2,0x20($out)
1111 movdqu $xt3,0x30($out)
1114 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1115 lea 0x40($inp),$inp # inp+=64*3
1117 movdqa $xt0,0x00(%rsp)
1118 movdqa $xb3,0x10(%rsp)
1119 lea 0x40($out),$out # out+=64*3
1120 movdqa $xc3,0x20(%rsp)
1121 sub \$192,$len # len-=64*3
1122 movdqa $xd3,0x30(%rsp)
1125 movzb ($inp,%r10),%eax
1126 movzb (%rsp,%r10),%ecx
1129 mov %al,-1($out,%r10)
1135 $code.=<<___ if ($win64);
1136 lea 0x140+0x30(%rsp),%r11
1137 movaps -0x30(%r11),%xmm6
1138 movaps -0x20(%r11),%xmm7
1139 movaps -0x10(%r11),%xmm8
1140 movaps 0x00(%r11),%xmm9
1141 movaps 0x10(%r11),%xmm10
1142 movaps 0x20(%r11),%xmm11
1143 movaps 0x30(%r11),%xmm12
1144 movaps 0x40(%r11),%xmm13
1145 movaps 0x50(%r11),%xmm14
1146 movaps 0x60(%r11),%xmm15
1149 add \$0x148+$xframe,%rsp
1151 .size ChaCha20_4x,.-ChaCha20_4x
1155 ########################################################################
1156 # XOP code path that handles all lengths.
1158 # There is some "anomaly" observed depending on instructions' size or
1159 # alignment. If you look closely at below code you'll notice that
1160 # sometimes argument order varies. The order affects instruction
1161 # encoding by making it larger, and such fiddling gives 5% performance
1162 # improvement. This is on FX-4100...
1164 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1165 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1166 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1167 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1169 sub XOP_lane_ROUND {
1170 my ($a0,$b0,$c0,$d0)=@_;
1171 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1172 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1173 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1174 my @x=map("\"$_\"",@xx);
1177 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1178 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1179 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1180 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1181 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1182 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1183 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1184 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1185 "&vprotd (@x[$d0],@x[$d0],16)",
1186 "&vprotd (@x[$d1],@x[$d1],16)",
1187 "&vprotd (@x[$d2],@x[$d2],16)",
1188 "&vprotd (@x[$d3],@x[$d3],16)",
1190 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1191 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1192 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1193 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1194 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1195 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1196 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1197 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1198 "&vprotd (@x[$b0],@x[$b0],12)",
1199 "&vprotd (@x[$b1],@x[$b1],12)",
1200 "&vprotd (@x[$b2],@x[$b2],12)",
1201 "&vprotd (@x[$b3],@x[$b3],12)",
1203 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1204 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1205 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1206 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1207 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1208 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1209 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1210 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1211 "&vprotd (@x[$d0],@x[$d0],8)",
1212 "&vprotd (@x[$d1],@x[$d1],8)",
1213 "&vprotd (@x[$d2],@x[$d2],8)",
1214 "&vprotd (@x[$d3],@x[$d3],8)",
1216 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1217 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1218 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1219 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1220 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1221 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1222 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1223 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1224 "&vprotd (@x[$b0],@x[$b0],7)",
1225 "&vprotd (@x[$b1],@x[$b1],7)",
1226 "&vprotd (@x[$b2],@x[$b2],7)",
1227 "&vprotd (@x[$b3],@x[$b3],7)"
1231 my $xframe = $win64 ? 0xa0 : 0;
1234 .type ChaCha20_4xop,\@function,5
1238 lea -0x78(%rsp),%r11
1239 sub \$0x148+$xframe,%rsp
1241 ################ stack layout
1242 # +0x00 SIMD equivalent of @x[8-12]
1244 # +0x40 constant copy of key[0-2] smashed by lanes
1246 # +0x100 SIMD counters (with nonce smashed by lanes)
1249 $code.=<<___ if ($win64);
1250 movaps %xmm6,-0x30(%r11)
1251 movaps %xmm7,-0x20(%r11)
1252 movaps %xmm8,-0x10(%r11)
1253 movaps %xmm9,0x00(%r11)
1254 movaps %xmm10,0x10(%r11)
1255 movaps %xmm11,0x20(%r11)
1256 movaps %xmm12,0x30(%r11)
1257 movaps %xmm13,0x40(%r11)
1258 movaps %xmm14,0x50(%r11)
1259 movaps %xmm15,0x60(%r11)
1264 vmovdqa .Lsigma(%rip),$xa3 # key[0]
1265 vmovdqu ($key),$xb3 # key[1]
1266 vmovdqu 16($key),$xt3 # key[2]
1267 vmovdqu ($counter),$xd3 # key[3]
1268 lea 0x100(%rsp),%rcx # size optimization
1270 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1271 vpshufd \$0x55,$xa3,$xa1
1272 vmovdqa $xa0,0x40(%rsp) # ... and offload
1273 vpshufd \$0xaa,$xa3,$xa2
1274 vmovdqa $xa1,0x50(%rsp)
1275 vpshufd \$0xff,$xa3,$xa3
1276 vmovdqa $xa2,0x60(%rsp)
1277 vmovdqa $xa3,0x70(%rsp)
1279 vpshufd \$0x00,$xb3,$xb0
1280 vpshufd \$0x55,$xb3,$xb1
1281 vmovdqa $xb0,0x80-0x100(%rcx)
1282 vpshufd \$0xaa,$xb3,$xb2
1283 vmovdqa $xb1,0x90-0x100(%rcx)
1284 vpshufd \$0xff,$xb3,$xb3
1285 vmovdqa $xb2,0xa0-0x100(%rcx)
1286 vmovdqa $xb3,0xb0-0x100(%rcx)
1288 vpshufd \$0x00,$xt3,$xt0 # "$xc0"
1289 vpshufd \$0x55,$xt3,$xt1 # "$xc1"
1290 vmovdqa $xt0,0xc0-0x100(%rcx)
1291 vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
1292 vmovdqa $xt1,0xd0-0x100(%rcx)
1293 vpshufd \$0xff,$xt3,$xt3 # "$xc3"
1294 vmovdqa $xt2,0xe0-0x100(%rcx)
1295 vmovdqa $xt3,0xf0-0x100(%rcx)
1297 vpshufd \$0x00,$xd3,$xd0
1298 vpshufd \$0x55,$xd3,$xd1
1299 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
1300 vpshufd \$0xaa,$xd3,$xd2
1301 vmovdqa $xd1,0x110-0x100(%rcx)
1302 vpshufd \$0xff,$xd3,$xd3
1303 vmovdqa $xd2,0x120-0x100(%rcx)
1304 vmovdqa $xd3,0x130-0x100(%rcx)
1310 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
1311 vmovdqa 0x50(%rsp),$xa1
1312 vmovdqa 0x60(%rsp),$xa2
1313 vmovdqa 0x70(%rsp),$xa3
1314 vmovdqa 0x80-0x100(%rcx),$xb0
1315 vmovdqa 0x90-0x100(%rcx),$xb1
1316 vmovdqa 0xa0-0x100(%rcx),$xb2
1317 vmovdqa 0xb0-0x100(%rcx),$xb3
1318 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1319 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1320 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1321 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1322 vmovdqa 0x100-0x100(%rcx),$xd0
1323 vmovdqa 0x110-0x100(%rcx),$xd1
1324 vmovdqa 0x120-0x100(%rcx),$xd2
1325 vmovdqa 0x130-0x100(%rcx),$xd3
1326 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
1330 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1336 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1337 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1342 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
1343 vpaddd 0x50(%rsp),$xa1,$xa1
1344 vpaddd 0x60(%rsp),$xa2,$xa2
1345 vpaddd 0x70(%rsp),$xa3,$xa3
1347 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
1348 vmovdqa $xt3,0x30(%rsp)
1350 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1351 vpunpckldq $xa3,$xa2,$xt3
1352 vpunpckhdq $xa1,$xa0,$xa0
1353 vpunpckhdq $xa3,$xa2,$xa2
1354 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1355 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1356 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1357 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1359 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1361 vpaddd 0x80-0x100(%rcx),$xb0,$xb0
1362 vpaddd 0x90-0x100(%rcx),$xb1,$xb1
1363 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
1364 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
1366 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
1367 vmovdqa $xa1,0x10(%rsp)
1368 vmovdqa 0x20(%rsp),$xa0 # "xc2"
1369 vmovdqa 0x30(%rsp),$xa1 # "xc3"
1371 vpunpckldq $xb1,$xb0,$xt2
1372 vpunpckldq $xb3,$xb2,$xt3
1373 vpunpckhdq $xb1,$xb0,$xb0
1374 vpunpckhdq $xb3,$xb2,$xb2
1375 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1376 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1377 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1378 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1380 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1381 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1383 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
1384 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
1385 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
1386 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
1388 vpunpckldq $xc1,$xc0,$xt2
1389 vpunpckldq $xc3,$xc2,$xt3
1390 vpunpckhdq $xc1,$xc0,$xc0
1391 vpunpckhdq $xc3,$xc2,$xc2
1392 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1393 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1394 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1395 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1397 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1399 vpaddd 0x100-0x100(%rcx),$xd0,$xd0
1400 vpaddd 0x110-0x100(%rcx),$xd1,$xd1
1401 vpaddd 0x120-0x100(%rcx),$xd2,$xd2
1402 vpaddd 0x130-0x100(%rcx),$xd3,$xd3
1404 vpunpckldq $xd1,$xd0,$xt2
1405 vpunpckldq $xd3,$xd2,$xt3
1406 vpunpckhdq $xd1,$xd0,$xd0
1407 vpunpckhdq $xd3,$xd2,$xd2
1408 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1409 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1410 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1411 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1413 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1414 ($xa0,$xa1)=($xt2,$xt3);
1416 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
1417 vmovdqa 0x10(%rsp),$xa1
1422 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1423 vpxor 0x10($inp),$xb0,$xb0
1424 vpxor 0x20($inp),$xc0,$xc0
1425 vpxor 0x30($inp),$xd0,$xd0
1426 vpxor 0x40($inp),$xa1,$xa1
1427 vpxor 0x50($inp),$xb1,$xb1
1428 vpxor 0x60($inp),$xc1,$xc1
1429 vpxor 0x70($inp),$xd1,$xd1
1430 lea 0x80($inp),$inp # size optimization
1431 vpxor 0x00($inp),$xa2,$xa2
1432 vpxor 0x10($inp),$xb2,$xb2
1433 vpxor 0x20($inp),$xc2,$xc2
1434 vpxor 0x30($inp),$xd2,$xd2
1435 vpxor 0x40($inp),$xa3,$xa3
1436 vpxor 0x50($inp),$xb3,$xb3
1437 vpxor 0x60($inp),$xc3,$xc3
1438 vpxor 0x70($inp),$xd3,$xd3
1439 lea 0x80($inp),$inp # inp+=64*4
1441 vmovdqu $xa0,0x00($out)
1442 vmovdqu $xb0,0x10($out)
1443 vmovdqu $xc0,0x20($out)
1444 vmovdqu $xd0,0x30($out)
1445 vmovdqu $xa1,0x40($out)
1446 vmovdqu $xb1,0x50($out)
1447 vmovdqu $xc1,0x60($out)
1448 vmovdqu $xd1,0x70($out)
1449 lea 0x80($out),$out # size optimization
1450 vmovdqu $xa2,0x00($out)
1451 vmovdqu $xb2,0x10($out)
1452 vmovdqu $xc2,0x20($out)
1453 vmovdqu $xd2,0x30($out)
1454 vmovdqu $xa3,0x40($out)
1455 vmovdqu $xb3,0x50($out)
1456 vmovdqu $xc3,0x60($out)
1457 vmovdqu $xd3,0x70($out)
1458 lea 0x80($out),$out # out+=64*4
1468 jae .L192_or_more4xop
1470 jae .L128_or_more4xop
1472 jae .L64_or_more4xop
1475 vmovdqa $xa0,0x00(%rsp)
1476 vmovdqa $xb0,0x10(%rsp)
1477 vmovdqa $xc0,0x20(%rsp)
1478 vmovdqa $xd0,0x30(%rsp)
1483 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1484 vpxor 0x10($inp),$xb0,$xb0
1485 vpxor 0x20($inp),$xc0,$xc0
1486 vpxor 0x30($inp),$xd0,$xd0
1487 vmovdqu $xa0,0x00($out)
1488 vmovdqu $xb0,0x10($out)
1489 vmovdqu $xc0,0x20($out)
1490 vmovdqu $xd0,0x30($out)
1493 lea 0x40($inp),$inp # inp+=64*1
1494 vmovdqa $xa1,0x00(%rsp)
1496 vmovdqa $xb1,0x10(%rsp)
1497 lea 0x40($out),$out # out+=64*1
1498 vmovdqa $xc1,0x20(%rsp)
1499 sub \$64,$len # len-=64*1
1500 vmovdqa $xd1,0x30(%rsp)
1505 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1506 vpxor 0x10($inp),$xb0,$xb0
1507 vpxor 0x20($inp),$xc0,$xc0
1508 vpxor 0x30($inp),$xd0,$xd0
1509 vpxor 0x40($inp),$xa1,$xa1
1510 vpxor 0x50($inp),$xb1,$xb1
1511 vpxor 0x60($inp),$xc1,$xc1
1512 vpxor 0x70($inp),$xd1,$xd1
1514 vmovdqu $xa0,0x00($out)
1515 vmovdqu $xb0,0x10($out)
1516 vmovdqu $xc0,0x20($out)
1517 vmovdqu $xd0,0x30($out)
1518 vmovdqu $xa1,0x40($out)
1519 vmovdqu $xb1,0x50($out)
1520 vmovdqu $xc1,0x60($out)
1521 vmovdqu $xd1,0x70($out)
1524 lea 0x80($inp),$inp # inp+=64*2
1525 vmovdqa $xa2,0x00(%rsp)
1527 vmovdqa $xb2,0x10(%rsp)
1528 lea 0x80($out),$out # out+=64*2
1529 vmovdqa $xc2,0x20(%rsp)
1530 sub \$128,$len # len-=64*2
1531 vmovdqa $xd2,0x30(%rsp)
1536 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1537 vpxor 0x10($inp),$xb0,$xb0
1538 vpxor 0x20($inp),$xc0,$xc0
1539 vpxor 0x30($inp),$xd0,$xd0
1540 vpxor 0x40($inp),$xa1,$xa1
1541 vpxor 0x50($inp),$xb1,$xb1
1542 vpxor 0x60($inp),$xc1,$xc1
1543 vpxor 0x70($inp),$xd1,$xd1
1544 lea 0x80($inp),$inp # size optimization
1545 vpxor 0x00($inp),$xa2,$xa2
1546 vpxor 0x10($inp),$xb2,$xb2
1547 vpxor 0x20($inp),$xc2,$xc2
1548 vpxor 0x30($inp),$xd2,$xd2
1550 vmovdqu $xa0,0x00($out)
1551 vmovdqu $xb0,0x10($out)
1552 vmovdqu $xc0,0x20($out)
1553 vmovdqu $xd0,0x30($out)
1554 vmovdqu $xa1,0x40($out)
1555 vmovdqu $xb1,0x50($out)
1556 vmovdqu $xc1,0x60($out)
1557 vmovdqu $xd1,0x70($out)
1558 lea 0x80($out),$out # size optimization
1559 vmovdqu $xa2,0x00($out)
1560 vmovdqu $xb2,0x10($out)
1561 vmovdqu $xc2,0x20($out)
1562 vmovdqu $xd2,0x30($out)
1565 lea 0x40($inp),$inp # inp+=64*3
1566 vmovdqa $xa3,0x00(%rsp)
1568 vmovdqa $xb3,0x10(%rsp)
1569 lea 0x40($out),$out # out+=64*3
1570 vmovdqa $xc3,0x20(%rsp)
1571 sub \$192,$len # len-=64*3
1572 vmovdqa $xd3,0x30(%rsp)
1575 movzb ($inp,%r10),%eax
1576 movzb (%rsp,%r10),%ecx
1579 mov %al,-1($out,%r10)
1586 $code.=<<___ if ($win64);
1587 lea 0x140+0x30(%rsp),%r11
1588 movaps -0x30(%r11),%xmm6
1589 movaps -0x20(%r11),%xmm7
1590 movaps -0x10(%r11),%xmm8
1591 movaps 0x00(%r11),%xmm9
1592 movaps 0x10(%r11),%xmm10
1593 movaps 0x20(%r11),%xmm11
1594 movaps 0x30(%r11),%xmm12
1595 movaps 0x40(%r11),%xmm13
1596 movaps 0x50(%r11),%xmm14
1597 movaps 0x60(%r11),%xmm15
1600 add \$0x148+$xframe,%rsp
1602 .size ChaCha20_4xop,.-ChaCha20_4xop
1606 ########################################################################
1609 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1610 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1611 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1612 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1614 sub AVX2_lane_ROUND {
1615 my ($a0,$b0,$c0,$d0)=@_;
1616 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1617 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1618 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1619 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1620 my @x=map("\"$_\"",@xx);
1622 # Consider order in which variables are addressed by their
1627 # 0 4 8 12 < even round
1631 # 0 5 10 15 < odd round
1636 # 'a', 'b' and 'd's are permanently allocated in registers,
1637 # @x[0..7,12..15], while 'c's are maintained in memory. If
1638 # you observe 'c' column, you'll notice that pair of 'c's is
1639 # invariant between rounds. This means that we have to reload
1640 # them once per round, in the middle. This is why you'll see
1641 # bunch of 'c' stores and loads in the middle, but none in
1642 # the beginning or end.
1645 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1646 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1647 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1648 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1649 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1650 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1652 "&vpaddd ($xc,$xc,@x[$d0])",
1653 "&vpxor (@x[$b0],$xc,@x[$b0])",
1654 "&vpslld ($t0,@x[$b0],12)",
1655 "&vpsrld (@x[$b0],@x[$b0],20)",
1656 "&vpor (@x[$b0],$t0,@x[$b0])",
1657 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1658 "&vpaddd ($xc_,$xc_,@x[$d1])",
1659 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1660 "&vpslld ($t1,@x[$b1],12)",
1661 "&vpsrld (@x[$b1],@x[$b1],20)",
1662 "&vpor (@x[$b1],$t1,@x[$b1])",
1664 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1665 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1666 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1667 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1668 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1669 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1671 "&vpaddd ($xc,$xc,@x[$d0])",
1672 "&vpxor (@x[$b0],$xc,@x[$b0])",
1673 "&vpslld ($t1,@x[$b0],7)",
1674 "&vpsrld (@x[$b0],@x[$b0],25)",
1675 "&vpor (@x[$b0],$t1,@x[$b0])",
1676 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1677 "&vpaddd ($xc_,$xc_,@x[$d1])",
1678 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1679 "&vpslld ($t0,@x[$b1],7)",
1680 "&vpsrld (@x[$b1],@x[$b1],25)",
1681 "&vpor (@x[$b1],$t0,@x[$b1])",
1683 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1684 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1685 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1686 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1688 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1689 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1690 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1691 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1692 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1693 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1695 "&vpaddd ($xc,$xc,@x[$d2])",
1696 "&vpxor (@x[$b2],$xc,@x[$b2])",
1697 "&vpslld ($t0,@x[$b2],12)",
1698 "&vpsrld (@x[$b2],@x[$b2],20)",
1699 "&vpor (@x[$b2],$t0,@x[$b2])",
1700 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1701 "&vpaddd ($xc_,$xc_,@x[$d3])",
1702 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1703 "&vpslld ($t1,@x[$b3],12)",
1704 "&vpsrld (@x[$b3],@x[$b3],20)",
1705 "&vpor (@x[$b3],$t1,@x[$b3])",
1707 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1708 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1709 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1710 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1711 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1712 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1714 "&vpaddd ($xc,$xc,@x[$d2])",
1715 "&vpxor (@x[$b2],$xc,@x[$b2])",
1716 "&vpslld ($t1,@x[$b2],7)",
1717 "&vpsrld (@x[$b2],@x[$b2],25)",
1718 "&vpor (@x[$b2],$t1,@x[$b2])",
1719 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1720 "&vpaddd ($xc_,$xc_,@x[$d3])",
1721 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1722 "&vpslld ($t0,@x[$b3],7)",
1723 "&vpsrld (@x[$b3],@x[$b3],25)",
1724 "&vpor (@x[$b3],$t0,@x[$b3])"
1728 my $xframe = $win64 ? 0xb0 : 8;
1731 .type ChaCha20_8x,\@function,5
1736 $code.=<<___ if ($avx>2);
1737 test \$`1<<16`,%r10d # check for AVX512F
1742 sub \$0x280+$xframe,%rsp
1745 $code.=<<___ if ($win64);
1746 lea 0x290+0x30(%rsp),%r11
1747 movaps %xmm6,-0x30(%r11)
1748 movaps %xmm7,-0x20(%r11)
1749 movaps %xmm8,-0x10(%r11)
1750 movaps %xmm9,0x00(%r11)
1751 movaps %xmm10,0x10(%r11)
1752 movaps %xmm11,0x20(%r11)
1753 movaps %xmm12,0x30(%r11)
1754 movaps %xmm13,0x40(%r11)
1755 movaps %xmm14,0x50(%r11)
1756 movaps %xmm15,0x60(%r11)
1760 mov %r10,0x280(%rsp)
1762 ################ stack layout
1763 # +0x00 SIMD equivalent of @x[8-12]
1765 # +0x80 constant copy of key[0-2] smashed by lanes
1767 # +0x200 SIMD counters (with nonce smashed by lanes)
1771 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1772 vbroadcasti128 ($key),$xb3 # key[1]
1773 vbroadcasti128 16($key),$xt3 # key[2]
1774 vbroadcasti128 ($counter),$xd3 # key[3]
1775 lea 0x100(%rsp),%rcx # size optimization
1776 lea 0x200(%rsp),%rax # size optimization
1777 lea .Lrot16(%rip),%r10
1778 lea .Lrot24(%rip),%r11
1780 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1781 vpshufd \$0x55,$xa3,$xa1
1782 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1783 vpshufd \$0xaa,$xa3,$xa2
1784 vmovdqa $xa1,0xa0-0x100(%rcx)
1785 vpshufd \$0xff,$xa3,$xa3
1786 vmovdqa $xa2,0xc0-0x100(%rcx)
1787 vmovdqa $xa3,0xe0-0x100(%rcx)
1789 vpshufd \$0x00,$xb3,$xb0
1790 vpshufd \$0x55,$xb3,$xb1
1791 vmovdqa $xb0,0x100-0x100(%rcx)
1792 vpshufd \$0xaa,$xb3,$xb2
1793 vmovdqa $xb1,0x120-0x100(%rcx)
1794 vpshufd \$0xff,$xb3,$xb3
1795 vmovdqa $xb2,0x140-0x100(%rcx)
1796 vmovdqa $xb3,0x160-0x100(%rcx)
1798 vpshufd \$0x00,$xt3,$xt0 # "xc0"
1799 vpshufd \$0x55,$xt3,$xt1 # "xc1"
1800 vmovdqa $xt0,0x180-0x200(%rax)
1801 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
1802 vmovdqa $xt1,0x1a0-0x200(%rax)
1803 vpshufd \$0xff,$xt3,$xt3 # "xc3"
1804 vmovdqa $xt2,0x1c0-0x200(%rax)
1805 vmovdqa $xt3,0x1e0-0x200(%rax)
1807 vpshufd \$0x00,$xd3,$xd0
1808 vpshufd \$0x55,$xd3,$xd1
1809 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
1810 vpshufd \$0xaa,$xd3,$xd2
1811 vmovdqa $xd1,0x220-0x200(%rax)
1812 vpshufd \$0xff,$xd3,$xd3
1813 vmovdqa $xd2,0x240-0x200(%rax)
1814 vmovdqa $xd3,0x260-0x200(%rax)
1820 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
1821 vmovdqa 0xa0-0x100(%rcx),$xa1
1822 vmovdqa 0xc0-0x100(%rcx),$xa2
1823 vmovdqa 0xe0-0x100(%rcx),$xa3
1824 vmovdqa 0x100-0x100(%rcx),$xb0
1825 vmovdqa 0x120-0x100(%rcx),$xb1
1826 vmovdqa 0x140-0x100(%rcx),$xb2
1827 vmovdqa 0x160-0x100(%rcx),$xb3
1828 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
1829 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
1830 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
1831 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
1832 vmovdqa 0x200-0x200(%rax),$xd0
1833 vmovdqa 0x220-0x200(%rax),$xd1
1834 vmovdqa 0x240-0x200(%rax),$xd2
1835 vmovdqa 0x260-0x200(%rax),$xd3
1836 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
1839 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
1840 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
1841 vbroadcasti128 (%r10),$xt3
1842 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
1849 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
1850 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
1855 lea 0x200(%rsp),%rax # size optimization
1856 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
1857 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
1858 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
1859 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
1861 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1862 vpunpckldq $xa3,$xa2,$xt3
1863 vpunpckhdq $xa1,$xa0,$xa0
1864 vpunpckhdq $xa3,$xa2,$xa2
1865 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1866 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1867 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1868 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1870 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1872 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
1873 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
1874 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
1875 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
1877 vpunpckldq $xb1,$xb0,$xt2
1878 vpunpckldq $xb3,$xb2,$xt3
1879 vpunpckhdq $xb1,$xb0,$xb0
1880 vpunpckhdq $xb3,$xb2,$xb2
1881 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1882 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1883 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1884 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1886 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1888 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
1889 vperm2i128 \$0x31,$xb0,$xa0,$xb0
1890 vperm2i128 \$0x20,$xb1,$xa1,$xa0
1891 vperm2i128 \$0x31,$xb1,$xa1,$xb1
1892 vperm2i128 \$0x20,$xb2,$xa2,$xa1
1893 vperm2i128 \$0x31,$xb2,$xa2,$xb2
1894 vperm2i128 \$0x20,$xb3,$xa3,$xa2
1895 vperm2i128 \$0x31,$xb3,$xa3,$xb3
1897 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1898 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1900 vmovdqa $xa0,0x00(%rsp) # offload $xaN
1901 vmovdqa $xa1,0x20(%rsp)
1902 vmovdqa 0x40(%rsp),$xc2 # $xa0
1903 vmovdqa 0x60(%rsp),$xc3 # $xa1
1905 vpaddd 0x180-0x200(%rax),$xc0,$xc0
1906 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
1907 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
1908 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
1910 vpunpckldq $xc1,$xc0,$xt2
1911 vpunpckldq $xc3,$xc2,$xt3
1912 vpunpckhdq $xc1,$xc0,$xc0
1913 vpunpckhdq $xc3,$xc2,$xc2
1914 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1915 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1916 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1917 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1919 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1921 vpaddd 0x200-0x200(%rax),$xd0,$xd0
1922 vpaddd 0x220-0x200(%rax),$xd1,$xd1
1923 vpaddd 0x240-0x200(%rax),$xd2,$xd2
1924 vpaddd 0x260-0x200(%rax),$xd3,$xd3
1926 vpunpckldq $xd1,$xd0,$xt2
1927 vpunpckldq $xd3,$xd2,$xt3
1928 vpunpckhdq $xd1,$xd0,$xd0
1929 vpunpckhdq $xd3,$xd2,$xd2
1930 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1931 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1932 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1933 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1935 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1937 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
1938 vperm2i128 \$0x31,$xd0,$xc0,$xd0
1939 vperm2i128 \$0x20,$xd1,$xc1,$xc0
1940 vperm2i128 \$0x31,$xd1,$xc1,$xd1
1941 vperm2i128 \$0x20,$xd2,$xc2,$xc1
1942 vperm2i128 \$0x31,$xd2,$xc2,$xd2
1943 vperm2i128 \$0x20,$xd3,$xc3,$xc2
1944 vperm2i128 \$0x31,$xd3,$xc3,$xd3
1946 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1947 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1948 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1949 ($xa0,$xa1)=($xt2,$xt3);
1951 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
1952 vmovdqa 0x20(%rsp),$xa1
1957 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1958 vpxor 0x20($inp),$xb0,$xb0
1959 vpxor 0x40($inp),$xc0,$xc0
1960 vpxor 0x60($inp),$xd0,$xd0
1961 lea 0x80($inp),$inp # size optimization
1962 vmovdqu $xa0,0x00($out)
1963 vmovdqu $xb0,0x20($out)
1964 vmovdqu $xc0,0x40($out)
1965 vmovdqu $xd0,0x60($out)
1966 lea 0x80($out),$out # size optimization
1968 vpxor 0x00($inp),$xa1,$xa1
1969 vpxor 0x20($inp),$xb1,$xb1
1970 vpxor 0x40($inp),$xc1,$xc1
1971 vpxor 0x60($inp),$xd1,$xd1
1972 lea 0x80($inp),$inp # size optimization
1973 vmovdqu $xa1,0x00($out)
1974 vmovdqu $xb1,0x20($out)
1975 vmovdqu $xc1,0x40($out)
1976 vmovdqu $xd1,0x60($out)
1977 lea 0x80($out),$out # size optimization
1979 vpxor 0x00($inp),$xa2,$xa2
1980 vpxor 0x20($inp),$xb2,$xb2
1981 vpxor 0x40($inp),$xc2,$xc2
1982 vpxor 0x60($inp),$xd2,$xd2
1983 lea 0x80($inp),$inp # size optimization
1984 vmovdqu $xa2,0x00($out)
1985 vmovdqu $xb2,0x20($out)
1986 vmovdqu $xc2,0x40($out)
1987 vmovdqu $xd2,0x60($out)
1988 lea 0x80($out),$out # size optimization
1990 vpxor 0x00($inp),$xa3,$xa3
1991 vpxor 0x20($inp),$xb3,$xb3
1992 vpxor 0x40($inp),$xc3,$xc3
1993 vpxor 0x60($inp),$xd3,$xd3
1994 lea 0x80($inp),$inp # size optimization
1995 vmovdqu $xa3,0x00($out)
1996 vmovdqu $xb3,0x20($out)
1997 vmovdqu $xc3,0x40($out)
1998 vmovdqu $xd3,0x60($out)
1999 lea 0x80($out),$out # size optimization
2023 vmovdqa $xa0,0x00(%rsp)
2024 vmovdqa $xb0,0x20(%rsp)
2029 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2030 vpxor 0x20($inp),$xb0,$xb0
2031 vmovdqu $xa0,0x00($out)
2032 vmovdqu $xb0,0x20($out)
2035 lea 0x40($inp),$inp # inp+=64*1
2037 vmovdqa $xc0,0x00(%rsp)
2038 lea 0x40($out),$out # out+=64*1
2039 sub \$64,$len # len-=64*1
2040 vmovdqa $xd0,0x20(%rsp)
2045 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2046 vpxor 0x20($inp),$xb0,$xb0
2047 vpxor 0x40($inp),$xc0,$xc0
2048 vpxor 0x60($inp),$xd0,$xd0
2049 vmovdqu $xa0,0x00($out)
2050 vmovdqu $xb0,0x20($out)
2051 vmovdqu $xc0,0x40($out)
2052 vmovdqu $xd0,0x60($out)
2055 lea 0x80($inp),$inp # inp+=64*2
2057 vmovdqa $xa1,0x00(%rsp)
2058 lea 0x80($out),$out # out+=64*2
2059 sub \$128,$len # len-=64*2
2060 vmovdqa $xb1,0x20(%rsp)
2065 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2066 vpxor 0x20($inp),$xb0,$xb0
2067 vpxor 0x40($inp),$xc0,$xc0
2068 vpxor 0x60($inp),$xd0,$xd0
2069 vpxor 0x80($inp),$xa1,$xa1
2070 vpxor 0xa0($inp),$xb1,$xb1
2071 vmovdqu $xa0,0x00($out)
2072 vmovdqu $xb0,0x20($out)
2073 vmovdqu $xc0,0x40($out)
2074 vmovdqu $xd0,0x60($out)
2075 vmovdqu $xa1,0x80($out)
2076 vmovdqu $xb1,0xa0($out)
2079 lea 0xc0($inp),$inp # inp+=64*3
2081 vmovdqa $xc1,0x00(%rsp)
2082 lea 0xc0($out),$out # out+=64*3
2083 sub \$192,$len # len-=64*3
2084 vmovdqa $xd1,0x20(%rsp)
2089 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2090 vpxor 0x20($inp),$xb0,$xb0
2091 vpxor 0x40($inp),$xc0,$xc0
2092 vpxor 0x60($inp),$xd0,$xd0
2093 vpxor 0x80($inp),$xa1,$xa1
2094 vpxor 0xa0($inp),$xb1,$xb1
2095 vpxor 0xc0($inp),$xc1,$xc1
2096 vpxor 0xe0($inp),$xd1,$xd1
2097 vmovdqu $xa0,0x00($out)
2098 vmovdqu $xb0,0x20($out)
2099 vmovdqu $xc0,0x40($out)
2100 vmovdqu $xd0,0x60($out)
2101 vmovdqu $xa1,0x80($out)
2102 vmovdqu $xb1,0xa0($out)
2103 vmovdqu $xc1,0xc0($out)
2104 vmovdqu $xd1,0xe0($out)
2107 lea 0x100($inp),$inp # inp+=64*4
2109 vmovdqa $xa2,0x00(%rsp)
2110 lea 0x100($out),$out # out+=64*4
2111 sub \$256,$len # len-=64*4
2112 vmovdqa $xb2,0x20(%rsp)
2117 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2118 vpxor 0x20($inp),$xb0,$xb0
2119 vpxor 0x40($inp),$xc0,$xc0
2120 vpxor 0x60($inp),$xd0,$xd0
2121 vpxor 0x80($inp),$xa1,$xa1
2122 vpxor 0xa0($inp),$xb1,$xb1
2123 vpxor 0xc0($inp),$xc1,$xc1
2124 vpxor 0xe0($inp),$xd1,$xd1
2125 vpxor 0x100($inp),$xa2,$xa2
2126 vpxor 0x120($inp),$xb2,$xb2
2127 vmovdqu $xa0,0x00($out)
2128 vmovdqu $xb0,0x20($out)
2129 vmovdqu $xc0,0x40($out)
2130 vmovdqu $xd0,0x60($out)
2131 vmovdqu $xa1,0x80($out)
2132 vmovdqu $xb1,0xa0($out)
2133 vmovdqu $xc1,0xc0($out)
2134 vmovdqu $xd1,0xe0($out)
2135 vmovdqu $xa2,0x100($out)
2136 vmovdqu $xb2,0x120($out)
2139 lea 0x140($inp),$inp # inp+=64*5
2141 vmovdqa $xc2,0x00(%rsp)
2142 lea 0x140($out),$out # out+=64*5
2143 sub \$320,$len # len-=64*5
2144 vmovdqa $xd2,0x20(%rsp)
2149 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2150 vpxor 0x20($inp),$xb0,$xb0
2151 vpxor 0x40($inp),$xc0,$xc0
2152 vpxor 0x60($inp),$xd0,$xd0
2153 vpxor 0x80($inp),$xa1,$xa1
2154 vpxor 0xa0($inp),$xb1,$xb1
2155 vpxor 0xc0($inp),$xc1,$xc1
2156 vpxor 0xe0($inp),$xd1,$xd1
2157 vpxor 0x100($inp),$xa2,$xa2
2158 vpxor 0x120($inp),$xb2,$xb2
2159 vpxor 0x140($inp),$xc2,$xc2
2160 vpxor 0x160($inp),$xd2,$xd2
2161 vmovdqu $xa0,0x00($out)
2162 vmovdqu $xb0,0x20($out)
2163 vmovdqu $xc0,0x40($out)
2164 vmovdqu $xd0,0x60($out)
2165 vmovdqu $xa1,0x80($out)
2166 vmovdqu $xb1,0xa0($out)
2167 vmovdqu $xc1,0xc0($out)
2168 vmovdqu $xd1,0xe0($out)
2169 vmovdqu $xa2,0x100($out)
2170 vmovdqu $xb2,0x120($out)
2171 vmovdqu $xc2,0x140($out)
2172 vmovdqu $xd2,0x160($out)
2175 lea 0x180($inp),$inp # inp+=64*6
2177 vmovdqa $xa3,0x00(%rsp)
2178 lea 0x180($out),$out # out+=64*6
2179 sub \$384,$len # len-=64*6
2180 vmovdqa $xb3,0x20(%rsp)
2185 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2186 vpxor 0x20($inp),$xb0,$xb0
2187 vpxor 0x40($inp),$xc0,$xc0
2188 vpxor 0x60($inp),$xd0,$xd0
2189 vpxor 0x80($inp),$xa1,$xa1
2190 vpxor 0xa0($inp),$xb1,$xb1
2191 vpxor 0xc0($inp),$xc1,$xc1
2192 vpxor 0xe0($inp),$xd1,$xd1
2193 vpxor 0x100($inp),$xa2,$xa2
2194 vpxor 0x120($inp),$xb2,$xb2
2195 vpxor 0x140($inp),$xc2,$xc2
2196 vpxor 0x160($inp),$xd2,$xd2
2197 vpxor 0x180($inp),$xa3,$xa3
2198 vpxor 0x1a0($inp),$xb3,$xb3
2199 vmovdqu $xa0,0x00($out)
2200 vmovdqu $xb0,0x20($out)
2201 vmovdqu $xc0,0x40($out)
2202 vmovdqu $xd0,0x60($out)
2203 vmovdqu $xa1,0x80($out)
2204 vmovdqu $xb1,0xa0($out)
2205 vmovdqu $xc1,0xc0($out)
2206 vmovdqu $xd1,0xe0($out)
2207 vmovdqu $xa2,0x100($out)
2208 vmovdqu $xb2,0x120($out)
2209 vmovdqu $xc2,0x140($out)
2210 vmovdqu $xd2,0x160($out)
2211 vmovdqu $xa3,0x180($out)
2212 vmovdqu $xb3,0x1a0($out)
2215 lea 0x1c0($inp),$inp # inp+=64*7
2217 vmovdqa $xc3,0x00(%rsp)
2218 lea 0x1c0($out),$out # out+=64*7
2219 sub \$448,$len # len-=64*7
2220 vmovdqa $xd3,0x20(%rsp)
2223 movzb ($inp,%r10),%eax
2224 movzb (%rsp,%r10),%ecx
2227 mov %al,-1($out,%r10)
2234 $code.=<<___ if ($win64);
2235 lea 0x290+0x30(%rsp),%r11
2236 movaps -0x30(%r11),%xmm6
2237 movaps -0x20(%r11),%xmm7
2238 movaps -0x10(%r11),%xmm8
2239 movaps 0x00(%r11),%xmm9
2240 movaps 0x10(%r11),%xmm10
2241 movaps 0x20(%r11),%xmm11
2242 movaps 0x30(%r11),%xmm12
2243 movaps 0x40(%r11),%xmm13
2244 movaps 0x50(%r11),%xmm14
2245 movaps 0x60(%r11),%xmm15
2248 mov 0x280(%rsp),%rsp
2250 .size ChaCha20_8x,.-ChaCha20_8x
2254 ########################################################################
2257 my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2258 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
2259 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2260 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2261 my @key=map("%zmm$_",(16..31));
2262 my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
2264 sub AVX512_lane_ROUND {
2265 my ($a0,$b0,$c0,$d0)=@_;
2266 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
2267 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
2268 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
2269 my @x=map("\"$_\"",@xx);
2272 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
2273 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
2274 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
2275 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
2276 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2277 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2278 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2279 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2280 "&vprold (@x[$d0],@x[$d0],16)",
2281 "&vprold (@x[$d1],@x[$d1],16)",
2282 "&vprold (@x[$d2],@x[$d2],16)",
2283 "&vprold (@x[$d3],@x[$d3],16)",
2285 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2286 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2287 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2288 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2289 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2290 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2291 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2292 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2293 "&vprold (@x[$b0],@x[$b0],12)",
2294 "&vprold (@x[$b1],@x[$b1],12)",
2295 "&vprold (@x[$b2],@x[$b2],12)",
2296 "&vprold (@x[$b3],@x[$b3],12)",
2298 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
2299 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
2300 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
2301 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
2302 "&vpxord (@x[$d0],@x[$d0],@x[$a0])",
2303 "&vpxord (@x[$d1],@x[$d1],@x[$a1])",
2304 "&vpxord (@x[$d2],@x[$d2],@x[$a2])",
2305 "&vpxord (@x[$d3],@x[$d3],@x[$a3])",
2306 "&vprold (@x[$d0],@x[$d0],8)",
2307 "&vprold (@x[$d1],@x[$d1],8)",
2308 "&vprold (@x[$d2],@x[$d2],8)",
2309 "&vprold (@x[$d3],@x[$d3],8)",
2311 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
2312 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
2313 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
2314 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
2315 "&vpxord (@x[$b0],@x[$b0],@x[$c0])",
2316 "&vpxord (@x[$b1],@x[$b1],@x[$c1])",
2317 "&vpxord (@x[$b2],@x[$b2],@x[$c2])",
2318 "&vpxord (@x[$b3],@x[$b3],@x[$c3])",
2319 "&vprold (@x[$b0],@x[$b0],7)",
2320 "&vprold (@x[$b1],@x[$b1],7)",
2321 "&vprold (@x[$b2],@x[$b2],7)",
2322 "&vprold (@x[$b3],@x[$b3],7)"
2326 my $xframe = $win64 ? 0xb0 : 8;
2329 .type ChaCha20_16x,\@function,5
2334 sub \$64+$xframe,%rsp
2337 $code.=<<___ if ($win64);
2338 lea 0x290+0x30(%rsp),%r11
2339 movaps %xmm6,-0x30(%r11)
2340 movaps %xmm7,-0x20(%r11)
2341 movaps %xmm8,-0x10(%r11)
2342 movaps %xmm9,0x00(%r11)
2343 movaps %xmm10,0x10(%r11)
2344 movaps %xmm11,0x20(%r11)
2345 movaps %xmm12,0x30(%r11)
2346 movaps %xmm13,0x40(%r11)
2347 movaps %xmm14,0x50(%r11)
2348 movaps %xmm15,0x60(%r11)
2353 lea .Lsigma(%rip),%r10
2354 vbroadcasti32x4 (%r10),$xa3 # key[0]
2355 vbroadcasti32x4 ($key),$xb3 # key[1]
2356 vbroadcasti32x4 16($key),$xc3 # key[2]
2357 vbroadcasti32x4 ($counter),$xd3 # key[3]
2359 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
2360 vpshufd \$0x55,$xa3,$xa1
2361 vpshufd \$0xaa,$xa3,$xa2
2362 vpshufd \$0xff,$xa3,$xa3
2363 vmovdqa64 $xa0,@key[0]
2364 vmovdqa64 $xa1,@key[1]
2365 vmovdqa64 $xa2,@key[2]
2366 vmovdqa64 $xa3,@key[3]
2368 vpshufd \$0x00,$xb3,$xb0
2369 vpshufd \$0x55,$xb3,$xb1
2370 vpshufd \$0xaa,$xb3,$xb2
2371 vpshufd \$0xff,$xb3,$xb3
2372 vmovdqa64 $xb0,@key[4]
2373 vmovdqa64 $xb1,@key[5]
2374 vmovdqa64 $xb2,@key[6]
2375 vmovdqa64 $xb3,@key[7]
2377 vpshufd \$0x00,$xc3,$xc0
2378 vpshufd \$0x55,$xc3,$xc1
2379 vpshufd \$0xaa,$xc3,$xc2
2380 vpshufd \$0xff,$xc3,$xc3
2381 vmovdqa64 $xc0,@key[8]
2382 vmovdqa64 $xc1,@key[9]
2383 vmovdqa64 $xc2,@key[10]
2384 vmovdqa64 $xc3,@key[11]
2386 vpshufd \$0x00,$xd3,$xd0
2387 vpshufd \$0x55,$xd3,$xd1
2388 vpshufd \$0xaa,$xd3,$xd2
2389 vpshufd \$0xff,$xd3,$xd3
2390 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet
2391 vmovdqa64 $xd0,@key[12]
2392 vmovdqa64 $xd1,@key[13]
2393 vmovdqa64 $xd2,@key[14]
2394 vmovdqa64 $xd3,@key[15]
2401 vpbroadcastd 0(%r10),$xa0 # reload key
2402 vpbroadcastd 4(%r10),$xa1
2403 vpbroadcastd 8(%r10),$xa2
2404 vpbroadcastd 12(%r10),$xa3
2405 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
2406 vmovdqa64 @key[4],$xb0
2407 vmovdqa64 @key[5],$xb1
2408 vmovdqa64 @key[6],$xb2
2409 vmovdqa64 @key[7],$xb3
2410 vmovdqa64 @key[8],$xc0
2411 vmovdqa64 @key[9],$xc1
2412 vmovdqa64 @key[10],$xc2
2413 vmovdqa64 @key[11],$xc3
2414 vmovdqa64 @key[12],$xd0
2415 vmovdqa64 @key[13],$xd1
2416 vmovdqa64 @key[14],$xd2
2417 vmovdqa64 @key[15],$xd3
2419 vmovdqa64 $xa0,@key[0]
2420 vmovdqa64 $xa1,@key[1]
2421 vmovdqa64 $xa2,@key[2]
2422 vmovdqa64 $xa3,@key[3]
2430 foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
2431 foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
2436 vpaddd @key[0],$xa0,$xa0 # accumulate key
2437 vpaddd @key[1],$xa1,$xa1
2438 vpaddd @key[2],$xa2,$xa2
2439 vpaddd @key[3],$xa3,$xa3
2441 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
2442 vpunpckldq $xa3,$xa2,$xt3
2443 vpunpckhdq $xa1,$xa0,$xa0
2444 vpunpckhdq $xa3,$xa2,$xa2
2445 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
2446 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
2447 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
2448 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
2450 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
2452 vpaddd @key[4],$xb0,$xb0
2453 vpaddd @key[5],$xb1,$xb1
2454 vpaddd @key[6],$xb2,$xb2
2455 vpaddd @key[7],$xb3,$xb3
2457 vpunpckldq $xb1,$xb0,$xt2
2458 vpunpckldq $xb3,$xb2,$xt3
2459 vpunpckhdq $xb1,$xb0,$xb0
2460 vpunpckhdq $xb3,$xb2,$xb2
2461 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
2462 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
2463 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
2464 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
2466 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
2468 vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further
2469 vshufi32x4 \$0xee,$xb0,$xa0,$xb0
2470 vshufi32x4 \$0x44,$xb1,$xa1,$xa0
2471 vshufi32x4 \$0xee,$xb1,$xa1,$xb1
2472 vshufi32x4 \$0x44,$xb2,$xa2,$xa1
2473 vshufi32x4 \$0xee,$xb2,$xa2,$xb2
2474 vshufi32x4 \$0x44,$xb3,$xa3,$xa2
2475 vshufi32x4 \$0xee,$xb3,$xa3,$xb3
2477 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
2479 vpaddd @key[8],$xc0,$xc0
2480 vpaddd @key[9],$xc1,$xc1
2481 vpaddd @key[10],$xc2,$xc2
2482 vpaddd @key[11],$xc3,$xc3
2484 vpunpckldq $xc1,$xc0,$xt2
2485 vpunpckldq $xc3,$xc2,$xt3
2486 vpunpckhdq $xc1,$xc0,$xc0
2487 vpunpckhdq $xc3,$xc2,$xc2
2488 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
2489 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
2490 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
2491 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
2493 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
2495 vpaddd @key[12],$xd0,$xd0
2496 vpaddd @key[13],$xd1,$xd1
2497 vpaddd @key[14],$xd2,$xd2
2498 vpaddd @key[15],$xd3,$xd3
2500 vpunpckldq $xd1,$xd0,$xt2
2501 vpunpckldq $xd3,$xd2,$xt3
2502 vpunpckhdq $xd1,$xd0,$xd0
2503 vpunpckhdq $xd3,$xd2,$xd2
2504 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
2505 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
2506 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
2507 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
2509 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
2511 vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further
2512 vshufi32x4 \$0xee,$xd0,$xc0,$xd0
2513 vshufi32x4 \$0x44,$xd1,$xc1,$xc0
2514 vshufi32x4 \$0xee,$xd1,$xc1,$xd1
2515 vshufi32x4 \$0x44,$xd2,$xc2,$xc1
2516 vshufi32x4 \$0xee,$xd2,$xc2,$xd2
2517 vshufi32x4 \$0x44,$xd3,$xc3,$xc2
2518 vshufi32x4 \$0xee,$xd3,$xc3,$xd3
2520 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
2522 vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further
2523 vshufi32x4 \$0xdd,$xc0,$xa0,$xa0
2524 vshufi32x4 \$0x88,$xd0,$xb0,$xc0
2525 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0
2526 vshufi32x4 \$0x88,$xc1,$xa1,$xt1
2527 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1
2528 vshufi32x4 \$0x88,$xd1,$xb1,$xc1
2529 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1
2530 vshufi32x4 \$0x88,$xc2,$xa2,$xt2
2531 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2
2532 vshufi32x4 \$0x88,$xd2,$xb2,$xc2
2533 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2
2534 vshufi32x4 \$0x88,$xc3,$xa3,$xt3
2535 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3
2536 vshufi32x4 \$0x88,$xd3,$xb3,$xc3
2537 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3
2539 ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
2540 ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
2542 ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
2543 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
2544 ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
2545 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
2550 vpxord 0x00($inp),$xa0,$xa0 # xor with input
2551 vpxord 0x40($inp),$xb0,$xb0
2552 vpxord 0x80($inp),$xc0,$xc0
2553 vpxord 0xc0($inp),$xd0,$xd0
2554 vmovdqu32 $xa0,0x00($out)
2555 vmovdqu32 $xb0,0x40($out)
2556 vmovdqu32 $xc0,0x80($out)
2557 vmovdqu32 $xd0,0xc0($out)
2559 vpxord 0x100($inp),$xa1,$xa1
2560 vpxord 0x140($inp),$xb1,$xb1
2561 vpxord 0x180($inp),$xc1,$xc1
2562 vpxord 0x1c0($inp),$xd1,$xd1
2563 vmovdqu32 $xa1,0x100($out)
2564 vmovdqu32 $xb1,0x140($out)
2565 vmovdqu32 $xc1,0x180($out)
2566 vmovdqu32 $xd1,0x1c0($out)
2568 vpxord 0x200($inp),$xa2,$xa2
2569 vpxord 0x240($inp),$xb2,$xb2
2570 vpxord 0x280($inp),$xc2,$xc2
2571 vpxord 0x2c0($inp),$xd2,$xd2
2572 vmovdqu32 $xa2,0x200($out)
2573 vmovdqu32 $xb2,0x240($out)
2574 vmovdqu32 $xc2,0x280($out)
2575 vmovdqu32 $xd2,0x2c0($out)
2577 vpxord 0x300($inp),$xa3,$xa3
2578 vpxord 0x340($inp),$xb3,$xb3
2579 vpxord 0x380($inp),$xc3,$xc3
2580 vpxord 0x3c0($inp),$xd3,$xd3
2581 lea 0x400($inp),$inp
2582 vmovdqu32 $xa3,0x300($out)
2583 vmovdqu32 $xb3,0x340($out)
2584 vmovdqu32 $xc3,0x380($out)
2585 vmovdqu32 $xd3,0x3c0($out)
2586 lea 0x400($out),$out
2598 jb .Less_than_64_16x
2599 vpxord ($inp),$xa0,$xa0 # xor with input
2600 vmovdqu32 $xa0,($out,$inp)
2606 jb .Less_than_64_16x
2607 vpxord ($inp),$xb0,$xb0
2608 vmovdqu32 $xb0,($out,$inp)
2614 jb .Less_than_64_16x
2615 vpxord ($inp),$xc0,$xc0
2616 vmovdqu32 $xc0,($out,$inp)
2622 jb .Less_than_64_16x
2623 vpxord ($inp),$xd0,$xd0
2624 vmovdqu32 $xd0,($out,$inp)
2630 jb .Less_than_64_16x
2631 vpxord ($inp),$xa1,$xa1
2632 vmovdqu32 $xa1,($out,$inp)
2638 jb .Less_than_64_16x
2639 vpxord ($inp),$xb1,$xb1
2640 vmovdqu32 $xb1,($out,$inp)
2646 jb .Less_than_64_16x
2647 vpxord ($inp),$xc1,$xc1
2648 vmovdqu32 $xc1,($out,$inp)
2654 jb .Less_than_64_16x
2655 vpxord ($inp),$xd1,$xd1
2656 vmovdqu32 $xd1,($out,$inp)
2662 jb .Less_than_64_16x
2663 vpxord ($inp),$xa2,$xa2
2664 vmovdqu32 $xa2,($out,$inp)
2670 jb .Less_than_64_16x
2671 vpxord ($inp),$xb2,$xb2
2672 vmovdqu32 $xb2,($out,$inp)
2678 jb .Less_than_64_16x
2679 vpxord ($inp),$xc2,$xc2
2680 vmovdqu32 $xc2,($out,$inp)
2686 jb .Less_than_64_16x
2687 vpxord ($inp),$xd2,$xd2
2688 vmovdqu32 $xd2,($out,$inp)
2694 jb .Less_than_64_16x
2695 vpxord ($inp),$xa3,$xa3
2696 vmovdqu32 $xa3,($out,$inp)
2702 jb .Less_than_64_16x
2703 vpxord ($inp),$xb3,$xb3
2704 vmovdqu32 $xb3,($out,$inp)
2710 jb .Less_than_64_16x
2711 vpxord ($inp),$xc3,$xc3
2712 vmovdqu32 $xc3,($out,$inp)
2718 vmovdqa32 $xa0,0x00(%rsp)
2719 lea ($out,$inp),$out
2723 movzb ($inp,%r10),%eax
2724 movzb (%rsp,%r10),%ecx
2727 mov %al,-1($out,%r10)
2734 $code.=<<___ if ($win64);
2735 lea 0x290+0x30(%rsp),%r11
2736 movaps -0x30(%r11),%xmm6
2737 movaps -0x20(%r11),%xmm7
2738 movaps -0x10(%r11),%xmm8
2739 movaps 0x00(%r11),%xmm9
2740 movaps 0x10(%r11),%xmm10
2741 movaps 0x20(%r11),%xmm11
2742 movaps 0x30(%r11),%xmm12
2743 movaps 0x40(%r11),%xmm13
2744 movaps 0x50(%r11),%xmm14
2745 movaps 0x60(%r11),%xmm15
2750 .size ChaCha20_16x,.-ChaCha20_16x
2754 foreach (split("\n",$code)) {
2755 s/\`([^\`]*)\`/eval $1/geo;