2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # ChaCha20 for x86_64.
21 # Performance in cycles per byte out of large buffer.
23 # IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2
25 # P4 9.48/+99% -/22.7(ii) -
26 # Core2 7.83/+55% 7.90/8.08 4.35
27 # Westmere 7.19/+50% 5.60/6.70 3.00
28 # Sandy Bridge 8.31/+42% 5.45/6.76 2.72
29 # Ivy Bridge 6.71/+46% 5.40/6.49 2.41
30 # Haswell 5.92/+43% 5.20/6.45 2.42 1.23
31 # Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
32 # Sledgehammer 7.28/+52% -/14.2(ii) -
33 # Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
34 # VIA Nano 10.5/+46% 6.72/8.60 6.05
36 # (i) compared to older gcc 3.x one can observe >2x improvement on
38 # (ii) as it can be seen, SSE2 performance is too low on legacy
39 # processors; NxSSE2 results are naturally better, but not
40 # impressively better than IALU ones, which is why you won't
41 # find SSE2 code below;
42 # (iii) this is not optimal result for Atom because of MSROM
43 # limitations, SSE2 can do better, but gain is considered too
44 # low to justify the [maintenance] effort;
45 # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
49 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
51 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
55 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
56 die "can't locate x86_64-xlate.pl";
58 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
59 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
60 $avx = ($1>=2.19) + ($1>=2.22);
63 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65 $avx = ($1>=2.09) + ($1>=2.10);
68 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
70 $avx = ($1>=10) + ($1>=11);
73 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
74 $avx = ($2>=3.0) + ($2>3.0);
77 open OUT,"| \"$^X\" $xlate $flavour $output";
80 # input parameter block
81 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
86 .extern OPENSSL_ia32cap_P
100 .long 8,8,8,8,8,8,8,8
102 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
104 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
106 .asciz "expand 32-byte k"
107 .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
110 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
111 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
113 $arg = "\$$arg" if ($arg*1 eq $arg);
114 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
117 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
118 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
121 sub ROUND { # critical path is 24 cycles per round
122 my ($a0,$b0,$c0,$d0)=@_;
123 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
124 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
125 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
126 my ($xc,$xc_)=map("\"$_\"",@t);
127 my @x=map("\"$_\"",@x);
129 # Consider order in which variables are addressed by their
134 # 0 4 8 12 < even round
138 # 0 5 10 15 < odd round
143 # 'a', 'b' and 'd's are permanently allocated in registers,
144 # @x[0..7,12..15], while 'c's are maintained in memory. If
145 # you observe 'c' column, you'll notice that pair of 'c's is
146 # invariant between rounds. This means that we have to reload
147 # them once per round, in the middle. This is why you'll see
148 # bunch of 'c' stores and loads in the middle, but none in
149 # the beginning or end.
151 # Normally instructions would be interleaved to favour in-order
152 # execution. Generally out-of-order cores manage it gracefully,
153 # but not this time for some reason. As in-order execution
154 # cores are dying breed, old Atom is the only one around,
155 # instructions are left uninterleaved. Besides, Atom is better
156 # off executing 1xSSSE3 code anyway...
159 "&add (@x[$a0],@x[$b0])", # Q1
160 "&xor (@x[$d0],@x[$a0])",
162 "&add (@x[$a1],@x[$b1])", # Q2
163 "&xor (@x[$d1],@x[$a1])",
166 "&add ($xc,@x[$d0])",
167 "&xor (@x[$b0],$xc)",
169 "&add ($xc_,@x[$d1])",
170 "&xor (@x[$b1],$xc_)",
173 "&add (@x[$a0],@x[$b0])",
174 "&xor (@x[$d0],@x[$a0])",
176 "&add (@x[$a1],@x[$b1])",
177 "&xor (@x[$d1],@x[$a1])",
180 "&add ($xc,@x[$d0])",
181 "&xor (@x[$b0],$xc)",
183 "&add ($xc_,@x[$d1])",
184 "&xor (@x[$b1],$xc_)",
187 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
188 "&mov (\"4*$c1(%rsp)\",$xc_)",
189 "&mov ($xc,\"4*$c2(%rsp)\")",
190 "&mov ($xc_,\"4*$c3(%rsp)\")",
192 "&add (@x[$a2],@x[$b2])", # Q3
193 "&xor (@x[$d2],@x[$a2])",
195 "&add (@x[$a3],@x[$b3])", # Q4
196 "&xor (@x[$d3],@x[$a3])",
199 "&add ($xc,@x[$d2])",
200 "&xor (@x[$b2],$xc)",
202 "&add ($xc_,@x[$d3])",
203 "&xor (@x[$b3],$xc_)",
206 "&add (@x[$a2],@x[$b2])",
207 "&xor (@x[$d2],@x[$a2])",
209 "&add (@x[$a3],@x[$b3])",
210 "&xor (@x[$d3],@x[$a3])",
213 "&add ($xc,@x[$d2])",
214 "&xor (@x[$b2],$xc)",
216 "&add ($xc_,@x[$d3])",
217 "&xor (@x[$b3],$xc_)",
222 ########################################################################
223 # Generic code path that handles all lengths on pre-SSSE3 processors.
225 .globl ChaCha20_ctr32
226 .type ChaCha20_ctr32,\@function,5
231 mov OPENSSL_ia32cap_P+4(%rip),%r10
232 test \$`1<<(41-32)`,%r10d
243 #movdqa .Lsigma(%rip),%xmm0
245 movdqu 16($key),%xmm2
246 movdqu ($counter),%xmm3
247 movdqa .Lone(%rip),%xmm4
249 #movdqa %xmm0,4*0(%rsp) # key[0]
250 movdqa %xmm1,4*4(%rsp) # key[1]
251 movdqa %xmm2,4*8(%rsp) # key[2]
252 movdqa %xmm3,4*12(%rsp) # key[3]
253 mov $len,%rbp # reassign $len
258 mov \$0x61707865,@x[0] # 'expa'
259 mov \$0x3320646e,@x[1] # 'nd 3'
260 mov \$0x79622d32,@x[2] # '2-by'
261 mov \$0x6b206574,@x[3] # 'te k'
267 mov 4*13(%rsp),@x[13]
268 mov 4*14(%rsp),@x[14]
269 mov 4*15(%rsp),@x[15]
271 mov %rbp,64+0(%rsp) # save len
273 mov $inp,64+8(%rsp) # save inp
274 movq %xmm2,%rsi # "@x[8]"
275 mov $out,64+16(%rsp) # save out
277 shr \$32,%rdi # "@x[9]"
283 foreach (&ROUND (0, 4, 8,12)) { eval; }
284 foreach (&ROUND (0, 5,10,15)) { eval; }
289 mov @t[1],4*9(%rsp) # modulo-scheduled
291 mov 64(%rsp),%rbp # load len
293 mov 64+8(%rsp),$inp # load inp
294 paddd %xmm4,%xmm3 # increment counter
295 mov 64+16(%rsp),$out # load out
297 add \$0x61707865,@x[0] # 'expa'
298 add \$0x3320646e,@x[1] # 'nd 3'
299 add \$0x79622d32,@x[2] # '2-by'
300 add \$0x6b206574,@x[3] # 'te k'
305 add 4*12(%rsp),@x[12]
306 add 4*13(%rsp),@x[13]
307 add 4*14(%rsp),@x[14]
308 add 4*15(%rsp),@x[15]
309 paddd 4*8(%rsp),%xmm1
314 xor 4*0($inp),@x[0] # xor with input
322 movdqu 4*8($inp),%xmm0
323 xor 4*12($inp),@x[12]
324 xor 4*13($inp),@x[13]
325 xor 4*14($inp),@x[14]
326 xor 4*15($inp),@x[15]
327 lea 4*16($inp),$inp # inp+=64
330 movdqa %xmm2,4*8(%rsp)
331 movd %xmm3,4*12(%rsp)
333 mov @x[0],4*0($out) # write output
341 movdqu %xmm0,4*8($out)
342 mov @x[12],4*12($out)
343 mov @x[13],4*13($out)
344 mov @x[14],4*14($out)
345 mov @x[15],4*15($out)
346 lea 4*16($out),$out # out+=64
364 movdqa %xmm1,4*8(%rsp)
365 mov @x[12],4*12(%rsp)
366 mov @x[13],4*13(%rsp)
367 mov @x[14],4*14(%rsp)
368 mov @x[15],4*15(%rsp)
371 movzb ($inp,%rbx),%eax
372 movzb (%rsp,%rbx),%edx
375 mov %al,-1($out,%rbx)
389 .size ChaCha20_ctr32,.-ChaCha20_ctr32
392 ########################################################################
393 # SSSE3 code path that handles shorter lengths
395 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
397 sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
421 my $xframe = $win64 ? 32+32+8 : 24;
424 .type ChaCha20_ssse3,\@function,5
429 $code.=<<___ if ($avx);
430 test \$`1<<(43-32)`,%r10d
431 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
434 cmp \$128,$len # we might throw away some data,
435 ja .LChaCha20_4x # but overall it won't be slower
445 sub \$64+$xframe,%rsp
447 $code.=<<___ if ($win64);
448 movaps %xmm6,64+32(%rsp)
449 movaps %xmm7,64+48(%rsp)
452 movdqa .Lsigma(%rip),$a
456 movdqa .Lrot16(%rip),$rot16
457 movdqa .Lrot24(%rip),$rot24
468 movdqa .Lone(%rip),$d
481 &pshufd ($c,$c,0b01001110);
482 &pshufd ($b,$b,0b00111001);
483 &pshufd ($d,$d,0b10010011);
487 &pshufd ($c,$c,0b01001110);
488 &pshufd ($b,$b,0b10010011);
489 &pshufd ($d,$d,0b00111001);
492 &jnz (".Loop_ssse3");
504 movdqu 0x10($inp),$t1
505 pxor $t,$a # xor with input
508 movdqu 0x30($inp),$t1
509 lea 0x40($inp),$inp # inp+=64
513 movdqu $a,0x00($out) # write output
517 lea 0x40($out),$out # out+=64
520 jnz .Loop_outer_ssse3
533 movzb ($inp,%rbx),%eax
534 movzb (%rsp,%rbx),%ecx
537 mov %al,-1($out,%rbx)
543 $code.=<<___ if ($win64);
544 movaps 64+32(%rsp),%xmm6
545 movaps 64+48(%rsp),%xmm7
548 add \$64+$xframe,%rsp
556 .size ChaCha20_ssse3,.-ChaCha20_ssse3
560 ########################################################################
561 # SSSE3 code path that handles longer messages.
563 # assign variables to favor Atom front-end
564 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
565 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
566 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
567 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
569 sub SSSE3_lane_ROUND {
570 my ($a0,$b0,$c0,$d0)=@_;
571 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
572 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
573 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
574 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
575 my @x=map("\"$_\"",@xx);
577 # Consider order in which variables are addressed by their
582 # 0 4 8 12 < even round
586 # 0 5 10 15 < odd round
591 # 'a', 'b' and 'd's are permanently allocated in registers,
592 # @x[0..7,12..15], while 'c's are maintained in memory. If
593 # you observe 'c' column, you'll notice that pair of 'c's is
594 # invariant between rounds. This means that we have to reload
595 # them once per round, in the middle. This is why you'll see
596 # bunch of 'c' stores and loads in the middle, but none in
597 # the beginning or end.
600 "&paddd (@x[$a0],@x[$b0])", # Q1
601 "&paddd (@x[$a1],@x[$b1])", # Q2
602 "&pxor (@x[$d0],@x[$a0])",
603 "&pxor (@x[$d1],@x[$a1])",
604 "&pshufb (@x[$d0],$t1)",
605 "&pshufb (@x[$d1],$t1)",
607 "&paddd ($xc,@x[$d0])",
608 "&paddd ($xc_,@x[$d1])",
609 "&pxor (@x[$b0],$xc)",
610 "&pxor (@x[$b1],$xc_)",
611 "&movdqa ($t0,@x[$b0])",
612 "&pslld (@x[$b0],12)",
614 "&movdqa ($t1,@x[$b1])",
615 "&pslld (@x[$b1],12)",
616 "&por (@x[$b0],$t0)",
618 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
619 "&por (@x[$b1],$t1)",
621 "&paddd (@x[$a0],@x[$b0])",
622 "&paddd (@x[$a1],@x[$b1])",
623 "&pxor (@x[$d0],@x[$a0])",
624 "&pxor (@x[$d1],@x[$a1])",
625 "&pshufb (@x[$d0],$t0)",
626 "&pshufb (@x[$d1],$t0)",
628 "&paddd ($xc,@x[$d0])",
629 "&paddd ($xc_,@x[$d1])",
630 "&pxor (@x[$b0],$xc)",
631 "&pxor (@x[$b1],$xc_)",
632 "&movdqa ($t1,@x[$b0])",
633 "&pslld (@x[$b0],7)",
635 "&movdqa ($t0,@x[$b1])",
636 "&pslld (@x[$b1],7)",
637 "&por (@x[$b0],$t1)",
639 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
640 "&por (@x[$b1],$t0)",
642 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
643 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
644 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
645 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
647 "&paddd (@x[$a2],@x[$b2])", # Q3
648 "&paddd (@x[$a3],@x[$b3])", # Q4
649 "&pxor (@x[$d2],@x[$a2])",
650 "&pxor (@x[$d3],@x[$a3])",
651 "&pshufb (@x[$d2],$t1)",
652 "&pshufb (@x[$d3],$t1)",
654 "&paddd ($xc,@x[$d2])",
655 "&paddd ($xc_,@x[$d3])",
656 "&pxor (@x[$b2],$xc)",
657 "&pxor (@x[$b3],$xc_)",
658 "&movdqa ($t0,@x[$b2])",
659 "&pslld (@x[$b2],12)",
661 "&movdqa ($t1,@x[$b3])",
662 "&pslld (@x[$b3],12)",
663 "&por (@x[$b2],$t0)",
665 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
666 "&por (@x[$b3],$t1)",
668 "&paddd (@x[$a2],@x[$b2])",
669 "&paddd (@x[$a3],@x[$b3])",
670 "&pxor (@x[$d2],@x[$a2])",
671 "&pxor (@x[$d3],@x[$a3])",
672 "&pshufb (@x[$d2],$t0)",
673 "&pshufb (@x[$d3],$t0)",
675 "&paddd ($xc,@x[$d2])",
676 "&paddd ($xc_,@x[$d3])",
677 "&pxor (@x[$b2],$xc)",
678 "&pxor (@x[$b3],$xc_)",
679 "&movdqa ($t1,@x[$b2])",
680 "&pslld (@x[$b2],7)",
682 "&movdqa ($t0,@x[$b3])",
683 "&pslld (@x[$b3],7)",
684 "&por (@x[$b2],$t1)",
686 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
691 my $xframe = $win64 ? 0xa0 : 0;
694 .type ChaCha20_4x,\@function,5
700 $code.=<<___ if ($avx>1);
701 shr \$32,%r10 # OPENSSL_ia32cap_P+8
702 test \$`1<<5`,%r10 # test AVX2
709 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
710 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
711 je .Ldo_sse3_after_all # to detect Atom
715 sub \$0x148+$xframe,%rsp
717 ################ stack layout
718 # +0x00 SIMD equivalent of @x[8-12]
720 # +0x40 constant copy of key[0-2] smashed by lanes
722 # +0x100 SIMD counters (with nonce smashed by lanes)
725 $code.=<<___ if ($win64);
726 movaps %xmm6,-0x30(%r11)
727 movaps %xmm7,-0x20(%r11)
728 movaps %xmm8,-0x10(%r11)
729 movaps %xmm9,0x00(%r11)
730 movaps %xmm10,0x10(%r11)
731 movaps %xmm11,0x20(%r11)
732 movaps %xmm12,0x30(%r11)
733 movaps %xmm13,0x40(%r11)
734 movaps %xmm14,0x50(%r11)
735 movaps %xmm15,0x60(%r11)
738 movdqa .Lsigma(%rip),$xa3 # key[0]
739 movdqu ($key),$xb3 # key[1]
740 movdqu 16($key),$xt3 # key[2]
741 movdqu ($counter),$xd3 # key[3]
742 lea 0x100(%rsp),%rcx # size optimization
743 lea .Lrot16(%rip),%r10
744 lea .Lrot24(%rip),%r11
746 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
747 pshufd \$0x55,$xa3,$xa1
748 movdqa $xa0,0x40(%rsp) # ... and offload
749 pshufd \$0xaa,$xa3,$xa2
750 movdqa $xa1,0x50(%rsp)
751 pshufd \$0xff,$xa3,$xa3
752 movdqa $xa2,0x60(%rsp)
753 movdqa $xa3,0x70(%rsp)
755 pshufd \$0x00,$xb3,$xb0
756 pshufd \$0x55,$xb3,$xb1
757 movdqa $xb0,0x80-0x100(%rcx)
758 pshufd \$0xaa,$xb3,$xb2
759 movdqa $xb1,0x90-0x100(%rcx)
760 pshufd \$0xff,$xb3,$xb3
761 movdqa $xb2,0xa0-0x100(%rcx)
762 movdqa $xb3,0xb0-0x100(%rcx)
764 pshufd \$0x00,$xt3,$xt0 # "$xc0"
765 pshufd \$0x55,$xt3,$xt1 # "$xc1"
766 movdqa $xt0,0xc0-0x100(%rcx)
767 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
768 movdqa $xt1,0xd0-0x100(%rcx)
769 pshufd \$0xff,$xt3,$xt3 # "$xc3"
770 movdqa $xt2,0xe0-0x100(%rcx)
771 movdqa $xt3,0xf0-0x100(%rcx)
773 pshufd \$0x00,$xd3,$xd0
774 pshufd \$0x55,$xd3,$xd1
775 paddd .Linc(%rip),$xd0 # don't save counters yet
776 pshufd \$0xaa,$xd3,$xd2
777 movdqa $xd1,0x110-0x100(%rcx)
778 pshufd \$0xff,$xd3,$xd3
779 movdqa $xd2,0x120-0x100(%rcx)
780 movdqa $xd3,0x130-0x100(%rcx)
786 movdqa 0x40(%rsp),$xa0 # re-load smashed key
787 movdqa 0x50(%rsp),$xa1
788 movdqa 0x60(%rsp),$xa2
789 movdqa 0x70(%rsp),$xa3
790 movdqa 0x80-0x100(%rcx),$xb0
791 movdqa 0x90-0x100(%rcx),$xb1
792 movdqa 0xa0-0x100(%rcx),$xb2
793 movdqa 0xb0-0x100(%rcx),$xb3
794 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
795 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
796 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
797 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
798 movdqa 0x100-0x100(%rcx),$xd0
799 movdqa 0x110-0x100(%rcx),$xd1
800 movdqa 0x120-0x100(%rcx),$xd2
801 movdqa 0x130-0x100(%rcx),$xd3
802 paddd .Lfour(%rip),$xd0 # next SIMD counters
805 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
806 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
807 movdqa (%r10),$xt3 # .Lrot16(%rip)
809 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
815 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
816 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
821 paddd 0x40(%rsp),$xa0 # accumulate key material
822 paddd 0x50(%rsp),$xa1
823 paddd 0x60(%rsp),$xa2
824 paddd 0x70(%rsp),$xa3
826 movdqa $xa0,$xt2 # "de-interlace" data
833 punpcklqdq $xa2,$xa0 # "a0"
835 punpcklqdq $xt3,$xt2 # "a2"
836 punpckhqdq $xa2,$xa1 # "a1"
837 punpckhqdq $xt3,$xa3 # "a3"
839 ($xa2,$xt2)=($xt2,$xa2);
841 paddd 0x80-0x100(%rcx),$xb0
842 paddd 0x90-0x100(%rcx),$xb1
843 paddd 0xa0-0x100(%rcx),$xb2
844 paddd 0xb0-0x100(%rcx),$xb3
846 movdqa $xa0,0x00(%rsp) # offload $xaN
847 movdqa $xa1,0x10(%rsp)
848 movdqa 0x20(%rsp),$xa0 # "xc2"
849 movdqa 0x30(%rsp),$xa1 # "xc3"
858 punpcklqdq $xb2,$xb0 # "b0"
860 punpcklqdq $xt3,$xt2 # "b2"
861 punpckhqdq $xb2,$xb1 # "b1"
862 punpckhqdq $xt3,$xb3 # "b3"
864 ($xb2,$xt2)=($xt2,$xb2);
865 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
867 paddd 0xc0-0x100(%rcx),$xc0
868 paddd 0xd0-0x100(%rcx),$xc1
869 paddd 0xe0-0x100(%rcx),$xc2
870 paddd 0xf0-0x100(%rcx),$xc3
872 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
873 movdqa $xa3,0x30(%rsp)
882 punpcklqdq $xc2,$xc0 # "c0"
884 punpcklqdq $xt3,$xt2 # "c2"
885 punpckhqdq $xc2,$xc1 # "c1"
886 punpckhqdq $xt3,$xc3 # "c3"
888 ($xc2,$xt2)=($xt2,$xc2);
889 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
891 paddd 0x100-0x100(%rcx),$xd0
892 paddd 0x110-0x100(%rcx),$xd1
893 paddd 0x120-0x100(%rcx),$xd2
894 paddd 0x130-0x100(%rcx),$xd3
903 punpcklqdq $xd2,$xd0 # "d0"
905 punpcklqdq $xt3,$xt2 # "d2"
906 punpckhqdq $xd2,$xd1 # "d1"
907 punpckhqdq $xt3,$xd3 # "d3"
909 ($xd2,$xt2)=($xt2,$xd2);
914 movdqu 0x00($inp),$xt0 # xor with input
915 movdqu 0x10($inp),$xt1
916 movdqu 0x20($inp),$xt2
917 movdqu 0x30($inp),$xt3
918 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
923 movdqu $xt0,0x00($out)
924 movdqu 0x40($inp),$xt0
925 movdqu $xt1,0x10($out)
926 movdqu 0x50($inp),$xt1
927 movdqu $xt2,0x20($out)
928 movdqu 0x60($inp),$xt2
929 movdqu $xt3,0x30($out)
930 movdqu 0x70($inp),$xt3
931 lea 0x80($inp),$inp # size optimization
937 movdqu $xt0,0x40($out)
938 movdqu 0x00($inp),$xt0
939 movdqu $xt1,0x50($out)
940 movdqu 0x10($inp),$xt1
941 movdqu $xt2,0x60($out)
942 movdqu 0x20($inp),$xt2
943 movdqu $xt3,0x70($out)
944 lea 0x80($out),$out # size optimization
945 movdqu 0x30($inp),$xt3
951 movdqu $xt0,0x00($out)
952 movdqu 0x40($inp),$xt0
953 movdqu $xt1,0x10($out)
954 movdqu 0x50($inp),$xt1
955 movdqu $xt2,0x20($out)
956 movdqu 0x60($inp),$xt2
957 movdqu $xt3,0x30($out)
958 movdqu 0x70($inp),$xt3
959 lea 0x80($inp),$inp # inp+=64*4
964 movdqu $xt0,0x40($out)
965 movdqu $xt1,0x50($out)
966 movdqu $xt2,0x60($out)
967 movdqu $xt3,0x70($out)
968 lea 0x80($out),$out # out+=64*4
983 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
985 #movdqa $xt0,0x00(%rsp)
986 movdqa $xb0,0x10(%rsp)
987 movdqa $xc0,0x20(%rsp)
988 movdqa $xd0,0x30(%rsp)
993 movdqu 0x00($inp),$xt0 # xor with input
994 movdqu 0x10($inp),$xt1
995 movdqu 0x20($inp),$xt2
996 movdqu 0x30($inp),$xt3
997 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
1001 movdqu $xt0,0x00($out)
1002 movdqu $xt1,0x10($out)
1003 movdqu $xt2,0x20($out)
1004 movdqu $xt3,0x30($out)
1007 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1008 lea 0x40($inp),$inp # inp+=64*1
1010 movdqa $xt0,0x00(%rsp)
1011 movdqa $xb1,0x10(%rsp)
1012 lea 0x40($out),$out # out+=64*1
1013 movdqa $xc1,0x20(%rsp)
1014 sub \$64,$len # len-=64*1
1015 movdqa $xd1,0x30(%rsp)
1020 movdqu 0x00($inp),$xt0 # xor with input
1021 movdqu 0x10($inp),$xt1
1022 movdqu 0x20($inp),$xt2
1023 movdqu 0x30($inp),$xt3
1024 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1029 movdqu $xt0,0x00($out)
1030 movdqu 0x40($inp),$xt0
1031 movdqu $xt1,0x10($out)
1032 movdqu 0x50($inp),$xt1
1033 movdqu $xt2,0x20($out)
1034 movdqu 0x60($inp),$xt2
1035 movdqu $xt3,0x30($out)
1036 movdqu 0x70($inp),$xt3
1037 pxor 0x10(%rsp),$xt0
1041 movdqu $xt0,0x40($out)
1042 movdqu $xt1,0x50($out)
1043 movdqu $xt2,0x60($out)
1044 movdqu $xt3,0x70($out)
1047 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1048 lea 0x80($inp),$inp # inp+=64*2
1050 movdqa $xt0,0x00(%rsp)
1051 movdqa $xb2,0x10(%rsp)
1052 lea 0x80($out),$out # out+=64*2
1053 movdqa $xc2,0x20(%rsp)
1054 sub \$128,$len # len-=64*2
1055 movdqa $xd2,0x30(%rsp)
1060 movdqu 0x00($inp),$xt0 # xor with input
1061 movdqu 0x10($inp),$xt1
1062 movdqu 0x20($inp),$xt2
1063 movdqu 0x30($inp),$xt3
1064 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1069 movdqu $xt0,0x00($out)
1070 movdqu 0x40($inp),$xt0
1071 movdqu $xt1,0x10($out)
1072 movdqu 0x50($inp),$xt1
1073 movdqu $xt2,0x20($out)
1074 movdqu 0x60($inp),$xt2
1075 movdqu $xt3,0x30($out)
1076 movdqu 0x70($inp),$xt3
1077 lea 0x80($inp),$inp # size optimization
1078 pxor 0x10(%rsp),$xt0
1083 movdqu $xt0,0x40($out)
1084 movdqu 0x00($inp),$xt0
1085 movdqu $xt1,0x50($out)
1086 movdqu 0x10($inp),$xt1
1087 movdqu $xt2,0x60($out)
1088 movdqu 0x20($inp),$xt2
1089 movdqu $xt3,0x70($out)
1090 lea 0x80($out),$out # size optimization
1091 movdqu 0x30($inp),$xt3
1092 pxor 0x20(%rsp),$xt0
1096 movdqu $xt0,0x00($out)
1097 movdqu $xt1,0x10($out)
1098 movdqu $xt2,0x20($out)
1099 movdqu $xt3,0x30($out)
1102 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1103 lea 0x40($inp),$inp # inp+=64*3
1105 movdqa $xt0,0x00(%rsp)
1106 movdqa $xb3,0x10(%rsp)
1107 lea 0x40($out),$out # out+=64*3
1108 movdqa $xc3,0x20(%rsp)
1109 sub \$192,$len # len-=64*3
1110 movdqa $xd3,0x30(%rsp)
1113 movzb ($inp,%r10),%eax
1114 movzb (%rsp,%r10),%ecx
1117 mov %al,-1($out,%r10)
1123 $code.=<<___ if ($win64);
1124 lea 0x140+0x30(%rsp),%r11
1125 movaps -0x30(%r11),%xmm6
1126 movaps -0x20(%r11),%xmm7
1127 movaps -0x10(%r11),%xmm8
1128 movaps 0x00(%r11),%xmm9
1129 movaps 0x10(%r11),%xmm10
1130 movaps 0x20(%r11),%xmm11
1131 movaps 0x30(%r11),%xmm12
1132 movaps 0x40(%r11),%xmm13
1133 movaps 0x50(%r11),%xmm14
1134 movaps 0x60(%r11),%xmm15
1137 add \$0x148+$xframe,%rsp
1139 .size ChaCha20_4x,.-ChaCha20_4x
1143 ########################################################################
1144 # XOP code path that handles all lengths.
1146 # There is some "anomaly" observed depending on instructions' size or
1147 # alignment. If you look closely at below code you'll notice that
1148 # sometimes argument order varies. The order affects instruction
1149 # encoding by making it larger, and such fiddling gives 5% performance
1150 # improvement. This is on FX-4100...
1152 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1153 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1154 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1155 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1157 sub XOP_lane_ROUND {
1158 my ($a0,$b0,$c0,$d0)=@_;
1159 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1160 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1161 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1162 my @x=map("\"$_\"",@xx);
1165 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1166 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1167 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1168 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1169 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1170 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1171 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1172 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1173 "&vprotd (@x[$d0],@x[$d0],16)",
1174 "&vprotd (@x[$d1],@x[$d1],16)",
1175 "&vprotd (@x[$d2],@x[$d2],16)",
1176 "&vprotd (@x[$d3],@x[$d3],16)",
1178 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1179 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1180 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1181 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1182 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1183 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1184 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1185 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1186 "&vprotd (@x[$b0],@x[$b0],12)",
1187 "&vprotd (@x[$b1],@x[$b1],12)",
1188 "&vprotd (@x[$b2],@x[$b2],12)",
1189 "&vprotd (@x[$b3],@x[$b3],12)",
1191 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1192 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1193 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1194 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1195 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1196 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1197 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1198 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1199 "&vprotd (@x[$d0],@x[$d0],8)",
1200 "&vprotd (@x[$d1],@x[$d1],8)",
1201 "&vprotd (@x[$d2],@x[$d2],8)",
1202 "&vprotd (@x[$d3],@x[$d3],8)",
1204 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1205 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1206 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1207 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1208 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1209 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1210 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1211 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1212 "&vprotd (@x[$b0],@x[$b0],7)",
1213 "&vprotd (@x[$b1],@x[$b1],7)",
1214 "&vprotd (@x[$b2],@x[$b2],7)",
1215 "&vprotd (@x[$b3],@x[$b3],7)"
1219 my $xframe = $win64 ? 0xa0 : 0;
1222 .type ChaCha20_4xop,\@function,5
1226 lea -0x78(%rsp),%r11
1227 sub \$0x148+$xframe,%rsp
1229 ################ stack layout
1230 # +0x00 SIMD equivalent of @x[8-12]
1232 # +0x40 constant copy of key[0-2] smashed by lanes
1234 # +0x100 SIMD counters (with nonce smashed by lanes)
1237 $code.=<<___ if ($win64);
1238 movaps %xmm6,-0x30(%r11)
1239 movaps %xmm7,-0x20(%r11)
1240 movaps %xmm8,-0x10(%r11)
1241 movaps %xmm9,0x00(%r11)
1242 movaps %xmm10,0x10(%r11)
1243 movaps %xmm11,0x20(%r11)
1244 movaps %xmm12,0x30(%r11)
1245 movaps %xmm13,0x40(%r11)
1246 movaps %xmm14,0x50(%r11)
1247 movaps %xmm15,0x60(%r11)
1252 vmovdqa .Lsigma(%rip),$xa3 # key[0]
1253 vmovdqu ($key),$xb3 # key[1]
1254 vmovdqu 16($key),$xt3 # key[2]
1255 vmovdqu ($counter),$xd3 # key[3]
1256 lea 0x100(%rsp),%rcx # size optimization
1258 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1259 vpshufd \$0x55,$xa3,$xa1
1260 vmovdqa $xa0,0x40(%rsp) # ... and offload
1261 vpshufd \$0xaa,$xa3,$xa2
1262 vmovdqa $xa1,0x50(%rsp)
1263 vpshufd \$0xff,$xa3,$xa3
1264 vmovdqa $xa2,0x60(%rsp)
1265 vmovdqa $xa3,0x70(%rsp)
1267 vpshufd \$0x00,$xb3,$xb0
1268 vpshufd \$0x55,$xb3,$xb1
1269 vmovdqa $xb0,0x80-0x100(%rcx)
1270 vpshufd \$0xaa,$xb3,$xb2
1271 vmovdqa $xb1,0x90-0x100(%rcx)
1272 vpshufd \$0xff,$xb3,$xb3
1273 vmovdqa $xb2,0xa0-0x100(%rcx)
1274 vmovdqa $xb3,0xb0-0x100(%rcx)
1276 vpshufd \$0x00,$xt3,$xt0 # "$xc0"
1277 vpshufd \$0x55,$xt3,$xt1 # "$xc1"
1278 vmovdqa $xt0,0xc0-0x100(%rcx)
1279 vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
1280 vmovdqa $xt1,0xd0-0x100(%rcx)
1281 vpshufd \$0xff,$xt3,$xt3 # "$xc3"
1282 vmovdqa $xt2,0xe0-0x100(%rcx)
1283 vmovdqa $xt3,0xf0-0x100(%rcx)
1285 vpshufd \$0x00,$xd3,$xd0
1286 vpshufd \$0x55,$xd3,$xd1
1287 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
1288 vpshufd \$0xaa,$xd3,$xd2
1289 vmovdqa $xd1,0x110-0x100(%rcx)
1290 vpshufd \$0xff,$xd3,$xd3
1291 vmovdqa $xd2,0x120-0x100(%rcx)
1292 vmovdqa $xd3,0x130-0x100(%rcx)
1298 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
1299 vmovdqa 0x50(%rsp),$xa1
1300 vmovdqa 0x60(%rsp),$xa2
1301 vmovdqa 0x70(%rsp),$xa3
1302 vmovdqa 0x80-0x100(%rcx),$xb0
1303 vmovdqa 0x90-0x100(%rcx),$xb1
1304 vmovdqa 0xa0-0x100(%rcx),$xb2
1305 vmovdqa 0xb0-0x100(%rcx),$xb3
1306 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1307 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1308 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1309 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1310 vmovdqa 0x100-0x100(%rcx),$xd0
1311 vmovdqa 0x110-0x100(%rcx),$xd1
1312 vmovdqa 0x120-0x100(%rcx),$xd2
1313 vmovdqa 0x130-0x100(%rcx),$xd3
1314 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
1318 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1324 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1325 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1330 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
1331 vpaddd 0x50(%rsp),$xa1,$xa1
1332 vpaddd 0x60(%rsp),$xa2,$xa2
1333 vpaddd 0x70(%rsp),$xa3,$xa3
1335 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
1336 vmovdqa $xt3,0x30(%rsp)
1338 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1339 vpunpckldq $xa3,$xa2,$xt3
1340 vpunpckhdq $xa1,$xa0,$xa0
1341 vpunpckhdq $xa3,$xa2,$xa2
1342 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1343 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1344 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1345 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1347 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1349 vpaddd 0x80-0x100(%rcx),$xb0,$xb0
1350 vpaddd 0x90-0x100(%rcx),$xb1,$xb1
1351 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
1352 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
1354 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
1355 vmovdqa $xa1,0x10(%rsp)
1356 vmovdqa 0x20(%rsp),$xa0 # "xc2"
1357 vmovdqa 0x30(%rsp),$xa1 # "xc3"
1359 vpunpckldq $xb1,$xb0,$xt2
1360 vpunpckldq $xb3,$xb2,$xt3
1361 vpunpckhdq $xb1,$xb0,$xb0
1362 vpunpckhdq $xb3,$xb2,$xb2
1363 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1364 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1365 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1366 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1368 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1369 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1371 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
1372 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
1373 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
1374 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
1376 vpunpckldq $xc1,$xc0,$xt2
1377 vpunpckldq $xc3,$xc2,$xt3
1378 vpunpckhdq $xc1,$xc0,$xc0
1379 vpunpckhdq $xc3,$xc2,$xc2
1380 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1381 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1382 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1383 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1385 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1387 vpaddd 0x100-0x100(%rcx),$xd0,$xd0
1388 vpaddd 0x110-0x100(%rcx),$xd1,$xd1
1389 vpaddd 0x120-0x100(%rcx),$xd2,$xd2
1390 vpaddd 0x130-0x100(%rcx),$xd3,$xd3
1392 vpunpckldq $xd1,$xd0,$xt2
1393 vpunpckldq $xd3,$xd2,$xt3
1394 vpunpckhdq $xd1,$xd0,$xd0
1395 vpunpckhdq $xd3,$xd2,$xd2
1396 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1397 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1398 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1399 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1401 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1402 ($xa0,$xa1)=($xt2,$xt3);
1404 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
1405 vmovdqa 0x10(%rsp),$xa1
1410 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1411 vpxor 0x10($inp),$xb0,$xb0
1412 vpxor 0x20($inp),$xc0,$xc0
1413 vpxor 0x30($inp),$xd0,$xd0
1414 vpxor 0x40($inp),$xa1,$xa1
1415 vpxor 0x50($inp),$xb1,$xb1
1416 vpxor 0x60($inp),$xc1,$xc1
1417 vpxor 0x70($inp),$xd1,$xd1
1418 lea 0x80($inp),$inp # size optimization
1419 vpxor 0x00($inp),$xa2,$xa2
1420 vpxor 0x10($inp),$xb2,$xb2
1421 vpxor 0x20($inp),$xc2,$xc2
1422 vpxor 0x30($inp),$xd2,$xd2
1423 vpxor 0x40($inp),$xa3,$xa3
1424 vpxor 0x50($inp),$xb3,$xb3
1425 vpxor 0x60($inp),$xc3,$xc3
1426 vpxor 0x70($inp),$xd3,$xd3
1427 lea 0x80($inp),$inp # inp+=64*4
1429 vmovdqu $xa0,0x00($out)
1430 vmovdqu $xb0,0x10($out)
1431 vmovdqu $xc0,0x20($out)
1432 vmovdqu $xd0,0x30($out)
1433 vmovdqu $xa1,0x40($out)
1434 vmovdqu $xb1,0x50($out)
1435 vmovdqu $xc1,0x60($out)
1436 vmovdqu $xd1,0x70($out)
1437 lea 0x80($out),$out # size optimization
1438 vmovdqu $xa2,0x00($out)
1439 vmovdqu $xb2,0x10($out)
1440 vmovdqu $xc2,0x20($out)
1441 vmovdqu $xd2,0x30($out)
1442 vmovdqu $xa3,0x40($out)
1443 vmovdqu $xb3,0x50($out)
1444 vmovdqu $xc3,0x60($out)
1445 vmovdqu $xd3,0x70($out)
1446 lea 0x80($out),$out # out+=64*4
1456 jae .L192_or_more4xop
1458 jae .L128_or_more4xop
1460 jae .L64_or_more4xop
1463 vmovdqa $xa0,0x00(%rsp)
1464 vmovdqa $xb0,0x10(%rsp)
1465 vmovdqa $xc0,0x20(%rsp)
1466 vmovdqa $xd0,0x30(%rsp)
1471 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1472 vpxor 0x10($inp),$xb0,$xb0
1473 vpxor 0x20($inp),$xc0,$xc0
1474 vpxor 0x30($inp),$xd0,$xd0
1475 vmovdqu $xa0,0x00($out)
1476 vmovdqu $xb0,0x10($out)
1477 vmovdqu $xc0,0x20($out)
1478 vmovdqu $xd0,0x30($out)
1481 lea 0x40($inp),$inp # inp+=64*1
1482 vmovdqa $xa1,0x00(%rsp)
1484 vmovdqa $xb1,0x10(%rsp)
1485 lea 0x40($out),$out # out+=64*1
1486 vmovdqa $xc1,0x20(%rsp)
1487 sub \$64,$len # len-=64*1
1488 vmovdqa $xd1,0x30(%rsp)
1493 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1494 vpxor 0x10($inp),$xb0,$xb0
1495 vpxor 0x20($inp),$xc0,$xc0
1496 vpxor 0x30($inp),$xd0,$xd0
1497 vpxor 0x40($inp),$xa1,$xa1
1498 vpxor 0x50($inp),$xb1,$xb1
1499 vpxor 0x60($inp),$xc1,$xc1
1500 vpxor 0x70($inp),$xd1,$xd1
1502 vmovdqu $xa0,0x00($out)
1503 vmovdqu $xb0,0x10($out)
1504 vmovdqu $xc0,0x20($out)
1505 vmovdqu $xd0,0x30($out)
1506 vmovdqu $xa1,0x40($out)
1507 vmovdqu $xb1,0x50($out)
1508 vmovdqu $xc1,0x60($out)
1509 vmovdqu $xd1,0x70($out)
1512 lea 0x80($inp),$inp # inp+=64*2
1513 vmovdqa $xa2,0x00(%rsp)
1515 vmovdqa $xb2,0x10(%rsp)
1516 lea 0x80($out),$out # out+=64*2
1517 vmovdqa $xc2,0x20(%rsp)
1518 sub \$128,$len # len-=64*2
1519 vmovdqa $xd2,0x30(%rsp)
1524 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1525 vpxor 0x10($inp),$xb0,$xb0
1526 vpxor 0x20($inp),$xc0,$xc0
1527 vpxor 0x30($inp),$xd0,$xd0
1528 vpxor 0x40($inp),$xa1,$xa1
1529 vpxor 0x50($inp),$xb1,$xb1
1530 vpxor 0x60($inp),$xc1,$xc1
1531 vpxor 0x70($inp),$xd1,$xd1
1532 lea 0x80($inp),$inp # size optimization
1533 vpxor 0x00($inp),$xa2,$xa2
1534 vpxor 0x10($inp),$xb2,$xb2
1535 vpxor 0x20($inp),$xc2,$xc2
1536 vpxor 0x30($inp),$xd2,$xd2
1538 vmovdqu $xa0,0x00($out)
1539 vmovdqu $xb0,0x10($out)
1540 vmovdqu $xc0,0x20($out)
1541 vmovdqu $xd0,0x30($out)
1542 vmovdqu $xa1,0x40($out)
1543 vmovdqu $xb1,0x50($out)
1544 vmovdqu $xc1,0x60($out)
1545 vmovdqu $xd1,0x70($out)
1546 lea 0x80($out),$out # size optimization
1547 vmovdqu $xa2,0x00($out)
1548 vmovdqu $xb2,0x10($out)
1549 vmovdqu $xc2,0x20($out)
1550 vmovdqu $xd2,0x30($out)
1553 lea 0x40($inp),$inp # inp+=64*3
1554 vmovdqa $xa3,0x00(%rsp)
1556 vmovdqa $xb3,0x10(%rsp)
1557 lea 0x40($out),$out # out+=64*3
1558 vmovdqa $xc3,0x20(%rsp)
1559 sub \$192,$len # len-=64*3
1560 vmovdqa $xd3,0x30(%rsp)
1563 movzb ($inp,%r10),%eax
1564 movzb (%rsp,%r10),%ecx
1567 mov %al,-1($out,%r10)
1574 $code.=<<___ if ($win64);
1575 lea 0x140+0x30(%rsp),%r11
1576 movaps -0x30(%r11),%xmm6
1577 movaps -0x20(%r11),%xmm7
1578 movaps -0x10(%r11),%xmm8
1579 movaps 0x00(%r11),%xmm9
1580 movaps 0x10(%r11),%xmm10
1581 movaps 0x20(%r11),%xmm11
1582 movaps 0x30(%r11),%xmm12
1583 movaps 0x40(%r11),%xmm13
1584 movaps 0x50(%r11),%xmm14
1585 movaps 0x60(%r11),%xmm15
1588 add \$0x148+$xframe,%rsp
1590 .size ChaCha20_4xop,.-ChaCha20_4xop
1594 ########################################################################
1597 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1598 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1599 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1600 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1602 sub AVX2_lane_ROUND {
1603 my ($a0,$b0,$c0,$d0)=@_;
1604 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1605 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1606 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1607 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1608 my @x=map("\"$_\"",@xx);
1610 # Consider order in which variables are addressed by their
1615 # 0 4 8 12 < even round
1619 # 0 5 10 15 < odd round
1624 # 'a', 'b' and 'd's are permanently allocated in registers,
1625 # @x[0..7,12..15], while 'c's are maintained in memory. If
1626 # you observe 'c' column, you'll notice that pair of 'c's is
1627 # invariant between rounds. This means that we have to reload
1628 # them once per round, in the middle. This is why you'll see
1629 # bunch of 'c' stores and loads in the middle, but none in
1630 # the beginning or end.
1633 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1634 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1635 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1636 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1637 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1638 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1640 "&vpaddd ($xc,$xc,@x[$d0])",
1641 "&vpxor (@x[$b0],$xc,@x[$b0])",
1642 "&vpslld ($t0,@x[$b0],12)",
1643 "&vpsrld (@x[$b0],@x[$b0],20)",
1644 "&vpor (@x[$b0],$t0,@x[$b0])",
1645 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1646 "&vpaddd ($xc_,$xc_,@x[$d1])",
1647 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1648 "&vpslld ($t1,@x[$b1],12)",
1649 "&vpsrld (@x[$b1],@x[$b1],20)",
1650 "&vpor (@x[$b1],$t1,@x[$b1])",
1652 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1653 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1654 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1655 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1656 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1657 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1659 "&vpaddd ($xc,$xc,@x[$d0])",
1660 "&vpxor (@x[$b0],$xc,@x[$b0])",
1661 "&vpslld ($t1,@x[$b0],7)",
1662 "&vpsrld (@x[$b0],@x[$b0],25)",
1663 "&vpor (@x[$b0],$t1,@x[$b0])",
1664 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1665 "&vpaddd ($xc_,$xc_,@x[$d1])",
1666 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1667 "&vpslld ($t0,@x[$b1],7)",
1668 "&vpsrld (@x[$b1],@x[$b1],25)",
1669 "&vpor (@x[$b1],$t0,@x[$b1])",
1671 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1672 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1673 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1674 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1676 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1677 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1678 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1679 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1680 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1681 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1683 "&vpaddd ($xc,$xc,@x[$d2])",
1684 "&vpxor (@x[$b2],$xc,@x[$b2])",
1685 "&vpslld ($t0,@x[$b2],12)",
1686 "&vpsrld (@x[$b2],@x[$b2],20)",
1687 "&vpor (@x[$b2],$t0,@x[$b2])",
1688 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1689 "&vpaddd ($xc_,$xc_,@x[$d3])",
1690 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1691 "&vpslld ($t1,@x[$b3],12)",
1692 "&vpsrld (@x[$b3],@x[$b3],20)",
1693 "&vpor (@x[$b3],$t1,@x[$b3])",
1695 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1696 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1697 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1698 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1699 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1700 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1702 "&vpaddd ($xc,$xc,@x[$d2])",
1703 "&vpxor (@x[$b2],$xc,@x[$b2])",
1704 "&vpslld ($t1,@x[$b2],7)",
1705 "&vpsrld (@x[$b2],@x[$b2],25)",
1706 "&vpor (@x[$b2],$t1,@x[$b2])",
1707 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1708 "&vpaddd ($xc_,$xc_,@x[$d3])",
1709 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1710 "&vpslld ($t0,@x[$b3],7)",
1711 "&vpsrld (@x[$b3],@x[$b3],25)",
1712 "&vpor (@x[$b3],$t0,@x[$b3])"
1716 my $xframe = $win64 ? 0xb0 : 8;
1719 .type ChaCha20_8x,\@function,5
1724 sub \$0x280+$xframe,%rsp
1727 $code.=<<___ if ($win64);
1728 lea 0x290+0x30(%rsp),%r11
1729 movaps %xmm6,-0x30(%r11)
1730 movaps %xmm7,-0x20(%r11)
1731 movaps %xmm8,-0x10(%r11)
1732 movaps %xmm9,0x00(%r11)
1733 movaps %xmm10,0x10(%r11)
1734 movaps %xmm11,0x20(%r11)
1735 movaps %xmm12,0x30(%r11)
1736 movaps %xmm13,0x40(%r11)
1737 movaps %xmm14,0x50(%r11)
1738 movaps %xmm15,0x60(%r11)
1742 mov %r10,0x280(%rsp)
1744 ################ stack layout
1745 # +0x00 SIMD equivalent of @x[8-12]
1747 # +0x80 constant copy of key[0-2] smashed by lanes
1749 # +0x200 SIMD counters (with nonce smashed by lanes)
1753 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1754 vbroadcasti128 ($key),$xb3 # key[1]
1755 vbroadcasti128 16($key),$xt3 # key[2]
1756 vbroadcasti128 ($counter),$xd3 # key[3]
1757 lea 0x100(%rsp),%rcx # size optimization
1758 lea 0x200(%rsp),%rax # size optimization
1759 lea .Lrot16(%rip),%r10
1760 lea .Lrot24(%rip),%r11
1762 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1763 vpshufd \$0x55,$xa3,$xa1
1764 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1765 vpshufd \$0xaa,$xa3,$xa2
1766 vmovdqa $xa1,0xa0-0x100(%rcx)
1767 vpshufd \$0xff,$xa3,$xa3
1768 vmovdqa $xa2,0xc0-0x100(%rcx)
1769 vmovdqa $xa3,0xe0-0x100(%rcx)
1771 vpshufd \$0x00,$xb3,$xb0
1772 vpshufd \$0x55,$xb3,$xb1
1773 vmovdqa $xb0,0x100-0x100(%rcx)
1774 vpshufd \$0xaa,$xb3,$xb2
1775 vmovdqa $xb1,0x120-0x100(%rcx)
1776 vpshufd \$0xff,$xb3,$xb3
1777 vmovdqa $xb2,0x140-0x100(%rcx)
1778 vmovdqa $xb3,0x160-0x100(%rcx)
1780 vpshufd \$0x00,$xt3,$xt0 # "xc0"
1781 vpshufd \$0x55,$xt3,$xt1 # "xc1"
1782 vmovdqa $xt0,0x180-0x200(%rax)
1783 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
1784 vmovdqa $xt1,0x1a0-0x200(%rax)
1785 vpshufd \$0xff,$xt3,$xt3 # "xc3"
1786 vmovdqa $xt2,0x1c0-0x200(%rax)
1787 vmovdqa $xt3,0x1e0-0x200(%rax)
1789 vpshufd \$0x00,$xd3,$xd0
1790 vpshufd \$0x55,$xd3,$xd1
1791 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
1792 vpshufd \$0xaa,$xd3,$xd2
1793 vmovdqa $xd1,0x220-0x200(%rax)
1794 vpshufd \$0xff,$xd3,$xd3
1795 vmovdqa $xd2,0x240-0x200(%rax)
1796 vmovdqa $xd3,0x260-0x200(%rax)
1802 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
1803 vmovdqa 0xa0-0x100(%rcx),$xa1
1804 vmovdqa 0xc0-0x100(%rcx),$xa2
1805 vmovdqa 0xe0-0x100(%rcx),$xa3
1806 vmovdqa 0x100-0x100(%rcx),$xb0
1807 vmovdqa 0x120-0x100(%rcx),$xb1
1808 vmovdqa 0x140-0x100(%rcx),$xb2
1809 vmovdqa 0x160-0x100(%rcx),$xb3
1810 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
1811 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
1812 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
1813 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
1814 vmovdqa 0x200-0x200(%rax),$xd0
1815 vmovdqa 0x220-0x200(%rax),$xd1
1816 vmovdqa 0x240-0x200(%rax),$xd2
1817 vmovdqa 0x260-0x200(%rax),$xd3
1818 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
1821 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
1822 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
1823 vbroadcasti128 (%r10),$xt3
1824 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
1831 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
1832 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
1837 lea 0x200(%rsp),%rax # size optimization
1838 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
1839 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
1840 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
1841 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
1843 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1844 vpunpckldq $xa3,$xa2,$xt3
1845 vpunpckhdq $xa1,$xa0,$xa0
1846 vpunpckhdq $xa3,$xa2,$xa2
1847 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1848 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1849 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1850 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1852 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1854 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
1855 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
1856 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
1857 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
1859 vpunpckldq $xb1,$xb0,$xt2
1860 vpunpckldq $xb3,$xb2,$xt3
1861 vpunpckhdq $xb1,$xb0,$xb0
1862 vpunpckhdq $xb3,$xb2,$xb2
1863 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1864 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1865 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1866 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1868 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1870 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
1871 vperm2i128 \$0x31,$xb0,$xa0,$xb0
1872 vperm2i128 \$0x20,$xb1,$xa1,$xa0
1873 vperm2i128 \$0x31,$xb1,$xa1,$xb1
1874 vperm2i128 \$0x20,$xb2,$xa2,$xa1
1875 vperm2i128 \$0x31,$xb2,$xa2,$xb2
1876 vperm2i128 \$0x20,$xb3,$xa3,$xa2
1877 vperm2i128 \$0x31,$xb3,$xa3,$xb3
1879 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1880 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1882 vmovdqa $xa0,0x00(%rsp) # offload $xaN
1883 vmovdqa $xa1,0x20(%rsp)
1884 vmovdqa 0x40(%rsp),$xc2 # $xa0
1885 vmovdqa 0x60(%rsp),$xc3 # $xa1
1887 vpaddd 0x180-0x200(%rax),$xc0,$xc0
1888 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
1889 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
1890 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
1892 vpunpckldq $xc1,$xc0,$xt2
1893 vpunpckldq $xc3,$xc2,$xt3
1894 vpunpckhdq $xc1,$xc0,$xc0
1895 vpunpckhdq $xc3,$xc2,$xc2
1896 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1897 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1898 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1899 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1901 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1903 vpaddd 0x200-0x200(%rax),$xd0,$xd0
1904 vpaddd 0x220-0x200(%rax),$xd1,$xd1
1905 vpaddd 0x240-0x200(%rax),$xd2,$xd2
1906 vpaddd 0x260-0x200(%rax),$xd3,$xd3
1908 vpunpckldq $xd1,$xd0,$xt2
1909 vpunpckldq $xd3,$xd2,$xt3
1910 vpunpckhdq $xd1,$xd0,$xd0
1911 vpunpckhdq $xd3,$xd2,$xd2
1912 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1913 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1914 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1915 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1917 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1919 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
1920 vperm2i128 \$0x31,$xd0,$xc0,$xd0
1921 vperm2i128 \$0x20,$xd1,$xc1,$xc0
1922 vperm2i128 \$0x31,$xd1,$xc1,$xd1
1923 vperm2i128 \$0x20,$xd2,$xc2,$xc1
1924 vperm2i128 \$0x31,$xd2,$xc2,$xd2
1925 vperm2i128 \$0x20,$xd3,$xc3,$xc2
1926 vperm2i128 \$0x31,$xd3,$xc3,$xd3
1928 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1929 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1930 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1931 ($xa0,$xa1)=($xt2,$xt3);
1933 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
1934 vmovdqa 0x20(%rsp),$xa1
1939 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1940 vpxor 0x20($inp),$xb0,$xb0
1941 vpxor 0x40($inp),$xc0,$xc0
1942 vpxor 0x60($inp),$xd0,$xd0
1943 lea 0x80($inp),$inp # size optimization
1944 vmovdqu $xa0,0x00($out)
1945 vmovdqu $xb0,0x20($out)
1946 vmovdqu $xc0,0x40($out)
1947 vmovdqu $xd0,0x60($out)
1948 lea 0x80($out),$out # size optimization
1950 vpxor 0x00($inp),$xa1,$xa1
1951 vpxor 0x20($inp),$xb1,$xb1
1952 vpxor 0x40($inp),$xc1,$xc1
1953 vpxor 0x60($inp),$xd1,$xd1
1954 lea 0x80($inp),$inp # size optimization
1955 vmovdqu $xa1,0x00($out)
1956 vmovdqu $xb1,0x20($out)
1957 vmovdqu $xc1,0x40($out)
1958 vmovdqu $xd1,0x60($out)
1959 lea 0x80($out),$out # size optimization
1961 vpxor 0x00($inp),$xa2,$xa2
1962 vpxor 0x20($inp),$xb2,$xb2
1963 vpxor 0x40($inp),$xc2,$xc2
1964 vpxor 0x60($inp),$xd2,$xd2
1965 lea 0x80($inp),$inp # size optimization
1966 vmovdqu $xa2,0x00($out)
1967 vmovdqu $xb2,0x20($out)
1968 vmovdqu $xc2,0x40($out)
1969 vmovdqu $xd2,0x60($out)
1970 lea 0x80($out),$out # size optimization
1972 vpxor 0x00($inp),$xa3,$xa3
1973 vpxor 0x20($inp),$xb3,$xb3
1974 vpxor 0x40($inp),$xc3,$xc3
1975 vpxor 0x60($inp),$xd3,$xd3
1976 lea 0x80($inp),$inp # size optimization
1977 vmovdqu $xa3,0x00($out)
1978 vmovdqu $xb3,0x20($out)
1979 vmovdqu $xc3,0x40($out)
1980 vmovdqu $xd3,0x60($out)
1981 lea 0x80($out),$out # size optimization
2005 vmovdqa $xa0,0x00(%rsp)
2006 vmovdqa $xb0,0x20(%rsp)
2011 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2012 vpxor 0x20($inp),$xb0,$xb0
2013 vmovdqu $xa0,0x00($out)
2014 vmovdqu $xb0,0x20($out)
2017 lea 0x40($inp),$inp # inp+=64*1
2019 vmovdqa $xc0,0x00(%rsp)
2020 lea 0x40($out),$out # out+=64*1
2021 sub \$64,$len # len-=64*1
2022 vmovdqa $xd0,0x20(%rsp)
2027 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2028 vpxor 0x20($inp),$xb0,$xb0
2029 vpxor 0x40($inp),$xc0,$xc0
2030 vpxor 0x60($inp),$xd0,$xd0
2031 vmovdqu $xa0,0x00($out)
2032 vmovdqu $xb0,0x20($out)
2033 vmovdqu $xc0,0x40($out)
2034 vmovdqu $xd0,0x60($out)
2037 lea 0x80($inp),$inp # inp+=64*2
2039 vmovdqa $xa1,0x00(%rsp)
2040 lea 0x80($out),$out # out+=64*2
2041 sub \$128,$len # len-=64*2
2042 vmovdqa $xb1,0x20(%rsp)
2047 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2048 vpxor 0x20($inp),$xb0,$xb0
2049 vpxor 0x40($inp),$xc0,$xc0
2050 vpxor 0x60($inp),$xd0,$xd0
2051 vpxor 0x80($inp),$xa1,$xa1
2052 vpxor 0xa0($inp),$xb1,$xb1
2053 vmovdqu $xa0,0x00($out)
2054 vmovdqu $xb0,0x20($out)
2055 vmovdqu $xc0,0x40($out)
2056 vmovdqu $xd0,0x60($out)
2057 vmovdqu $xa1,0x80($out)
2058 vmovdqu $xb1,0xa0($out)
2061 lea 0xc0($inp),$inp # inp+=64*3
2063 vmovdqa $xc1,0x00(%rsp)
2064 lea 0xc0($out),$out # out+=64*3
2065 sub \$192,$len # len-=64*3
2066 vmovdqa $xd1,0x20(%rsp)
2071 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2072 vpxor 0x20($inp),$xb0,$xb0
2073 vpxor 0x40($inp),$xc0,$xc0
2074 vpxor 0x60($inp),$xd0,$xd0
2075 vpxor 0x80($inp),$xa1,$xa1
2076 vpxor 0xa0($inp),$xb1,$xb1
2077 vpxor 0xc0($inp),$xc1,$xc1
2078 vpxor 0xe0($inp),$xd1,$xd1
2079 vmovdqu $xa0,0x00($out)
2080 vmovdqu $xb0,0x20($out)
2081 vmovdqu $xc0,0x40($out)
2082 vmovdqu $xd0,0x60($out)
2083 vmovdqu $xa1,0x80($out)
2084 vmovdqu $xb1,0xa0($out)
2085 vmovdqu $xc1,0xc0($out)
2086 vmovdqu $xd1,0xe0($out)
2089 lea 0x100($inp),$inp # inp+=64*4
2091 vmovdqa $xa2,0x00(%rsp)
2092 lea 0x100($out),$out # out+=64*4
2093 sub \$256,$len # len-=64*4
2094 vmovdqa $xb2,0x20(%rsp)
2099 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2100 vpxor 0x20($inp),$xb0,$xb0
2101 vpxor 0x40($inp),$xc0,$xc0
2102 vpxor 0x60($inp),$xd0,$xd0
2103 vpxor 0x80($inp),$xa1,$xa1
2104 vpxor 0xa0($inp),$xb1,$xb1
2105 vpxor 0xc0($inp),$xc1,$xc1
2106 vpxor 0xe0($inp),$xd1,$xd1
2107 vpxor 0x100($inp),$xa2,$xa2
2108 vpxor 0x120($inp),$xb2,$xb2
2109 vmovdqu $xa0,0x00($out)
2110 vmovdqu $xb0,0x20($out)
2111 vmovdqu $xc0,0x40($out)
2112 vmovdqu $xd0,0x60($out)
2113 vmovdqu $xa1,0x80($out)
2114 vmovdqu $xb1,0xa0($out)
2115 vmovdqu $xc1,0xc0($out)
2116 vmovdqu $xd1,0xe0($out)
2117 vmovdqu $xa2,0x100($out)
2118 vmovdqu $xb2,0x120($out)
2121 lea 0x140($inp),$inp # inp+=64*5
2123 vmovdqa $xc2,0x00(%rsp)
2124 lea 0x140($out),$out # out+=64*5
2125 sub \$320,$len # len-=64*5
2126 vmovdqa $xd2,0x20(%rsp)
2131 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2132 vpxor 0x20($inp),$xb0,$xb0
2133 vpxor 0x40($inp),$xc0,$xc0
2134 vpxor 0x60($inp),$xd0,$xd0
2135 vpxor 0x80($inp),$xa1,$xa1
2136 vpxor 0xa0($inp),$xb1,$xb1
2137 vpxor 0xc0($inp),$xc1,$xc1
2138 vpxor 0xe0($inp),$xd1,$xd1
2139 vpxor 0x100($inp),$xa2,$xa2
2140 vpxor 0x120($inp),$xb2,$xb2
2141 vpxor 0x140($inp),$xc2,$xc2
2142 vpxor 0x160($inp),$xd2,$xd2
2143 vmovdqu $xa0,0x00($out)
2144 vmovdqu $xb0,0x20($out)
2145 vmovdqu $xc0,0x40($out)
2146 vmovdqu $xd0,0x60($out)
2147 vmovdqu $xa1,0x80($out)
2148 vmovdqu $xb1,0xa0($out)
2149 vmovdqu $xc1,0xc0($out)
2150 vmovdqu $xd1,0xe0($out)
2151 vmovdqu $xa2,0x100($out)
2152 vmovdqu $xb2,0x120($out)
2153 vmovdqu $xc2,0x140($out)
2154 vmovdqu $xd2,0x160($out)
2157 lea 0x180($inp),$inp # inp+=64*6
2159 vmovdqa $xa3,0x00(%rsp)
2160 lea 0x180($out),$out # out+=64*6
2161 sub \$384,$len # len-=64*6
2162 vmovdqa $xb3,0x20(%rsp)
2167 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2168 vpxor 0x20($inp),$xb0,$xb0
2169 vpxor 0x40($inp),$xc0,$xc0
2170 vpxor 0x60($inp),$xd0,$xd0
2171 vpxor 0x80($inp),$xa1,$xa1
2172 vpxor 0xa0($inp),$xb1,$xb1
2173 vpxor 0xc0($inp),$xc1,$xc1
2174 vpxor 0xe0($inp),$xd1,$xd1
2175 vpxor 0x100($inp),$xa2,$xa2
2176 vpxor 0x120($inp),$xb2,$xb2
2177 vpxor 0x140($inp),$xc2,$xc2
2178 vpxor 0x160($inp),$xd2,$xd2
2179 vpxor 0x180($inp),$xa3,$xa3
2180 vpxor 0x1a0($inp),$xb3,$xb3
2181 vmovdqu $xa0,0x00($out)
2182 vmovdqu $xb0,0x20($out)
2183 vmovdqu $xc0,0x40($out)
2184 vmovdqu $xd0,0x60($out)
2185 vmovdqu $xa1,0x80($out)
2186 vmovdqu $xb1,0xa0($out)
2187 vmovdqu $xc1,0xc0($out)
2188 vmovdqu $xd1,0xe0($out)
2189 vmovdqu $xa2,0x100($out)
2190 vmovdqu $xb2,0x120($out)
2191 vmovdqu $xc2,0x140($out)
2192 vmovdqu $xd2,0x160($out)
2193 vmovdqu $xa3,0x180($out)
2194 vmovdqu $xb3,0x1a0($out)
2197 lea 0x1c0($inp),$inp # inp+=64*7
2199 vmovdqa $xc3,0x00(%rsp)
2200 lea 0x1c0($out),$out # out+=64*7
2201 sub \$448,$len # len-=64*7
2202 vmovdqa $xd3,0x20(%rsp)
2205 movzb ($inp,%r10),%eax
2206 movzb (%rsp,%r10),%ecx
2209 mov %al,-1($out,%r10)
2216 $code.=<<___ if ($win64);
2217 lea 0x290+0x30(%rsp),%r11
2218 movaps -0x30(%r11),%xmm6
2219 movaps -0x20(%r11),%xmm7
2220 movaps -0x10(%r11),%xmm8
2221 movaps 0x00(%r11),%xmm9
2222 movaps 0x10(%r11),%xmm10
2223 movaps 0x20(%r11),%xmm11
2224 movaps 0x30(%r11),%xmm12
2225 movaps 0x40(%r11),%xmm13
2226 movaps 0x50(%r11),%xmm14
2227 movaps 0x60(%r11),%xmm15
2230 mov 0x280(%rsp),%rsp
2232 .size ChaCha20_8x,.-ChaCha20_8x
2236 foreach (split("\n",$code)) {
2237 s/\`([^\`]*)\`/eval $1/geo;