3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # ChaCha20 for x86_64.
14 # Performance in cycles per byte out of large buffer.
16 # IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2
18 # P4 9.48/+99% -/22.7(ii) -
19 # Core2 7.83/+55% 7.90/8.08 4.35
20 # Westmere 7.19/+50% 5.60/6.70 3.00
21 # Sandy Bridge 8.31/+42% 5.45/6.76 2.72
22 # Ivy Bridge 6.71/+46% 5.40/6.49 2.41
23 # Haswell 5.92/+43% 5.20/6.45 2.42 1.23
24 # Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
25 # Sledgehammer 7.28/+52% -/14.2(ii) -
26 # Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
27 # VIA Nano 10.5/+46% 6.72/8.60 6.05
29 # (i) compared to older gcc 3.x one can observe >2x improvement on
31 # (ii) as it can be seen, SSE2 performance is too low on legacy
32 # processors; NxSSE2 results are naturally better, but not
33 # impressively better than IALU ones, which is why you won't
34 # find SSE2 code below;
35 # (iii) this is not optimal result for Atom because of MSROM
36 # limitations, SSE2 can do better, but gain is considered too
37 # low to justify the [maintenance] effort;
38 # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
42 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
44 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
49 die "can't locate x86_64-xlate.pl";
51 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
52 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
53 $avx = ($1>=2.19) + ($1>=2.22);
56 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
57 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
58 $avx = ($1>=2.09) + ($1>=2.10);
61 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
62 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
63 $avx = ($1>=10) + ($1>=11);
66 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
67 $avx = ($2>=3.0) + ($2>3.0);
70 open OUT,"| \"$^X\" $xlate $flavour $output";
73 # input parameter block
74 ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8");
79 .extern OPENSSL_ia32cap_P
95 .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
97 .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
99 .asciz "expand 32-byte k"
100 .asciz "ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
103 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
104 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
106 $arg = "\$$arg" if ($arg*1 eq $arg);
107 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
110 @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)),
111 "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15)));
114 sub ROUND { # critical path is 24 cycles per round
115 my ($a0,$b0,$c0,$d0)=@_;
116 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
117 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
118 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
119 my ($xc,$xc_)=map("\"$_\"",@t);
120 my @x=map("\"$_\"",@x);
122 # Consider order in which variables are addressed by their
127 # 0 4 8 12 < even round
131 # 0 5 10 15 < odd round
136 # 'a', 'b' and 'd's are permanently allocated in registers,
137 # @x[0..7,12..15], while 'c's are maintained in memory. If
138 # you observe 'c' column, you'll notice that pair of 'c's is
139 # invariant between rounds. This means that we have to reload
140 # them once per round, in the middle. This is why you'll see
141 # bunch of 'c' stores and loads in the middle, but none in
142 # the beginning or end.
144 # Normally instructions would be interleaved to favour in-order
145 # execution. Generally out-of-order cores manage it gracefully,
146 # but not this time for some reason. As in-order execution
147 # cores are dying breed, old Atom is the only one around,
148 # instructions are left uninterleaved. Besides, Atom is better
149 # off executing 1xSSSE3 code anyway...
152 "&add (@x[$a0],@x[$b0])", # Q1
153 "&xor (@x[$d0],@x[$a0])",
155 "&add (@x[$a1],@x[$b1])", # Q2
156 "&xor (@x[$d1],@x[$a1])",
159 "&add ($xc,@x[$d0])",
160 "&xor (@x[$b0],$xc)",
162 "&add ($xc_,@x[$d1])",
163 "&xor (@x[$b1],$xc_)",
166 "&add (@x[$a0],@x[$b0])",
167 "&xor (@x[$d0],@x[$a0])",
169 "&add (@x[$a1],@x[$b1])",
170 "&xor (@x[$d1],@x[$a1])",
173 "&add ($xc,@x[$d0])",
174 "&xor (@x[$b0],$xc)",
176 "&add ($xc_,@x[$d1])",
177 "&xor (@x[$b1],$xc_)",
180 "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's
181 "&mov (\"4*$c1(%rsp)\",$xc_)",
182 "&mov ($xc,\"4*$c2(%rsp)\")",
183 "&mov ($xc_,\"4*$c3(%rsp)\")",
185 "&add (@x[$a2],@x[$b2])", # Q3
186 "&xor (@x[$d2],@x[$a2])",
188 "&add (@x[$a3],@x[$b3])", # Q4
189 "&xor (@x[$d3],@x[$a3])",
192 "&add ($xc,@x[$d2])",
193 "&xor (@x[$b2],$xc)",
195 "&add ($xc_,@x[$d3])",
196 "&xor (@x[$b3],$xc_)",
199 "&add (@x[$a2],@x[$b2])",
200 "&xor (@x[$d2],@x[$a2])",
202 "&add (@x[$a3],@x[$b3])",
203 "&xor (@x[$d3],@x[$a3])",
206 "&add ($xc,@x[$d2])",
207 "&xor (@x[$b2],$xc)",
209 "&add ($xc_,@x[$d3])",
210 "&xor (@x[$b3],$xc_)",
215 ########################################################################
216 # Generic code path that handles all lengths on pre-SSSE3 processors.
218 .globl ChaCha20_ctr32
219 .type ChaCha20_ctr32,\@function,5
224 mov OPENSSL_ia32cap_P+4(%rip),%r10
225 test \$`1<<(41-32)`,%r10d
236 #movdqa .Lsigma(%rip),%xmm0
238 movdqu 16($key),%xmm2
239 movdqu ($counter),%xmm3
240 movdqa .Lone(%rip),%xmm4
242 #movdqa %xmm0,4*0(%rsp) # key[0]
243 movdqa %xmm1,4*4(%rsp) # key[1]
244 movdqa %xmm2,4*8(%rsp) # key[2]
245 movdqa %xmm3,4*12(%rsp) # key[3]
246 mov $len,%rbp # reassign $len
251 mov \$0x61707865,@x[0] # 'expa'
252 mov \$0x3320646e,@x[1] # 'nd 3'
253 mov \$0x79622d32,@x[2] # '2-by'
254 mov \$0x6b206574,@x[3] # 'te k'
260 mov 4*13(%rsp),@x[13]
261 mov 4*14(%rsp),@x[14]
262 mov 4*15(%rsp),@x[15]
264 mov %rbp,64+0(%rsp) # save len
266 mov $inp,64+8(%rsp) # save inp
267 movq %xmm2,%rsi # "@x[8]"
268 mov $out,64+16(%rsp) # save out
270 shr \$32,%rdi # "@x[9]"
276 foreach (&ROUND (0, 4, 8,12)) { eval; }
277 foreach (&ROUND (0, 5,10,15)) { eval; }
282 mov @t[1],4*9(%rsp) # modulo-scheduled
284 mov 64(%rsp),%rbp # load len
286 mov 64+8(%rsp),$inp # load inp
287 paddd %xmm4,%xmm3 # increment counter
288 mov 64+16(%rsp),$out # load out
290 add \$0x61707865,@x[0] # 'expa'
291 add \$0x3320646e,@x[1] # 'nd 3'
292 add \$0x79622d32,@x[2] # '2-by'
293 add \$0x6b206574,@x[3] # 'te k'
298 add 4*12(%rsp),@x[12]
299 add 4*13(%rsp),@x[13]
300 add 4*14(%rsp),@x[14]
301 add 4*15(%rsp),@x[15]
302 paddd 4*8(%rsp),%xmm1
307 xor 4*0($inp),@x[0] # xor with input
315 movdqu 4*8($inp),%xmm0
316 xor 4*12($inp),@x[12]
317 xor 4*13($inp),@x[13]
318 xor 4*14($inp),@x[14]
319 xor 4*15($inp),@x[15]
320 lea 4*16($inp),$inp # inp+=64
323 movdqa %xmm2,4*8(%rsp)
324 movd %xmm3,4*12(%rsp)
326 mov @x[0],4*0($out) # write output
334 movdqu %xmm0,4*8($out)
335 mov @x[12],4*12($out)
336 mov @x[13],4*13($out)
337 mov @x[14],4*14($out)
338 mov @x[15],4*15($out)
339 lea 4*16($out),$out # out+=64
357 movdqa %xmm1,4*8(%rsp)
358 mov @x[12],4*12(%rsp)
359 mov @x[13],4*13(%rsp)
360 mov @x[14],4*14(%rsp)
361 mov @x[15],4*15(%rsp)
364 movzb ($inp,%rbx),%eax
365 movzb (%rsp,%rbx),%edx
368 mov %al,-1($out,%rbx)
382 .size ChaCha20_ctr32,.-ChaCha20_ctr32
385 ########################################################################
386 # SSSE3 code path that handles shorter lengths
388 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
390 sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
414 my $xframe = $win64 ? 32+32+8 : 24;
417 .type ChaCha20_ssse3,\@function,5
422 $code.=<<___ if ($avx);
423 test \$`1<<(43-32)`,%r10d
424 jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4
427 cmp \$128,$len # we might throw away some data,
428 ja .LChaCha20_4x # but overall it won't be slower
438 sub \$64+$xframe,%rsp
440 $code.=<<___ if ($win64);
441 movaps %xmm6,64+32(%rsp)
442 movaps %xmm7,64+48(%rsp)
445 movdqa .Lsigma(%rip),$a
449 movdqa .Lrot16(%rip),$rot16
450 movdqa .Lrot24(%rip),$rot24
461 movdqa .Lone(%rip),$d
474 &pshufd ($c,$c,0b01001110);
475 &pshufd ($b,$b,0b00111001);
476 &pshufd ($d,$d,0b10010011);
480 &pshufd ($c,$c,0b01001110);
481 &pshufd ($b,$b,0b10010011);
482 &pshufd ($d,$d,0b00111001);
485 &jnz (".Loop_ssse3");
497 movdqu 0x10($inp),$t1
498 pxor $t,$a # xor with input
501 movdqu 0x30($inp),$t1
502 lea 0x40($inp),$inp # inp+=64
506 movdqu $a,0x00($out) # write output
510 lea 0x40($out),$out # out+=64
513 jnz .Loop_outer_ssse3
526 movzb ($inp,%rbx),%eax
527 movzb (%rsp,%rbx),%ecx
530 mov %al,-1($out,%rbx)
536 $code.=<<___ if ($win64);
537 movaps 64+32(%rsp),%xmm6
538 movaps 64+48(%rsp),%xmm7
541 add \$64+$xframe,%rsp
549 .size ChaCha20_ssse3,.-ChaCha20_ssse3
553 ########################################################################
554 # SSSE3 code path that handles longer messages.
556 # assign variables to favor Atom front-end
557 my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3,
558 $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15));
559 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
560 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
562 sub SSSE3_lane_ROUND {
563 my ($a0,$b0,$c0,$d0)=@_;
564 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
565 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
566 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
567 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
568 my @x=map("\"$_\"",@xx);
570 # Consider order in which variables are addressed by their
575 # 0 4 8 12 < even round
579 # 0 5 10 15 < odd round
584 # 'a', 'b' and 'd's are permanently allocated in registers,
585 # @x[0..7,12..15], while 'c's are maintained in memory. If
586 # you observe 'c' column, you'll notice that pair of 'c's is
587 # invariant between rounds. This means that we have to reload
588 # them once per round, in the middle. This is why you'll see
589 # bunch of 'c' stores and loads in the middle, but none in
590 # the beginning or end.
593 "&paddd (@x[$a0],@x[$b0])", # Q1
594 "&paddd (@x[$a1],@x[$b1])", # Q2
595 "&pxor (@x[$d0],@x[$a0])",
596 "&pxor (@x[$d1],@x[$a1])",
597 "&pshufb (@x[$d0],$t1)",
598 "&pshufb (@x[$d1],$t1)",
600 "&paddd ($xc,@x[$d0])",
601 "&paddd ($xc_,@x[$d1])",
602 "&pxor (@x[$b0],$xc)",
603 "&pxor (@x[$b1],$xc_)",
604 "&movdqa ($t0,@x[$b0])",
605 "&pslld (@x[$b0],12)",
607 "&movdqa ($t1,@x[$b1])",
608 "&pslld (@x[$b1],12)",
609 "&por (@x[$b0],$t0)",
611 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
612 "&por (@x[$b1],$t1)",
614 "&paddd (@x[$a0],@x[$b0])",
615 "&paddd (@x[$a1],@x[$b1])",
616 "&pxor (@x[$d0],@x[$a0])",
617 "&pxor (@x[$d1],@x[$a1])",
618 "&pshufb (@x[$d0],$t0)",
619 "&pshufb (@x[$d1],$t0)",
621 "&paddd ($xc,@x[$d0])",
622 "&paddd ($xc_,@x[$d1])",
623 "&pxor (@x[$b0],$xc)",
624 "&pxor (@x[$b1],$xc_)",
625 "&movdqa ($t1,@x[$b0])",
626 "&pslld (@x[$b0],7)",
628 "&movdqa ($t0,@x[$b1])",
629 "&pslld (@x[$b1],7)",
630 "&por (@x[$b0],$t1)",
632 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
633 "&por (@x[$b1],$t0)",
635 "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
636 "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)",
637 "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")",
638 "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")",
640 "&paddd (@x[$a2],@x[$b2])", # Q3
641 "&paddd (@x[$a3],@x[$b3])", # Q4
642 "&pxor (@x[$d2],@x[$a2])",
643 "&pxor (@x[$d3],@x[$a3])",
644 "&pshufb (@x[$d2],$t1)",
645 "&pshufb (@x[$d3],$t1)",
647 "&paddd ($xc,@x[$d2])",
648 "&paddd ($xc_,@x[$d3])",
649 "&pxor (@x[$b2],$xc)",
650 "&pxor (@x[$b3],$xc_)",
651 "&movdqa ($t0,@x[$b2])",
652 "&pslld (@x[$b2],12)",
654 "&movdqa ($t1,@x[$b3])",
655 "&pslld (@x[$b3],12)",
656 "&por (@x[$b2],$t0)",
658 "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip)
659 "&por (@x[$b3],$t1)",
661 "&paddd (@x[$a2],@x[$b2])",
662 "&paddd (@x[$a3],@x[$b3])",
663 "&pxor (@x[$d2],@x[$a2])",
664 "&pxor (@x[$d3],@x[$a3])",
665 "&pshufb (@x[$d2],$t0)",
666 "&pshufb (@x[$d3],$t0)",
668 "&paddd ($xc,@x[$d2])",
669 "&paddd ($xc_,@x[$d3])",
670 "&pxor (@x[$b2],$xc)",
671 "&pxor (@x[$b3],$xc_)",
672 "&movdqa ($t1,@x[$b2])",
673 "&pslld (@x[$b2],7)",
675 "&movdqa ($t0,@x[$b3])",
676 "&pslld (@x[$b3],7)",
677 "&por (@x[$b2],$t1)",
679 "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
684 my $xframe = $win64 ? 0xa0 : 0;
687 .type ChaCha20_4x,\@function,5
693 $code.=<<___ if ($avx>1);
694 shr \$32,%r10 # OPENSSL_ia32cap_P+8
695 test \$`1<<5`,%r10 # test AVX2
702 and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE
703 cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE
704 je .Ldo_sse3_after_all # to detect Atom
708 sub \$0x148+$xframe,%rsp
710 ################ stack layout
711 # +0x00 SIMD equivalent of @x[8-12]
713 # +0x40 constant copy of key[0-2] smashed by lanes
715 # +0x100 SIMD counters (with nonce smashed by lanes)
718 $code.=<<___ if ($win64);
719 movaps %xmm6,-0x30(%r11)
720 movaps %xmm7,-0x20(%r11)
721 movaps %xmm8,-0x10(%r11)
722 movaps %xmm9,0x00(%r11)
723 movaps %xmm10,0x10(%r11)
724 movaps %xmm11,0x20(%r11)
725 movaps %xmm12,0x30(%r11)
726 movaps %xmm13,0x40(%r11)
727 movaps %xmm14,0x50(%r11)
728 movaps %xmm15,0x60(%r11)
731 movdqa .Lsigma(%rip),$xa3 # key[0]
732 movdqu ($key),$xb3 # key[1]
733 movdqu 16($key),$xt3 # key[2]
734 movdqu ($counter),$xd3 # key[3]
735 lea 0x100(%rsp),%rcx # size optimization
736 lea .Lrot16(%rip),%r10
737 lea .Lrot24(%rip),%r11
739 pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
740 pshufd \$0x55,$xa3,$xa1
741 movdqa $xa0,0x40(%rsp) # ... and offload
742 pshufd \$0xaa,$xa3,$xa2
743 movdqa $xa1,0x50(%rsp)
744 pshufd \$0xff,$xa3,$xa3
745 movdqa $xa2,0x60(%rsp)
746 movdqa $xa3,0x70(%rsp)
748 pshufd \$0x00,$xb3,$xb0
749 pshufd \$0x55,$xb3,$xb1
750 movdqa $xb0,0x80-0x100(%rcx)
751 pshufd \$0xaa,$xb3,$xb2
752 movdqa $xb1,0x90-0x100(%rcx)
753 pshufd \$0xff,$xb3,$xb3
754 movdqa $xb2,0xa0-0x100(%rcx)
755 movdqa $xb3,0xb0-0x100(%rcx)
757 pshufd \$0x00,$xt3,$xt0 # "$xc0"
758 pshufd \$0x55,$xt3,$xt1 # "$xc1"
759 movdqa $xt0,0xc0-0x100(%rcx)
760 pshufd \$0xaa,$xt3,$xt2 # "$xc2"
761 movdqa $xt1,0xd0-0x100(%rcx)
762 pshufd \$0xff,$xt3,$xt3 # "$xc3"
763 movdqa $xt2,0xe0-0x100(%rcx)
764 movdqa $xt3,0xf0-0x100(%rcx)
766 pshufd \$0x00,$xd3,$xd0
767 pshufd \$0x55,$xd3,$xd1
768 paddd .Linc(%rip),$xd0 # don't save counters yet
769 pshufd \$0xaa,$xd3,$xd2
770 movdqa $xd1,0x110-0x100(%rcx)
771 pshufd \$0xff,$xd3,$xd3
772 movdqa $xd2,0x120-0x100(%rcx)
773 movdqa $xd3,0x130-0x100(%rcx)
779 movdqa 0x40(%rsp),$xa0 # re-load smashed key
780 movdqa 0x50(%rsp),$xa1
781 movdqa 0x60(%rsp),$xa2
782 movdqa 0x70(%rsp),$xa3
783 movdqa 0x80-0x100(%rcx),$xb0
784 movdqa 0x90-0x100(%rcx),$xb1
785 movdqa 0xa0-0x100(%rcx),$xb2
786 movdqa 0xb0-0x100(%rcx),$xb3
787 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
788 movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
789 movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
790 movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
791 movdqa 0x100-0x100(%rcx),$xd0
792 movdqa 0x110-0x100(%rcx),$xd1
793 movdqa 0x120-0x100(%rcx),$xd2
794 movdqa 0x130-0x100(%rcx),$xd3
795 paddd .Lfour(%rip),$xd0 # next SIMD counters
798 movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
799 movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
800 movdqa (%r10),$xt3 # .Lrot16(%rip)
802 movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
808 foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; }
809 foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; }
814 paddd 0x40(%rsp),$xa0 # accumulate key material
815 paddd 0x50(%rsp),$xa1
816 paddd 0x60(%rsp),$xa2
817 paddd 0x70(%rsp),$xa3
819 movdqa $xa0,$xt2 # "de-interlace" data
826 punpcklqdq $xa2,$xa0 # "a0"
828 punpcklqdq $xt3,$xt2 # "a2"
829 punpckhqdq $xa2,$xa1 # "a1"
830 punpckhqdq $xt3,$xa3 # "a3"
832 ($xa2,$xt2)=($xt2,$xa2);
834 paddd 0x80-0x100(%rcx),$xb0
835 paddd 0x90-0x100(%rcx),$xb1
836 paddd 0xa0-0x100(%rcx),$xb2
837 paddd 0xb0-0x100(%rcx),$xb3
839 movdqa $xa0,0x00(%rsp) # offload $xaN
840 movdqa $xa1,0x10(%rsp)
841 movdqa 0x20(%rsp),$xa0 # "xc2"
842 movdqa 0x30(%rsp),$xa1 # "xc3"
851 punpcklqdq $xb2,$xb0 # "b0"
853 punpcklqdq $xt3,$xt2 # "b2"
854 punpckhqdq $xb2,$xb1 # "b1"
855 punpckhqdq $xt3,$xb3 # "b3"
857 ($xb2,$xt2)=($xt2,$xb2);
858 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
860 paddd 0xc0-0x100(%rcx),$xc0
861 paddd 0xd0-0x100(%rcx),$xc1
862 paddd 0xe0-0x100(%rcx),$xc2
863 paddd 0xf0-0x100(%rcx),$xc3
865 movdqa $xa2,0x20(%rsp) # keep offloading $xaN
866 movdqa $xa3,0x30(%rsp)
875 punpcklqdq $xc2,$xc0 # "c0"
877 punpcklqdq $xt3,$xt2 # "c2"
878 punpckhqdq $xc2,$xc1 # "c1"
879 punpckhqdq $xt3,$xc3 # "c3"
881 ($xc2,$xt2)=($xt2,$xc2);
882 ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary
884 paddd 0x100-0x100(%rcx),$xd0
885 paddd 0x110-0x100(%rcx),$xd1
886 paddd 0x120-0x100(%rcx),$xd2
887 paddd 0x130-0x100(%rcx),$xd3
896 punpcklqdq $xd2,$xd0 # "d0"
898 punpcklqdq $xt3,$xt2 # "d2"
899 punpckhqdq $xd2,$xd1 # "d1"
900 punpckhqdq $xt3,$xd3 # "d3"
902 ($xd2,$xt2)=($xt2,$xd2);
907 movdqu 0x00($inp),$xt0 # xor with input
908 movdqu 0x10($inp),$xt1
909 movdqu 0x20($inp),$xt2
910 movdqu 0x30($inp),$xt3
911 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
916 movdqu $xt0,0x00($out)
917 movdqu 0x40($inp),$xt0
918 movdqu $xt1,0x10($out)
919 movdqu 0x50($inp),$xt1
920 movdqu $xt2,0x20($out)
921 movdqu 0x60($inp),$xt2
922 movdqu $xt3,0x30($out)
923 movdqu 0x70($inp),$xt3
924 lea 0x80($inp),$inp # size optimization
930 movdqu $xt0,0x40($out)
931 movdqu 0x00($inp),$xt0
932 movdqu $xt1,0x50($out)
933 movdqu 0x10($inp),$xt1
934 movdqu $xt2,0x60($out)
935 movdqu 0x20($inp),$xt2
936 movdqu $xt3,0x70($out)
937 lea 0x80($out),$out # size optimization
938 movdqu 0x30($inp),$xt3
944 movdqu $xt0,0x00($out)
945 movdqu 0x40($inp),$xt0
946 movdqu $xt1,0x10($out)
947 movdqu 0x50($inp),$xt1
948 movdqu $xt2,0x20($out)
949 movdqu 0x60($inp),$xt2
950 movdqu $xt3,0x30($out)
951 movdqu 0x70($inp),$xt3
952 lea 0x80($inp),$inp # inp+=64*4
957 movdqu $xt0,0x40($out)
958 movdqu $xt1,0x50($out)
959 movdqu $xt2,0x60($out)
960 movdqu $xt3,0x70($out)
961 lea 0x80($out),$out # out+=64*4
976 #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
978 #movdqa $xt0,0x00(%rsp)
979 movdqa $xb0,0x10(%rsp)
980 movdqa $xc0,0x20(%rsp)
981 movdqa $xd0,0x30(%rsp)
986 movdqu 0x00($inp),$xt0 # xor with input
987 movdqu 0x10($inp),$xt1
988 movdqu 0x20($inp),$xt2
989 movdqu 0x30($inp),$xt3
990 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember?
994 movdqu $xt0,0x00($out)
995 movdqu $xt1,0x10($out)
996 movdqu $xt2,0x20($out)
997 movdqu $xt3,0x30($out)
1000 movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
1001 lea 0x40($inp),$inp # inp+=64*1
1003 movdqa $xt0,0x00(%rsp)
1004 movdqa $xb1,0x10(%rsp)
1005 lea 0x40($out),$out # out+=64*1
1006 movdqa $xc1,0x20(%rsp)
1007 sub \$64,$len # len-=64*1
1008 movdqa $xd1,0x30(%rsp)
1013 movdqu 0x00($inp),$xt0 # xor with input
1014 movdqu 0x10($inp),$xt1
1015 movdqu 0x20($inp),$xt2
1016 movdqu 0x30($inp),$xt3
1017 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1022 movdqu $xt0,0x00($out)
1023 movdqu 0x40($inp),$xt0
1024 movdqu $xt1,0x10($out)
1025 movdqu 0x50($inp),$xt1
1026 movdqu $xt2,0x20($out)
1027 movdqu 0x60($inp),$xt2
1028 movdqu $xt3,0x30($out)
1029 movdqu 0x70($inp),$xt3
1030 pxor 0x10(%rsp),$xt0
1034 movdqu $xt0,0x40($out)
1035 movdqu $xt1,0x50($out)
1036 movdqu $xt2,0x60($out)
1037 movdqu $xt3,0x70($out)
1040 movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
1041 lea 0x80($inp),$inp # inp+=64*2
1043 movdqa $xt0,0x00(%rsp)
1044 movdqa $xb2,0x10(%rsp)
1045 lea 0x80($out),$out # out+=64*2
1046 movdqa $xc2,0x20(%rsp)
1047 sub \$128,$len # len-=64*2
1048 movdqa $xd2,0x30(%rsp)
1053 movdqu 0x00($inp),$xt0 # xor with input
1054 movdqu 0x10($inp),$xt1
1055 movdqu 0x20($inp),$xt2
1056 movdqu 0x30($inp),$xt3
1057 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
1062 movdqu $xt0,0x00($out)
1063 movdqu 0x40($inp),$xt0
1064 movdqu $xt1,0x10($out)
1065 movdqu 0x50($inp),$xt1
1066 movdqu $xt2,0x20($out)
1067 movdqu 0x60($inp),$xt2
1068 movdqu $xt3,0x30($out)
1069 movdqu 0x70($inp),$xt3
1070 lea 0x80($inp),$inp # size optimization
1071 pxor 0x10(%rsp),$xt0
1076 movdqu $xt0,0x40($out)
1077 movdqu 0x00($inp),$xt0
1078 movdqu $xt1,0x50($out)
1079 movdqu 0x10($inp),$xt1
1080 movdqu $xt2,0x60($out)
1081 movdqu 0x20($inp),$xt2
1082 movdqu $xt3,0x70($out)
1083 lea 0x80($out),$out # size optimization
1084 movdqu 0x30($inp),$xt3
1085 pxor 0x20(%rsp),$xt0
1089 movdqu $xt0,0x00($out)
1090 movdqu $xt1,0x10($out)
1091 movdqu $xt2,0x20($out)
1092 movdqu $xt3,0x30($out)
1095 movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
1096 lea 0x40($inp),$inp # inp+=64*3
1098 movdqa $xt0,0x00(%rsp)
1099 movdqa $xb3,0x10(%rsp)
1100 lea 0x40($out),$out # out+=64*3
1101 movdqa $xc3,0x20(%rsp)
1102 sub \$192,$len # len-=64*3
1103 movdqa $xd3,0x30(%rsp)
1106 movzb ($inp,%r10),%eax
1107 movzb (%rsp,%r10),%ecx
1110 mov %al,-1($out,%r10)
1116 $code.=<<___ if ($win64);
1117 lea 0x140+0x30(%rsp),%r11
1118 movaps -0x30(%r11),%xmm6
1119 movaps -0x20(%r11),%xmm7
1120 movaps -0x10(%r11),%xmm8
1121 movaps 0x00(%r11),%xmm9
1122 movaps 0x10(%r11),%xmm10
1123 movaps 0x20(%r11),%xmm11
1124 movaps 0x30(%r11),%xmm12
1125 movaps 0x40(%r11),%xmm13
1126 movaps 0x50(%r11),%xmm14
1127 movaps 0x60(%r11),%xmm15
1130 add \$0x148+$xframe,%rsp
1132 .size ChaCha20_4x,.-ChaCha20_4x
1136 ########################################################################
1137 # XOP code path that handles all lengths.
1139 # There is some "anomaly" observed depending on instructions' size or
1140 # alignment. If you look closely at below code you'll notice that
1141 # sometimes argument order varies. The order affects instruction
1142 # encoding by making it larger, and such fiddling gives 5% performance
1143 # improvement. This is on FX-4100...
1145 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1146 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15));
1147 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1148 $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3);
1150 sub XOP_lane_ROUND {
1151 my ($a0,$b0,$c0,$d0)=@_;
1152 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1153 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1154 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1155 my @x=map("\"$_\"",@xx);
1158 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1159 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1160 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1161 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1162 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1163 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1164 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1165 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1166 "&vprotd (@x[$d0],@x[$d0],16)",
1167 "&vprotd (@x[$d1],@x[$d1],16)",
1168 "&vprotd (@x[$d2],@x[$d2],16)",
1169 "&vprotd (@x[$d3],@x[$d3],16)",
1171 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1172 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1173 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1174 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1175 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1176 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1177 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1178 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1179 "&vprotd (@x[$b0],@x[$b0],12)",
1180 "&vprotd (@x[$b1],@x[$b1],12)",
1181 "&vprotd (@x[$b2],@x[$b2],12)",
1182 "&vprotd (@x[$b3],@x[$b3],12)",
1184 "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip
1185 "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip
1186 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1187 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1188 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1189 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1190 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1191 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1192 "&vprotd (@x[$d0],@x[$d0],8)",
1193 "&vprotd (@x[$d1],@x[$d1],8)",
1194 "&vprotd (@x[$d2],@x[$d2],8)",
1195 "&vprotd (@x[$d3],@x[$d3],8)",
1197 "&vpaddd (@x[$c0],@x[$c0],@x[$d0])",
1198 "&vpaddd (@x[$c1],@x[$c1],@x[$d1])",
1199 "&vpaddd (@x[$c2],@x[$c2],@x[$d2])",
1200 "&vpaddd (@x[$c3],@x[$c3],@x[$d3])",
1201 "&vpxor (@x[$b0],@x[$c0],@x[$b0])",
1202 "&vpxor (@x[$b1],@x[$c1],@x[$b1])",
1203 "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip
1204 "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip
1205 "&vprotd (@x[$b0],@x[$b0],7)",
1206 "&vprotd (@x[$b1],@x[$b1],7)",
1207 "&vprotd (@x[$b2],@x[$b2],7)",
1208 "&vprotd (@x[$b3],@x[$b3],7)"
1212 my $xframe = $win64 ? 0xa0 : 0;
1215 .type ChaCha20_4xop,\@function,5
1219 lea -0x78(%rsp),%r11
1220 sub \$0x148+$xframe,%rsp
1222 ################ stack layout
1223 # +0x00 SIMD equivalent of @x[8-12]
1225 # +0x40 constant copy of key[0-2] smashed by lanes
1227 # +0x100 SIMD counters (with nonce smashed by lanes)
1230 $code.=<<___ if ($win64);
1231 movaps %xmm6,-0x30(%r11)
1232 movaps %xmm7,-0x20(%r11)
1233 movaps %xmm8,-0x10(%r11)
1234 movaps %xmm9,0x00(%r11)
1235 movaps %xmm10,0x10(%r11)
1236 movaps %xmm11,0x20(%r11)
1237 movaps %xmm12,0x30(%r11)
1238 movaps %xmm13,0x40(%r11)
1239 movaps %xmm14,0x50(%r11)
1240 movaps %xmm15,0x60(%r11)
1245 vmovdqa .Lsigma(%rip),$xa3 # key[0]
1246 vmovdqu ($key),$xb3 # key[1]
1247 vmovdqu 16($key),$xt3 # key[2]
1248 vmovdqu ($counter),$xd3 # key[3]
1249 lea 0x100(%rsp),%rcx # size optimization
1251 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1252 vpshufd \$0x55,$xa3,$xa1
1253 vmovdqa $xa0,0x40(%rsp) # ... and offload
1254 vpshufd \$0xaa,$xa3,$xa2
1255 vmovdqa $xa1,0x50(%rsp)
1256 vpshufd \$0xff,$xa3,$xa3
1257 vmovdqa $xa2,0x60(%rsp)
1258 vmovdqa $xa3,0x70(%rsp)
1260 vpshufd \$0x00,$xb3,$xb0
1261 vpshufd \$0x55,$xb3,$xb1
1262 vmovdqa $xb0,0x80-0x100(%rcx)
1263 vpshufd \$0xaa,$xb3,$xb2
1264 vmovdqa $xb1,0x90-0x100(%rcx)
1265 vpshufd \$0xff,$xb3,$xb3
1266 vmovdqa $xb2,0xa0-0x100(%rcx)
1267 vmovdqa $xb3,0xb0-0x100(%rcx)
1269 vpshufd \$0x00,$xt3,$xt0 # "$xc0"
1270 vpshufd \$0x55,$xt3,$xt1 # "$xc1"
1271 vmovdqa $xt0,0xc0-0x100(%rcx)
1272 vpshufd \$0xaa,$xt3,$xt2 # "$xc2"
1273 vmovdqa $xt1,0xd0-0x100(%rcx)
1274 vpshufd \$0xff,$xt3,$xt3 # "$xc3"
1275 vmovdqa $xt2,0xe0-0x100(%rcx)
1276 vmovdqa $xt3,0xf0-0x100(%rcx)
1278 vpshufd \$0x00,$xd3,$xd0
1279 vpshufd \$0x55,$xd3,$xd1
1280 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet
1281 vpshufd \$0xaa,$xd3,$xd2
1282 vmovdqa $xd1,0x110-0x100(%rcx)
1283 vpshufd \$0xff,$xd3,$xd3
1284 vmovdqa $xd2,0x120-0x100(%rcx)
1285 vmovdqa $xd3,0x130-0x100(%rcx)
1291 vmovdqa 0x40(%rsp),$xa0 # re-load smashed key
1292 vmovdqa 0x50(%rsp),$xa1
1293 vmovdqa 0x60(%rsp),$xa2
1294 vmovdqa 0x70(%rsp),$xa3
1295 vmovdqa 0x80-0x100(%rcx),$xb0
1296 vmovdqa 0x90-0x100(%rcx),$xb1
1297 vmovdqa 0xa0-0x100(%rcx),$xb2
1298 vmovdqa 0xb0-0x100(%rcx),$xb3
1299 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0"
1300 vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1"
1301 vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2"
1302 vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3"
1303 vmovdqa 0x100-0x100(%rcx),$xd0
1304 vmovdqa 0x110-0x100(%rcx),$xd1
1305 vmovdqa 0x120-0x100(%rcx),$xd2
1306 vmovdqa 0x130-0x100(%rcx),$xd3
1307 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters
1311 vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
1317 foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; }
1318 foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; }
1323 vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material
1324 vpaddd 0x50(%rsp),$xa1,$xa1
1325 vpaddd 0x60(%rsp),$xa2,$xa2
1326 vpaddd 0x70(%rsp),$xa3,$xa3
1328 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3
1329 vmovdqa $xt3,0x30(%rsp)
1331 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1332 vpunpckldq $xa3,$xa2,$xt3
1333 vpunpckhdq $xa1,$xa0,$xa0
1334 vpunpckhdq $xa3,$xa2,$xa2
1335 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1336 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1337 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1338 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1340 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1342 vpaddd 0x80-0x100(%rcx),$xb0,$xb0
1343 vpaddd 0x90-0x100(%rcx),$xb1,$xb1
1344 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2
1345 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3
1347 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1
1348 vmovdqa $xa1,0x10(%rsp)
1349 vmovdqa 0x20(%rsp),$xa0 # "xc2"
1350 vmovdqa 0x30(%rsp),$xa1 # "xc3"
1352 vpunpckldq $xb1,$xb0,$xt2
1353 vpunpckldq $xb3,$xb2,$xt3
1354 vpunpckhdq $xb1,$xb0,$xb0
1355 vpunpckhdq $xb3,$xb2,$xb2
1356 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1357 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1358 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1359 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1361 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1362 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1364 vpaddd 0xc0-0x100(%rcx),$xc0,$xc0
1365 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1
1366 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2
1367 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3
1369 vpunpckldq $xc1,$xc0,$xt2
1370 vpunpckldq $xc3,$xc2,$xt3
1371 vpunpckhdq $xc1,$xc0,$xc0
1372 vpunpckhdq $xc3,$xc2,$xc2
1373 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1374 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1375 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1376 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1378 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1380 vpaddd 0x100-0x100(%rcx),$xd0,$xd0
1381 vpaddd 0x110-0x100(%rcx),$xd1,$xd1
1382 vpaddd 0x120-0x100(%rcx),$xd2,$xd2
1383 vpaddd 0x130-0x100(%rcx),$xd3,$xd3
1385 vpunpckldq $xd1,$xd0,$xt2
1386 vpunpckldq $xd3,$xd2,$xt3
1387 vpunpckhdq $xd1,$xd0,$xd0
1388 vpunpckhdq $xd3,$xd2,$xd2
1389 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1390 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1391 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1392 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1394 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1395 ($xa0,$xa1)=($xt2,$xt3);
1397 vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1
1398 vmovdqa 0x10(%rsp),$xa1
1403 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1404 vpxor 0x10($inp),$xb0,$xb0
1405 vpxor 0x20($inp),$xc0,$xc0
1406 vpxor 0x30($inp),$xd0,$xd0
1407 vpxor 0x40($inp),$xa1,$xa1
1408 vpxor 0x50($inp),$xb1,$xb1
1409 vpxor 0x60($inp),$xc1,$xc1
1410 vpxor 0x70($inp),$xd1,$xd1
1411 lea 0x80($inp),$inp # size optimization
1412 vpxor 0x00($inp),$xa2,$xa2
1413 vpxor 0x10($inp),$xb2,$xb2
1414 vpxor 0x20($inp),$xc2,$xc2
1415 vpxor 0x30($inp),$xd2,$xd2
1416 vpxor 0x40($inp),$xa3,$xa3
1417 vpxor 0x50($inp),$xb3,$xb3
1418 vpxor 0x60($inp),$xc3,$xc3
1419 vpxor 0x70($inp),$xd3,$xd3
1420 lea 0x80($inp),$inp # inp+=64*4
1422 vmovdqu $xa0,0x00($out)
1423 vmovdqu $xb0,0x10($out)
1424 vmovdqu $xc0,0x20($out)
1425 vmovdqu $xd0,0x30($out)
1426 vmovdqu $xa1,0x40($out)
1427 vmovdqu $xb1,0x50($out)
1428 vmovdqu $xc1,0x60($out)
1429 vmovdqu $xd1,0x70($out)
1430 lea 0x80($out),$out # size optimization
1431 vmovdqu $xa2,0x00($out)
1432 vmovdqu $xb2,0x10($out)
1433 vmovdqu $xc2,0x20($out)
1434 vmovdqu $xd2,0x30($out)
1435 vmovdqu $xa3,0x40($out)
1436 vmovdqu $xb3,0x50($out)
1437 vmovdqu $xc3,0x60($out)
1438 vmovdqu $xd3,0x70($out)
1439 lea 0x80($out),$out # out+=64*4
1449 jae .L192_or_more4xop
1451 jae .L128_or_more4xop
1453 jae .L64_or_more4xop
1456 vmovdqa $xa0,0x00(%rsp)
1457 vmovdqa $xb0,0x10(%rsp)
1458 vmovdqa $xc0,0x20(%rsp)
1459 vmovdqa $xd0,0x30(%rsp)
1464 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1465 vpxor 0x10($inp),$xb0,$xb0
1466 vpxor 0x20($inp),$xc0,$xc0
1467 vpxor 0x30($inp),$xd0,$xd0
1468 vmovdqu $xa0,0x00($out)
1469 vmovdqu $xb0,0x10($out)
1470 vmovdqu $xc0,0x20($out)
1471 vmovdqu $xd0,0x30($out)
1474 lea 0x40($inp),$inp # inp+=64*1
1475 vmovdqa $xa1,0x00(%rsp)
1477 vmovdqa $xb1,0x10(%rsp)
1478 lea 0x40($out),$out # out+=64*1
1479 vmovdqa $xc1,0x20(%rsp)
1480 sub \$64,$len # len-=64*1
1481 vmovdqa $xd1,0x30(%rsp)
1486 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1487 vpxor 0x10($inp),$xb0,$xb0
1488 vpxor 0x20($inp),$xc0,$xc0
1489 vpxor 0x30($inp),$xd0,$xd0
1490 vpxor 0x40($inp),$xa1,$xa1
1491 vpxor 0x50($inp),$xb1,$xb1
1492 vpxor 0x60($inp),$xc1,$xc1
1493 vpxor 0x70($inp),$xd1,$xd1
1495 vmovdqu $xa0,0x00($out)
1496 vmovdqu $xb0,0x10($out)
1497 vmovdqu $xc0,0x20($out)
1498 vmovdqu $xd0,0x30($out)
1499 vmovdqu $xa1,0x40($out)
1500 vmovdqu $xb1,0x50($out)
1501 vmovdqu $xc1,0x60($out)
1502 vmovdqu $xd1,0x70($out)
1505 lea 0x80($inp),$inp # inp+=64*2
1506 vmovdqa $xa2,0x00(%rsp)
1508 vmovdqa $xb2,0x10(%rsp)
1509 lea 0x80($out),$out # out+=64*2
1510 vmovdqa $xc2,0x20(%rsp)
1511 sub \$128,$len # len-=64*2
1512 vmovdqa $xd2,0x30(%rsp)
1517 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1518 vpxor 0x10($inp),$xb0,$xb0
1519 vpxor 0x20($inp),$xc0,$xc0
1520 vpxor 0x30($inp),$xd0,$xd0
1521 vpxor 0x40($inp),$xa1,$xa1
1522 vpxor 0x50($inp),$xb1,$xb1
1523 vpxor 0x60($inp),$xc1,$xc1
1524 vpxor 0x70($inp),$xd1,$xd1
1525 lea 0x80($inp),$inp # size optimization
1526 vpxor 0x00($inp),$xa2,$xa2
1527 vpxor 0x10($inp),$xb2,$xb2
1528 vpxor 0x20($inp),$xc2,$xc2
1529 vpxor 0x30($inp),$xd2,$xd2
1531 vmovdqu $xa0,0x00($out)
1532 vmovdqu $xb0,0x10($out)
1533 vmovdqu $xc0,0x20($out)
1534 vmovdqu $xd0,0x30($out)
1535 vmovdqu $xa1,0x40($out)
1536 vmovdqu $xb1,0x50($out)
1537 vmovdqu $xc1,0x60($out)
1538 vmovdqu $xd1,0x70($out)
1539 lea 0x80($out),$out # size optimization
1540 vmovdqu $xa2,0x00($out)
1541 vmovdqu $xb2,0x10($out)
1542 vmovdqu $xc2,0x20($out)
1543 vmovdqu $xd2,0x30($out)
1546 lea 0x40($inp),$inp # inp+=64*3
1547 vmovdqa $xa3,0x00(%rsp)
1549 vmovdqa $xb3,0x10(%rsp)
1550 lea 0x40($out),$out # out+=64*3
1551 vmovdqa $xc3,0x20(%rsp)
1552 sub \$192,$len # len-=64*3
1553 vmovdqa $xd3,0x30(%rsp)
1556 movzb ($inp,%r10),%eax
1557 movzb (%rsp,%r10),%ecx
1560 mov %al,-1($out,%r10)
1567 $code.=<<___ if ($win64);
1568 lea 0x140+0x30(%rsp),%r11
1569 movaps -0x30(%r11),%xmm6
1570 movaps -0x20(%r11),%xmm7
1571 movaps -0x10(%r11),%xmm8
1572 movaps 0x00(%r11),%xmm9
1573 movaps 0x10(%r11),%xmm10
1574 movaps 0x20(%r11),%xmm11
1575 movaps 0x30(%r11),%xmm12
1576 movaps 0x40(%r11),%xmm13
1577 movaps 0x50(%r11),%xmm14
1578 movaps 0x60(%r11),%xmm15
1581 add \$0x148+$xframe,%rsp
1583 .size ChaCha20_4xop,.-ChaCha20_4xop
1587 ########################################################################
1590 my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3,
1591 $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15));
1592 my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
1593 "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3);
1595 sub AVX2_lane_ROUND {
1596 my ($a0,$b0,$c0,$d0)=@_;
1597 my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
1598 my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
1599 my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
1600 my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3);
1601 my @x=map("\"$_\"",@xx);
1603 # Consider order in which variables are addressed by their
1608 # 0 4 8 12 < even round
1612 # 0 5 10 15 < odd round
1617 # 'a', 'b' and 'd's are permanently allocated in registers,
1618 # @x[0..7,12..15], while 'c's are maintained in memory. If
1619 # you observe 'c' column, you'll notice that pair of 'c's is
1620 # invariant between rounds. This means that we have to reload
1621 # them once per round, in the middle. This is why you'll see
1622 # bunch of 'c' stores and loads in the middle, but none in
1623 # the beginning or end.
1626 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1
1627 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1628 "&vpshufb (@x[$d0],@x[$d0],$t1)",
1629 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2
1630 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1631 "&vpshufb (@x[$d1],@x[$d1],$t1)",
1633 "&vpaddd ($xc,$xc,@x[$d0])",
1634 "&vpxor (@x[$b0],$xc,@x[$b0])",
1635 "&vpslld ($t0,@x[$b0],12)",
1636 "&vpsrld (@x[$b0],@x[$b0],20)",
1637 "&vpor (@x[$b0],$t0,@x[$b0])",
1638 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1639 "&vpaddd ($xc_,$xc_,@x[$d1])",
1640 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1641 "&vpslld ($t1,@x[$b1],12)",
1642 "&vpsrld (@x[$b1],@x[$b1],20)",
1643 "&vpor (@x[$b1],$t1,@x[$b1])",
1645 "&vpaddd (@x[$a0],@x[$a0],@x[$b0])",
1646 "&vpxor (@x[$d0],@x[$a0],@x[$d0])",
1647 "&vpshufb (@x[$d0],@x[$d0],$t0)",
1648 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])",
1649 "&vpxor (@x[$d1],@x[$a1],@x[$d1])",
1650 "&vpshufb (@x[$d1],@x[$d1],$t0)",
1652 "&vpaddd ($xc,$xc,@x[$d0])",
1653 "&vpxor (@x[$b0],$xc,@x[$b0])",
1654 "&vpslld ($t1,@x[$b0],7)",
1655 "&vpsrld (@x[$b0],@x[$b0],25)",
1656 "&vpor (@x[$b0],$t1,@x[$b0])",
1657 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1658 "&vpaddd ($xc_,$xc_,@x[$d1])",
1659 "&vpxor (@x[$b1],$xc_,@x[$b1])",
1660 "&vpslld ($t0,@x[$b1],7)",
1661 "&vpsrld (@x[$b1],@x[$b1],25)",
1662 "&vpor (@x[$b1],$t0,@x[$b1])",
1664 "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
1665 "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)",
1666 "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")",
1667 "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")",
1669 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3
1670 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1671 "&vpshufb (@x[$d2],@x[$d2],$t1)",
1672 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4
1673 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1674 "&vpshufb (@x[$d3],@x[$d3],$t1)",
1676 "&vpaddd ($xc,$xc,@x[$d2])",
1677 "&vpxor (@x[$b2],$xc,@x[$b2])",
1678 "&vpslld ($t0,@x[$b2],12)",
1679 "&vpsrld (@x[$b2],@x[$b2],20)",
1680 "&vpor (@x[$b2],$t0,@x[$b2])",
1681 "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip)
1682 "&vpaddd ($xc_,$xc_,@x[$d3])",
1683 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1684 "&vpslld ($t1,@x[$b3],12)",
1685 "&vpsrld (@x[$b3],@x[$b3],20)",
1686 "&vpor (@x[$b3],$t1,@x[$b3])",
1688 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])",
1689 "&vpxor (@x[$d2],@x[$a2],@x[$d2])",
1690 "&vpshufb (@x[$d2],@x[$d2],$t0)",
1691 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])",
1692 "&vpxor (@x[$d3],@x[$a3],@x[$d3])",
1693 "&vpshufb (@x[$d3],@x[$d3],$t0)",
1695 "&vpaddd ($xc,$xc,@x[$d2])",
1696 "&vpxor (@x[$b2],$xc,@x[$b2])",
1697 "&vpslld ($t1,@x[$b2],7)",
1698 "&vpsrld (@x[$b2],@x[$b2],25)",
1699 "&vpor (@x[$b2],$t1,@x[$b2])",
1700 "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
1701 "&vpaddd ($xc_,$xc_,@x[$d3])",
1702 "&vpxor (@x[$b3],$xc_,@x[$b3])",
1703 "&vpslld ($t0,@x[$b3],7)",
1704 "&vpsrld (@x[$b3],@x[$b3],25)",
1705 "&vpor (@x[$b3],$t0,@x[$b3])"
1709 my $xframe = $win64 ? 0xb0 : 8;
1712 .type ChaCha20_8x,\@function,5
1717 sub \$0x280+$xframe,%rsp
1720 $code.=<<___ if ($win64);
1721 lea 0x290+0x30(%rsp),%r11
1722 movaps %xmm6,-0x30(%r11)
1723 movaps %xmm7,-0x20(%r11)
1724 movaps %xmm8,-0x10(%r11)
1725 movaps %xmm9,0x00(%r11)
1726 movaps %xmm10,0x10(%r11)
1727 movaps %xmm11,0x20(%r11)
1728 movaps %xmm12,0x30(%r11)
1729 movaps %xmm13,0x40(%r11)
1730 movaps %xmm14,0x50(%r11)
1731 movaps %xmm15,0x60(%r11)
1735 mov %r10,0x280(%rsp)
1737 ################ stack layout
1738 # +0x00 SIMD equivalent of @x[8-12]
1740 # +0x80 constant copy of key[0-2] smashed by lanes
1742 # +0x200 SIMD counters (with nonce smashed by lanes)
1746 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0]
1747 vbroadcasti128 ($key),$xb3 # key[1]
1748 vbroadcasti128 16($key),$xt3 # key[2]
1749 vbroadcasti128 ($counter),$xd3 # key[3]
1750 lea 0x100(%rsp),%rcx # size optimization
1751 lea 0x200(%rsp),%rax # size optimization
1752 lea .Lrot16(%rip),%r10
1753 lea .Lrot24(%rip),%r11
1755 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
1756 vpshufd \$0x55,$xa3,$xa1
1757 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload
1758 vpshufd \$0xaa,$xa3,$xa2
1759 vmovdqa $xa1,0xa0-0x100(%rcx)
1760 vpshufd \$0xff,$xa3,$xa3
1761 vmovdqa $xa2,0xc0-0x100(%rcx)
1762 vmovdqa $xa3,0xe0-0x100(%rcx)
1764 vpshufd \$0x00,$xb3,$xb0
1765 vpshufd \$0x55,$xb3,$xb1
1766 vmovdqa $xb0,0x100-0x100(%rcx)
1767 vpshufd \$0xaa,$xb3,$xb2
1768 vmovdqa $xb1,0x120-0x100(%rcx)
1769 vpshufd \$0xff,$xb3,$xb3
1770 vmovdqa $xb2,0x140-0x100(%rcx)
1771 vmovdqa $xb3,0x160-0x100(%rcx)
1773 vpshufd \$0x00,$xt3,$xt0 # "xc0"
1774 vpshufd \$0x55,$xt3,$xt1 # "xc1"
1775 vmovdqa $xt0,0x180-0x200(%rax)
1776 vpshufd \$0xaa,$xt3,$xt2 # "xc2"
1777 vmovdqa $xt1,0x1a0-0x200(%rax)
1778 vpshufd \$0xff,$xt3,$xt3 # "xc3"
1779 vmovdqa $xt2,0x1c0-0x200(%rax)
1780 vmovdqa $xt3,0x1e0-0x200(%rax)
1782 vpshufd \$0x00,$xd3,$xd0
1783 vpshufd \$0x55,$xd3,$xd1
1784 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet
1785 vpshufd \$0xaa,$xd3,$xd2
1786 vmovdqa $xd1,0x220-0x200(%rax)
1787 vpshufd \$0xff,$xd3,$xd3
1788 vmovdqa $xd2,0x240-0x200(%rax)
1789 vmovdqa $xd3,0x260-0x200(%rax)
1795 vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key
1796 vmovdqa 0xa0-0x100(%rcx),$xa1
1797 vmovdqa 0xc0-0x100(%rcx),$xa2
1798 vmovdqa 0xe0-0x100(%rcx),$xa3
1799 vmovdqa 0x100-0x100(%rcx),$xb0
1800 vmovdqa 0x120-0x100(%rcx),$xb1
1801 vmovdqa 0x140-0x100(%rcx),$xb2
1802 vmovdqa 0x160-0x100(%rcx),$xb3
1803 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0"
1804 vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1"
1805 vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2"
1806 vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3"
1807 vmovdqa 0x200-0x200(%rax),$xd0
1808 vmovdqa 0x220-0x200(%rax),$xd1
1809 vmovdqa 0x240-0x200(%rax),$xd2
1810 vmovdqa 0x260-0x200(%rax),$xd3
1811 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters
1814 vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
1815 vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
1816 vbroadcasti128 (%r10),$xt3
1817 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
1824 foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; }
1825 foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; }
1830 lea 0x200(%rsp),%rax # size optimization
1831 vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key
1832 vpaddd 0xa0-0x100(%rcx),$xa1,$xa1
1833 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2
1834 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3
1836 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data
1837 vpunpckldq $xa3,$xa2,$xt3
1838 vpunpckhdq $xa1,$xa0,$xa0
1839 vpunpckhdq $xa3,$xa2,$xa2
1840 vpunpcklqdq $xt3,$xt2,$xa1 # "a0"
1841 vpunpckhqdq $xt3,$xt2,$xt2 # "a1"
1842 vpunpcklqdq $xa2,$xa0,$xa3 # "a2"
1843 vpunpckhqdq $xa2,$xa0,$xa0 # "a3"
1845 ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
1847 vpaddd 0x100-0x100(%rcx),$xb0,$xb0
1848 vpaddd 0x120-0x100(%rcx),$xb1,$xb1
1849 vpaddd 0x140-0x100(%rcx),$xb2,$xb2
1850 vpaddd 0x160-0x100(%rcx),$xb3,$xb3
1852 vpunpckldq $xb1,$xb0,$xt2
1853 vpunpckldq $xb3,$xb2,$xt3
1854 vpunpckhdq $xb1,$xb0,$xb0
1855 vpunpckhdq $xb3,$xb2,$xb2
1856 vpunpcklqdq $xt3,$xt2,$xb1 # "b0"
1857 vpunpckhqdq $xt3,$xt2,$xt2 # "b1"
1858 vpunpcklqdq $xb2,$xb0,$xb3 # "b2"
1859 vpunpckhqdq $xb2,$xb0,$xb0 # "b3"
1861 ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
1863 vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further
1864 vperm2i128 \$0x31,$xb0,$xa0,$xb0
1865 vperm2i128 \$0x20,$xb1,$xa1,$xa0
1866 vperm2i128 \$0x31,$xb1,$xa1,$xb1
1867 vperm2i128 \$0x20,$xb2,$xa2,$xa1
1868 vperm2i128 \$0x31,$xb2,$xa2,$xb2
1869 vperm2i128 \$0x20,$xb3,$xa3,$xa2
1870 vperm2i128 \$0x31,$xb3,$xa3,$xb3
1872 ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
1873 my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1);
1875 vmovdqa $xa0,0x00(%rsp) # offload $xaN
1876 vmovdqa $xa1,0x20(%rsp)
1877 vmovdqa 0x40(%rsp),$xc2 # $xa0
1878 vmovdqa 0x60(%rsp),$xc3 # $xa1
1880 vpaddd 0x180-0x200(%rax),$xc0,$xc0
1881 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1
1882 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2
1883 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3
1885 vpunpckldq $xc1,$xc0,$xt2
1886 vpunpckldq $xc3,$xc2,$xt3
1887 vpunpckhdq $xc1,$xc0,$xc0
1888 vpunpckhdq $xc3,$xc2,$xc2
1889 vpunpcklqdq $xt3,$xt2,$xc1 # "c0"
1890 vpunpckhqdq $xt3,$xt2,$xt2 # "c1"
1891 vpunpcklqdq $xc2,$xc0,$xc3 # "c2"
1892 vpunpckhqdq $xc2,$xc0,$xc0 # "c3"
1894 ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
1896 vpaddd 0x200-0x200(%rax),$xd0,$xd0
1897 vpaddd 0x220-0x200(%rax),$xd1,$xd1
1898 vpaddd 0x240-0x200(%rax),$xd2,$xd2
1899 vpaddd 0x260-0x200(%rax),$xd3,$xd3
1901 vpunpckldq $xd1,$xd0,$xt2
1902 vpunpckldq $xd3,$xd2,$xt3
1903 vpunpckhdq $xd1,$xd0,$xd0
1904 vpunpckhdq $xd3,$xd2,$xd2
1905 vpunpcklqdq $xt3,$xt2,$xd1 # "d0"
1906 vpunpckhqdq $xt3,$xt2,$xt2 # "d1"
1907 vpunpcklqdq $xd2,$xd0,$xd3 # "d2"
1908 vpunpckhqdq $xd2,$xd0,$xd0 # "d3"
1910 ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
1912 vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further
1913 vperm2i128 \$0x31,$xd0,$xc0,$xd0
1914 vperm2i128 \$0x20,$xd1,$xc1,$xc0
1915 vperm2i128 \$0x31,$xd1,$xc1,$xd1
1916 vperm2i128 \$0x20,$xd2,$xc2,$xc1
1917 vperm2i128 \$0x31,$xd2,$xc2,$xd2
1918 vperm2i128 \$0x20,$xd3,$xc3,$xc2
1919 vperm2i128 \$0x31,$xd3,$xc3,$xd3
1921 ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
1922 ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)=
1923 ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3);
1924 ($xa0,$xa1)=($xt2,$xt3);
1926 vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember?
1927 vmovdqa 0x20(%rsp),$xa1
1932 vpxor 0x00($inp),$xa0,$xa0 # xor with input
1933 vpxor 0x20($inp),$xb0,$xb0
1934 vpxor 0x40($inp),$xc0,$xc0
1935 vpxor 0x60($inp),$xd0,$xd0
1936 lea 0x80($inp),$inp # size optimization
1937 vmovdqu $xa0,0x00($out)
1938 vmovdqu $xb0,0x20($out)
1939 vmovdqu $xc0,0x40($out)
1940 vmovdqu $xd0,0x60($out)
1941 lea 0x80($out),$out # size optimization
1943 vpxor 0x00($inp),$xa1,$xa1
1944 vpxor 0x20($inp),$xb1,$xb1
1945 vpxor 0x40($inp),$xc1,$xc1
1946 vpxor 0x60($inp),$xd1,$xd1
1947 lea 0x80($inp),$inp # size optimization
1948 vmovdqu $xa1,0x00($out)
1949 vmovdqu $xb1,0x20($out)
1950 vmovdqu $xc1,0x40($out)
1951 vmovdqu $xd1,0x60($out)
1952 lea 0x80($out),$out # size optimization
1954 vpxor 0x00($inp),$xa2,$xa2
1955 vpxor 0x20($inp),$xb2,$xb2
1956 vpxor 0x40($inp),$xc2,$xc2
1957 vpxor 0x60($inp),$xd2,$xd2
1958 lea 0x80($inp),$inp # size optimization
1959 vmovdqu $xa2,0x00($out)
1960 vmovdqu $xb2,0x20($out)
1961 vmovdqu $xc2,0x40($out)
1962 vmovdqu $xd2,0x60($out)
1963 lea 0x80($out),$out # size optimization
1965 vpxor 0x00($inp),$xa3,$xa3
1966 vpxor 0x20($inp),$xb3,$xb3
1967 vpxor 0x40($inp),$xc3,$xc3
1968 vpxor 0x60($inp),$xd3,$xd3
1969 lea 0x80($inp),$inp # size optimization
1970 vmovdqu $xa3,0x00($out)
1971 vmovdqu $xb3,0x20($out)
1972 vmovdqu $xc3,0x40($out)
1973 vmovdqu $xd3,0x60($out)
1974 lea 0x80($out),$out # size optimization
1998 vmovdqa $xa0,0x00(%rsp)
1999 vmovdqa $xb0,0x20(%rsp)
2004 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2005 vpxor 0x20($inp),$xb0,$xb0
2006 vmovdqu $xa0,0x00($out)
2007 vmovdqu $xb0,0x20($out)
2010 lea 0x40($inp),$inp # inp+=64*1
2012 vmovdqa $xc0,0x00(%rsp)
2013 lea 0x40($out),$out # out+=64*1
2014 sub \$64,$len # len-=64*1
2015 vmovdqa $xd0,0x20(%rsp)
2020 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2021 vpxor 0x20($inp),$xb0,$xb0
2022 vpxor 0x40($inp),$xc0,$xc0
2023 vpxor 0x60($inp),$xd0,$xd0
2024 vmovdqu $xa0,0x00($out)
2025 vmovdqu $xb0,0x20($out)
2026 vmovdqu $xc0,0x40($out)
2027 vmovdqu $xd0,0x60($out)
2030 lea 0x80($inp),$inp # inp+=64*2
2032 vmovdqa $xa1,0x00(%rsp)
2033 lea 0x80($out),$out # out+=64*2
2034 sub \$128,$len # len-=64*2
2035 vmovdqa $xb1,0x20(%rsp)
2040 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2041 vpxor 0x20($inp),$xb0,$xb0
2042 vpxor 0x40($inp),$xc0,$xc0
2043 vpxor 0x60($inp),$xd0,$xd0
2044 vpxor 0x80($inp),$xa1,$xa1
2045 vpxor 0xa0($inp),$xb1,$xb1
2046 vmovdqu $xa0,0x00($out)
2047 vmovdqu $xb0,0x20($out)
2048 vmovdqu $xc0,0x40($out)
2049 vmovdqu $xd0,0x60($out)
2050 vmovdqu $xa1,0x80($out)
2051 vmovdqu $xb1,0xa0($out)
2054 lea 0xc0($inp),$inp # inp+=64*3
2056 vmovdqa $xc1,0x00(%rsp)
2057 lea 0xc0($out),$out # out+=64*3
2058 sub \$192,$len # len-=64*3
2059 vmovdqa $xd1,0x20(%rsp)
2064 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2065 vpxor 0x20($inp),$xb0,$xb0
2066 vpxor 0x40($inp),$xc0,$xc0
2067 vpxor 0x60($inp),$xd0,$xd0
2068 vpxor 0x80($inp),$xa1,$xa1
2069 vpxor 0xa0($inp),$xb1,$xb1
2070 vpxor 0xc0($inp),$xc1,$xc1
2071 vpxor 0xe0($inp),$xd1,$xd1
2072 vmovdqu $xa0,0x00($out)
2073 vmovdqu $xb0,0x20($out)
2074 vmovdqu $xc0,0x40($out)
2075 vmovdqu $xd0,0x60($out)
2076 vmovdqu $xa1,0x80($out)
2077 vmovdqu $xb1,0xa0($out)
2078 vmovdqu $xc1,0xc0($out)
2079 vmovdqu $xd1,0xe0($out)
2082 lea 0x100($inp),$inp # inp+=64*4
2084 vmovdqa $xa2,0x00(%rsp)
2085 lea 0x100($out),$out # out+=64*4
2086 sub \$256,$len # len-=64*4
2087 vmovdqa $xb2,0x20(%rsp)
2092 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2093 vpxor 0x20($inp),$xb0,$xb0
2094 vpxor 0x40($inp),$xc0,$xc0
2095 vpxor 0x60($inp),$xd0,$xd0
2096 vpxor 0x80($inp),$xa1,$xa1
2097 vpxor 0xa0($inp),$xb1,$xb1
2098 vpxor 0xc0($inp),$xc1,$xc1
2099 vpxor 0xe0($inp),$xd1,$xd1
2100 vpxor 0x100($inp),$xa2,$xa2
2101 vpxor 0x120($inp),$xb2,$xb2
2102 vmovdqu $xa0,0x00($out)
2103 vmovdqu $xb0,0x20($out)
2104 vmovdqu $xc0,0x40($out)
2105 vmovdqu $xd0,0x60($out)
2106 vmovdqu $xa1,0x80($out)
2107 vmovdqu $xb1,0xa0($out)
2108 vmovdqu $xc1,0xc0($out)
2109 vmovdqu $xd1,0xe0($out)
2110 vmovdqu $xa2,0x100($out)
2111 vmovdqu $xb2,0x120($out)
2114 lea 0x140($inp),$inp # inp+=64*5
2116 vmovdqa $xc2,0x00(%rsp)
2117 lea 0x140($out),$out # out+=64*5
2118 sub \$320,$len # len-=64*5
2119 vmovdqa $xd2,0x20(%rsp)
2124 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2125 vpxor 0x20($inp),$xb0,$xb0
2126 vpxor 0x40($inp),$xc0,$xc0
2127 vpxor 0x60($inp),$xd0,$xd0
2128 vpxor 0x80($inp),$xa1,$xa1
2129 vpxor 0xa0($inp),$xb1,$xb1
2130 vpxor 0xc0($inp),$xc1,$xc1
2131 vpxor 0xe0($inp),$xd1,$xd1
2132 vpxor 0x100($inp),$xa2,$xa2
2133 vpxor 0x120($inp),$xb2,$xb2
2134 vpxor 0x140($inp),$xc2,$xc2
2135 vpxor 0x160($inp),$xd2,$xd2
2136 vmovdqu $xa0,0x00($out)
2137 vmovdqu $xb0,0x20($out)
2138 vmovdqu $xc0,0x40($out)
2139 vmovdqu $xd0,0x60($out)
2140 vmovdqu $xa1,0x80($out)
2141 vmovdqu $xb1,0xa0($out)
2142 vmovdqu $xc1,0xc0($out)
2143 vmovdqu $xd1,0xe0($out)
2144 vmovdqu $xa2,0x100($out)
2145 vmovdqu $xb2,0x120($out)
2146 vmovdqu $xc2,0x140($out)
2147 vmovdqu $xd2,0x160($out)
2150 lea 0x180($inp),$inp # inp+=64*6
2152 vmovdqa $xa3,0x00(%rsp)
2153 lea 0x180($out),$out # out+=64*6
2154 sub \$384,$len # len-=64*6
2155 vmovdqa $xb3,0x20(%rsp)
2160 vpxor 0x00($inp),$xa0,$xa0 # xor with input
2161 vpxor 0x20($inp),$xb0,$xb0
2162 vpxor 0x40($inp),$xc0,$xc0
2163 vpxor 0x60($inp),$xd0,$xd0
2164 vpxor 0x80($inp),$xa1,$xa1
2165 vpxor 0xa0($inp),$xb1,$xb1
2166 vpxor 0xc0($inp),$xc1,$xc1
2167 vpxor 0xe0($inp),$xd1,$xd1
2168 vpxor 0x100($inp),$xa2,$xa2
2169 vpxor 0x120($inp),$xb2,$xb2
2170 vpxor 0x140($inp),$xc2,$xc2
2171 vpxor 0x160($inp),$xd2,$xd2
2172 vpxor 0x180($inp),$xa3,$xa3
2173 vpxor 0x1a0($inp),$xb3,$xb3
2174 vmovdqu $xa0,0x00($out)
2175 vmovdqu $xb0,0x20($out)
2176 vmovdqu $xc0,0x40($out)
2177 vmovdqu $xd0,0x60($out)
2178 vmovdqu $xa1,0x80($out)
2179 vmovdqu $xb1,0xa0($out)
2180 vmovdqu $xc1,0xc0($out)
2181 vmovdqu $xd1,0xe0($out)
2182 vmovdqu $xa2,0x100($out)
2183 vmovdqu $xb2,0x120($out)
2184 vmovdqu $xc2,0x140($out)
2185 vmovdqu $xd2,0x160($out)
2186 vmovdqu $xa3,0x180($out)
2187 vmovdqu $xb3,0x1a0($out)
2190 lea 0x1c0($inp),$inp # inp+=64*7
2192 vmovdqa $xc3,0x00(%rsp)
2193 lea 0x1c0($out),$out # out+=64*7
2194 sub \$448,$len # len-=64*7
2195 vmovdqa $xd3,0x20(%rsp)
2198 movzb ($inp,%r10),%eax
2199 movzb (%rsp,%r10),%ecx
2202 mov %al,-1($out,%r10)
2209 $code.=<<___ if ($win64);
2210 lea 0x290+0x30(%rsp),%r11
2211 movaps -0x30(%r11),%xmm6
2212 movaps -0x20(%r11),%xmm7
2213 movaps -0x10(%r11),%xmm8
2214 movaps 0x00(%r11),%xmm9
2215 movaps 0x10(%r11),%xmm10
2216 movaps 0x20(%r11),%xmm11
2217 movaps 0x30(%r11),%xmm12
2218 movaps 0x40(%r11),%xmm13
2219 movaps 0x50(%r11),%xmm14
2220 movaps 0x60(%r11),%xmm15
2223 mov 0x280(%rsp),%rsp
2225 .size ChaCha20_8x,.-ChaCha20_8x
2229 foreach (split("\n",$code)) {
2230 s/\`([^\`]*)\`/eval $1/geo;