2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements Poly1305 hash for x86_64.
25 # Add AVX512F+VL+BW code path.
27 # Numbers are cycles per processed byte with poly1305_blocks alone,
28 # measured with rdtsc at fixed clock frequency.
30 # IALU/gcc-4.8(*) AVX(**) AVX2
33 # Westmere 1.88/+120% -
34 # Sandy Bridge 1.39/+140% 1.10
35 # Haswell 1.14/+175% 1.11 0.65
36 # Skylake 1.13/+120% 0.96 0.51
37 # Silvermont 2.83/+95% -
38 # Goldmont 1.70/+180% -
39 # VIA Nano 1.82/+150% -
40 # Sledgehammer 1.38/+160% -
41 # Bulldozer 2.30/+130% 0.97
43 # (*) improvement coefficients relative to clang are more modest and
44 # are ~50% on most processors, in both cases we are comparing to
46 # (**) SSE2 implementation was attempted, but among non-AVX processors
47 # it was faster than integer-only code only on older Intel P4 and
48 # Core processors, 50-30%, less newer processor is, but slower on
49 # contemporary ones, for example almost 2x slower on Atom, and as
50 # former are naturally disappearing, SSE2 is deemed unnecessary;
54 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
56 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
58 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
60 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
61 die "can't locate x86_64-xlate.pl";
63 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
64 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
65 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
68 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
69 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
70 $avx = ($1>=2.09) + ($1>=2.10);
73 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
74 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
75 $avx = ($1>=10) + ($1>=12);
78 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
79 $avx = ($2>=3.0) + ($2>3.0);
82 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
85 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
86 my ($mac,$nonce)=($inp,$len); # *_emit arguments
87 my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
88 my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
90 sub poly1305_iteration {
91 # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
92 # output: $h0-$h2 *= $r0-$r1
100 mov %rax,$h0 # future $h0
110 mov $h2,$h1 # borrow $h1
114 imulq $s1,$h1 # h2*s1
119 imulq $r0,$h2 # h2*r0
121 mov \$-4,%rax # mask value
124 and $d3,%rax # last reduction step
135 ########################################################################
136 # Layout of opaque area is following.
138 # unsigned __int64 h[3]; # current hash value base 2^64
139 # unsigned __int64 r[2]; # key value base 2^64
144 .extern OPENSSL_ia32cap_P
147 .hidden poly1305_init
148 .globl poly1305_blocks
149 .hidden poly1305_blocks
151 .hidden poly1305_emit
153 .type poly1305_init,\@function,3
157 mov %rax,0($ctx) # initialize hash value
164 lea poly1305_blocks(%rip),%r10
165 lea poly1305_emit(%rip),%r11
167 $code.=<<___ if ($avx);
168 mov OPENSSL_ia32cap_P+4(%rip),%r9
169 lea poly1305_blocks_avx(%rip),%rax
170 lea poly1305_emit_avx(%rip),%rcx
171 bt \$`60-32`,%r9 # AVX?
175 $code.=<<___ if ($avx>1);
176 lea poly1305_blocks_avx2(%rip),%rax
177 bt \$`5+32`,%r9 # AVX2?
181 mov \$0x0ffffffc0fffffff,%rax
182 mov \$0x0ffffffc0ffffffc,%rcx
188 $code.=<<___ if ($flavour !~ /elf32/);
192 $code.=<<___ if ($flavour =~ /elf32/);
200 .size poly1305_init,.-poly1305_init
202 .type poly1305_blocks,\@function,4
207 jz .Lno_data # too short
217 mov $len,%r15 # reassign $len
219 mov 24($ctx),$r0 # load r
222 mov 0($ctx),$h0 # load hash value
229 add $r1,$s1 # s1 = r1 + (r1 >> 2)
234 add 0($inp),$h0 # accumulate input
239 &poly1305_iteration();
245 mov $h0,0($ctx) # store hash value
259 .size poly1305_blocks,.-poly1305_blocks
261 .type poly1305_emit,\@function,3
265 mov 0($ctx),%r8 # load hash value
270 add \$5,%r8 # compare to modulus
274 shr \$2,%r10 # did 130-bit value overfow?
278 add 0($nonce),%rax # accumulate nonce
280 mov %rax,0($mac) # write result
284 .size poly1305_emit,.-poly1305_emit
288 ########################################################################
289 # Layout of opaque area is following.
291 # unsigned __int32 h[5]; # current hash value base 2^26
292 # unsigned __int32 is_base2_26;
293 # unsigned __int64 r[2]; # key value base 2^64
294 # unsigned __int64 pad;
295 # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
297 # where r^n are base 2^26 digits of degrees of multiplier key. There are
298 # 5 digits, but last four are interleaved with multiples of 5, totalling
299 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
301 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
302 map("%xmm$_",(0..15));
305 .type __poly1305_block,\@abi-omnipotent
309 &poly1305_iteration();
312 .size __poly1305_block,.-__poly1305_block
314 .type __poly1305_init_avx,\@abi-omnipotent
321 lea 48+64($ctx),$ctx # size optimization
324 call __poly1305_block # r^2
326 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
332 mov %eax,`16*0+0-64`($ctx)
334 mov %edx,`16*0+4-64`($ctx)
341 mov %eax,`16*1+0-64`($ctx)
342 lea (%rax,%rax,4),%eax # *5
343 mov %edx,`16*1+4-64`($ctx)
344 lea (%rdx,%rdx,4),%edx # *5
345 mov %eax,`16*2+0-64`($ctx)
347 mov %edx,`16*2+4-64`($ctx)
358 mov %eax,`16*3+0-64`($ctx)
359 lea (%rax,%rax,4),%eax # *5
360 mov %edx,`16*3+4-64`($ctx)
361 lea (%rdx,%rdx,4),%edx # *5
362 mov %eax,`16*4+0-64`($ctx)
364 mov %edx,`16*4+4-64`($ctx)
373 mov %eax,`16*5+0-64`($ctx)
374 lea (%rax,%rax,4),%eax # *5
375 mov %edx,`16*5+4-64`($ctx)
376 lea (%rdx,%rdx,4),%edx # *5
377 mov %eax,`16*6+0-64`($ctx)
379 mov %edx,`16*6+4-64`($ctx)
385 mov $d1#d,`16*7+0-64`($ctx)
386 lea ($d1,$d1,4),$d1 # *5
387 mov $d2#d,`16*7+4-64`($ctx)
388 lea ($d2,$d2,4),$d2 # *5
389 mov $d1#d,`16*8+0-64`($ctx)
390 mov $d2#d,`16*8+4-64`($ctx)
393 call __poly1305_block # r^3
395 mov \$0x3ffffff,%eax # save r^3 base 2^26
399 mov %eax,`16*0+12-64`($ctx)
403 mov %edx,`16*1+12-64`($ctx)
404 lea (%rdx,%rdx,4),%edx # *5
406 mov %edx,`16*2+12-64`($ctx)
412 mov %eax,`16*3+12-64`($ctx)
413 lea (%rax,%rax,4),%eax # *5
415 mov %eax,`16*4+12-64`($ctx)
420 mov %edx,`16*5+12-64`($ctx)
421 lea (%rdx,%rdx,4),%edx # *5
423 mov %edx,`16*6+12-64`($ctx)
428 mov $d1#d,`16*7+12-64`($ctx)
429 lea ($d1,$d1,4),$d1 # *5
430 mov $d1#d,`16*8+12-64`($ctx)
433 call __poly1305_block # r^4
435 mov \$0x3ffffff,%eax # save r^4 base 2^26
439 mov %eax,`16*0+8-64`($ctx)
443 mov %edx,`16*1+8-64`($ctx)
444 lea (%rdx,%rdx,4),%edx # *5
446 mov %edx,`16*2+8-64`($ctx)
452 mov %eax,`16*3+8-64`($ctx)
453 lea (%rax,%rax,4),%eax # *5
455 mov %eax,`16*4+8-64`($ctx)
460 mov %edx,`16*5+8-64`($ctx)
461 lea (%rdx,%rdx,4),%edx # *5
463 mov %edx,`16*6+8-64`($ctx)
468 mov $d1#d,`16*7+8-64`($ctx)
469 lea ($d1,$d1,4),$d1 # *5
470 mov $d1#d,`16*8+8-64`($ctx)
472 lea -48-64($ctx),$ctx # size [de-]optimization
474 .size __poly1305_init_avx,.-__poly1305_init_avx
476 .type poly1305_blocks_avx,\@function,4
479 mov 20($ctx),%r8d # is_base2_26
505 mov $len,%r15 # reassign $len
507 mov 0($ctx),$d1 # load hash value
511 mov 24($ctx),$r0 # load r
514 ################################# base 2^26 -> base 2^64
516 and \$`-1*(1<<31)`,$d1
517 mov $d2,$r1 # borrow $r1
519 and \$`-1*(1<<31)`,$d2
533 adc \$0,$h2 # can be partially reduced...
535 mov \$-4,$d2 # ... so reduce
548 add $r1,$s1 # s1 = r1 + (r1 >> 2)
550 add 0($inp),$h0 # accumulate input
555 call __poly1305_block
557 test $padbit,$padbit # if $padbit is zero,
558 jz .Lstore_base2_64_avx # store hash in base 2^64 format
560 ################################# base 2^64 -> base 2^26
567 and \$0x3ffffff,%rax # h[0]
569 and \$0x3ffffff,%rdx # h[1]
573 and \$0x3ffffff,$h0 # h[2]
575 and \$0x3ffffff,$h1 # h[3]
579 jz .Lstore_base2_26_avx
589 .Lstore_base2_64_avx:
592 mov $h2,16($ctx) # note that is_base2_26 is zeroed
596 .Lstore_base2_26_avx:
597 mov %rax#d,0($ctx) # store hash value base 2^26
612 .Lblocks_avx_epilogue:
625 mov $len,%r15 # reassign $len
627 mov 24($ctx),$r0 # load r
630 mov 0($ctx),$h0 # load hash value
637 add $r1,$s1 # s1 = r1 + (r1 >> 2)
642 add 0($inp),$h0 # accumulate input
648 call __poly1305_block
651 ################################# base 2^64 -> base 2^26
658 and \$0x3ffffff,%rax # h[0]
660 and \$0x3ffffff,%rdx # h[1]
664 and \$0x3ffffff,$h0 # h[2]
666 and \$0x3ffffff,$h1 # h[3]
674 movl \$1,20($ctx) # set is_base2_26
676 call __poly1305_init_avx
689 .Lbase2_64_avx_epilogue:
694 vmovd 4*0($ctx),$H0 # load hash value
702 $code.=<<___ if (!$win64);
706 $code.=<<___ if ($win64);
709 vmovdqa %xmm6,0x50(%r11)
710 vmovdqa %xmm7,0x60(%r11)
711 vmovdqa %xmm8,0x70(%r11)
712 vmovdqa %xmm9,0x80(%r11)
713 vmovdqa %xmm10,0x90(%r11)
714 vmovdqa %xmm11,0xa0(%r11)
715 vmovdqa %xmm12,0xb0(%r11)
716 vmovdqa %xmm13,0xc0(%r11)
717 vmovdqa %xmm14,0xd0(%r11)
718 vmovdqa %xmm15,0xe0(%r11)
726 vmovdqu `16*3`($ctx),$D4 # preload r0^2
727 lea `16*3+64`($ctx),$ctx # size optimization
728 lea .Lconst(%rip),%rcx
730 ################################################################
732 vmovdqu 16*2($inp),$T0
733 vmovdqu 16*3($inp),$T1
734 vmovdqa 64(%rcx),$MASK # .Lmask26
736 vpsrldq \$6,$T0,$T2 # splat input
738 vpunpckhqdq $T1,$T0,$T4 # 4
739 vpunpcklqdq $T1,$T0,$T0 # 0:1
740 vpunpcklqdq $T3,$T2,$T3 # 2:3
742 vpsrlq \$40,$T4,$T4 # 4
744 vpand $MASK,$T0,$T0 # 0
746 vpand $MASK,$T1,$T1 # 1
748 vpand $MASK,$T2,$T2 # 2
749 vpand $MASK,$T3,$T3 # 3
750 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
754 # expand and copy pre-calculated table to stack
755 vmovdqu `16*1-64`($ctx),$D1
756 vmovdqu `16*2-64`($ctx),$D2
757 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
758 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
759 vmovdqa $D3,-0x90(%r11)
760 vmovdqa $D0,0x00(%rsp)
761 vpshufd \$0xEE,$D1,$D4
762 vmovdqu `16*3-64`($ctx),$D0
763 vpshufd \$0x44,$D1,$D1
764 vmovdqa $D4,-0x80(%r11)
765 vmovdqa $D1,0x10(%rsp)
766 vpshufd \$0xEE,$D2,$D3
767 vmovdqu `16*4-64`($ctx),$D1
768 vpshufd \$0x44,$D2,$D2
769 vmovdqa $D3,-0x70(%r11)
770 vmovdqa $D2,0x20(%rsp)
771 vpshufd \$0xEE,$D0,$D4
772 vmovdqu `16*5-64`($ctx),$D2
773 vpshufd \$0x44,$D0,$D0
774 vmovdqa $D4,-0x60(%r11)
775 vmovdqa $D0,0x30(%rsp)
776 vpshufd \$0xEE,$D1,$D3
777 vmovdqu `16*6-64`($ctx),$D0
778 vpshufd \$0x44,$D1,$D1
779 vmovdqa $D3,-0x50(%r11)
780 vmovdqa $D1,0x40(%rsp)
781 vpshufd \$0xEE,$D2,$D4
782 vmovdqu `16*7-64`($ctx),$D1
783 vpshufd \$0x44,$D2,$D2
784 vmovdqa $D4,-0x40(%r11)
785 vmovdqa $D2,0x50(%rsp)
786 vpshufd \$0xEE,$D0,$D3
787 vmovdqu `16*8-64`($ctx),$D2
788 vpshufd \$0x44,$D0,$D0
789 vmovdqa $D3,-0x30(%r11)
790 vmovdqa $D0,0x60(%rsp)
791 vpshufd \$0xEE,$D1,$D4
792 vpshufd \$0x44,$D1,$D1
793 vmovdqa $D4,-0x20(%r11)
794 vmovdqa $D1,0x70(%rsp)
795 vpshufd \$0xEE,$D2,$D3
796 vmovdqa 0x00(%rsp),$D4 # preload r0^2
797 vpshufd \$0x44,$D2,$D2
798 vmovdqa $D3,-0x10(%r11)
799 vmovdqa $D2,0x80(%rsp)
805 ################################################################
806 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
807 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
808 # \___________________/
809 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
810 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
811 # \___________________/ \____________________/
813 # Note that we start with inp[2:3]*r^2. This is because it
814 # doesn't depend on reduction in previous iteration.
815 ################################################################
816 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
817 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
818 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
819 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
820 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
822 # though note that $Tx and $Hx are "reversed" in this section,
823 # and $D4 is preloaded with r0^2...
825 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
826 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
827 vmovdqa $H2,0x20(%r11) # offload hash
828 vpmuludq $T2,$D4,$D2 # d3 = h2*r0
829 vmovdqa 0x10(%rsp),$H2 # r1^2
830 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
831 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
833 vmovdqa $H0,0x00(%r11) #
834 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
835 vmovdqa $H1,0x10(%r11) #
836 vpmuludq $T3,$H2,$H1 # h3*r1
837 vpaddq $H0,$D0,$D0 # d0 += h4*s1
838 vpaddq $H1,$D4,$D4 # d4 += h3*r1
839 vmovdqa $H3,0x30(%r11) #
840 vpmuludq $T2,$H2,$H0 # h2*r1
841 vpmuludq $T1,$H2,$H1 # h1*r1
842 vpaddq $H0,$D3,$D3 # d3 += h2*r1
843 vmovdqa 0x30(%rsp),$H3 # r2^2
844 vpaddq $H1,$D2,$D2 # d2 += h1*r1
845 vmovdqa $H4,0x40(%r11) #
846 vpmuludq $T0,$H2,$H2 # h0*r1
847 vpmuludq $T2,$H3,$H0 # h2*r2
848 vpaddq $H2,$D1,$D1 # d1 += h0*r1
850 vmovdqa 0x40(%rsp),$H4 # s2^2
851 vpaddq $H0,$D4,$D4 # d4 += h2*r2
852 vpmuludq $T1,$H3,$H1 # h1*r2
853 vpmuludq $T0,$H3,$H3 # h0*r2
854 vpaddq $H1,$D3,$D3 # d3 += h1*r2
855 vmovdqa 0x50(%rsp),$H2 # r3^2
856 vpaddq $H3,$D2,$D2 # d2 += h0*r2
857 vpmuludq $T4,$H4,$H0 # h4*s2
858 vpmuludq $T3,$H4,$H4 # h3*s2
859 vpaddq $H0,$D1,$D1 # d1 += h4*s2
860 vmovdqa 0x60(%rsp),$H3 # s3^2
861 vpaddq $H4,$D0,$D0 # d0 += h3*s2
863 vmovdqa 0x80(%rsp),$H4 # s4^2
864 vpmuludq $T1,$H2,$H1 # h1*r3
865 vpmuludq $T0,$H2,$H2 # h0*r3
866 vpaddq $H1,$D4,$D4 # d4 += h1*r3
867 vpaddq $H2,$D3,$D3 # d3 += h0*r3
868 vpmuludq $T4,$H3,$H0 # h4*s3
869 vpmuludq $T3,$H3,$H1 # h3*s3
870 vpaddq $H0,$D2,$D2 # d2 += h4*s3
871 vmovdqu 16*0($inp),$H0 # load input
872 vpaddq $H1,$D1,$D1 # d1 += h3*s3
873 vpmuludq $T2,$H3,$H3 # h2*s3
874 vpmuludq $T2,$H4,$T2 # h2*s4
875 vpaddq $H3,$D0,$D0 # d0 += h2*s3
877 vmovdqu 16*1($inp),$H1 #
878 vpaddq $T2,$D1,$D1 # d1 += h2*s4
879 vpmuludq $T3,$H4,$T3 # h3*s4
880 vpmuludq $T4,$H4,$T4 # h4*s4
881 vpsrldq \$6,$H0,$H2 # splat input
882 vpaddq $T3,$D2,$D2 # d2 += h3*s4
883 vpaddq $T4,$D3,$D3 # d3 += h4*s4
884 vpsrldq \$6,$H1,$H3 #
885 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
886 vpmuludq $T1,$H4,$T0 # h1*s4
887 vpunpckhqdq $H1,$H0,$H4 # 4
888 vpaddq $T4,$D4,$D4 # d4 += h0*r4
889 vmovdqa -0x90(%r11),$T4 # r0^4
890 vpaddq $T0,$D0,$D0 # d0 += h1*s4
892 vpunpcklqdq $H1,$H0,$H0 # 0:1
893 vpunpcklqdq $H3,$H2,$H3 # 2:3
895 #vpsrlq \$40,$H4,$H4 # 4
896 vpsrldq \$`40/8`,$H4,$H4 # 4
898 vpand $MASK,$H0,$H0 # 0
900 vpand $MASK,$H1,$H1 # 1
901 vpand 0(%rcx),$H4,$H4 # .Lmask24
903 vpand $MASK,$H2,$H2 # 2
904 vpand $MASK,$H3,$H3 # 3
905 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
907 vpaddq 0x00(%r11),$H0,$H0 # add hash value
908 vpaddq 0x10(%r11),$H1,$H1
909 vpaddq 0x20(%r11),$H2,$H2
910 vpaddq 0x30(%r11),$H3,$H3
911 vpaddq 0x40(%r11),$H4,$H4
918 ################################################################
919 # Now we accumulate (inp[0:1]+hash)*r^4
920 ################################################################
921 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
922 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
923 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
924 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
925 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
927 vpmuludq $H0,$T4,$T0 # h0*r0
928 vpmuludq $H1,$T4,$T1 # h1*r0
931 vmovdqa -0x80(%r11),$T2 # r1^4
932 vpmuludq $H2,$T4,$T0 # h2*r0
933 vpmuludq $H3,$T4,$T1 # h3*r0
936 vpmuludq $H4,$T4,$T4 # h4*r0
937 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
940 vpaddq $T0,$D0,$D0 # d0 += h4*s1
941 vpmuludq $H2,$T2,$T1 # h2*r1
942 vpmuludq $H3,$T2,$T0 # h3*r1
943 vpaddq $T1,$D3,$D3 # d3 += h2*r1
944 vmovdqa -0x60(%r11),$T3 # r2^4
945 vpaddq $T0,$D4,$D4 # d4 += h3*r1
946 vpmuludq $H1,$T2,$T1 # h1*r1
947 vpmuludq $H0,$T2,$T2 # h0*r1
948 vpaddq $T1,$D2,$D2 # d2 += h1*r1
949 vpaddq $T2,$D1,$D1 # d1 += h0*r1
951 vmovdqa -0x50(%r11),$T4 # s2^4
952 vpmuludq $H2,$T3,$T0 # h2*r2
953 vpmuludq $H1,$T3,$T1 # h1*r2
954 vpaddq $T0,$D4,$D4 # d4 += h2*r2
955 vpaddq $T1,$D3,$D3 # d3 += h1*r2
956 vmovdqa -0x40(%r11),$T2 # r3^4
957 vpmuludq $H0,$T3,$T3 # h0*r2
958 vpmuludq $H4,$T4,$T0 # h4*s2
959 vpaddq $T3,$D2,$D2 # d2 += h0*r2
960 vpaddq $T0,$D1,$D1 # d1 += h4*s2
961 vmovdqa -0x30(%r11),$T3 # s3^4
962 vpmuludq $H3,$T4,$T4 # h3*s2
963 vpmuludq $H1,$T2,$T1 # h1*r3
964 vpaddq $T4,$D0,$D0 # d0 += h3*s2
966 vmovdqa -0x10(%r11),$T4 # s4^4
967 vpaddq $T1,$D4,$D4 # d4 += h1*r3
968 vpmuludq $H0,$T2,$T2 # h0*r3
969 vpmuludq $H4,$T3,$T0 # h4*s3
970 vpaddq $T2,$D3,$D3 # d3 += h0*r3
971 vpaddq $T0,$D2,$D2 # d2 += h4*s3
972 vmovdqu 16*2($inp),$T0 # load input
973 vpmuludq $H3,$T3,$T2 # h3*s3
974 vpmuludq $H2,$T3,$T3 # h2*s3
975 vpaddq $T2,$D1,$D1 # d1 += h3*s3
976 vmovdqu 16*3($inp),$T1 #
977 vpaddq $T3,$D0,$D0 # d0 += h2*s3
979 vpmuludq $H2,$T4,$H2 # h2*s4
980 vpmuludq $H3,$T4,$H3 # h3*s4
981 vpsrldq \$6,$T0,$T2 # splat input
982 vpaddq $H2,$D1,$D1 # d1 += h2*s4
983 vpmuludq $H4,$T4,$H4 # h4*s4
984 vpsrldq \$6,$T1,$T3 #
985 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
986 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
987 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
989 vpunpckhqdq $T1,$T0,$T4 # 4
990 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
991 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
993 vpunpcklqdq $T1,$T0,$T0 # 0:1
994 vpunpcklqdq $T3,$T2,$T3 # 2:3
996 #vpsrlq \$40,$T4,$T4 # 4
997 vpsrldq \$`40/8`,$T4,$T4 # 4
999 vmovdqa 0x00(%rsp),$D4 # preload r0^2
1000 vpand $MASK,$T0,$T0 # 0
1002 vpand $MASK,$T1,$T1 # 1
1003 vpand 0(%rcx),$T4,$T4 # .Lmask24
1005 vpand $MASK,$T2,$T2 # 2
1006 vpand $MASK,$T3,$T3 # 3
1007 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1009 ################################################################
1010 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1015 vpaddq $D3,$H4,$H4 # h3 -> h4
1019 vpaddq $D0,$D1,$H1 # h0 -> h1
1026 vpaddq $D1,$H2,$H2 # h1 -> h2
1030 vpaddq $D0,$H0,$H0 # h4 -> h0
1034 vpaddq $D2,$H3,$H3 # h2 -> h3
1038 vpaddq $D0,$H1,$H1 # h0 -> h1
1042 vpaddq $D3,$H4,$H4 # h3 -> h4
1047 ################################################################
1048 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1050 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1061 vmovdqa $H2,0x20(%r11)
1062 vmovdqa $H0,0x00(%r11)
1063 vmovdqa $H1,0x10(%r11)
1064 vmovdqa $H3,0x30(%r11)
1065 vmovdqa $H4,0x40(%r11)
1067 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1068 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1069 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1070 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1071 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1073 vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1074 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1075 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1076 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1077 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1078 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1080 vpmuludq $T3,$H2,$H0 # h3*r1
1081 vpaddq $H0,$D4,$D4 # d4 += h3*r1
1082 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1083 vpmuludq $T2,$H2,$H1 # h2*r1
1084 vpaddq $H1,$D3,$D3 # d3 += h2*r1
1085 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1086 vpmuludq $T1,$H2,$H0 # h1*r1
1087 vpaddq $H0,$D2,$D2 # d2 += h1*r1
1088 vpmuludq $T0,$H2,$H2 # h0*r1
1089 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1090 vpmuludq $T4,$H3,$H3 # h4*s1
1091 vpaddq $H3,$D0,$D0 # d0 += h4*s1
1093 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1094 vpmuludq $T2,$H4,$H1 # h2*r2
1095 vpaddq $H1,$D4,$D4 # d4 += h2*r2
1096 vpmuludq $T1,$H4,$H0 # h1*r2
1097 vpaddq $H0,$D3,$D3 # d3 += h1*r2
1098 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1099 vpmuludq $T0,$H4,$H4 # h0*r2
1100 vpaddq $H4,$D2,$D2 # d2 += h0*r2
1101 vpmuludq $T4,$H2,$H1 # h4*s2
1102 vpaddq $H1,$D1,$D1 # d1 += h4*s2
1103 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1104 vpmuludq $T3,$H2,$H2 # h3*s2
1105 vpaddq $H2,$D0,$D0 # d0 += h3*s2
1107 vpmuludq $T1,$H3,$H0 # h1*r3
1108 vpaddq $H0,$D4,$D4 # d4 += h1*r3
1109 vpmuludq $T0,$H3,$H3 # h0*r3
1110 vpaddq $H3,$D3,$D3 # d3 += h0*r3
1111 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1112 vpmuludq $T4,$H4,$H1 # h4*s3
1113 vpaddq $H1,$D2,$D2 # d2 += h4*s3
1114 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1115 vpmuludq $T3,$H4,$H0 # h3*s3
1116 vpaddq $H0,$D1,$D1 # d1 += h3*s3
1117 vpmuludq $T2,$H4,$H4 # h2*s3
1118 vpaddq $H4,$D0,$D0 # d0 += h2*s3
1120 vpmuludq $T0,$H2,$H2 # h0*r4
1121 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1122 vpmuludq $T4,$H3,$H1 # h4*s4
1123 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1124 vpmuludq $T3,$H3,$H0 # h3*s4
1125 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1126 vpmuludq $T2,$H3,$H1 # h2*s4
1127 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1128 vpmuludq $T1,$H3,$H3 # h1*s4
1129 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1133 vmovdqu 16*0($inp),$H0 # load input
1134 vmovdqu 16*1($inp),$H1
1136 vpsrldq \$6,$H0,$H2 # splat input
1138 vpunpckhqdq $H1,$H0,$H4 # 4
1139 vpunpcklqdq $H1,$H0,$H0 # 0:1
1140 vpunpcklqdq $H3,$H2,$H3 # 2:3
1142 vpsrlq \$40,$H4,$H4 # 4
1144 vpand $MASK,$H0,$H0 # 0
1146 vpand $MASK,$H1,$H1 # 1
1148 vpand $MASK,$H2,$H2 # 2
1149 vpand $MASK,$H3,$H3 # 3
1150 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1152 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1153 vpaddq 0x00(%r11),$H0,$H0
1154 vpaddq 0x10(%r11),$H1,$H1
1155 vpaddq 0x20(%r11),$H2,$H2
1156 vpaddq 0x30(%r11),$H3,$H3
1157 vpaddq 0x40(%r11),$H4,$H4
1159 ################################################################
1160 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1162 vpmuludq $H0,$T4,$T0 # h0*r0
1163 vpaddq $T0,$D0,$D0 # d0 += h0*r0
1164 vpmuludq $H1,$T4,$T1 # h1*r0
1165 vpaddq $T1,$D1,$D1 # d1 += h1*r0
1166 vpmuludq $H2,$T4,$T0 # h2*r0
1167 vpaddq $T0,$D2,$D2 # d2 += h2*r0
1168 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1169 vpmuludq $H3,$T4,$T1 # h3*r0
1170 vpaddq $T1,$D3,$D3 # d3 += h3*r0
1171 vpmuludq $H4,$T4,$T4 # h4*r0
1172 vpaddq $T4,$D4,$D4 # d4 += h4*r0
1174 vpmuludq $H3,$T2,$T0 # h3*r1
1175 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1176 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1177 vpmuludq $H2,$T2,$T1 # h2*r1
1178 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1179 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1180 vpmuludq $H1,$T2,$T0 # h1*r1
1181 vpaddq $T0,$D2,$D2 # d2 += h1*r1
1182 vpmuludq $H0,$T2,$T2 # h0*r1
1183 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1184 vpmuludq $H4,$T3,$T3 # h4*s1
1185 vpaddq $T3,$D0,$D0 # d0 += h4*s1
1187 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1188 vpmuludq $H2,$T4,$T1 # h2*r2
1189 vpaddq $T1,$D4,$D4 # d4 += h2*r2
1190 vpmuludq $H1,$T4,$T0 # h1*r2
1191 vpaddq $T0,$D3,$D3 # d3 += h1*r2
1192 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1193 vpmuludq $H0,$T4,$T4 # h0*r2
1194 vpaddq $T4,$D2,$D2 # d2 += h0*r2
1195 vpmuludq $H4,$T2,$T1 # h4*s2
1196 vpaddq $T1,$D1,$D1 # d1 += h4*s2
1197 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1198 vpmuludq $H3,$T2,$T2 # h3*s2
1199 vpaddq $T2,$D0,$D0 # d0 += h3*s2
1201 vpmuludq $H1,$T3,$T0 # h1*r3
1202 vpaddq $T0,$D4,$D4 # d4 += h1*r3
1203 vpmuludq $H0,$T3,$T3 # h0*r3
1204 vpaddq $T3,$D3,$D3 # d3 += h0*r3
1205 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1206 vpmuludq $H4,$T4,$T1 # h4*s3
1207 vpaddq $T1,$D2,$D2 # d2 += h4*s3
1208 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1209 vpmuludq $H3,$T4,$T0 # h3*s3
1210 vpaddq $T0,$D1,$D1 # d1 += h3*s3
1211 vpmuludq $H2,$T4,$T4 # h2*s3
1212 vpaddq $T4,$D0,$D0 # d0 += h2*s3
1214 vpmuludq $H0,$T2,$T2 # h0*r4
1215 vpaddq $T2,$D4,$D4 # d4 += h0*r4
1216 vpmuludq $H4,$T3,$T1 # h4*s4
1217 vpaddq $T1,$D3,$D3 # d3 += h4*s4
1218 vpmuludq $H3,$T3,$T0 # h3*s4
1219 vpaddq $T0,$D2,$D2 # d2 += h3*s4
1220 vpmuludq $H2,$T3,$T1 # h2*s4
1221 vpaddq $T1,$D1,$D1 # d1 += h2*s4
1222 vpmuludq $H1,$T3,$T3 # h1*s4
1223 vpaddq $T3,$D0,$D0 # d0 += h1*s4
1226 ################################################################
1227 # horizontal addition
1240 ################################################################
1245 vpaddq $H3,$D4,$D4 # h3 -> h4
1249 vpaddq $H0,$D1,$D1 # h0 -> h1
1256 vpaddq $H1,$D2,$D2 # h1 -> h2
1260 vpaddq $H4,$D0,$D0 # h4 -> h0
1264 vpaddq $H2,$D3,$D3 # h2 -> h3
1268 vpaddq $H0,$D1,$D1 # h0 -> h1
1272 vpaddq $H3,$D4,$D4 # h3 -> h4
1274 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1275 vmovd $D1,`4*1-48-64`($ctx)
1276 vmovd $D2,`4*2-48-64`($ctx)
1277 vmovd $D3,`4*3-48-64`($ctx)
1278 vmovd $D4,`4*4-48-64`($ctx)
1280 $code.=<<___ if ($win64);
1281 vmovdqa 0x50(%r11),%xmm6
1282 vmovdqa 0x60(%r11),%xmm7
1283 vmovdqa 0x70(%r11),%xmm8
1284 vmovdqa 0x80(%r11),%xmm9
1285 vmovdqa 0x90(%r11),%xmm10
1286 vmovdqa 0xa0(%r11),%xmm11
1287 vmovdqa 0xb0(%r11),%xmm12
1288 vmovdqa 0xc0(%r11),%xmm13
1289 vmovdqa 0xd0(%r11),%xmm14
1290 vmovdqa 0xe0(%r11),%xmm15
1294 $code.=<<___ if (!$win64);
1300 .size poly1305_blocks_avx,.-poly1305_blocks_avx
1302 .type poly1305_emit_avx,\@function,3
1305 cmpl \$0,20($ctx) # is_base2_26?
1308 mov 0($ctx),%eax # load hash value base 2^26
1314 shl \$26,%rcx # base 2^26 -> base 2^64
1330 mov %r10,%rax # could be partially reduced, so reduce
1341 add \$5,%r8 # compare to modulus
1345 shr \$2,%r10 # did 130-bit value overfow?
1349 add 0($nonce),%rax # accumulate nonce
1351 mov %rax,0($mac) # write result
1355 .size poly1305_emit_avx,.-poly1305_emit_avx
1359 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1360 map("%ymm$_",(0..15));
1364 .type poly1305_blocks_avx2,\@function,4
1366 poly1305_blocks_avx2:
1367 mov 20($ctx),%r8d # is_base2_26
1393 mov $len,%r15 # reassign $len
1395 mov 0($ctx),$d1 # load hash value
1399 mov 24($ctx),$r0 # load r
1402 ################################# base 2^26 -> base 2^64
1404 and \$`-1*(1<<31)`,$d1
1405 mov $d2,$r1 # borrow $r1
1407 and \$`-1*(1<<31)`,$d2
1421 adc \$0,$h2 # can be partially reduced...
1423 mov \$-4,$d2 # ... so reduce
1436 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1438 .Lbase2_26_pre_avx2:
1439 add 0($inp),$h0 # accumulate input
1445 call __poly1305_block
1449 jnz .Lbase2_26_pre_avx2
1451 test $padbit,$padbit # if $padbit is zero,
1452 jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
1454 ################################# base 2^64 -> base 2^26
1461 and \$0x3ffffff,%rax # h[0]
1463 and \$0x3ffffff,%rdx # h[1]
1467 and \$0x3ffffff,$h0 # h[2]
1469 and \$0x3ffffff,$h1 # h[3]
1473 jz .Lstore_base2_26_avx2
1483 .Lstore_base2_64_avx2:
1486 mov $h2,16($ctx) # note that is_base2_26 is zeroed
1490 .Lstore_base2_26_avx2:
1491 mov %rax#d,0($ctx) # store hash value base 2^26
1506 .Lblocks_avx2_epilogue:
1517 .Lbase2_64_avx2_body:
1519 mov $len,%r15 # reassign $len
1521 mov 24($ctx),$r0 # load r
1524 mov 0($ctx),$h0 # load hash value
1531 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1536 .Lbase2_64_pre_avx2:
1537 add 0($inp),$h0 # accumulate input
1543 call __poly1305_block
1547 jnz .Lbase2_64_pre_avx2
1550 ################################# base 2^64 -> base 2^26
1557 and \$0x3ffffff,%rax # h[0]
1559 and \$0x3ffffff,%rdx # h[1]
1563 and \$0x3ffffff,$h0 # h[2]
1565 and \$0x3ffffff,$h1 # h[3]
1573 movl \$1,20($ctx) # set is_base2_26
1575 call __poly1305_init_avx
1578 mov %r15,$len # restore $len
1579 mov OPENSSL_ia32cap_P+8(%rip),%r10d
1580 mov \$`(1<<31|1<<30|1<<16)`,%r11d
1590 .Lbase2_64_avx2_epilogue:
1595 mov OPENSSL_ia32cap_P+8(%rip),%r10d
1596 mov \$`(1<<31|1<<30|1<<16)`,%r11d
1597 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1598 vmovd 4*1($ctx),%x#$H1
1599 vmovd 4*2($ctx),%x#$H2
1600 vmovd 4*3($ctx),%x#$H3
1601 vmovd 4*4($ctx),%x#$H4
1605 $code.=<<___ if ($avx>2);
1609 cmp %r11d,%r10d # check for AVX512F+BW+VL
1613 $code.=<<___ if (!$win64);
1617 $code.=<<___ if ($win64);
1618 lea -0xf8(%rsp),%r11
1620 vmovdqa %xmm6,0x50(%r11)
1621 vmovdqa %xmm7,0x60(%r11)
1622 vmovdqa %xmm8,0x70(%r11)
1623 vmovdqa %xmm9,0x80(%r11)
1624 vmovdqa %xmm10,0x90(%r11)
1625 vmovdqa %xmm11,0xa0(%r11)
1626 vmovdqa %xmm12,0xb0(%r11)
1627 vmovdqa %xmm13,0xc0(%r11)
1628 vmovdqa %xmm14,0xd0(%r11)
1629 vmovdqa %xmm15,0xe0(%r11)
1633 lea 48+64($ctx),$ctx # size optimization
1634 lea .Lconst(%rip),%rcx
1636 # expand and copy pre-calculated table to stack
1637 vmovdqu `16*0-64`($ctx),%x#$T2
1639 vmovdqu `16*1-64`($ctx),%x#$T3
1640 vmovdqu `16*2-64`($ctx),%x#$T4
1641 vmovdqu `16*3-64`($ctx),%x#$D0
1642 vmovdqu `16*4-64`($ctx),%x#$D1
1643 vmovdqu `16*5-64`($ctx),%x#$D2
1644 vmovdqu `16*6-64`($ctx),%x#$D3
1645 vpermq \$0x15,$T2,$T2 # 00003412 -> 12343434
1646 vmovdqu `16*7-64`($ctx),%x#$D4
1647 vpermq \$0x15,$T3,$T3
1648 vpshufd \$0xc8,$T2,$T2 # 12343434 -> 14243444
1649 vmovdqu `16*8-64`($ctx),%x#$MASK
1650 vpermq \$0x15,$T4,$T4
1651 vpshufd \$0xc8,$T3,$T3
1652 vmovdqa $T2,0x00(%rsp)
1653 vpermq \$0x15,$D0,$D0
1654 vpshufd \$0xc8,$T4,$T4
1655 vmovdqa $T3,0x20(%rsp)
1656 vpermq \$0x15,$D1,$D1
1657 vpshufd \$0xc8,$D0,$D0
1658 vmovdqa $T4,0x40(%rsp)
1659 vpermq \$0x15,$D2,$D2
1660 vpshufd \$0xc8,$D1,$D1
1661 vmovdqa $D0,0x60(%rsp)
1662 vpermq \$0x15,$D3,$D3
1663 vpshufd \$0xc8,$D2,$D2
1664 vmovdqa $D1,0x80(%rsp)
1665 vpermq \$0x15,$D4,$D4
1666 vpshufd \$0xc8,$D3,$D3
1667 vmovdqa $D2,0xa0(%rsp)
1668 vpermq \$0x15,$MASK,$MASK
1669 vpshufd \$0xc8,$D4,$D4
1670 vmovdqa $D3,0xc0(%rsp)
1671 vpshufd \$0xc8,$MASK,$MASK
1672 vmovdqa $D4,0xe0(%rsp)
1673 vmovdqa $MASK,0x100(%rsp)
1674 vmovdqa 64(%rcx),$MASK # .Lmask26
1676 ################################################################
1678 vmovdqu 16*0($inp),%x#$T0
1679 vmovdqu 16*1($inp),%x#$T1
1680 vinserti128 \$1,16*2($inp),$T0,$T0
1681 vinserti128 \$1,16*3($inp),$T1,$T1
1684 vpsrldq \$6,$T0,$T2 # splat input
1686 vpunpckhqdq $T1,$T0,$T4 # 4
1687 vpunpcklqdq $T3,$T2,$T2 # 2:3
1688 vpunpcklqdq $T1,$T0,$T0 # 0:1
1693 vpsrlq \$40,$T4,$T4 # 4
1694 vpand $MASK,$T2,$T2 # 2
1695 vpand $MASK,$T0,$T0 # 0
1696 vpand $MASK,$T1,$T1 # 1
1697 vpand $MASK,$T3,$T3 # 3
1698 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1700 lea 0x90(%rsp),%rax # size optimization
1701 vpaddq $H2,$T2,$H2 # accumulate input
1708 ################################################################
1709 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1710 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1711 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1712 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1713 # \________/\__________/
1714 ################################################################
1715 #vpaddq $H2,$T2,$H2 # accumulate input
1717 vmovdqa `32*0`(%rsp),$T0 # r0^4
1719 vmovdqa `32*1`(%rsp),$T1 # r1^4
1721 vmovdqa `32*3`(%rsp),$T2 # r2^4
1723 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1724 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1726 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1727 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1728 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1729 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1730 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1732 # however, as h2 is "chronologically" first one available pull
1733 # corresponding operations up, so it's
1735 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1736 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1737 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1738 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1739 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1741 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1742 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1743 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1744 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1745 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1747 vpmuludq $H0,$T1,$T4 # h0*r1
1748 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1749 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1750 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1751 vpmuludq $H3,$T1,$T4 # h3*r1
1752 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1753 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1754 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1755 vmovdqa `32*4-0x90`(%rax),$T1 # s2
1757 vpmuludq $H0,$T0,$T4 # h0*r0
1758 vpmuludq $H1,$T0,$H2 # h1*r0
1759 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1760 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1761 vpmuludq $H3,$T0,$T4 # h3*r0
1762 vpmuludq $H4,$T0,$H2 # h4*r0
1763 vmovdqu 16*0($inp),%x#$T0 # load input
1764 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1765 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1766 vinserti128 \$1,16*2($inp),$T0,$T0
1768 vpmuludq $H3,$T1,$T4 # h3*s2
1769 vpmuludq $H4,$T1,$H2 # h4*s2
1770 vmovdqu 16*1($inp),%x#$T1
1771 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1772 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1773 vmovdqa `32*5-0x90`(%rax),$H2 # r3
1774 vpmuludq $H1,$T2,$T4 # h1*r2
1775 vpmuludq $H0,$T2,$T2 # h0*r2
1776 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1777 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1778 vinserti128 \$1,16*3($inp),$T1,$T1
1781 vpmuludq $H1,$H2,$T4 # h1*r3
1782 vpmuludq $H0,$H2,$H2 # h0*r3
1783 vpsrldq \$6,$T0,$T2 # splat input
1784 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1785 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1786 vpmuludq $H3,$T3,$T4 # h3*s3
1787 vpmuludq $H4,$T3,$H2 # h4*s3
1789 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1790 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1791 vpunpckhqdq $T1,$T0,$T4 # 4
1793 vpmuludq $H3,$S4,$H3 # h3*s4
1794 vpmuludq $H4,$S4,$H4 # h4*s4
1795 vpunpcklqdq $T1,$T0,$T0 # 0:1
1796 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1797 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1798 vpunpcklqdq $T3,$T2,$T3 # 2:3
1799 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
1800 vpmuludq $H1,$S4,$H0 # h1*s4
1801 vmovdqa 64(%rcx),$MASK # .Lmask26
1802 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1803 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1805 ################################################################
1806 # lazy reduction (interleaved with tail of input splat)
1810 vpaddq $D3,$H4,$H4 # h3 -> h4
1814 vpaddq $D0,$D1,$H1 # h0 -> h1
1823 vpaddq $D1,$H2,$H2 # h1 -> h2
1827 vpaddq $D4,$H0,$H0 # h4 -> h0
1829 vpand $MASK,$T2,$T2 # 2
1834 vpaddq $D2,$H3,$H3 # h2 -> h3
1836 vpaddq $T2,$H2,$H2 # modulo-scheduled
1841 vpaddq $D0,$H1,$H1 # h0 -> h1
1843 vpsrlq \$40,$T4,$T4 # 4
1847 vpaddq $D3,$H4,$H4 # h3 -> h4
1849 vpand $MASK,$T0,$T0 # 0
1850 vpand $MASK,$T1,$T1 # 1
1851 vpand $MASK,$T3,$T3 # 3
1852 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1859 ################################################################
1860 # while above multiplications were by r^4 in all lanes, in last
1861 # iteration we multiply least significant lane by r^4 and most
1862 # significant one by r, so copy of above except that references
1863 # to the precomputed table are displaced by 4...
1865 #vpaddq $H2,$T2,$H2 # accumulate input
1867 vmovdqu `32*0+4`(%rsp),$T0 # r0^4
1869 vmovdqu `32*1+4`(%rsp),$T1 # r1^4
1871 vmovdqu `32*3+4`(%rsp),$T2 # r2^4
1873 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
1874 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
1876 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1877 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1878 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1879 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1880 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1882 vpmuludq $H0,$T1,$T4 # h0*r1
1883 vpmuludq $H1,$T1,$H2 # h1*r1
1884 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1885 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1886 vpmuludq $H3,$T1,$T4 # h3*r1
1887 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
1888 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1889 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1891 vpmuludq $H0,$T0,$T4 # h0*r0
1892 vpmuludq $H1,$T0,$H2 # h1*r0
1893 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1894 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
1895 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1896 vpmuludq $H3,$T0,$T4 # h3*r0
1897 vpmuludq $H4,$T0,$H2 # h4*r0
1898 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1899 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1901 vpmuludq $H3,$T1,$T4 # h3*s2
1902 vpmuludq $H4,$T1,$H2 # h4*s2
1903 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1904 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1905 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
1906 vpmuludq $H1,$T2,$T4 # h1*r2
1907 vpmuludq $H0,$T2,$T2 # h0*r2
1908 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1909 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1911 vpmuludq $H1,$H2,$T4 # h1*r3
1912 vpmuludq $H0,$H2,$H2 # h0*r3
1913 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1914 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1915 vpmuludq $H3,$T3,$T4 # h3*s3
1916 vpmuludq $H4,$T3,$H2 # h4*s3
1917 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1918 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1920 vpmuludq $H3,$S4,$H3 # h3*s4
1921 vpmuludq $H4,$S4,$H4 # h4*s4
1922 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1923 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1924 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
1925 vpmuludq $H1,$S4,$H0 # h1*s4
1926 vmovdqa 64(%rcx),$MASK # .Lmask26
1927 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1928 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1930 ################################################################
1931 # horizontal addition
1944 vpermq \$0x2,$H3,$T3
1945 vpermq \$0x2,$H4,$T4
1946 vpermq \$0x2,$H0,$T0
1947 vpermq \$0x2,$D1,$T1
1948 vpermq \$0x2,$H2,$T2
1955 ################################################################
1960 vpaddq $D3,$H4,$H4 # h3 -> h4
1964 vpaddq $D0,$D1,$H1 # h0 -> h1
1971 vpaddq $D1,$H2,$H2 # h1 -> h2
1975 vpaddq $D4,$H0,$H0 # h4 -> h0
1979 vpaddq $D2,$H3,$H3 # h2 -> h3
1983 vpaddq $D0,$H1,$H1 # h0 -> h1
1987 vpaddq $D3,$H4,$H4 # h3 -> h4
1989 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
1990 vmovd %x#$H1,`4*1-48-64`($ctx)
1991 vmovd %x#$H2,`4*2-48-64`($ctx)
1992 vmovd %x#$H3,`4*3-48-64`($ctx)
1993 vmovd %x#$H4,`4*4-48-64`($ctx)
1995 $code.=<<___ if ($win64);
1996 vmovdqa 0x50(%r11),%xmm6
1997 vmovdqa 0x60(%r11),%xmm7
1998 vmovdqa 0x70(%r11),%xmm8
1999 vmovdqa 0x80(%r11),%xmm9
2000 vmovdqa 0x90(%r11),%xmm10
2001 vmovdqa 0xa0(%r11),%xmm11
2002 vmovdqa 0xb0(%r11),%xmm12
2003 vmovdqa 0xc0(%r11),%xmm13
2004 vmovdqa 0xd0(%r11),%xmm14
2005 vmovdqa 0xe0(%r11),%xmm15
2009 $code.=<<___ if (!$win64);
2015 .size poly1305_blocks_avx2,.-poly1305_blocks_avx2
2017 #######################################################################
2019 # On entry we have input length divisible by 64. But since inner loop
2020 # processes 128 bytes per iteration, cases when length is not divisible
2021 # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2022 # reason stack layout is kept identical to poly1305_blocks_avx2. If not
2023 # for this tail, we wouldn't have to even allocate stack frame...
2025 my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%ymm$_",(16..24));
2026 my ($M0,$M1,$M2,$M3,$M4) = map("%ymm$_",(25..29));
2027 my $PADBIT="%zmm30";
2028 my $GATHER="%ymm31";
2031 .type poly1305_blocks_avx512,\@function,4
2033 poly1305_blocks_avx512:
2037 $code.=<<___ if (!$win64);
2041 $code.=<<___ if ($win64);
2042 lea -0xf8(%rsp),%r11
2044 vmovdqa %xmm6,0x50(%r11)
2045 vmovdqa %xmm7,0x60(%r11)
2046 vmovdqa %xmm8,0x70(%r11)
2047 vmovdqa %xmm9,0x80(%r11)
2048 vmovdqa %xmm10,0x90(%r11)
2049 vmovdqa %xmm11,0xa0(%r11)
2050 vmovdqa %xmm12,0xb0(%r11)
2051 vmovdqa %xmm13,0xc0(%r11)
2052 vmovdqa %xmm14,0xd0(%r11)
2053 vmovdqa %xmm15,0xe0(%r11)
2057 lea 48+64($ctx),$ctx # size optimization
2058 lea .Lconst(%rip),%rcx
2060 # expand pre-calculated table
2061 vmovdqu32 `16*0-64`($ctx),%x#$R0
2063 vmovdqu32 `16*1-64`($ctx),%x#$R1
2064 vmovdqu32 `16*2-64`($ctx),%x#$S1
2065 vmovdqu32 `16*3-64`($ctx),%x#$R2
2066 vmovdqu32 `16*4-64`($ctx),%x#$S2
2067 vmovdqu32 `16*5-64`($ctx),%x#$R3
2068 vmovdqu32 `16*6-64`($ctx),%x#$S3
2069 vmovdqu32 `16*7-64`($ctx),%x#$R4
2070 vmovdqu32 `16*8-64`($ctx),%x#$S4
2071 vpermq \$0x15,$R0,$R0 # 00003412 -> 12343434
2072 vmovdqa64 64(%rcx),$MASK # .Lmask26
2073 vpermq \$0x15,$R1,$R1
2074 vmovdqa32 128(%rcx),$GATHER # .Lgather
2075 vpermq \$0x15,$S1,$S1
2076 vpshufd \$0xc8,$R0,$R0 # 12343434 -> 14243444
2077 vpermq \$0x15,$R2,$R2
2078 vpshufd \$0xc8,$R1,$R1
2079 vmovdqa32 $R0,0x00(%rsp) # save in case $len%128 != 0
2080 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
2081 vpermq \$0x15,$S2,$S2
2082 vpshufd \$0xc8,$S1,$S1
2083 vmovdqa32 $R1,0x20(%rsp)
2085 vpermq \$0x15,$R3,$R3
2086 vpshufd \$0xc8,$R2,$R2
2087 vmovdqa32 $S1,0x40(%rsp)
2088 vpermq \$0x15,$S3,$S3
2089 vpshufd \$0xc8,$S2,$S2
2090 vpermq \$0x15,$R4,$R4
2091 vpshufd \$0xc8,$R3,$R3
2092 vmovdqa32 $R2,0x60(%rsp)
2093 vpermq \$0x15,$S4,$S4
2094 vpshufd \$0xc8,$S3,$S3
2095 vmovdqa32 $S2,0x80(%rsp)
2096 vpshufd \$0xc8,$R4,$R4
2097 vpshufd \$0xc8,$S4,$S4
2098 vmovdqa32 $R3,0xa0(%rsp)
2099 vmovdqa32 $S3,0xc0(%rsp)
2100 vmovdqa32 $R4,0xe0(%rsp)
2101 vmovdqa32 $S4,0x100(%rsp)
2103 ################################################################
2104 # calculate 5th through 8th powers of the key
2106 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2107 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2108 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
2109 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
2110 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
2112 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
2113 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
2114 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
2115 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
2116 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
2119 vpmuludq $T1,$S4,$M0
2120 vpmuludq $T1,$R0,$M1
2121 vpmuludq $T1,$R1,$M2
2122 vpmuludq $T1,$R2,$M3
2123 vpmuludq $T1,$R3,$M4
2125 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
2126 vpaddq $M1,$D1,$D1 # d1 += r1'*r0
2127 vpaddq $M2,$D2,$D2 # d2 += r1'*r1
2128 vpaddq $M3,$D3,$D3 # d3 += r1'*r2
2129 vpaddq $M4,$D4,$D4 # d4 += r1'*r3
2131 vpmuludq $T2,$S3,$M0
2132 vpmuludq $T2,$S4,$M1
2133 vpmuludq $T2,$R1,$M3
2134 vpmuludq $T2,$R2,$M4
2135 vpmuludq $T2,$R0,$M2
2137 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
2138 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
2139 vpaddq $M3,$D3,$D3 # d3 += r2'*r1
2140 vpaddq $M4,$D4,$D4 # d4 += r2'*r2
2141 vpaddq $M2,$D2,$D2 # d2 += r2'*r0
2143 vpmuludq $T3,$S2,$M0
2144 vpmuludq $T3,$R0,$M3
2145 vpmuludq $T3,$R1,$M4
2146 vpmuludq $T3,$S3,$M1
2147 vpmuludq $T3,$S4,$M2
2148 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
2149 vpaddq $M3,$D3,$D3 # d3 += r3'*r0
2150 vpaddq $M4,$D4,$D4 # d4 += r3'*r1
2151 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
2152 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
2154 vpmuludq $T4,$S4,$M3
2155 vpmuludq $T4,$R0,$M4
2156 vpmuludq $T4,$S1,$M0
2157 vpmuludq $T4,$S2,$M1
2158 vpmuludq $T4,$S3,$M2
2159 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
2160 vpaddq $M4,$D4,$D4 # d4 += r2'*r0
2161 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
2162 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
2163 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
2165 ################################################################
2167 vmovdqu64 16*0($inp),%x#$T0
2168 vmovdqu64 16*1($inp),%x#$T1
2169 vinserti64x2 \$1,16*2($inp),$T0,$T0
2170 vinserti64x2 \$1,16*3($inp),$T1,$T1
2172 ################################################################
2176 vpandq $MASK,$D3,$D3
2177 vpaddq $M3,$D4,$D4 # d3 -> d4
2180 vpandq $MASK,$D0,$D0
2181 vpaddq $M0,$D1,$D1 # d0 -> d1
2184 vpandq $MASK,$D4,$D4
2187 vpandq $MASK,$D1,$D1
2188 vpaddq $M1,$D2,$D2 # d1 -> d2
2192 vpaddq $M4,$D0,$D0 # d4 -> d0
2195 vpandq $MASK,$D2,$D2
2196 vpaddq $M2,$D3,$D3 # d2 -> d3
2199 vpandq $MASK,$D0,$D0
2200 vpaddq $M0,$D1,$D1 # d0 -> d1
2203 vpandq $MASK,$D3,$D3
2204 vpaddq $M3,$D4,$D4 # d3 -> d4
2207 map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));
2208 map(s/%y/%z/,($M4,$M0,$M1,$M2,$M3));
2209 map(s/%y/%z/,($MASK));
2211 ################################################################
2213 vinserti64x2 \$2,16*4($inp),$T0,$T0
2214 vinserti64x2 \$2,16*5($inp),$T1,$T1
2215 vinserti64x2 \$3,16*6($inp),$T0,$T0
2216 vinserti64x2 \$3,16*7($inp),$T1,$T1
2219 vpbroadcastq %x#$MASK,$MASK
2220 vpbroadcastq 32(%rcx),$PADBIT
2222 ################################################################
2223 # at this point we have 14243444 in $R0-$S4 and 05060708 in
2224 # $D0-$D4, and the goal is 1828384858687888 in $R0-$S4
2227 vpbroadcastq %x#$D0,$M0 # 0808080808080808
2228 vpbroadcastq %x#$D1,$M1
2229 vpbroadcastq %x#$D2,$M2
2230 vpbroadcastq %x#$D3,$M3
2231 vpbroadcastq %x#$D4,$M4
2233 vpsllq \$32,$D0,$D0 # 05060708 -> 50607080
2239 map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2241 vinserti64x4 \$1,$R0,$D0,$D0 # 1424344450607080
2242 vinserti64x4 \$1,$R1,$D1,$D1
2243 vinserti64x4 \$1,$R2,$D2,$D2
2244 vinserti64x4 \$1,$R3,$D3,$D3
2245 vinserti64x4 \$1,$R4,$D4,$D4
2247 map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2248 map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4));
2250 vpblendmd $M0,$D0,${R0}{%k3} # 1828384858687888
2251 vpblendmd $M1,$D1,${R1}{%k3}
2252 vpblendmd $M2,$D2,${R2}{%k3}
2253 vpblendmd $M3,$D3,${R3}{%k3}
2254 vpblendmd $M4,$D4,${R4}{%k3}
2256 vpslld \$2,$R1,$S1 # *5
2265 vpsrldq \$6,$T0,$T2 # splat input
2267 vpunpckhqdq $T1,$T0,$T4 # 4
2268 vpunpcklqdq $T3,$T2,$T2 # 2:3
2269 vpunpcklqdq $T1,$T0,$T0 # 0:1
2274 vpsrlq \$40,$T4,$T4 # 4
2275 vpandq $MASK,$T2,$T2 # 2
2276 vpandq $MASK,$T0,$T0 # 0
2277 #vpandq $MASK,$T1,$T1 # 1
2278 #vpandq $MASK,$T3,$T3 # 3
2279 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2281 vpaddq $H2,$T2,$H2 # accumulate input
2287 ################################################################
2288 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2289 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2290 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2291 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2292 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2293 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2294 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2295 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2296 # \________/\___________/
2297 ################################################################
2298 #vpaddq $H2,$T2,$H2 # accumulate input
2300 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
2301 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
2302 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
2303 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2304 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2306 # however, as h2 is "chronologically" first one available pull
2307 # corresponding operations up, so it's
2309 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
2310 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
2311 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
2312 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
2313 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
2315 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2317 vmovdqu64 16*0($inp),%x#$M0 # load input
2318 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2319 vpandq $MASK,$T1,$T1 # 1, module-scheduled
2320 vmovdqu64 16*1($inp),%x#$M1
2321 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2322 vpandq $MASK,$T3,$T3 # 3
2323 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2324 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2325 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2326 vpaddq $H1,$T1,$H1 # accumulate input
2330 vinserti64x2 \$1,16*2($inp),$M0,$T0
2331 vinserti64x2 \$1,16*3($inp),$M1,$T1
2332 vpmuludq $H0,$R3,$M3
2333 vpmuludq $H0,$R4,$M4
2334 vpmuludq $H0,$R0,$M0
2335 vpmuludq $H0,$R1,$M1
2336 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2337 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2338 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2339 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2341 vinserti64x2 \$2,16*4($inp),$T0,$T0
2342 vinserti64x2 \$2,16*5($inp),$T1,$T1
2343 vpmuludq $H1,$R2,$M3
2344 vpmuludq $H1,$R3,$M4
2345 vpmuludq $H1,$S4,$M0
2346 vpmuludq $H0,$R2,$M2
2347 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2348 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2349 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2350 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2352 vinserti64x2 \$3,16*6($inp),$T0,$T0
2353 vinserti64x2 \$3,16*7($inp),$T1,$T1
2354 vpmuludq $H3,$R0,$M3
2355 vpmuludq $H3,$R1,$M4
2356 vpmuludq $H1,$R0,$M1
2357 vpmuludq $H1,$R1,$M2
2358 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2359 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2360 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2361 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2363 vpsrldq \$6,$T0,$T2 # splat input
2365 vpunpckhqdq $T1,$T0,$T4 # 4
2366 vpmuludq $H4,$S4,$M3
2367 vpmuludq $H4,$R0,$M4
2368 vpmuludq $H3,$S2,$M0
2369 vpmuludq $H3,$S3,$M1
2370 vpaddq $M3,$D3,$D3 # d3 += h4*s4
2371 vpmuludq $H3,$S4,$M2
2372 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2373 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2374 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2375 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2377 vpunpcklqdq $T1,$T0,$T0 # 0:1
2378 vpunpcklqdq $T3,$T2,$T3 # 2:3
2380 vpmuludq $H4,$S1,$M0
2381 vpmuludq $H4,$S2,$M1
2382 vpmuludq $H4,$S3,$M2
2383 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2384 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2385 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2387 ################################################################
2388 # lazy reduction (interleaved with tail of input splat)
2391 vpandq $MASK,$D3,$D3
2392 vpaddq $H3,$D4,$H4 # h3 -> h4
2395 vpandq $MASK,$H0,$H0
2396 vpaddq $D0,$H1,$H1 # h0 -> h1
2399 vpandq $MASK,$H4,$H4
2404 vpandq $MASK,$H1,$H1
2405 vpaddq $D1,$H2,$H2 # h1 -> h2
2409 vpaddq $D4,$H0,$H0 # h4 -> h0
2411 vpandq $MASK,$T2,$T2 # 2
2415 vpandq $MASK,$H2,$H2
2416 vpaddq $D2,$D3,$H3 # h2 -> h3
2418 vpaddq $T2,$H2,$H2 # modulo-scheduled
2422 vpandq $MASK,$H0,$H0
2423 vpaddq $D0,$H1,$H1 # h0 -> h1
2425 vpsrlq \$40,$T4,$T4 # 4
2428 vpandq $MASK,$H3,$H3
2429 vpaddq $D3,$H4,$H4 # h3 -> h4
2431 vpandq $MASK,$T0,$T0 # 0
2432 #vpandq $MASK,$T1,$T1 # 1
2433 #vpandq $MASK,$T3,$T3 # 3
2434 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2440 ################################################################
2441 # while above multiplications were by r^8 in all lanes, in last
2442 # iteration we multiply least significant lane by r^8 and most
2443 # significant one by r, that's why table gets shifted...
2445 vpsrlq \$32,$R0,$R0 # 0102030405060708
2455 ################################################################
2456 # load either next or last 64 byte of input
2457 lea ($inp,$len),$inp
2459 #vpaddq $H2,$T2,$H2 # accumulate input
2462 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2463 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2464 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2465 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2466 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2467 vpandq $MASK,$T1,$T1 # 1, module-scheduled
2468 vpandq $MASK,$T3,$T3 # 3
2469 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2470 vpaddq $H1,$T1,$H1 # accumulate input
2474 vmovdqu64 16*0($inp),%x#$T0
2475 vpmuludq $H0,$R3,$M3
2476 vpmuludq $H0,$R4,$M4
2477 vpmuludq $H0,$R0,$M0
2478 vpmuludq $H0,$R1,$M1
2479 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2480 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2481 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2482 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2484 vmovdqu64 16*1($inp),%x#$T1
2485 vpmuludq $H1,$R2,$M3
2486 vpmuludq $H1,$R3,$M4
2487 vpmuludq $H1,$S4,$M0
2488 vpmuludq $H0,$R2,$M2
2489 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2490 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2491 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2492 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2494 vinserti64x2 \$1,16*2($inp),$T0,$T0
2495 vpmuludq $H3,$R0,$M3
2496 vpmuludq $H3,$R1,$M4
2497 vpmuludq $H1,$R0,$M1
2498 vpmuludq $H1,$R1,$M2
2499 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2500 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2501 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2502 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2504 vinserti64x2 \$1,16*3($inp),$T1,$T1
2505 vpmuludq $H4,$S4,$M3
2506 vpmuludq $H4,$R0,$M4
2507 vpmuludq $H3,$S2,$M0
2508 vpmuludq $H3,$S3,$M1
2509 vpmuludq $H3,$S4,$M2
2510 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
2511 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2512 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2513 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2514 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2516 vpmuludq $H4,$S1,$M0
2517 vpmuludq $H4,$S2,$M1
2518 vpmuludq $H4,$S3,$M2
2519 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2520 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2521 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2523 ################################################################
2524 # horizontal addition
2539 vpermq \$0x2,$H3,$D3
2540 vpermq \$0x2,$H4,$D4
2541 vpermq \$0x2,$H0,$D0
2542 vpermq \$0x2,$H1,$D1
2543 vpermq \$0x2,$H2,$D2
2550 vextracti64x4 \$0x1,$H3,%y#$D3
2551 vextracti64x4 \$0x1,$H4,%y#$D4
2552 vextracti64x4 \$0x1,$H0,%y#$D0
2553 vextracti64x4 \$0x1,$H1,%y#$D1
2554 vextracti64x4 \$0x1,$H2,%y#$D2
2555 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
2556 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
2557 vpaddq $D0,$H0,${H0}{%k3}{z}
2558 vpaddq $D1,$H1,${H1}{%k3}{z}
2559 vpaddq $D2,$H2,${H2}{%k3}{z}
2561 map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2562 map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2564 ################################################################
2565 # lazy reduction (interleaved with input splat)
2568 vpandq $MASK,$H3,$H3
2569 vpsrldq \$6,$T0,$T2 # splat input
2571 vpunpckhqdq $T1,$T0,$T4 # 4
2572 vpaddq $D3,$H4,$H4 # h3 -> h4
2575 vpandq $MASK,$H0,$H0
2576 vpunpcklqdq $T3,$T2,$T2 # 2:3
2577 vpunpcklqdq $T1,$T0,$T0 # 0:1
2578 vpaddq $D0,$H1,$H1 # h0 -> h1
2581 vpandq $MASK,$H4,$H4
2584 vpandq $MASK,$H1,$H1
2587 vpaddq $D1,$H2,$H2 # h1 -> h2
2592 vpsrlq \$40,$T4,$T4 # 4
2593 vpaddq $D4,$H0,$H0 # h4 -> h0
2596 vpandq $MASK,$H2,$H2
2597 vpandq $MASK,$T2,$T2 # 2
2598 vpandq $MASK,$T0,$T0 # 0
2599 vpaddq $D2,$H3,$H3 # h2 -> h3
2602 vpandq $MASK,$H0,$H0
2603 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
2604 vpandq $MASK,$T1,$T1 # 1
2605 vpaddq $D0,$H1,$H1 # h0 -> h1
2608 vpandq $MASK,$H3,$H3
2609 vpandq $MASK,$T3,$T3 # 3
2610 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2611 vpaddq $D3,$H4,$H4 # h3 -> h4
2613 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
2617 vpsubq $T2,$H2,$H2 # undo input accumulation
2618 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2619 vmovd %x#$H1,`4*1-48-64`($ctx)
2620 vmovd %x#$H2,`4*2-48-64`($ctx)
2621 vmovd %x#$H3,`4*3-48-64`($ctx)
2622 vmovd %x#$H4,`4*4-48-64`($ctx)
2624 $code.=<<___ if ($win64);
2625 vmovdqa 0x50(%r11),%xmm6
2626 vmovdqa 0x60(%r11),%xmm7
2627 vmovdqa 0x70(%r11),%xmm8
2628 vmovdqa 0x80(%r11),%xmm9
2629 vmovdqa 0x90(%r11),%xmm10
2630 vmovdqa 0xa0(%r11),%xmm11
2631 vmovdqa 0xb0(%r11),%xmm12
2632 vmovdqa 0xc0(%r11),%xmm13
2633 vmovdqa 0xd0(%r11),%xmm14
2634 vmovdqa 0xe0(%r11),%xmm15
2636 .Ldo_avx512_epilogue:
2638 $code.=<<___ if (!$win64);
2644 .size poly1305_blocks_avx512,.-poly1305_blocks_avx512
2651 .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
2653 .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
2655 .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
2657 .long 5,0,5,0,5,0,5,0
2659 .long 0,8, 32,40, 64,72, 96,104
2664 .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2668 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2669 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2677 .extern __imp_RtlVirtualUnwind
2678 .type se_handler,\@abi-omnipotent
2692 mov 120($context),%rax # pull context->Rax
2693 mov 248($context),%rbx # pull context->Rip
2695 mov 8($disp),%rsi # disp->ImageBase
2696 mov 56($disp),%r11 # disp->HandlerData
2698 mov 0(%r11),%r10d # HandlerData[0]
2699 lea (%rsi,%r10),%r10 # prologue label
2700 cmp %r10,%rbx # context->Rip<.Lprologue
2701 jb .Lcommon_seh_tail
2703 mov 152($context),%rax # pull context->Rsp
2705 mov 4(%r11),%r10d # HandlerData[1]
2706 lea (%rsi,%r10),%r10 # epilogue label
2707 cmp %r10,%rbx # context->Rip>=.Lepilogue
2708 jae .Lcommon_seh_tail
2718 mov %rbx,144($context) # restore context->Rbx
2719 mov %rbp,160($context) # restore context->Rbp
2720 mov %r12,216($context) # restore context->R12
2721 mov %r13,224($context) # restore context->R13
2722 mov %r14,232($context) # restore context->R14
2723 mov %r15,240($context) # restore context->R14
2725 jmp .Lcommon_seh_tail
2726 .size se_handler,.-se_handler
2728 .type avx_handler,\@abi-omnipotent
2742 mov 120($context),%rax # pull context->Rax
2743 mov 248($context),%rbx # pull context->Rip
2745 mov 8($disp),%rsi # disp->ImageBase
2746 mov 56($disp),%r11 # disp->HandlerData
2748 mov 0(%r11),%r10d # HandlerData[0]
2749 lea (%rsi,%r10),%r10 # prologue label
2750 cmp %r10,%rbx # context->Rip<prologue label
2751 jb .Lcommon_seh_tail
2753 mov 152($context),%rax # pull context->Rsp
2755 mov 4(%r11),%r10d # HandlerData[1]
2756 lea (%rsi,%r10),%r10 # epilogue label
2757 cmp %r10,%rbx # context->Rip>=epilogue label
2758 jae .Lcommon_seh_tail
2760 mov 208($context),%rax # pull context->R11
2764 lea 512($context),%rdi # &context.Xmm6
2766 .long 0xa548f3fc # cld; rep movsq
2771 mov %rax,152($context) # restore context->Rsp
2772 mov %rsi,168($context) # restore context->Rsi
2773 mov %rdi,176($context) # restore context->Rdi
2775 mov 40($disp),%rdi # disp->ContextRecord
2776 mov $context,%rsi # context
2777 mov \$154,%ecx # sizeof(CONTEXT)
2778 .long 0xa548f3fc # cld; rep movsq
2781 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2782 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2783 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2784 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2785 mov 40(%rsi),%r10 # disp->ContextRecord
2786 lea 56(%rsi),%r11 # &disp->HandlerData
2787 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2788 mov %r10,32(%rsp) # arg5
2789 mov %r11,40(%rsp) # arg6
2790 mov %r12,48(%rsp) # arg7
2791 mov %rcx,56(%rsp) # arg8, (NULL)
2792 call *__imp_RtlVirtualUnwind(%rip)
2794 mov \$1,%eax # ExceptionContinueSearch
2806 .size avx_handler,.-avx_handler
2810 .rva .LSEH_begin_poly1305_init
2811 .rva .LSEH_end_poly1305_init
2812 .rva .LSEH_info_poly1305_init
2814 .rva .LSEH_begin_poly1305_blocks
2815 .rva .LSEH_end_poly1305_blocks
2816 .rva .LSEH_info_poly1305_blocks
2818 .rva .LSEH_begin_poly1305_emit
2819 .rva .LSEH_end_poly1305_emit
2820 .rva .LSEH_info_poly1305_emit
2822 $code.=<<___ if ($avx);
2823 .rva .LSEH_begin_poly1305_blocks_avx
2825 .rva .LSEH_info_poly1305_blocks_avx_1
2829 .rva .LSEH_info_poly1305_blocks_avx_2
2832 .rva .LSEH_end_poly1305_blocks_avx
2833 .rva .LSEH_info_poly1305_blocks_avx_3
2835 .rva .LSEH_begin_poly1305_emit_avx
2836 .rva .LSEH_end_poly1305_emit_avx
2837 .rva .LSEH_info_poly1305_emit_avx
2839 $code.=<<___ if ($avx>1);
2840 .rva .LSEH_begin_poly1305_blocks_avx2
2841 .rva .Lbase2_64_avx2
2842 .rva .LSEH_info_poly1305_blocks_avx2_1
2844 .rva .Lbase2_64_avx2
2846 .rva .LSEH_info_poly1305_blocks_avx2_2
2849 .rva .LSEH_end_poly1305_blocks_avx2
2850 .rva .LSEH_info_poly1305_blocks_avx2_3
2852 $code.=<<___ if ($avx>2);
2853 .rva .LSEH_begin_poly1305_blocks_avx512
2854 .rva .LSEH_end_poly1305_blocks_avx512
2855 .rva .LSEH_info_poly1305_blocks_avx512
2860 .LSEH_info_poly1305_init:
2863 .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
2865 .LSEH_info_poly1305_blocks:
2868 .rva .Lblocks_body,.Lblocks_epilogue
2870 .LSEH_info_poly1305_emit:
2873 .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
2875 $code.=<<___ if ($avx);
2876 .LSEH_info_poly1305_blocks_avx_1:
2879 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
2881 .LSEH_info_poly1305_blocks_avx_2:
2884 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
2886 .LSEH_info_poly1305_blocks_avx_3:
2889 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
2891 .LSEH_info_poly1305_emit_avx:
2894 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
2896 $code.=<<___ if ($avx>1);
2897 .LSEH_info_poly1305_blocks_avx2_1:
2900 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
2902 .LSEH_info_poly1305_blocks_avx2_2:
2905 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
2907 .LSEH_info_poly1305_blocks_avx2_3:
2910 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
2912 $code.=<<___ if ($avx>2);
2913 .LSEH_info_poly1305_blocks_avx512:
2916 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
2920 foreach (split('\n',$code)) {
2921 s/\`([^\`]*)\`/eval($1)/ge;
2922 s/%r([a-z]+)#d/%e$1/g;
2923 s/%r([0-9]+)#d/%r$1d/g;
2924 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;