2 # Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements Poly1305 hash for x86_64.
25 # Add AVX512F+VL+BW code path.
29 # Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
30 # executed even on Knights Landing. Trigger for modification was
31 # observation that AVX512 code paths can negatively affect overall
32 # Skylake-X system performance. Since we are likely to suppress
33 # AVX512F capability flag [at least on Skylake-X], conversion serves
34 # as kind of "investment protection". Note that next *lake processor,
35 # Cannolake, has AVX512IFMA code path to execute...
37 # Numbers are cycles per processed byte with poly1305_blocks alone,
38 # measured with rdtsc at fixed clock frequency.
40 # IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
43 # Westmere 1.88/+120% -
44 # Sandy Bridge 1.39/+140% 1.10
45 # Haswell 1.14/+175% 1.11 0.65
46 # Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
47 # Silvermont 2.83/+95% -
48 # Knights L 3.60/? 1.65 1.10 0.41(***)
49 # Goldmont 1.70/+180% -
50 # VIA Nano 1.82/+150% -
51 # Sledgehammer 1.38/+160% -
52 # Bulldozer 2.30/+130% 0.97
53 # Ryzen 1.15/+200% 1.08 1.18
55 # (*) improvement coefficients relative to clang are more modest and
56 # are ~50% on most processors, in both cases we are comparing to
58 # (**) SSE2 implementation was attempted, but among non-AVX processors
59 # it was faster than integer-only code only on older Intel P4 and
60 # Core processors, 50-30%, less newer processor is, but slower on
61 # contemporary ones, for example almost 2x slower on Atom, and as
62 # former are naturally disappearing, SSE2 is deemed unnecessary;
63 # (***) strangely enough performance seems to vary from core to core,
64 # listed result is best case;
66 # $output is the last argument if it looks like a file (it has an extension)
67 # $flavour is the first argument if it doesn't look like a file
68 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
69 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
71 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
73 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
74 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
75 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
76 die "can't locate x86_64-xlate.pl";
78 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
79 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
80 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
83 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
84 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
85 $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
86 $avx += 2 if ($1==2.11 && $2>=8);
89 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
90 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
91 $avx = ($1>=10) + ($1>=12);
94 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
95 $avx = ($2>=3.0) + ($2>3.0);
98 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
99 or die "can't call $xlate: $!";
102 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
103 my ($mac,$nonce)=($inp,$len); # *_emit arguments
104 my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
105 my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
107 sub poly1305_iteration {
108 # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
109 # output: $h0-$h2 *= $r0-$r1
117 mov %rax,$h0 # future $h0
127 mov $h2,$h1 # borrow $h1
131 imulq $s1,$h1 # h2*s1
136 imulq $r0,$h2 # h2*r0
138 mov \$-4,%rax # mask value
141 and $d3,%rax # last reduction step
152 ########################################################################
153 # Layout of opaque area is following.
155 # unsigned __int64 h[3]; # current hash value base 2^64
156 # unsigned __int64 r[2]; # key value base 2^64
161 .extern OPENSSL_ia32cap_P
164 .hidden poly1305_init
165 .globl poly1305_blocks
166 .hidden poly1305_blocks
168 .hidden poly1305_emit
170 .type poly1305_init,\@function,3
175 mov %rax,0($ctx) # initialize hash value
182 lea poly1305_blocks(%rip),%r10
183 lea poly1305_emit(%rip),%r11
185 $code.=<<___ if ($avx);
186 mov OPENSSL_ia32cap_P+4(%rip),%r9
187 lea poly1305_blocks_avx(%rip),%rax
188 lea poly1305_emit_avx(%rip),%rcx
189 bt \$`60-32`,%r9 # AVX?
193 $code.=<<___ if ($avx>1);
194 lea poly1305_blocks_avx2(%rip),%rax
195 bt \$`5+32`,%r9 # AVX2?
198 $code.=<<___ if ($avx>3);
199 mov \$`(1<<31|1<<21|1<<16)`,%rax
206 mov \$0x0ffffffc0fffffff,%rax
207 mov \$0x0ffffffc0ffffffc,%rcx
213 $code.=<<___ if ($flavour !~ /elf32/);
217 $code.=<<___ if ($flavour =~ /elf32/);
226 .size poly1305_init,.-poly1305_init
228 .type poly1305_blocks,\@function,4
234 jz .Lno_data # too short
250 mov $len,%r15 # reassign $len
252 mov 24($ctx),$r0 # load r
255 mov 0($ctx),$h0 # load hash value
262 add $r1,$s1 # s1 = r1 + (r1 >> 2)
267 add 0($inp),$h0 # accumulate input
272 &poly1305_iteration();
278 mov $h0,0($ctx) # store hash value
295 .cfi_adjust_cfa_offset -48
300 .size poly1305_blocks,.-poly1305_blocks
302 .type poly1305_emit,\@function,3
307 mov 0($ctx),%r8 # load hash value
312 add \$5,%r8 # compare to modulus
316 shr \$2,%r10 # did 130-bit value overflow?
320 add 0($nonce),%rax # accumulate nonce
322 mov %rax,0($mac) # write result
327 .size poly1305_emit,.-poly1305_emit
331 ########################################################################
332 # Layout of opaque area is following.
334 # unsigned __int32 h[5]; # current hash value base 2^26
335 # unsigned __int32 is_base2_26;
336 # unsigned __int64 r[2]; # key value base 2^64
337 # unsigned __int64 pad;
338 # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
340 # where r^n are base 2^26 digits of degrees of multiplier key. There are
341 # 5 digits, but last four are interleaved with multiples of 5, totalling
342 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
344 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
345 map("%xmm$_",(0..15));
348 .type __poly1305_block,\@abi-omnipotent
353 &poly1305_iteration();
357 .size __poly1305_block,.-__poly1305_block
359 .type __poly1305_init_avx,\@abi-omnipotent
367 lea 48+64($ctx),$ctx # size optimization
370 call __poly1305_block # r^2
372 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
378 mov %eax,`16*0+0-64`($ctx)
380 mov %edx,`16*0+4-64`($ctx)
387 mov %eax,`16*1+0-64`($ctx)
388 lea (%rax,%rax,4),%eax # *5
389 mov %edx,`16*1+4-64`($ctx)
390 lea (%rdx,%rdx,4),%edx # *5
391 mov %eax,`16*2+0-64`($ctx)
393 mov %edx,`16*2+4-64`($ctx)
404 mov %eax,`16*3+0-64`($ctx)
405 lea (%rax,%rax,4),%eax # *5
406 mov %edx,`16*3+4-64`($ctx)
407 lea (%rdx,%rdx,4),%edx # *5
408 mov %eax,`16*4+0-64`($ctx)
410 mov %edx,`16*4+4-64`($ctx)
419 mov %eax,`16*5+0-64`($ctx)
420 lea (%rax,%rax,4),%eax # *5
421 mov %edx,`16*5+4-64`($ctx)
422 lea (%rdx,%rdx,4),%edx # *5
423 mov %eax,`16*6+0-64`($ctx)
425 mov %edx,`16*6+4-64`($ctx)
431 mov $d1#d,`16*7+0-64`($ctx)
432 lea ($d1,$d1,4),$d1 # *5
433 mov $d2#d,`16*7+4-64`($ctx)
434 lea ($d2,$d2,4),$d2 # *5
435 mov $d1#d,`16*8+0-64`($ctx)
436 mov $d2#d,`16*8+4-64`($ctx)
439 call __poly1305_block # r^3
441 mov \$0x3ffffff,%eax # save r^3 base 2^26
445 mov %eax,`16*0+12-64`($ctx)
449 mov %edx,`16*1+12-64`($ctx)
450 lea (%rdx,%rdx,4),%edx # *5
452 mov %edx,`16*2+12-64`($ctx)
458 mov %eax,`16*3+12-64`($ctx)
459 lea (%rax,%rax,4),%eax # *5
461 mov %eax,`16*4+12-64`($ctx)
466 mov %edx,`16*5+12-64`($ctx)
467 lea (%rdx,%rdx,4),%edx # *5
469 mov %edx,`16*6+12-64`($ctx)
474 mov $d1#d,`16*7+12-64`($ctx)
475 lea ($d1,$d1,4),$d1 # *5
476 mov $d1#d,`16*8+12-64`($ctx)
479 call __poly1305_block # r^4
481 mov \$0x3ffffff,%eax # save r^4 base 2^26
485 mov %eax,`16*0+8-64`($ctx)
489 mov %edx,`16*1+8-64`($ctx)
490 lea (%rdx,%rdx,4),%edx # *5
492 mov %edx,`16*2+8-64`($ctx)
498 mov %eax,`16*3+8-64`($ctx)
499 lea (%rax,%rax,4),%eax # *5
501 mov %eax,`16*4+8-64`($ctx)
506 mov %edx,`16*5+8-64`($ctx)
507 lea (%rdx,%rdx,4),%edx # *5
509 mov %edx,`16*6+8-64`($ctx)
514 mov $d1#d,`16*7+8-64`($ctx)
515 lea ($d1,$d1,4),$d1 # *5
516 mov $d1#d,`16*8+8-64`($ctx)
518 lea -48-64($ctx),$ctx # size [de-]optimization
521 .size __poly1305_init_avx,.-__poly1305_init_avx
523 .type poly1305_blocks_avx,\@function,4
527 mov 20($ctx),%r8d # is_base2_26
559 mov $len,%r15 # reassign $len
561 mov 0($ctx),$d1 # load hash value
565 mov 24($ctx),$r0 # load r
568 ################################# base 2^26 -> base 2^64
570 and \$`-1*(1<<31)`,$d1
571 mov $d2,$r1 # borrow $r1
573 and \$`-1*(1<<31)`,$d2
587 adc \$0,$h2 # can be partially reduced...
589 mov \$-4,$d2 # ... so reduce
602 add $r1,$s1 # s1 = r1 + (r1 >> 2)
604 add 0($inp),$h0 # accumulate input
609 call __poly1305_block
611 test $padbit,$padbit # if $padbit is zero,
612 jz .Lstore_base2_64_avx # store hash in base 2^64 format
614 ################################# base 2^64 -> base 2^26
621 and \$0x3ffffff,%rax # h[0]
623 and \$0x3ffffff,%rdx # h[1]
627 and \$0x3ffffff,$h0 # h[2]
629 and \$0x3ffffff,$h1 # h[3]
633 jz .Lstore_base2_26_avx
643 .Lstore_base2_64_avx:
646 mov $h2,16($ctx) # note that is_base2_26 is zeroed
650 .Lstore_base2_26_avx:
651 mov %rax#d,0($ctx) # store hash value base 2^26
671 .cfi_adjust_cfa_offset -48
673 .Lblocks_avx_epilogue:
694 mov $len,%r15 # reassign $len
696 mov 24($ctx),$r0 # load r
699 mov 0($ctx),$h0 # load hash value
706 add $r1,$s1 # s1 = r1 + (r1 >> 2)
711 add 0($inp),$h0 # accumulate input
717 call __poly1305_block
720 ################################# base 2^64 -> base 2^26
727 and \$0x3ffffff,%rax # h[0]
729 and \$0x3ffffff,%rdx # h[1]
733 and \$0x3ffffff,$h0 # h[2]
735 and \$0x3ffffff,$h1 # h[3]
743 movl \$1,20($ctx) # set is_base2_26
745 call __poly1305_init_avx
764 .cfi_adjust_cfa_offset -48
765 .Lbase2_64_avx_epilogue:
772 vmovd 4*0($ctx),$H0 # load hash value
780 $code.=<<___ if (!$win64);
782 .cfi_def_cfa %r11,0x60
785 $code.=<<___ if ($win64);
788 vmovdqa %xmm6,0x50(%r11)
789 vmovdqa %xmm7,0x60(%r11)
790 vmovdqa %xmm8,0x70(%r11)
791 vmovdqa %xmm9,0x80(%r11)
792 vmovdqa %xmm10,0x90(%r11)
793 vmovdqa %xmm11,0xa0(%r11)
794 vmovdqa %xmm12,0xb0(%r11)
795 vmovdqa %xmm13,0xc0(%r11)
796 vmovdqa %xmm14,0xd0(%r11)
797 vmovdqa %xmm15,0xe0(%r11)
805 vmovdqu `16*3`($ctx),$D4 # preload r0^2
806 lea `16*3+64`($ctx),$ctx # size optimization
807 lea .Lconst(%rip),%rcx
809 ################################################################
811 vmovdqu 16*2($inp),$T0
812 vmovdqu 16*3($inp),$T1
813 vmovdqa 64(%rcx),$MASK # .Lmask26
815 vpsrldq \$6,$T0,$T2 # splat input
817 vpunpckhqdq $T1,$T0,$T4 # 4
818 vpunpcklqdq $T1,$T0,$T0 # 0:1
819 vpunpcklqdq $T3,$T2,$T3 # 2:3
821 vpsrlq \$40,$T4,$T4 # 4
823 vpand $MASK,$T0,$T0 # 0
825 vpand $MASK,$T1,$T1 # 1
827 vpand $MASK,$T2,$T2 # 2
828 vpand $MASK,$T3,$T3 # 3
829 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
833 # expand and copy pre-calculated table to stack
834 vmovdqu `16*1-64`($ctx),$D1
835 vmovdqu `16*2-64`($ctx),$D2
836 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
837 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
838 vmovdqa $D3,-0x90(%r11)
839 vmovdqa $D0,0x00(%rsp)
840 vpshufd \$0xEE,$D1,$D4
841 vmovdqu `16*3-64`($ctx),$D0
842 vpshufd \$0x44,$D1,$D1
843 vmovdqa $D4,-0x80(%r11)
844 vmovdqa $D1,0x10(%rsp)
845 vpshufd \$0xEE,$D2,$D3
846 vmovdqu `16*4-64`($ctx),$D1
847 vpshufd \$0x44,$D2,$D2
848 vmovdqa $D3,-0x70(%r11)
849 vmovdqa $D2,0x20(%rsp)
850 vpshufd \$0xEE,$D0,$D4
851 vmovdqu `16*5-64`($ctx),$D2
852 vpshufd \$0x44,$D0,$D0
853 vmovdqa $D4,-0x60(%r11)
854 vmovdqa $D0,0x30(%rsp)
855 vpshufd \$0xEE,$D1,$D3
856 vmovdqu `16*6-64`($ctx),$D0
857 vpshufd \$0x44,$D1,$D1
858 vmovdqa $D3,-0x50(%r11)
859 vmovdqa $D1,0x40(%rsp)
860 vpshufd \$0xEE,$D2,$D4
861 vmovdqu `16*7-64`($ctx),$D1
862 vpshufd \$0x44,$D2,$D2
863 vmovdqa $D4,-0x40(%r11)
864 vmovdqa $D2,0x50(%rsp)
865 vpshufd \$0xEE,$D0,$D3
866 vmovdqu `16*8-64`($ctx),$D2
867 vpshufd \$0x44,$D0,$D0
868 vmovdqa $D3,-0x30(%r11)
869 vmovdqa $D0,0x60(%rsp)
870 vpshufd \$0xEE,$D1,$D4
871 vpshufd \$0x44,$D1,$D1
872 vmovdqa $D4,-0x20(%r11)
873 vmovdqa $D1,0x70(%rsp)
874 vpshufd \$0xEE,$D2,$D3
875 vmovdqa 0x00(%rsp),$D4 # preload r0^2
876 vpshufd \$0x44,$D2,$D2
877 vmovdqa $D3,-0x10(%r11)
878 vmovdqa $D2,0x80(%rsp)
884 ################################################################
885 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
886 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
887 # \___________________/
888 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
889 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
890 # \___________________/ \____________________/
892 # Note that we start with inp[2:3]*r^2. This is because it
893 # doesn't depend on reduction in previous iteration.
894 ################################################################
895 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
896 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
897 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
898 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
899 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
901 # though note that $Tx and $Hx are "reversed" in this section,
902 # and $D4 is preloaded with r0^2...
904 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
905 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
906 vmovdqa $H2,0x20(%r11) # offload hash
907 vpmuludq $T2,$D4,$D2 # d3 = h2*r0
908 vmovdqa 0x10(%rsp),$H2 # r1^2
909 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
910 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
912 vmovdqa $H0,0x00(%r11) #
913 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
914 vmovdqa $H1,0x10(%r11) #
915 vpmuludq $T3,$H2,$H1 # h3*r1
916 vpaddq $H0,$D0,$D0 # d0 += h4*s1
917 vpaddq $H1,$D4,$D4 # d4 += h3*r1
918 vmovdqa $H3,0x30(%r11) #
919 vpmuludq $T2,$H2,$H0 # h2*r1
920 vpmuludq $T1,$H2,$H1 # h1*r1
921 vpaddq $H0,$D3,$D3 # d3 += h2*r1
922 vmovdqa 0x30(%rsp),$H3 # r2^2
923 vpaddq $H1,$D2,$D2 # d2 += h1*r1
924 vmovdqa $H4,0x40(%r11) #
925 vpmuludq $T0,$H2,$H2 # h0*r1
926 vpmuludq $T2,$H3,$H0 # h2*r2
927 vpaddq $H2,$D1,$D1 # d1 += h0*r1
929 vmovdqa 0x40(%rsp),$H4 # s2^2
930 vpaddq $H0,$D4,$D4 # d4 += h2*r2
931 vpmuludq $T1,$H3,$H1 # h1*r2
932 vpmuludq $T0,$H3,$H3 # h0*r2
933 vpaddq $H1,$D3,$D3 # d3 += h1*r2
934 vmovdqa 0x50(%rsp),$H2 # r3^2
935 vpaddq $H3,$D2,$D2 # d2 += h0*r2
936 vpmuludq $T4,$H4,$H0 # h4*s2
937 vpmuludq $T3,$H4,$H4 # h3*s2
938 vpaddq $H0,$D1,$D1 # d1 += h4*s2
939 vmovdqa 0x60(%rsp),$H3 # s3^2
940 vpaddq $H4,$D0,$D0 # d0 += h3*s2
942 vmovdqa 0x80(%rsp),$H4 # s4^2
943 vpmuludq $T1,$H2,$H1 # h1*r3
944 vpmuludq $T0,$H2,$H2 # h0*r3
945 vpaddq $H1,$D4,$D4 # d4 += h1*r3
946 vpaddq $H2,$D3,$D3 # d3 += h0*r3
947 vpmuludq $T4,$H3,$H0 # h4*s3
948 vpmuludq $T3,$H3,$H1 # h3*s3
949 vpaddq $H0,$D2,$D2 # d2 += h4*s3
950 vmovdqu 16*0($inp),$H0 # load input
951 vpaddq $H1,$D1,$D1 # d1 += h3*s3
952 vpmuludq $T2,$H3,$H3 # h2*s3
953 vpmuludq $T2,$H4,$T2 # h2*s4
954 vpaddq $H3,$D0,$D0 # d0 += h2*s3
956 vmovdqu 16*1($inp),$H1 #
957 vpaddq $T2,$D1,$D1 # d1 += h2*s4
958 vpmuludq $T3,$H4,$T3 # h3*s4
959 vpmuludq $T4,$H4,$T4 # h4*s4
960 vpsrldq \$6,$H0,$H2 # splat input
961 vpaddq $T3,$D2,$D2 # d2 += h3*s4
962 vpaddq $T4,$D3,$D3 # d3 += h4*s4
963 vpsrldq \$6,$H1,$H3 #
964 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
965 vpmuludq $T1,$H4,$T0 # h1*s4
966 vpunpckhqdq $H1,$H0,$H4 # 4
967 vpaddq $T4,$D4,$D4 # d4 += h0*r4
968 vmovdqa -0x90(%r11),$T4 # r0^4
969 vpaddq $T0,$D0,$D0 # d0 += h1*s4
971 vpunpcklqdq $H1,$H0,$H0 # 0:1
972 vpunpcklqdq $H3,$H2,$H3 # 2:3
974 #vpsrlq \$40,$H4,$H4 # 4
975 vpsrldq \$`40/8`,$H4,$H4 # 4
977 vpand $MASK,$H0,$H0 # 0
979 vpand $MASK,$H1,$H1 # 1
980 vpand 0(%rcx),$H4,$H4 # .Lmask24
982 vpand $MASK,$H2,$H2 # 2
983 vpand $MASK,$H3,$H3 # 3
984 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
986 vpaddq 0x00(%r11),$H0,$H0 # add hash value
987 vpaddq 0x10(%r11),$H1,$H1
988 vpaddq 0x20(%r11),$H2,$H2
989 vpaddq 0x30(%r11),$H3,$H3
990 vpaddq 0x40(%r11),$H4,$H4
997 ################################################################
998 # Now we accumulate (inp[0:1]+hash)*r^4
999 ################################################################
1000 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1001 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1002 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1003 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1004 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1006 vpmuludq $H0,$T4,$T0 # h0*r0
1007 vpmuludq $H1,$T4,$T1 # h1*r0
1010 vmovdqa -0x80(%r11),$T2 # r1^4
1011 vpmuludq $H2,$T4,$T0 # h2*r0
1012 vpmuludq $H3,$T4,$T1 # h3*r0
1015 vpmuludq $H4,$T4,$T4 # h4*r0
1016 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
1019 vpaddq $T0,$D0,$D0 # d0 += h4*s1
1020 vpmuludq $H2,$T2,$T1 # h2*r1
1021 vpmuludq $H3,$T2,$T0 # h3*r1
1022 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1023 vmovdqa -0x60(%r11),$T3 # r2^4
1024 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1025 vpmuludq $H1,$T2,$T1 # h1*r1
1026 vpmuludq $H0,$T2,$T2 # h0*r1
1027 vpaddq $T1,$D2,$D2 # d2 += h1*r1
1028 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1030 vmovdqa -0x50(%r11),$T4 # s2^4
1031 vpmuludq $H2,$T3,$T0 # h2*r2
1032 vpmuludq $H1,$T3,$T1 # h1*r2
1033 vpaddq $T0,$D4,$D4 # d4 += h2*r2
1034 vpaddq $T1,$D3,$D3 # d3 += h1*r2
1035 vmovdqa -0x40(%r11),$T2 # r3^4
1036 vpmuludq $H0,$T3,$T3 # h0*r2
1037 vpmuludq $H4,$T4,$T0 # h4*s2
1038 vpaddq $T3,$D2,$D2 # d2 += h0*r2
1039 vpaddq $T0,$D1,$D1 # d1 += h4*s2
1040 vmovdqa -0x30(%r11),$T3 # s3^4
1041 vpmuludq $H3,$T4,$T4 # h3*s2
1042 vpmuludq $H1,$T2,$T1 # h1*r3
1043 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1045 vmovdqa -0x10(%r11),$T4 # s4^4
1046 vpaddq $T1,$D4,$D4 # d4 += h1*r3
1047 vpmuludq $H0,$T2,$T2 # h0*r3
1048 vpmuludq $H4,$T3,$T0 # h4*s3
1049 vpaddq $T2,$D3,$D3 # d3 += h0*r3
1050 vpaddq $T0,$D2,$D2 # d2 += h4*s3
1051 vmovdqu 16*2($inp),$T0 # load input
1052 vpmuludq $H3,$T3,$T2 # h3*s3
1053 vpmuludq $H2,$T3,$T3 # h2*s3
1054 vpaddq $T2,$D1,$D1 # d1 += h3*s3
1055 vmovdqu 16*3($inp),$T1 #
1056 vpaddq $T3,$D0,$D0 # d0 += h2*s3
1058 vpmuludq $H2,$T4,$H2 # h2*s4
1059 vpmuludq $H3,$T4,$H3 # h3*s4
1060 vpsrldq \$6,$T0,$T2 # splat input
1061 vpaddq $H2,$D1,$D1 # d1 += h2*s4
1062 vpmuludq $H4,$T4,$H4 # h4*s4
1063 vpsrldq \$6,$T1,$T3 #
1064 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
1065 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
1066 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
1067 vpmuludq $H1,$T4,$H0
1068 vpunpckhqdq $T1,$T0,$T4 # 4
1069 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1070 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1072 vpunpcklqdq $T1,$T0,$T0 # 0:1
1073 vpunpcklqdq $T3,$T2,$T3 # 2:3
1075 #vpsrlq \$40,$T4,$T4 # 4
1076 vpsrldq \$`40/8`,$T4,$T4 # 4
1078 vmovdqa 0x00(%rsp),$D4 # preload r0^2
1079 vpand $MASK,$T0,$T0 # 0
1081 vpand $MASK,$T1,$T1 # 1
1082 vpand 0(%rcx),$T4,$T4 # .Lmask24
1084 vpand $MASK,$T2,$T2 # 2
1085 vpand $MASK,$T3,$T3 # 3
1086 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1088 ################################################################
1089 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1094 vpaddq $D3,$H4,$H4 # h3 -> h4
1098 vpaddq $D0,$D1,$H1 # h0 -> h1
1105 vpaddq $D1,$H2,$H2 # h1 -> h2
1109 vpaddq $D0,$H0,$H0 # h4 -> h0
1113 vpaddq $D2,$H3,$H3 # h2 -> h3
1117 vpaddq $D0,$H1,$H1 # h0 -> h1
1121 vpaddq $D3,$H4,$H4 # h3 -> h4
1126 ################################################################
1127 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1129 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1140 vmovdqa $H2,0x20(%r11)
1141 vmovdqa $H0,0x00(%r11)
1142 vmovdqa $H1,0x10(%r11)
1143 vmovdqa $H3,0x30(%r11)
1144 vmovdqa $H4,0x40(%r11)
1146 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1147 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1148 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1149 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1150 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1152 vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1153 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1154 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1155 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1156 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1157 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1159 vpmuludq $T3,$H2,$H0 # h3*r1
1160 vpaddq $H0,$D4,$D4 # d4 += h3*r1
1161 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1162 vpmuludq $T2,$H2,$H1 # h2*r1
1163 vpaddq $H1,$D3,$D3 # d3 += h2*r1
1164 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1165 vpmuludq $T1,$H2,$H0 # h1*r1
1166 vpaddq $H0,$D2,$D2 # d2 += h1*r1
1167 vpmuludq $T0,$H2,$H2 # h0*r1
1168 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1169 vpmuludq $T4,$H3,$H3 # h4*s1
1170 vpaddq $H3,$D0,$D0 # d0 += h4*s1
1172 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1173 vpmuludq $T2,$H4,$H1 # h2*r2
1174 vpaddq $H1,$D4,$D4 # d4 += h2*r2
1175 vpmuludq $T1,$H4,$H0 # h1*r2
1176 vpaddq $H0,$D3,$D3 # d3 += h1*r2
1177 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1178 vpmuludq $T0,$H4,$H4 # h0*r2
1179 vpaddq $H4,$D2,$D2 # d2 += h0*r2
1180 vpmuludq $T4,$H2,$H1 # h4*s2
1181 vpaddq $H1,$D1,$D1 # d1 += h4*s2
1182 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1183 vpmuludq $T3,$H2,$H2 # h3*s2
1184 vpaddq $H2,$D0,$D0 # d0 += h3*s2
1186 vpmuludq $T1,$H3,$H0 # h1*r3
1187 vpaddq $H0,$D4,$D4 # d4 += h1*r3
1188 vpmuludq $T0,$H3,$H3 # h0*r3
1189 vpaddq $H3,$D3,$D3 # d3 += h0*r3
1190 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1191 vpmuludq $T4,$H4,$H1 # h4*s3
1192 vpaddq $H1,$D2,$D2 # d2 += h4*s3
1193 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1194 vpmuludq $T3,$H4,$H0 # h3*s3
1195 vpaddq $H0,$D1,$D1 # d1 += h3*s3
1196 vpmuludq $T2,$H4,$H4 # h2*s3
1197 vpaddq $H4,$D0,$D0 # d0 += h2*s3
1199 vpmuludq $T0,$H2,$H2 # h0*r4
1200 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1201 vpmuludq $T4,$H3,$H1 # h4*s4
1202 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1203 vpmuludq $T3,$H3,$H0 # h3*s4
1204 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1205 vpmuludq $T2,$H3,$H1 # h2*s4
1206 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1207 vpmuludq $T1,$H3,$H3 # h1*s4
1208 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1212 vmovdqu 16*0($inp),$H0 # load input
1213 vmovdqu 16*1($inp),$H1
1215 vpsrldq \$6,$H0,$H2 # splat input
1217 vpunpckhqdq $H1,$H0,$H4 # 4
1218 vpunpcklqdq $H1,$H0,$H0 # 0:1
1219 vpunpcklqdq $H3,$H2,$H3 # 2:3
1221 vpsrlq \$40,$H4,$H4 # 4
1223 vpand $MASK,$H0,$H0 # 0
1225 vpand $MASK,$H1,$H1 # 1
1227 vpand $MASK,$H2,$H2 # 2
1228 vpand $MASK,$H3,$H3 # 3
1229 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1231 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1232 vpaddq 0x00(%r11),$H0,$H0
1233 vpaddq 0x10(%r11),$H1,$H1
1234 vpaddq 0x20(%r11),$H2,$H2
1235 vpaddq 0x30(%r11),$H3,$H3
1236 vpaddq 0x40(%r11),$H4,$H4
1238 ################################################################
1239 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1241 vpmuludq $H0,$T4,$T0 # h0*r0
1242 vpaddq $T0,$D0,$D0 # d0 += h0*r0
1243 vpmuludq $H1,$T4,$T1 # h1*r0
1244 vpaddq $T1,$D1,$D1 # d1 += h1*r0
1245 vpmuludq $H2,$T4,$T0 # h2*r0
1246 vpaddq $T0,$D2,$D2 # d2 += h2*r0
1247 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1248 vpmuludq $H3,$T4,$T1 # h3*r0
1249 vpaddq $T1,$D3,$D3 # d3 += h3*r0
1250 vpmuludq $H4,$T4,$T4 # h4*r0
1251 vpaddq $T4,$D4,$D4 # d4 += h4*r0
1253 vpmuludq $H3,$T2,$T0 # h3*r1
1254 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1255 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1256 vpmuludq $H2,$T2,$T1 # h2*r1
1257 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1258 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1259 vpmuludq $H1,$T2,$T0 # h1*r1
1260 vpaddq $T0,$D2,$D2 # d2 += h1*r1
1261 vpmuludq $H0,$T2,$T2 # h0*r1
1262 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1263 vpmuludq $H4,$T3,$T3 # h4*s1
1264 vpaddq $T3,$D0,$D0 # d0 += h4*s1
1266 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1267 vpmuludq $H2,$T4,$T1 # h2*r2
1268 vpaddq $T1,$D4,$D4 # d4 += h2*r2
1269 vpmuludq $H1,$T4,$T0 # h1*r2
1270 vpaddq $T0,$D3,$D3 # d3 += h1*r2
1271 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1272 vpmuludq $H0,$T4,$T4 # h0*r2
1273 vpaddq $T4,$D2,$D2 # d2 += h0*r2
1274 vpmuludq $H4,$T2,$T1 # h4*s2
1275 vpaddq $T1,$D1,$D1 # d1 += h4*s2
1276 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1277 vpmuludq $H3,$T2,$T2 # h3*s2
1278 vpaddq $T2,$D0,$D0 # d0 += h3*s2
1280 vpmuludq $H1,$T3,$T0 # h1*r3
1281 vpaddq $T0,$D4,$D4 # d4 += h1*r3
1282 vpmuludq $H0,$T3,$T3 # h0*r3
1283 vpaddq $T3,$D3,$D3 # d3 += h0*r3
1284 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1285 vpmuludq $H4,$T4,$T1 # h4*s3
1286 vpaddq $T1,$D2,$D2 # d2 += h4*s3
1287 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1288 vpmuludq $H3,$T4,$T0 # h3*s3
1289 vpaddq $T0,$D1,$D1 # d1 += h3*s3
1290 vpmuludq $H2,$T4,$T4 # h2*s3
1291 vpaddq $T4,$D0,$D0 # d0 += h2*s3
1293 vpmuludq $H0,$T2,$T2 # h0*r4
1294 vpaddq $T2,$D4,$D4 # d4 += h0*r4
1295 vpmuludq $H4,$T3,$T1 # h4*s4
1296 vpaddq $T1,$D3,$D3 # d3 += h4*s4
1297 vpmuludq $H3,$T3,$T0 # h3*s4
1298 vpaddq $T0,$D2,$D2 # d2 += h3*s4
1299 vpmuludq $H2,$T3,$T1 # h2*s4
1300 vpaddq $T1,$D1,$D1 # d1 += h2*s4
1301 vpmuludq $H1,$T3,$T3 # h1*s4
1302 vpaddq $T3,$D0,$D0 # d0 += h1*s4
1305 ################################################################
1306 # horizontal addition
1319 ################################################################
1324 vpaddq $H3,$D4,$D4 # h3 -> h4
1328 vpaddq $H0,$D1,$D1 # h0 -> h1
1335 vpaddq $H1,$D2,$D2 # h1 -> h2
1339 vpaddq $H4,$D0,$D0 # h4 -> h0
1343 vpaddq $H2,$D3,$D3 # h2 -> h3
1347 vpaddq $H0,$D1,$D1 # h0 -> h1
1351 vpaddq $H3,$D4,$D4 # h3 -> h4
1353 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1354 vmovd $D1,`4*1-48-64`($ctx)
1355 vmovd $D2,`4*2-48-64`($ctx)
1356 vmovd $D3,`4*3-48-64`($ctx)
1357 vmovd $D4,`4*4-48-64`($ctx)
1359 $code.=<<___ if ($win64);
1360 vmovdqa 0x50(%r11),%xmm6
1361 vmovdqa 0x60(%r11),%xmm7
1362 vmovdqa 0x70(%r11),%xmm8
1363 vmovdqa 0x80(%r11),%xmm9
1364 vmovdqa 0x90(%r11),%xmm10
1365 vmovdqa 0xa0(%r11),%xmm11
1366 vmovdqa 0xb0(%r11),%xmm12
1367 vmovdqa 0xc0(%r11),%xmm13
1368 vmovdqa 0xd0(%r11),%xmm14
1369 vmovdqa 0xe0(%r11),%xmm15
1373 $code.=<<___ if (!$win64);
1381 .size poly1305_blocks_avx,.-poly1305_blocks_avx
1383 .type poly1305_emit_avx,\@function,3
1387 cmpl \$0,20($ctx) # is_base2_26?
1390 mov 0($ctx),%eax # load hash value base 2^26
1396 shl \$26,%rcx # base 2^26 -> base 2^64
1412 mov %r10,%rax # could be partially reduced, so reduce
1423 add \$5,%r8 # compare to modulus
1427 shr \$2,%r10 # did 130-bit value overflow?
1431 add 0($nonce),%rax # accumulate nonce
1433 mov %rax,0($mac) # write result
1438 .size poly1305_emit_avx,.-poly1305_emit_avx
1442 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1443 map("%ymm$_",(0..15));
1447 .type poly1305_blocks_avx2,\@function,4
1449 poly1305_blocks_avx2:
1451 mov 20($ctx),%r8d # is_base2_26
1483 mov $len,%r15 # reassign $len
1485 mov 0($ctx),$d1 # load hash value
1489 mov 24($ctx),$r0 # load r
1492 ################################# base 2^26 -> base 2^64
1494 and \$`-1*(1<<31)`,$d1
1495 mov $d2,$r1 # borrow $r1
1497 and \$`-1*(1<<31)`,$d2
1511 adc \$0,$h2 # can be partially reduced...
1513 mov \$-4,$d2 # ... so reduce
1526 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1528 .Lbase2_26_pre_avx2:
1529 add 0($inp),$h0 # accumulate input
1535 call __poly1305_block
1539 jnz .Lbase2_26_pre_avx2
1541 test $padbit,$padbit # if $padbit is zero,
1542 jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
1544 ################################# base 2^64 -> base 2^26
1551 and \$0x3ffffff,%rax # h[0]
1553 and \$0x3ffffff,%rdx # h[1]
1557 and \$0x3ffffff,$h0 # h[2]
1559 and \$0x3ffffff,$h1 # h[3]
1563 jz .Lstore_base2_26_avx2
1573 .Lstore_base2_64_avx2:
1576 mov $h2,16($ctx) # note that is_base2_26 is zeroed
1580 .Lstore_base2_26_avx2:
1581 mov %rax#d,0($ctx) # store hash value base 2^26
1601 .cfi_adjust_cfa_offset -48
1603 .Lblocks_avx2_epilogue:
1622 .Lbase2_64_avx2_body:
1624 mov $len,%r15 # reassign $len
1626 mov 24($ctx),$r0 # load r
1629 mov 0($ctx),$h0 # load hash value
1636 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1641 .Lbase2_64_pre_avx2:
1642 add 0($inp),$h0 # accumulate input
1648 call __poly1305_block
1652 jnz .Lbase2_64_pre_avx2
1655 ################################# base 2^64 -> base 2^26
1662 and \$0x3ffffff,%rax # h[0]
1664 and \$0x3ffffff,%rdx # h[1]
1668 and \$0x3ffffff,$h0 # h[2]
1670 and \$0x3ffffff,$h1 # h[3]
1678 movl \$1,20($ctx) # set is_base2_26
1680 call __poly1305_init_avx
1683 mov %r15,$len # restore $len
1684 mov OPENSSL_ia32cap_P+8(%rip),%r10d
1685 mov \$`(1<<31|1<<30|1<<16)`,%r11d
1701 .cfi_adjust_cfa_offset -48
1702 .Lbase2_64_avx2_epilogue:
1709 mov OPENSSL_ia32cap_P+8(%rip),%r10d
1710 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1711 vmovd 4*1($ctx),%x#$H1
1712 vmovd 4*2($ctx),%x#$H2
1713 vmovd 4*3($ctx),%x#$H3
1714 vmovd 4*4($ctx),%x#$H4
1718 $code.=<<___ if ($avx>2);
1722 test \$`1<<16`,%r10d # check for AVX512F
1726 $code.=<<___ if (!$win64);
1728 .cfi_def_cfa %r11,16
1731 $code.=<<___ if ($win64);
1732 lea -0xf8(%rsp),%r11
1734 vmovdqa %xmm6,0x50(%r11)
1735 vmovdqa %xmm7,0x60(%r11)
1736 vmovdqa %xmm8,0x70(%r11)
1737 vmovdqa %xmm9,0x80(%r11)
1738 vmovdqa %xmm10,0x90(%r11)
1739 vmovdqa %xmm11,0xa0(%r11)
1740 vmovdqa %xmm12,0xb0(%r11)
1741 vmovdqa %xmm13,0xc0(%r11)
1742 vmovdqa %xmm14,0xd0(%r11)
1743 vmovdqa %xmm15,0xe0(%r11)
1747 lea .Lconst(%rip),%rcx
1748 lea 48+64($ctx),$ctx # size optimization
1749 vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
1751 # expand and copy pre-calculated table to stack
1752 vmovdqu `16*0-64`($ctx),%x#$T2
1754 vmovdqu `16*1-64`($ctx),%x#$T3
1755 vmovdqu `16*2-64`($ctx),%x#$T4
1756 vmovdqu `16*3-64`($ctx),%x#$D0
1757 vmovdqu `16*4-64`($ctx),%x#$D1
1758 vmovdqu `16*5-64`($ctx),%x#$D2
1759 lea 0x90(%rsp),%rax # size optimization
1760 vmovdqu `16*6-64`($ctx),%x#$D3
1761 vpermd $T2,$T0,$T2 # 00003412 -> 14243444
1762 vmovdqu `16*7-64`($ctx),%x#$D4
1764 vmovdqu `16*8-64`($ctx),%x#$MASK
1766 vmovdqa $T2,0x00(%rsp)
1768 vmovdqa $T3,0x20-0x90(%rax)
1770 vmovdqa $T4,0x40-0x90(%rax)
1772 vmovdqa $D0,0x60-0x90(%rax)
1774 vmovdqa $D1,0x80-0x90(%rax)
1776 vmovdqa $D2,0xa0-0x90(%rax)
1777 vpermd $MASK,$T0,$MASK
1778 vmovdqa $D3,0xc0-0x90(%rax)
1779 vmovdqa $D4,0xe0-0x90(%rax)
1780 vmovdqa $MASK,0x100-0x90(%rax)
1781 vmovdqa 64(%rcx),$MASK # .Lmask26
1783 ################################################################
1785 vmovdqu 16*0($inp),%x#$T0
1786 vmovdqu 16*1($inp),%x#$T1
1787 vinserti128 \$1,16*2($inp),$T0,$T0
1788 vinserti128 \$1,16*3($inp),$T1,$T1
1791 vpsrldq \$6,$T0,$T2 # splat input
1793 vpunpckhqdq $T1,$T0,$T4 # 4
1794 vpunpcklqdq $T3,$T2,$T2 # 2:3
1795 vpunpcklqdq $T1,$T0,$T0 # 0:1
1800 vpsrlq \$40,$T4,$T4 # 4
1801 vpand $MASK,$T2,$T2 # 2
1802 vpand $MASK,$T0,$T0 # 0
1803 vpand $MASK,$T1,$T1 # 1
1804 vpand $MASK,$T3,$T3 # 3
1805 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1807 vpaddq $H2,$T2,$H2 # accumulate input
1814 ################################################################
1815 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
1816 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1817 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
1818 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1819 # \________/\__________/
1820 ################################################################
1821 #vpaddq $H2,$T2,$H2 # accumulate input
1823 vmovdqa `32*0`(%rsp),$T0 # r0^4
1825 vmovdqa `32*1`(%rsp),$T1 # r1^4
1827 vmovdqa `32*3`(%rsp),$T2 # r2^4
1829 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1830 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1832 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1833 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1834 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1835 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1836 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1838 # however, as h2 is "chronologically" first one available pull
1839 # corresponding operations up, so it's
1841 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1842 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1843 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1844 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1845 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1847 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1848 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1849 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1850 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1851 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1853 vpmuludq $H0,$T1,$T4 # h0*r1
1854 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1855 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1856 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1857 vpmuludq $H3,$T1,$T4 # h3*r1
1858 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1859 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1860 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1861 vmovdqa `32*4-0x90`(%rax),$T1 # s2
1863 vpmuludq $H0,$T0,$T4 # h0*r0
1864 vpmuludq $H1,$T0,$H2 # h1*r0
1865 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1866 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1867 vpmuludq $H3,$T0,$T4 # h3*r0
1868 vpmuludq $H4,$T0,$H2 # h4*r0
1869 vmovdqu 16*0($inp),%x#$T0 # load input
1870 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1871 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1872 vinserti128 \$1,16*2($inp),$T0,$T0
1874 vpmuludq $H3,$T1,$T4 # h3*s2
1875 vpmuludq $H4,$T1,$H2 # h4*s2
1876 vmovdqu 16*1($inp),%x#$T1
1877 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1878 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1879 vmovdqa `32*5-0x90`(%rax),$H2 # r3
1880 vpmuludq $H1,$T2,$T4 # h1*r2
1881 vpmuludq $H0,$T2,$T2 # h0*r2
1882 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1883 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1884 vinserti128 \$1,16*3($inp),$T1,$T1
1887 vpmuludq $H1,$H2,$T4 # h1*r3
1888 vpmuludq $H0,$H2,$H2 # h0*r3
1889 vpsrldq \$6,$T0,$T2 # splat input
1890 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1891 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1892 vpmuludq $H3,$T3,$T4 # h3*s3
1893 vpmuludq $H4,$T3,$H2 # h4*s3
1895 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1896 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1897 vpunpckhqdq $T1,$T0,$T4 # 4
1899 vpmuludq $H3,$S4,$H3 # h3*s4
1900 vpmuludq $H4,$S4,$H4 # h4*s4
1901 vpunpcklqdq $T1,$T0,$T0 # 0:1
1902 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1903 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1904 vpunpcklqdq $T3,$T2,$T3 # 2:3
1905 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
1906 vpmuludq $H1,$S4,$H0 # h1*s4
1907 vmovdqa 64(%rcx),$MASK # .Lmask26
1908 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1909 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1911 ################################################################
1912 # lazy reduction (interleaved with tail of input splat)
1916 vpaddq $D3,$H4,$H4 # h3 -> h4
1920 vpaddq $D0,$D1,$H1 # h0 -> h1
1929 vpaddq $D1,$H2,$H2 # h1 -> h2
1933 vpaddq $D4,$H0,$H0 # h4 -> h0
1935 vpand $MASK,$T2,$T2 # 2
1940 vpaddq $D2,$H3,$H3 # h2 -> h3
1942 vpaddq $T2,$H2,$H2 # modulo-scheduled
1947 vpaddq $D0,$H1,$H1 # h0 -> h1
1949 vpsrlq \$40,$T4,$T4 # 4
1953 vpaddq $D3,$H4,$H4 # h3 -> h4
1955 vpand $MASK,$T0,$T0 # 0
1956 vpand $MASK,$T1,$T1 # 1
1957 vpand $MASK,$T3,$T3 # 3
1958 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1965 ################################################################
1966 # while above multiplications were by r^4 in all lanes, in last
1967 # iteration we multiply least significant lane by r^4 and most
1968 # significant one by r, so copy of above except that references
1969 # to the precomputed table are displaced by 4...
1971 #vpaddq $H2,$T2,$H2 # accumulate input
1973 vmovdqu `32*0+4`(%rsp),$T0 # r0^4
1975 vmovdqu `32*1+4`(%rsp),$T1 # r1^4
1977 vmovdqu `32*3+4`(%rsp),$T2 # r2^4
1979 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
1980 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
1982 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1983 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1984 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1985 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1986 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1988 vpmuludq $H0,$T1,$T4 # h0*r1
1989 vpmuludq $H1,$T1,$H2 # h1*r1
1990 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1991 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1992 vpmuludq $H3,$T1,$T4 # h3*r1
1993 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
1994 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1995 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1997 vpmuludq $H0,$T0,$T4 # h0*r0
1998 vpmuludq $H1,$T0,$H2 # h1*r0
1999 vpaddq $T4,$D0,$D0 # d0 += h0*r0
2000 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
2001 vpaddq $H2,$D1,$D1 # d1 += h1*r0
2002 vpmuludq $H3,$T0,$T4 # h3*r0
2003 vpmuludq $H4,$T0,$H2 # h4*r0
2004 vpaddq $T4,$D3,$D3 # d3 += h3*r0
2005 vpaddq $H2,$D4,$D4 # d4 += h4*r0
2007 vpmuludq $H3,$T1,$T4 # h3*s2
2008 vpmuludq $H4,$T1,$H2 # h4*s2
2009 vpaddq $T4,$D0,$D0 # d0 += h3*s2
2010 vpaddq $H2,$D1,$D1 # d1 += h4*s2
2011 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
2012 vpmuludq $H1,$T2,$T4 # h1*r2
2013 vpmuludq $H0,$T2,$T2 # h0*r2
2014 vpaddq $T4,$D3,$D3 # d3 += h1*r2
2015 vpaddq $T2,$D2,$D2 # d2 += h0*r2
2017 vpmuludq $H1,$H2,$T4 # h1*r3
2018 vpmuludq $H0,$H2,$H2 # h0*r3
2019 vpaddq $T4,$D4,$D4 # d4 += h1*r3
2020 vpaddq $H2,$D3,$D3 # d3 += h0*r3
2021 vpmuludq $H3,$T3,$T4 # h3*s3
2022 vpmuludq $H4,$T3,$H2 # h4*s3
2023 vpaddq $T4,$D1,$D1 # d1 += h3*s3
2024 vpaddq $H2,$D2,$D2 # d2 += h4*s3
2026 vpmuludq $H3,$S4,$H3 # h3*s4
2027 vpmuludq $H4,$S4,$H4 # h4*s4
2028 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
2029 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
2030 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
2031 vpmuludq $H1,$S4,$H0 # h1*s4
2032 vmovdqa 64(%rcx),$MASK # .Lmask26
2033 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
2034 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
2036 ################################################################
2037 # horizontal addition
2050 vpermq \$0x2,$H3,$T3
2051 vpermq \$0x2,$H4,$T4
2052 vpermq \$0x2,$H0,$T0
2053 vpermq \$0x2,$D1,$T1
2054 vpermq \$0x2,$H2,$T2
2061 ################################################################
2066 vpaddq $D3,$H4,$H4 # h3 -> h4
2070 vpaddq $D0,$D1,$H1 # h0 -> h1
2077 vpaddq $D1,$H2,$H2 # h1 -> h2
2081 vpaddq $D4,$H0,$H0 # h4 -> h0
2085 vpaddq $D2,$H3,$H3 # h2 -> h3
2089 vpaddq $D0,$H1,$H1 # h0 -> h1
2093 vpaddq $D3,$H4,$H4 # h3 -> h4
2095 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2096 vmovd %x#$H1,`4*1-48-64`($ctx)
2097 vmovd %x#$H2,`4*2-48-64`($ctx)
2098 vmovd %x#$H3,`4*3-48-64`($ctx)
2099 vmovd %x#$H4,`4*4-48-64`($ctx)
2101 $code.=<<___ if ($win64);
2102 vmovdqa 0x50(%r11),%xmm6
2103 vmovdqa 0x60(%r11),%xmm7
2104 vmovdqa 0x70(%r11),%xmm8
2105 vmovdqa 0x80(%r11),%xmm9
2106 vmovdqa 0x90(%r11),%xmm10
2107 vmovdqa 0xa0(%r11),%xmm11
2108 vmovdqa 0xb0(%r11),%xmm12
2109 vmovdqa 0xc0(%r11),%xmm13
2110 vmovdqa 0xd0(%r11),%xmm14
2111 vmovdqa 0xe0(%r11),%xmm15
2115 $code.=<<___ if (!$win64);
2123 .size poly1305_blocks_avx2,.-poly1305_blocks_avx2
2125 #######################################################################
2127 # On entry we have input length divisible by 64. But since inner loop
2128 # processes 128 bytes per iteration, cases when length is not divisible
2129 # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
2130 # reason stack layout is kept identical to poly1305_blocks_avx2. If not
2131 # for this tail, we wouldn't have to even allocate stack frame...
2133 my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
2134 my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
2135 my $PADBIT="%zmm30";
2137 map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
2138 map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2139 map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
2140 map(s/%y/%z/,($MASK));
2143 .type poly1305_blocks_avx512,\@function,4
2145 poly1305_blocks_avx512:
2151 $code.=<<___ if (!$win64);
2153 .cfi_def_cfa %r11,16
2156 $code.=<<___ if ($win64);
2157 lea -0xf8(%rsp),%r11
2159 vmovdqa %xmm6,0x50(%r11)
2160 vmovdqa %xmm7,0x60(%r11)
2161 vmovdqa %xmm8,0x70(%r11)
2162 vmovdqa %xmm9,0x80(%r11)
2163 vmovdqa %xmm10,0x90(%r11)
2164 vmovdqa %xmm11,0xa0(%r11)
2165 vmovdqa %xmm12,0xb0(%r11)
2166 vmovdqa %xmm13,0xc0(%r11)
2167 vmovdqa %xmm14,0xd0(%r11)
2168 vmovdqa %xmm15,0xe0(%r11)
2172 lea .Lconst(%rip),%rcx
2173 lea 48+64($ctx),$ctx # size optimization
2174 vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
2176 # expand pre-calculated table
2177 vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
2179 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
2181 vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
2182 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
2183 vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
2184 vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
2185 vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
2186 vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
2187 vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
2188 vpermd $D0,$T2,$R0 # 00003412 -> 14243444
2189 vpbroadcastq 64(%rcx),$MASK # .Lmask26
2193 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
2194 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
2196 vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
2199 vmovdqa64 $S1,0x40(%rsp){%k2}
2202 vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
2204 vmovdqa64 $S2,0x80(%rsp){%k2}
2205 vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
2206 vmovdqa64 $S3,0xc0(%rsp){%k2}
2207 vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
2208 vmovdqa64 $S4,0x100(%rsp){%k2}
2210 ################################################################
2211 # calculate 5th through 8th powers of the key
2213 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
2214 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2215 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
2216 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
2217 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
2219 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
2220 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
2221 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
2222 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
2223 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
2226 vpmuludq $T1,$S4,$M0
2227 vpmuludq $T1,$R0,$M1
2228 vpmuludq $T1,$R1,$M2
2229 vpmuludq $T1,$R2,$M3
2230 vpmuludq $T1,$R3,$M4
2232 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
2233 vpaddq $M1,$D1,$D1 # d1 += r1'*r0
2234 vpaddq $M2,$D2,$D2 # d2 += r1'*r1
2235 vpaddq $M3,$D3,$D3 # d3 += r1'*r2
2236 vpaddq $M4,$D4,$D4 # d4 += r1'*r3
2238 vpmuludq $T2,$S3,$M0
2239 vpmuludq $T2,$S4,$M1
2240 vpmuludq $T2,$R1,$M3
2241 vpmuludq $T2,$R2,$M4
2242 vpmuludq $T2,$R0,$M2
2244 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
2245 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
2246 vpaddq $M3,$D3,$D3 # d3 += r2'*r1
2247 vpaddq $M4,$D4,$D4 # d4 += r2'*r2
2248 vpaddq $M2,$D2,$D2 # d2 += r2'*r0
2250 vpmuludq $T3,$S2,$M0
2251 vpmuludq $T3,$R0,$M3
2252 vpmuludq $T3,$R1,$M4
2253 vpmuludq $T3,$S3,$M1
2254 vpmuludq $T3,$S4,$M2
2255 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
2256 vpaddq $M3,$D3,$D3 # d3 += r3'*r0
2257 vpaddq $M4,$D4,$D4 # d4 += r3'*r1
2258 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
2259 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
2261 vpmuludq $T4,$S4,$M3
2262 vpmuludq $T4,$R0,$M4
2263 vpmuludq $T4,$S1,$M0
2264 vpmuludq $T4,$S2,$M1
2265 vpmuludq $T4,$S3,$M2
2266 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
2267 vpaddq $M4,$D4,$D4 # d4 += r2'*r0
2268 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
2269 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
2270 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
2272 ################################################################
2274 vmovdqu64 16*0($inp),%z#$T3
2275 vmovdqu64 16*4($inp),%z#$T4
2278 ################################################################
2282 vpandq $MASK,$D3,$D3
2283 vpaddq $M3,$D4,$D4 # d3 -> d4
2286 vpandq $MASK,$D0,$D0
2287 vpaddq $M0,$D1,$D1 # d0 -> d1
2290 vpandq $MASK,$D4,$D4
2293 vpandq $MASK,$D1,$D1
2294 vpaddq $M1,$D2,$D2 # d1 -> d2
2298 vpaddq $M4,$D0,$D0 # d4 -> d0
2301 vpandq $MASK,$D2,$D2
2302 vpaddq $M2,$D3,$D3 # d2 -> d3
2305 vpandq $MASK,$D0,$D0
2306 vpaddq $M0,$D1,$D1 # d0 -> d1
2309 vpandq $MASK,$D3,$D3
2310 vpaddq $M3,$D4,$D4 # d3 -> d4
2312 ################################################################
2313 # at this point we have 14243444 in $R0-$S4 and 05060708 in
2316 vpunpcklqdq $T4,$T3,$T0 # transpose input
2317 vpunpckhqdq $T4,$T3,$T4
2319 # ... since input 64-bit lanes are ordered as 73625140, we could
2320 # "vperm" it to 76543210 (here and in each loop iteration), *or*
2321 # we could just flow along, hence the goal for $R0-$S4 is
2322 # 1858286838784888 ...
2324 vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
2328 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
2334 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
2335 vpermd $D1,$M0,${R1}{%k1}
2336 vpermd $D2,$M0,${R2}{%k1}
2337 vpermd $D3,$M0,${R3}{%k1}
2338 vpermd $D4,$M0,${R4}{%k1}
2340 vpslld \$2,$R1,$S1 # *5
2349 vpbroadcastq 32(%rcx),$PADBIT # .L129
2351 vpsrlq \$52,$T0,$T2 # splat input
2356 vpsrlq \$40,$T4,$T4 # 4
2357 vpandq $MASK,$T2,$T2 # 2
2358 vpandq $MASK,$T0,$T0 # 0
2359 #vpandq $MASK,$T1,$T1 # 1
2360 #vpandq $MASK,$T3,$T3 # 3
2361 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2363 vpaddq $H2,$T2,$H2 # accumulate input
2370 ################################################################
2371 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
2372 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
2373 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
2374 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2375 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
2376 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2377 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
2378 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
2379 # \________/\___________/
2380 ################################################################
2381 #vpaddq $H2,$T2,$H2 # accumulate input
2383 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
2384 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
2385 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
2386 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2387 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
2389 # however, as h2 is "chronologically" first one available pull
2390 # corresponding operations up, so it's
2392 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
2393 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
2394 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
2395 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
2396 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
2398 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2400 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2401 vpandq $MASK,$T1,$T1 # 1
2402 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2403 vpandq $MASK,$T3,$T3 # 3
2404 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2405 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2406 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2407 vpaddq $H1,$T1,$H1 # accumulate input
2411 vmovdqu64 16*0($inp),$T3 # load input
2412 vmovdqu64 16*4($inp),$T4
2414 vpmuludq $H0,$R3,$M3
2415 vpmuludq $H0,$R4,$M4
2416 vpmuludq $H0,$R0,$M0
2417 vpmuludq $H0,$R1,$M1
2418 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2419 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2420 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2421 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2423 vpmuludq $H1,$R2,$M3
2424 vpmuludq $H1,$R3,$M4
2425 vpmuludq $H1,$S4,$M0
2426 vpmuludq $H0,$R2,$M2
2427 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2428 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2429 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2430 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2432 vpunpcklqdq $T4,$T3,$T0 # transpose input
2433 vpunpckhqdq $T4,$T3,$T4
2435 vpmuludq $H3,$R0,$M3
2436 vpmuludq $H3,$R1,$M4
2437 vpmuludq $H1,$R0,$M1
2438 vpmuludq $H1,$R1,$M2
2439 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2440 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2441 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2442 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2444 vpmuludq $H4,$S4,$M3
2445 vpmuludq $H4,$R0,$M4
2446 vpmuludq $H3,$S2,$M0
2447 vpmuludq $H3,$S3,$M1
2448 vpaddq $M3,$D3,$D3 # d3 += h4*s4
2449 vpmuludq $H3,$S4,$M2
2450 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2451 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2452 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2453 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2455 vpmuludq $H4,$S1,$M0
2456 vpmuludq $H4,$S2,$M1
2457 vpmuludq $H4,$S3,$M2
2458 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2459 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2460 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2462 ################################################################
2463 # lazy reduction (interleaved with input splat)
2465 vpsrlq \$52,$T0,$T2 # splat input
2469 vpandq $MASK,$D3,$D3
2470 vpaddq $H3,$D4,$H4 # h3 -> h4
2475 vpandq $MASK,$H0,$H0
2476 vpaddq $D0,$H1,$H1 # h0 -> h1
2478 vpandq $MASK,$T2,$T2 # 2
2481 vpandq $MASK,$H4,$H4
2484 vpandq $MASK,$H1,$H1
2485 vpaddq $D1,$H2,$H2 # h1 -> h2
2489 vpaddq $D4,$H0,$H0 # h4 -> h0
2491 vpaddq $T2,$H2,$H2 # modulo-scheduled
2495 vpandq $MASK,$H2,$H2
2496 vpaddq $D2,$D3,$H3 # h2 -> h3
2501 vpandq $MASK,$H0,$H0
2502 vpaddq $D0,$H1,$H1 # h0 -> h1
2504 vpsrlq \$40,$T4,$T4 # 4
2507 vpandq $MASK,$H3,$H3
2508 vpaddq $D3,$H4,$H4 # h3 -> h4
2510 vpandq $MASK,$T0,$T0 # 0
2511 #vpandq $MASK,$T1,$T1 # 1
2512 #vpandq $MASK,$T3,$T3 # 3
2513 #vporq $PADBIT,$T4,$T4 # padbit, yes, always
2519 ################################################################
2520 # while above multiplications were by r^8 in all lanes, in last
2521 # iteration we multiply least significant lane by r^8 and most
2522 # significant one by r, that's why table gets shifted...
2524 vpsrlq \$32,$R0,$R0 # 0105020603070408
2534 ################################################################
2535 # load either next or last 64 byte of input
2536 lea ($inp,$len),$inp
2538 #vpaddq $H2,$T2,$H2 # accumulate input
2541 vpmuludq $H2,$R1,$D3 # d3 = h2*r1
2542 vpmuludq $H2,$R2,$D4 # d4 = h2*r2
2543 vpmuludq $H2,$S3,$D0 # d0 = h2*s3
2544 vpandq $MASK,$T1,$T1 # 1
2545 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2546 vpandq $MASK,$T3,$T3 # 3
2547 vpmuludq $H2,$R0,$D2 # d2 = h2*r0
2548 vporq $PADBIT,$T4,$T4 # padbit, yes, always
2549 vpaddq $H1,$T1,$H1 # accumulate input
2553 vmovdqu 16*0($inp),%x#$T0
2554 vpmuludq $H0,$R3,$M3
2555 vpmuludq $H0,$R4,$M4
2556 vpmuludq $H0,$R0,$M0
2557 vpmuludq $H0,$R1,$M1
2558 vpaddq $M3,$D3,$D3 # d3 += h0*r3
2559 vpaddq $M4,$D4,$D4 # d4 += h0*r4
2560 vpaddq $M0,$D0,$D0 # d0 += h0*r0
2561 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2563 vmovdqu 16*1($inp),%x#$T1
2564 vpmuludq $H1,$R2,$M3
2565 vpmuludq $H1,$R3,$M4
2566 vpmuludq $H1,$S4,$M0
2567 vpmuludq $H0,$R2,$M2
2568 vpaddq $M3,$D3,$D3 # d3 += h1*r2
2569 vpaddq $M4,$D4,$D4 # d4 += h1*r3
2570 vpaddq $M0,$D0,$D0 # d0 += h1*s4
2571 vpaddq $M2,$D2,$D2 # d2 += h0*r2
2573 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
2574 vpmuludq $H3,$R0,$M3
2575 vpmuludq $H3,$R1,$M4
2576 vpmuludq $H1,$R0,$M1
2577 vpmuludq $H1,$R1,$M2
2578 vpaddq $M3,$D3,$D3 # d3 += h3*r0
2579 vpaddq $M4,$D4,$D4 # d4 += h3*r1
2580 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2581 vpaddq $M2,$D2,$D2 # d2 += h1*r1
2583 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
2584 vpmuludq $H4,$S4,$M3
2585 vpmuludq $H4,$R0,$M4
2586 vpmuludq $H3,$S2,$M0
2587 vpmuludq $H3,$S3,$M1
2588 vpmuludq $H3,$S4,$M2
2589 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
2590 vpaddq $M4,$D4,$D4 # d4 += h4*r0
2591 vpaddq $M0,$D0,$D0 # d0 += h3*s2
2592 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2593 vpaddq $M2,$D2,$D2 # d2 += h3*s4
2595 vpmuludq $H4,$S1,$M0
2596 vpmuludq $H4,$S2,$M1
2597 vpmuludq $H4,$S3,$M2
2598 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
2599 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2600 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
2602 ################################################################
2603 # horizontal addition
2606 vpermq \$0xb1,$H3,$D3
2607 vpermq \$0xb1,$D4,$H4
2608 vpermq \$0xb1,$H0,$D0
2609 vpermq \$0xb1,$H1,$D1
2610 vpermq \$0xb1,$H2,$D2
2618 vpermq \$0x2,$H3,$D3
2619 vpermq \$0x2,$H4,$D4
2620 vpermq \$0x2,$H0,$D0
2621 vpermq \$0x2,$H1,$D1
2622 vpermq \$0x2,$H2,$D2
2629 vextracti64x4 \$0x1,$H3,%y#$D3
2630 vextracti64x4 \$0x1,$H4,%y#$D4
2631 vextracti64x4 \$0x1,$H0,%y#$D0
2632 vextracti64x4 \$0x1,$H1,%y#$D1
2633 vextracti64x4 \$0x1,$H2,%y#$D2
2634 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
2635 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
2636 vpaddq $D0,$H0,${H0}{%k3}{z}
2637 vpaddq $D1,$H1,${H1}{%k3}{z}
2638 vpaddq $D2,$H2,${H2}{%k3}{z}
2640 map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
2641 map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2643 ################################################################
2644 # lazy reduction (interleaved with input splat)
2648 vpsrldq \$6,$T0,$T2 # splat input
2650 vpunpckhqdq $T1,$T0,$T4 # 4
2651 vpaddq $D3,$H4,$H4 # h3 -> h4
2655 vpunpcklqdq $T3,$T2,$T2 # 2:3
2656 vpunpcklqdq $T1,$T0,$T0 # 0:1
2657 vpaddq $D0,$H1,$H1 # h0 -> h1
2666 vpaddq $D1,$H2,$H2 # h1 -> h2
2671 vpsrlq \$40,$T4,$T4 # 4
2672 vpaddq $D4,$H0,$H0 # h4 -> h0
2676 vpand $MASK,$T2,$T2 # 2
2677 vpand $MASK,$T0,$T0 # 0
2678 vpaddq $D2,$H3,$H3 # h2 -> h3
2682 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
2683 vpand $MASK,$T1,$T1 # 1
2684 vpaddq $D0,$H1,$H1 # h0 -> h1
2688 vpand $MASK,$T3,$T3 # 3
2689 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
2690 vpaddq $D3,$H4,$H4 # h3 -> h4
2692 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
2696 vpsubq $T2,$H2,$H2 # undo input accumulation
2697 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
2698 vmovd %x#$H1,`4*1-48-64`($ctx)
2699 vmovd %x#$H2,`4*2-48-64`($ctx)
2700 vmovd %x#$H3,`4*3-48-64`($ctx)
2701 vmovd %x#$H4,`4*4-48-64`($ctx)
2704 $code.=<<___ if ($win64);
2705 movdqa 0x50(%r11),%xmm6
2706 movdqa 0x60(%r11),%xmm7
2707 movdqa 0x70(%r11),%xmm8
2708 movdqa 0x80(%r11),%xmm9
2709 movdqa 0x90(%r11),%xmm10
2710 movdqa 0xa0(%r11),%xmm11
2711 movdqa 0xb0(%r11),%xmm12
2712 movdqa 0xc0(%r11),%xmm13
2713 movdqa 0xd0(%r11),%xmm14
2714 movdqa 0xe0(%r11),%xmm15
2716 .Ldo_avx512_epilogue:
2718 $code.=<<___ if (!$win64);
2725 .size poly1305_blocks_avx512,.-poly1305_blocks_avx512
2728 ########################################################################
2729 # VPMADD52 version using 2^44 radix.
2731 # One can argue that base 2^52 would be more natural. Well, even though
2732 # some operations would be more natural, one has to recognize couple of
2733 # things. Base 2^52 doesn't provide advantage over base 2^44 if you look
2734 # at amount of multiply-n-accumulate operations. Secondly, it makes it
2735 # impossible to pre-compute multiples of 5 [referred to as s[]/sN in
2736 # reference implementations], which means that more such operations
2737 # would have to be performed in inner loop, which in turn makes critical
2738 # path longer. In other words, even though base 2^44 reduction might
2739 # look less elegant, overall critical path is actually shorter...
2741 ########################################################################
2742 # Layout of opaque area is following.
2744 # unsigned __int64 h[3]; # current hash value base 2^44
2745 # unsigned __int64 s[2]; # key value*20 base 2^44
2746 # unsigned __int64 r[3]; # key value base 2^44
2747 # struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2748 # # r^n positions reflect
2749 # # placement in register, not
2750 # # memory, R[3] is R[1]*20
2753 .type poly1305_init_base2_44,\@function,3
2755 poly1305_init_base2_44:
2758 mov %rax,0($ctx) # initialize hash value
2763 lea poly1305_blocks_vpmadd52(%rip),%r10
2764 lea poly1305_emit_base2_44(%rip),%r11
2766 mov \$0x0ffffffc0fffffff,%rax
2767 mov \$0x0ffffffc0ffffffc,%rcx
2769 mov \$0x00000fffffffffff,%r8
2771 mov \$0x00000fffffffffff,%r9
2774 mov %r8,40($ctx) # r0
2777 mov %rax,48($ctx) # r1
2778 lea (%rax,%rax,4),%rax # *5
2779 mov %rcx,56($ctx) # r2
2780 shl \$2,%rax # magic <<2
2781 lea (%rcx,%rcx,4),%rcx # *5
2782 shl \$2,%rcx # magic <<2
2783 mov %rax,24($ctx) # s1
2784 mov %rcx,32($ctx) # s2
2785 movq \$-1,64($ctx) # write impossible value
2787 $code.=<<___ if ($flavour !~ /elf32/);
2791 $code.=<<___ if ($flavour =~ /elf32/);
2799 .size poly1305_init_base2_44,.-poly1305_init_base2_44
2802 my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
2803 my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
2804 my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
2807 .type poly1305_blocks_vpmadd52,\@function,4
2809 poly1305_blocks_vpmadd52:
2812 jz .Lno_data_vpmadd52 # too short
2815 mov 64($ctx),%r8 # peek on power of the key
2817 # if powers of the key are not calculated yet, process up to 3
2818 # blocks with this single-block subroutine, otherwise ensure that
2819 # length is divisible by 2 blocks and pass the rest down to next
2824 cmp \$4,$len # is input long
2826 test %r8,%r8 # is power value impossible?
2829 and $len,%rax # is input of favourable length?
2830 jz .Lblocks_vpmadd52_4x
2836 lea .L2_44_inp_permd(%rip),%r10
2839 vmovq $padbit,%x#$PAD
2840 vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
2841 vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
2842 vpermq \$0xcf,$PAD,$PAD
2843 vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
2845 vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
2846 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
2847 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
2848 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
2850 vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
2851 vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
2857 vmovdqu32 0($inp),%x#$T0 # load input as ----3210
2860 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
2861 vpsrlvq $inp_shift,$T0,$T0
2862 vpandq $reduc_mask,$T0,$T0
2865 vpaddq $T0,$Dlo,$Dlo # accumulate input
2867 vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
2868 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
2869 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
2871 vpxord $Dlo,$Dlo,$Dlo
2872 vpxord $Dhi,$Dhi,$Dhi
2874 vpmadd52luq $r2r1r0,$H0,$Dlo
2875 vpmadd52huq $r2r1r0,$H0,$Dhi
2877 vpmadd52luq $r1r0s2,$H1,$Dlo
2878 vpmadd52huq $r1r0s2,$H1,$Dhi
2880 vpmadd52luq $r0s2s1,$H2,$Dlo
2881 vpmadd52huq $r0s2s1,$H2,$Dhi
2883 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
2884 vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
2885 vpandq $reduc_mask,$Dlo,$Dlo
2887 vpaddq $T0,$Dhi,$Dhi
2889 vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
2891 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
2893 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
2894 vpandq $reduc_mask,$Dlo,$Dlo
2896 vpermq \$0b10010011,$T0,$T0
2898 vpaddq $T0,$Dlo,$Dlo
2900 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
2902 vpaddq $T0,$Dlo,$Dlo
2905 vpaddq $T0,$Dlo,$Dlo
2910 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
2913 jnz .Lblocks_vpmadd52_4x
2918 .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
2922 ########################################################################
2923 # As implied by its name 4x subroutine processes 4 blocks in parallel
2924 # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
2925 # and is handled in 256-bit %ymm registers.
2927 my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
2928 my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
2929 my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
2932 .type poly1305_blocks_vpmadd52_4x,\@function,4
2934 poly1305_blocks_vpmadd52_4x:
2937 jz .Lno_data_vpmadd52_4x # too short
2940 mov 64($ctx),%r8 # peek on power of the key
2942 .Lblocks_vpmadd52_4x:
2943 vpbroadcastq $padbit,$PAD
2945 vmovdqa64 .Lx_mask44(%rip),$mask44
2947 vmovdqa64 .Lx_mask42(%rip),$mask42
2948 kmovw %eax,%k1 # used in 2x path
2950 test %r8,%r8 # is power value impossible?
2951 js .Linit_vpmadd52 # if it is, then init R[4]
2953 vmovq 0($ctx),%x#$H0 # load current hash value
2954 vmovq 8($ctx),%x#$H1
2955 vmovq 16($ctx),%x#$H2
2957 test \$3,$len # is length 4*n+2?
2958 jnz .Lblocks_vpmadd52_2x_do
2960 .Lblocks_vpmadd52_4x_do:
2961 vpbroadcastq 64($ctx),$R0 # load 4th power of the key
2962 vpbroadcastq 96($ctx),$R1
2963 vpbroadcastq 128($ctx),$R2
2964 vpbroadcastq 160($ctx),$S1
2966 .Lblocks_vpmadd52_4x_key_loaded:
2967 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
2971 test \$7,$len # is len 8*n?
2972 jz .Lblocks_vpmadd52_8x
2974 vmovdqu64 16*0($inp),$T2 # load data
2975 vmovdqu64 16*2($inp),$T3
2978 vpunpcklqdq $T3,$T2,$T1 # transpose data
2979 vpunpckhqdq $T3,$T2,$T3
2981 # at this point 64-bit lanes are ordered as 3-1-2-0
2983 vpsrlq \$24,$T3,$T2 # splat the data
2985 vpaddq $T2,$H2,$H2 # accumulate input
2986 vpandq $mask44,$T1,$T0
2990 vpandq $mask44,$T1,$T1
2993 jz .Ltail_vpmadd52_4x
2994 jmp .Loop_vpmadd52_4x
2999 vmovq 24($ctx),%x#$S1 # load key
3000 vmovq 56($ctx),%x#$H2
3001 vmovq 32($ctx),%x#$S2
3002 vmovq 40($ctx),%x#$R0
3003 vmovq 48($ctx),%x#$R1
3011 .Lmul_init_vpmadd52:
3012 vpxorq $D0lo,$D0lo,$D0lo
3013 vpmadd52luq $H2,$S1,$D0lo
3014 vpxorq $D0hi,$D0hi,$D0hi
3015 vpmadd52huq $H2,$S1,$D0hi
3016 vpxorq $D1lo,$D1lo,$D1lo
3017 vpmadd52luq $H2,$S2,$D1lo
3018 vpxorq $D1hi,$D1hi,$D1hi
3019 vpmadd52huq $H2,$S2,$D1hi
3020 vpxorq $D2lo,$D2lo,$D2lo
3021 vpmadd52luq $H2,$R0,$D2lo
3022 vpxorq $D2hi,$D2hi,$D2hi
3023 vpmadd52huq $H2,$R0,$D2hi
3025 vpmadd52luq $H0,$R0,$D0lo
3026 vpmadd52huq $H0,$R0,$D0hi
3027 vpmadd52luq $H0,$R1,$D1lo
3028 vpmadd52huq $H0,$R1,$D1hi
3029 vpmadd52luq $H0,$R2,$D2lo
3030 vpmadd52huq $H0,$R2,$D2hi
3032 vpmadd52luq $H1,$S2,$D0lo
3033 vpmadd52huq $H1,$S2,$D0hi
3034 vpmadd52luq $H1,$R0,$D1lo
3035 vpmadd52huq $H1,$R0,$D1hi
3036 vpmadd52luq $H1,$R1,$D2lo
3037 vpmadd52huq $H1,$R1,$D2hi
3039 ################################################################
3041 vpsrlq \$44,$D0lo,$tmp
3042 vpsllq \$8,$D0hi,$D0hi
3043 vpandq $mask44,$D0lo,$H0
3044 vpaddq $tmp,$D0hi,$D0hi
3046 vpaddq $D0hi,$D1lo,$D1lo
3048 vpsrlq \$44,$D1lo,$tmp
3049 vpsllq \$8,$D1hi,$D1hi
3050 vpandq $mask44,$D1lo,$H1
3051 vpaddq $tmp,$D1hi,$D1hi
3053 vpaddq $D1hi,$D2lo,$D2lo
3055 vpsrlq \$42,$D2lo,$tmp
3056 vpsllq \$10,$D2hi,$D2hi
3057 vpandq $mask42,$D2lo,$H2
3058 vpaddq $tmp,$D2hi,$D2hi
3060 vpaddq $D2hi,$H0,$H0
3061 vpsllq \$2,$D2hi,$D2hi
3063 vpaddq $D2hi,$H0,$H0
3065 vpsrlq \$44,$H0,$tmp # additional step
3066 vpandq $mask44,$H0,$H0
3071 jz .Ldone_init_vpmadd52
3073 vpunpcklqdq $R1,$H1,$R1 # 1,2
3074 vpbroadcastq %x#$H1,%x#$H1 # 2,2
3075 vpunpcklqdq $R2,$H2,$R2
3076 vpbroadcastq %x#$H2,%x#$H2
3077 vpunpcklqdq $R0,$H0,$R0
3078 vpbroadcastq %x#$H0,%x#$H0
3080 vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3081 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3087 jmp .Lmul_init_vpmadd52
3091 .Ldone_init_vpmadd52:
3092 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
3093 vinserti128 \$1,%x#$R2,$H2,$R2
3094 vinserti128 \$1,%x#$R0,$H0,$R0
3096 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
3097 vpermq \$0b11011000,$R2,$R2
3098 vpermq \$0b11011000,$R0,$R0
3100 vpsllq \$2,$R1,$S1 # S1 = R1*5*4
3104 vmovq 0($ctx),%x#$H0 # load current hash value
3105 vmovq 8($ctx),%x#$H1
3106 vmovq 16($ctx),%x#$H2
3108 test \$3,$len # is length 4*n+2?
3109 jnz .Ldone_init_vpmadd52_2x
3111 vmovdqu64 $R0,64($ctx) # save key powers
3112 vpbroadcastq %x#$R0,$R0 # broadcast 4th power
3113 vmovdqu64 $R1,96($ctx)
3114 vpbroadcastq %x#$R1,$R1
3115 vmovdqu64 $R2,128($ctx)
3116 vpbroadcastq %x#$R2,$R2
3117 vmovdqu64 $S1,160($ctx)
3118 vpbroadcastq %x#$S1,$S1
3120 jmp .Lblocks_vpmadd52_4x_key_loaded
3124 .Ldone_init_vpmadd52_2x:
3125 vmovdqu64 $R0,64($ctx) # save key powers
3126 vpsrldq \$8,$R0,$R0 # 0-1-0-2
3127 vmovdqu64 $R1,96($ctx)
3129 vmovdqu64 $R2,128($ctx)
3131 vmovdqu64 $S1,160($ctx)
3133 jmp .Lblocks_vpmadd52_2x_key_loaded
3137 .Lblocks_vpmadd52_2x_do:
3138 vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
3139 vmovdqu64 160+8($ctx),${S1}{%k1}{z}
3140 vmovdqu64 64+8($ctx),${R0}{%k1}{z}
3141 vmovdqu64 96+8($ctx),${R1}{%k1}{z}
3143 .Lblocks_vpmadd52_2x_key_loaded:
3144 vmovdqu64 16*0($inp),$T2 # load data
3148 vpunpcklqdq $T3,$T2,$T1 # transpose data
3149 vpunpckhqdq $T3,$T2,$T3
3151 # at this point 64-bit lanes are ordered as x-1-x-0
3153 vpsrlq \$24,$T3,$T2 # splat the data
3155 vpaddq $T2,$H2,$H2 # accumulate input
3156 vpandq $mask44,$T1,$T0
3160 vpandq $mask44,$T1,$T1
3162 jmp .Ltail_vpmadd52_2x
3167 #vpaddq $T2,$H2,$H2 # accumulate input
3171 vpxorq $D0lo,$D0lo,$D0lo
3172 vpmadd52luq $H2,$S1,$D0lo
3173 vpxorq $D0hi,$D0hi,$D0hi
3174 vpmadd52huq $H2,$S1,$D0hi
3175 vpxorq $D1lo,$D1lo,$D1lo
3176 vpmadd52luq $H2,$S2,$D1lo
3177 vpxorq $D1hi,$D1hi,$D1hi
3178 vpmadd52huq $H2,$S2,$D1hi
3179 vpxorq $D2lo,$D2lo,$D2lo
3180 vpmadd52luq $H2,$R0,$D2lo
3181 vpxorq $D2hi,$D2hi,$D2hi
3182 vpmadd52huq $H2,$R0,$D2hi
3184 vmovdqu64 16*0($inp),$T2 # load data
3185 vmovdqu64 16*2($inp),$T3
3187 vpmadd52luq $H0,$R0,$D0lo
3188 vpmadd52huq $H0,$R0,$D0hi
3189 vpmadd52luq $H0,$R1,$D1lo
3190 vpmadd52huq $H0,$R1,$D1hi
3191 vpmadd52luq $H0,$R2,$D2lo
3192 vpmadd52huq $H0,$R2,$D2hi
3194 vpunpcklqdq $T3,$T2,$T1 # transpose data
3195 vpunpckhqdq $T3,$T2,$T3
3196 vpmadd52luq $H1,$S2,$D0lo
3197 vpmadd52huq $H1,$S2,$D0hi
3198 vpmadd52luq $H1,$R0,$D1lo
3199 vpmadd52huq $H1,$R0,$D1hi
3200 vpmadd52luq $H1,$R1,$D2lo
3201 vpmadd52huq $H1,$R1,$D2hi
3203 ################################################################
3204 # partial reduction (interleaved with data splat)
3205 vpsrlq \$44,$D0lo,$tmp
3206 vpsllq \$8,$D0hi,$D0hi
3207 vpandq $mask44,$D0lo,$H0
3208 vpaddq $tmp,$D0hi,$D0hi
3212 vpaddq $D0hi,$D1lo,$D1lo
3214 vpsrlq \$44,$D1lo,$tmp
3215 vpsllq \$8,$D1hi,$D1hi
3216 vpandq $mask44,$D1lo,$H1
3217 vpaddq $tmp,$D1hi,$D1hi
3219 vpandq $mask44,$T1,$T0
3222 vpaddq $D1hi,$D2lo,$D2lo
3224 vpsrlq \$42,$D2lo,$tmp
3225 vpsllq \$10,$D2hi,$D2hi
3226 vpandq $mask42,$D2lo,$H2
3227 vpaddq $tmp,$D2hi,$D2hi
3229 vpaddq $T2,$H2,$H2 # accumulate input
3230 vpaddq $D2hi,$H0,$H0
3231 vpsllq \$2,$D2hi,$D2hi
3233 vpaddq $D2hi,$H0,$H0
3235 vpandq $mask44,$T1,$T1
3237 vpsrlq \$44,$H0,$tmp # additional step
3238 vpandq $mask44,$H0,$H0
3242 sub \$4,$len # len-=64
3243 jnz .Loop_vpmadd52_4x
3246 vmovdqu64 128($ctx),$R2 # load all key powers
3247 vmovdqu64 160($ctx),$S1
3248 vmovdqu64 64($ctx),$R0
3249 vmovdqu64 96($ctx),$R1
3252 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3256 #vpaddq $T2,$H2,$H2 # accumulate input
3260 vpxorq $D0lo,$D0lo,$D0lo
3261 vpmadd52luq $H2,$S1,$D0lo
3262 vpxorq $D0hi,$D0hi,$D0hi
3263 vpmadd52huq $H2,$S1,$D0hi
3264 vpxorq $D1lo,$D1lo,$D1lo
3265 vpmadd52luq $H2,$S2,$D1lo
3266 vpxorq $D1hi,$D1hi,$D1hi
3267 vpmadd52huq $H2,$S2,$D1hi
3268 vpxorq $D2lo,$D2lo,$D2lo
3269 vpmadd52luq $H2,$R0,$D2lo
3270 vpxorq $D2hi,$D2hi,$D2hi
3271 vpmadd52huq $H2,$R0,$D2hi
3273 vpmadd52luq $H0,$R0,$D0lo
3274 vpmadd52huq $H0,$R0,$D0hi
3275 vpmadd52luq $H0,$R1,$D1lo
3276 vpmadd52huq $H0,$R1,$D1hi
3277 vpmadd52luq $H0,$R2,$D2lo
3278 vpmadd52huq $H0,$R2,$D2hi
3280 vpmadd52luq $H1,$S2,$D0lo
3281 vpmadd52huq $H1,$S2,$D0hi
3282 vpmadd52luq $H1,$R0,$D1lo
3283 vpmadd52huq $H1,$R0,$D1hi
3284 vpmadd52luq $H1,$R1,$D2lo
3285 vpmadd52huq $H1,$R1,$D2hi
3287 ################################################################
3288 # horizontal addition
3292 vpsrldq \$8,$D0lo,$T0
3293 vpsrldq \$8,$D0hi,$H0
3294 vpsrldq \$8,$D1lo,$T1
3295 vpsrldq \$8,$D1hi,$H1
3296 vpaddq $T0,$D0lo,$D0lo
3297 vpaddq $H0,$D0hi,$D0hi
3298 vpsrldq \$8,$D2lo,$T2
3299 vpsrldq \$8,$D2hi,$H2
3300 vpaddq $T1,$D1lo,$D1lo
3301 vpaddq $H1,$D1hi,$D1hi
3302 vpermq \$0x2,$D0lo,$T0
3303 vpermq \$0x2,$D0hi,$H0
3304 vpaddq $T2,$D2lo,$D2lo
3305 vpaddq $H2,$D2hi,$D2hi
3307 vpermq \$0x2,$D1lo,$T1
3308 vpermq \$0x2,$D1hi,$H1
3309 vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3310 vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3311 vpermq \$0x2,$D2lo,$T2
3312 vpermq \$0x2,$D2hi,$H2
3313 vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3314 vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3315 vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3316 vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3318 ################################################################
3320 vpsrlq \$44,$D0lo,$tmp
3321 vpsllq \$8,$D0hi,$D0hi
3322 vpandq $mask44,$D0lo,$H0
3323 vpaddq $tmp,$D0hi,$D0hi
3325 vpaddq $D0hi,$D1lo,$D1lo
3327 vpsrlq \$44,$D1lo,$tmp
3328 vpsllq \$8,$D1hi,$D1hi
3329 vpandq $mask44,$D1lo,$H1
3330 vpaddq $tmp,$D1hi,$D1hi
3332 vpaddq $D1hi,$D2lo,$D2lo
3334 vpsrlq \$42,$D2lo,$tmp
3335 vpsllq \$10,$D2hi,$D2hi
3336 vpandq $mask42,$D2lo,$H2
3337 vpaddq $tmp,$D2hi,$D2hi
3339 vpaddq $D2hi,$H0,$H0
3340 vpsllq \$2,$D2hi,$D2hi
3342 vpaddq $D2hi,$H0,$H0
3344 vpsrlq \$44,$H0,$tmp # additional step
3345 vpandq $mask44,$H0,$H0
3348 # at this point $len is
3349 # either 4*n+2 or 0...
3350 sub \$2,$len # len-=32
3351 ja .Lblocks_vpmadd52_4x_do
3353 vmovq %x#$H0,0($ctx)
3354 vmovq %x#$H1,8($ctx)
3355 vmovq %x#$H2,16($ctx)
3358 .Lno_data_vpmadd52_4x:
3361 .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3365 ########################################################################
3366 # As implied by its name 8x subroutine processes 8 blocks in parallel...
3367 # This is intermediate version, as it's used only in cases when input
3368 # length is either 8*n, 8*n+1 or 8*n+2...
3370 my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
3371 my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
3372 my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
3373 my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
3376 .type poly1305_blocks_vpmadd52_8x,\@function,4
3378 poly1305_blocks_vpmadd52_8x:
3381 jz .Lno_data_vpmadd52_8x # too short
3384 mov 64($ctx),%r8 # peek on power of the key
3386 vmovdqa64 .Lx_mask44(%rip),$mask44
3387 vmovdqa64 .Lx_mask42(%rip),$mask42
3389 test %r8,%r8 # is power value impossible?
3390 js .Linit_vpmadd52 # if it is, then init R[4]
3392 vmovq 0($ctx),%x#$H0 # load current hash value
3393 vmovq 8($ctx),%x#$H1
3394 vmovq 16($ctx),%x#$H2
3396 .Lblocks_vpmadd52_8x:
3397 ################################################################
3398 # fist we calculate more key powers
3400 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
3401 vmovdqu64 160($ctx),$S1
3402 vmovdqu64 64($ctx),$R0
3403 vmovdqu64 96($ctx),$R1
3405 vpsllq \$2,$R2,$S2 # S2 = R2*5*4
3409 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
3410 vpbroadcastq %x#$R0,$RR0
3411 vpbroadcastq %x#$R1,$RR1
3413 vpxorq $D0lo,$D0lo,$D0lo
3414 vpmadd52luq $RR2,$S1,$D0lo
3415 vpxorq $D0hi,$D0hi,$D0hi
3416 vpmadd52huq $RR2,$S1,$D0hi
3417 vpxorq $D1lo,$D1lo,$D1lo
3418 vpmadd52luq $RR2,$S2,$D1lo
3419 vpxorq $D1hi,$D1hi,$D1hi
3420 vpmadd52huq $RR2,$S2,$D1hi
3421 vpxorq $D2lo,$D2lo,$D2lo
3422 vpmadd52luq $RR2,$R0,$D2lo
3423 vpxorq $D2hi,$D2hi,$D2hi
3424 vpmadd52huq $RR2,$R0,$D2hi
3426 vpmadd52luq $RR0,$R0,$D0lo
3427 vpmadd52huq $RR0,$R0,$D0hi
3428 vpmadd52luq $RR0,$R1,$D1lo
3429 vpmadd52huq $RR0,$R1,$D1hi
3430 vpmadd52luq $RR0,$R2,$D2lo
3431 vpmadd52huq $RR0,$R2,$D2hi
3433 vpmadd52luq $RR1,$S2,$D0lo
3434 vpmadd52huq $RR1,$S2,$D0hi
3435 vpmadd52luq $RR1,$R0,$D1lo
3436 vpmadd52huq $RR1,$R0,$D1hi
3437 vpmadd52luq $RR1,$R1,$D2lo
3438 vpmadd52huq $RR1,$R1,$D2hi
3440 ################################################################
3442 vpsrlq \$44,$D0lo,$tmp
3443 vpsllq \$8,$D0hi,$D0hi
3444 vpandq $mask44,$D0lo,$RR0
3445 vpaddq $tmp,$D0hi,$D0hi
3447 vpaddq $D0hi,$D1lo,$D1lo
3449 vpsrlq \$44,$D1lo,$tmp
3450 vpsllq \$8,$D1hi,$D1hi
3451 vpandq $mask44,$D1lo,$RR1
3452 vpaddq $tmp,$D1hi,$D1hi
3454 vpaddq $D1hi,$D2lo,$D2lo
3456 vpsrlq \$42,$D2lo,$tmp
3457 vpsllq \$10,$D2hi,$D2hi
3458 vpandq $mask42,$D2lo,$RR2
3459 vpaddq $tmp,$D2hi,$D2hi
3461 vpaddq $D2hi,$RR0,$RR0
3462 vpsllq \$2,$D2hi,$D2hi
3464 vpaddq $D2hi,$RR0,$RR0
3466 vpsrlq \$44,$RR0,$tmp # additional step
3467 vpandq $mask44,$RR0,$RR0
3469 vpaddq $tmp,$RR1,$RR1
3471 ################################################################
3472 # At this point Rx holds 1324 powers, RRx - 5768, and the goal
3473 # is 15263748, which reflects how data is loaded...
3475 vpunpcklqdq $R2,$RR2,$T2 # 3748
3476 vpunpckhqdq $R2,$RR2,$R2 # 1526
3477 vpunpcklqdq $R0,$RR0,$T0
3478 vpunpckhqdq $R0,$RR0,$R0
3479 vpunpcklqdq $R1,$RR1,$T1
3480 vpunpckhqdq $R1,$RR1,$R1
3482 ######## switch to %zmm
3483 map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3484 map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3485 map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3486 map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
3489 vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
3490 vshufi64x2 \$0x44,$R0,$T0,$RR0
3491 vshufi64x2 \$0x44,$R1,$T1,$RR1
3493 vmovdqu64 16*0($inp),$T2 # load data
3494 vmovdqu64 16*4($inp),$T3
3497 vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
3498 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
3499 vpaddq $RR2,$SS2,$SS2
3500 vpaddq $RR1,$SS1,$SS1
3501 vpsllq \$2,$SS2,$SS2
3502 vpsllq \$2,$SS1,$SS1
3504 vpbroadcastq $padbit,$PAD
3505 vpbroadcastq %x#$mask44,$mask44
3506 vpbroadcastq %x#$mask42,$mask42
3508 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
3509 vpbroadcastq %x#$SS2,$S2
3510 vpbroadcastq %x#$RR0,$R0
3511 vpbroadcastq %x#$RR1,$R1
3512 vpbroadcastq %x#$RR2,$R2
3514 vpunpcklqdq $T3,$T2,$T1 # transpose data
3515 vpunpckhqdq $T3,$T2,$T3
3517 # at this point 64-bit lanes are ordered as 73625140
3519 vpsrlq \$24,$T3,$T2 # splat the data
3521 vpaddq $T2,$H2,$H2 # accumulate input
3522 vpandq $mask44,$T1,$T0
3526 vpandq $mask44,$T1,$T1
3529 jz .Ltail_vpmadd52_8x
3530 jmp .Loop_vpmadd52_8x
3534 #vpaddq $T2,$H2,$H2 # accumulate input
3538 vpxorq $D0lo,$D0lo,$D0lo
3539 vpmadd52luq $H2,$S1,$D0lo
3540 vpxorq $D0hi,$D0hi,$D0hi
3541 vpmadd52huq $H2,$S1,$D0hi
3542 vpxorq $D1lo,$D1lo,$D1lo
3543 vpmadd52luq $H2,$S2,$D1lo
3544 vpxorq $D1hi,$D1hi,$D1hi
3545 vpmadd52huq $H2,$S2,$D1hi
3546 vpxorq $D2lo,$D2lo,$D2lo
3547 vpmadd52luq $H2,$R0,$D2lo
3548 vpxorq $D2hi,$D2hi,$D2hi
3549 vpmadd52huq $H2,$R0,$D2hi
3551 vmovdqu64 16*0($inp),$T2 # load data
3552 vmovdqu64 16*4($inp),$T3
3554 vpmadd52luq $H0,$R0,$D0lo
3555 vpmadd52huq $H0,$R0,$D0hi
3556 vpmadd52luq $H0,$R1,$D1lo
3557 vpmadd52huq $H0,$R1,$D1hi
3558 vpmadd52luq $H0,$R2,$D2lo
3559 vpmadd52huq $H0,$R2,$D2hi
3561 vpunpcklqdq $T3,$T2,$T1 # transpose data
3562 vpunpckhqdq $T3,$T2,$T3
3563 vpmadd52luq $H1,$S2,$D0lo
3564 vpmadd52huq $H1,$S2,$D0hi
3565 vpmadd52luq $H1,$R0,$D1lo
3566 vpmadd52huq $H1,$R0,$D1hi
3567 vpmadd52luq $H1,$R1,$D2lo
3568 vpmadd52huq $H1,$R1,$D2hi
3570 ################################################################
3571 # partial reduction (interleaved with data splat)
3572 vpsrlq \$44,$D0lo,$tmp
3573 vpsllq \$8,$D0hi,$D0hi
3574 vpandq $mask44,$D0lo,$H0
3575 vpaddq $tmp,$D0hi,$D0hi
3579 vpaddq $D0hi,$D1lo,$D1lo
3581 vpsrlq \$44,$D1lo,$tmp
3582 vpsllq \$8,$D1hi,$D1hi
3583 vpandq $mask44,$D1lo,$H1
3584 vpaddq $tmp,$D1hi,$D1hi
3586 vpandq $mask44,$T1,$T0
3589 vpaddq $D1hi,$D2lo,$D2lo
3591 vpsrlq \$42,$D2lo,$tmp
3592 vpsllq \$10,$D2hi,$D2hi
3593 vpandq $mask42,$D2lo,$H2
3594 vpaddq $tmp,$D2hi,$D2hi
3596 vpaddq $T2,$H2,$H2 # accumulate input
3597 vpaddq $D2hi,$H0,$H0
3598 vpsllq \$2,$D2hi,$D2hi
3600 vpaddq $D2hi,$H0,$H0
3602 vpandq $mask44,$T1,$T1
3604 vpsrlq \$44,$H0,$tmp # additional step
3605 vpandq $mask44,$H0,$H0
3609 sub \$8,$len # len-=128
3610 jnz .Loop_vpmadd52_8x
3613 #vpaddq $T2,$H2,$H2 # accumulate input
3617 vpxorq $D0lo,$D0lo,$D0lo
3618 vpmadd52luq $H2,$SS1,$D0lo
3619 vpxorq $D0hi,$D0hi,$D0hi
3620 vpmadd52huq $H2,$SS1,$D0hi
3621 vpxorq $D1lo,$D1lo,$D1lo
3622 vpmadd52luq $H2,$SS2,$D1lo
3623 vpxorq $D1hi,$D1hi,$D1hi
3624 vpmadd52huq $H2,$SS2,$D1hi
3625 vpxorq $D2lo,$D2lo,$D2lo
3626 vpmadd52luq $H2,$RR0,$D2lo
3627 vpxorq $D2hi,$D2hi,$D2hi
3628 vpmadd52huq $H2,$RR0,$D2hi
3630 vpmadd52luq $H0,$RR0,$D0lo
3631 vpmadd52huq $H0,$RR0,$D0hi
3632 vpmadd52luq $H0,$RR1,$D1lo
3633 vpmadd52huq $H0,$RR1,$D1hi
3634 vpmadd52luq $H0,$RR2,$D2lo
3635 vpmadd52huq $H0,$RR2,$D2hi
3637 vpmadd52luq $H1,$SS2,$D0lo
3638 vpmadd52huq $H1,$SS2,$D0hi
3639 vpmadd52luq $H1,$RR0,$D1lo
3640 vpmadd52huq $H1,$RR0,$D1hi
3641 vpmadd52luq $H1,$RR1,$D2lo
3642 vpmadd52huq $H1,$RR1,$D2hi
3644 ################################################################
3645 # horizontal addition
3649 vpsrldq \$8,$D0lo,$T0
3650 vpsrldq \$8,$D0hi,$H0
3651 vpsrldq \$8,$D1lo,$T1
3652 vpsrldq \$8,$D1hi,$H1
3653 vpaddq $T0,$D0lo,$D0lo
3654 vpaddq $H0,$D0hi,$D0hi
3655 vpsrldq \$8,$D2lo,$T2
3656 vpsrldq \$8,$D2hi,$H2
3657 vpaddq $T1,$D1lo,$D1lo
3658 vpaddq $H1,$D1hi,$D1hi
3659 vpermq \$0x2,$D0lo,$T0
3660 vpermq \$0x2,$D0hi,$H0
3661 vpaddq $T2,$D2lo,$D2lo
3662 vpaddq $H2,$D2hi,$D2hi
3664 vpermq \$0x2,$D1lo,$T1
3665 vpermq \$0x2,$D1hi,$H1
3666 vpaddq $T0,$D0lo,$D0lo
3667 vpaddq $H0,$D0hi,$D0hi
3668 vpermq \$0x2,$D2lo,$T2
3669 vpermq \$0x2,$D2hi,$H2
3670 vpaddq $T1,$D1lo,$D1lo
3671 vpaddq $H1,$D1hi,$D1hi
3672 vextracti64x4 \$1,$D0lo,%y#$T0
3673 vextracti64x4 \$1,$D0hi,%y#$H0
3674 vpaddq $T2,$D2lo,$D2lo
3675 vpaddq $H2,$D2hi,$D2hi
3677 vextracti64x4 \$1,$D1lo,%y#$T1
3678 vextracti64x4 \$1,$D1hi,%y#$H1
3679 vextracti64x4 \$1,$D2lo,%y#$T2
3680 vextracti64x4 \$1,$D2hi,%y#$H2
3682 ######## switch back to %ymm
3683 map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
3684 map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
3685 map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
3688 vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
3689 vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
3690 vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
3691 vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
3692 vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
3693 vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
3695 ################################################################
3697 vpsrlq \$44,$D0lo,$tmp
3698 vpsllq \$8,$D0hi,$D0hi
3699 vpandq $mask44,$D0lo,$H0
3700 vpaddq $tmp,$D0hi,$D0hi
3702 vpaddq $D0hi,$D1lo,$D1lo
3704 vpsrlq \$44,$D1lo,$tmp
3705 vpsllq \$8,$D1hi,$D1hi
3706 vpandq $mask44,$D1lo,$H1
3707 vpaddq $tmp,$D1hi,$D1hi
3709 vpaddq $D1hi,$D2lo,$D2lo
3711 vpsrlq \$42,$D2lo,$tmp
3712 vpsllq \$10,$D2hi,$D2hi
3713 vpandq $mask42,$D2lo,$H2
3714 vpaddq $tmp,$D2hi,$D2hi
3716 vpaddq $D2hi,$H0,$H0
3717 vpsllq \$2,$D2hi,$D2hi
3719 vpaddq $D2hi,$H0,$H0
3721 vpsrlq \$44,$H0,$tmp # additional step
3722 vpandq $mask44,$H0,$H0
3726 ################################################################
3728 vmovq %x#$H0,0($ctx)
3729 vmovq %x#$H1,8($ctx)
3730 vmovq %x#$H2,16($ctx)
3733 .Lno_data_vpmadd52_8x:
3736 .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3740 .type poly1305_emit_base2_44,\@function,3
3742 poly1305_emit_base2_44:
3744 mov 0($ctx),%r8 # load hash value
3760 add \$5,%r8 # compare to modulus
3764 shr \$2,%r10 # did 130-bit value overflow?
3768 add 0($nonce),%rax # accumulate nonce
3770 mov %rax,0($mac) # write result
3775 .size poly1305_emit_base2_44,.-poly1305_emit_base2_44
3782 .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
3784 .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
3786 .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
3788 .long 2,2,2,3,2,0,2,1
3790 .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
3793 .long 0,1,1,2,2,3,7,7
3797 .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
3805 .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3806 .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3808 .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3809 .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3813 .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3817 { # chacha20-poly1305 helpers
3818 my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
3819 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
3821 .globl xor128_encrypt_n_pad
3822 .type xor128_encrypt_n_pad,\@abi-omnipotent
3824 xor128_encrypt_n_pad:
3828 mov $len,%r10 # put len aside
3829 shr \$4,$len # len / 16
3833 movdqu ($inp,$otp),%xmm0
3835 movdqu %xmm0,($out,$otp)
3841 and \$15,%r10 # len % 16
3868 .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3870 .globl xor128_decrypt_n_pad
3871 .type xor128_decrypt_n_pad,\@abi-omnipotent
3873 xor128_decrypt_n_pad:
3877 mov $len,%r10 # put len aside
3878 shr \$4,$len # len / 16
3882 movdqu ($inp,$otp),%xmm0
3885 movdqu %xmm1,($out,$otp)
3892 and \$15,%r10 # len % 16
3901 mov ($inp,$otp),%r11b
3921 .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3925 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3926 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
3934 .extern __imp_RtlVirtualUnwind
3935 .type se_handler,\@abi-omnipotent
3949 mov 120($context),%rax # pull context->Rax
3950 mov 248($context),%rbx # pull context->Rip
3952 mov 8($disp),%rsi # disp->ImageBase
3953 mov 56($disp),%r11 # disp->HandlerData
3955 mov 0(%r11),%r10d # HandlerData[0]
3956 lea (%rsi,%r10),%r10 # prologue label
3957 cmp %r10,%rbx # context->Rip<.Lprologue
3958 jb .Lcommon_seh_tail
3960 mov 152($context),%rax # pull context->Rsp
3962 mov 4(%r11),%r10d # HandlerData[1]
3963 lea (%rsi,%r10),%r10 # epilogue label
3964 cmp %r10,%rbx # context->Rip>=.Lepilogue
3965 jae .Lcommon_seh_tail
3975 mov %rbx,144($context) # restore context->Rbx
3976 mov %rbp,160($context) # restore context->Rbp
3977 mov %r12,216($context) # restore context->R12
3978 mov %r13,224($context) # restore context->R13
3979 mov %r14,232($context) # restore context->R14
3980 mov %r15,240($context) # restore context->R14
3982 jmp .Lcommon_seh_tail
3983 .size se_handler,.-se_handler
3985 .type avx_handler,\@abi-omnipotent
3999 mov 120($context),%rax # pull context->Rax
4000 mov 248($context),%rbx # pull context->Rip
4002 mov 8($disp),%rsi # disp->ImageBase
4003 mov 56($disp),%r11 # disp->HandlerData
4005 mov 0(%r11),%r10d # HandlerData[0]
4006 lea (%rsi,%r10),%r10 # prologue label
4007 cmp %r10,%rbx # context->Rip<prologue label
4008 jb .Lcommon_seh_tail
4010 mov 152($context),%rax # pull context->Rsp
4012 mov 4(%r11),%r10d # HandlerData[1]
4013 lea (%rsi,%r10),%r10 # epilogue label
4014 cmp %r10,%rbx # context->Rip>=epilogue label
4015 jae .Lcommon_seh_tail
4017 mov 208($context),%rax # pull context->R11
4021 lea 512($context),%rdi # &context.Xmm6
4023 .long 0xa548f3fc # cld; rep movsq
4028 mov %rax,152($context) # restore context->Rsp
4029 mov %rsi,168($context) # restore context->Rsi
4030 mov %rdi,176($context) # restore context->Rdi
4032 mov 40($disp),%rdi # disp->ContextRecord
4033 mov $context,%rsi # context
4034 mov \$154,%ecx # sizeof(CONTEXT)
4035 .long 0xa548f3fc # cld; rep movsq
4038 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4039 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4040 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4041 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4042 mov 40(%rsi),%r10 # disp->ContextRecord
4043 lea 56(%rsi),%r11 # &disp->HandlerData
4044 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4045 mov %r10,32(%rsp) # arg5
4046 mov %r11,40(%rsp) # arg6
4047 mov %r12,48(%rsp) # arg7
4048 mov %rcx,56(%rsp) # arg8, (NULL)
4049 call *__imp_RtlVirtualUnwind(%rip)
4051 mov \$1,%eax # ExceptionContinueSearch
4063 .size avx_handler,.-avx_handler
4067 .rva .LSEH_begin_poly1305_init
4068 .rva .LSEH_end_poly1305_init
4069 .rva .LSEH_info_poly1305_init
4071 .rva .LSEH_begin_poly1305_blocks
4072 .rva .LSEH_end_poly1305_blocks
4073 .rva .LSEH_info_poly1305_blocks
4075 .rva .LSEH_begin_poly1305_emit
4076 .rva .LSEH_end_poly1305_emit
4077 .rva .LSEH_info_poly1305_emit
4079 $code.=<<___ if ($avx);
4080 .rva .LSEH_begin_poly1305_blocks_avx
4082 .rva .LSEH_info_poly1305_blocks_avx_1
4086 .rva .LSEH_info_poly1305_blocks_avx_2
4089 .rva .LSEH_end_poly1305_blocks_avx
4090 .rva .LSEH_info_poly1305_blocks_avx_3
4092 .rva .LSEH_begin_poly1305_emit_avx
4093 .rva .LSEH_end_poly1305_emit_avx
4094 .rva .LSEH_info_poly1305_emit_avx
4096 $code.=<<___ if ($avx>1);
4097 .rva .LSEH_begin_poly1305_blocks_avx2
4098 .rva .Lbase2_64_avx2
4099 .rva .LSEH_info_poly1305_blocks_avx2_1
4101 .rva .Lbase2_64_avx2
4103 .rva .LSEH_info_poly1305_blocks_avx2_2
4106 .rva .LSEH_end_poly1305_blocks_avx2
4107 .rva .LSEH_info_poly1305_blocks_avx2_3
4109 $code.=<<___ if ($avx>2);
4110 .rva .LSEH_begin_poly1305_blocks_avx512
4111 .rva .LSEH_end_poly1305_blocks_avx512
4112 .rva .LSEH_info_poly1305_blocks_avx512
4117 .LSEH_info_poly1305_init:
4120 .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
4122 .LSEH_info_poly1305_blocks:
4125 .rva .Lblocks_body,.Lblocks_epilogue
4127 .LSEH_info_poly1305_emit:
4130 .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
4132 $code.=<<___ if ($avx);
4133 .LSEH_info_poly1305_blocks_avx_1:
4136 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
4138 .LSEH_info_poly1305_blocks_avx_2:
4141 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
4143 .LSEH_info_poly1305_blocks_avx_3:
4146 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
4148 .LSEH_info_poly1305_emit_avx:
4151 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
4153 $code.=<<___ if ($avx>1);
4154 .LSEH_info_poly1305_blocks_avx2_1:
4157 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
4159 .LSEH_info_poly1305_blocks_avx2_2:
4162 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
4164 .LSEH_info_poly1305_blocks_avx2_3:
4167 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
4169 $code.=<<___ if ($avx>2);
4170 .LSEH_info_poly1305_blocks_avx512:
4173 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
4177 foreach (split('\n',$code)) {
4178 s/\`([^\`]*)\`/eval($1)/ge;
4179 s/%r([a-z]+)#d/%e$1/g;
4180 s/%r([0-9]+)#d/%r$1d/g;
4181 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
4185 close STDOUT or die "error closing STDOUT";