2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements Poly1305 hash for x86_64.
21 # Numbers are cycles per processed byte with poly1305_blocks alone,
22 # measured with rdtsc at fixed clock frequency.
24 # IALU/gcc-4.8(*) AVX(**) AVX2
27 # Westmere 1.88/+120% -
28 # Sandy Bridge 1.39/+140% 1.10
29 # Haswell 1.14/+175% 1.11 0.65
30 # Skylake 1.13/+120% 0.96 0.51
31 # Silvermont 2.83/+95% -
32 # VIA Nano 1.82/+150% -
33 # Sledgehammer 1.38/+160% -
34 # Bulldozer 2.30/+130% 0.97
36 # (*) improvement coefficients relative to clang are more modest and
37 # are ~50% on most processors, in both cases we are comparing to
39 # (**) SSE2 implementation was attempted, but among non-AVX processors
40 # it was faster than integer-only code only on older Intel P4 and
41 # Core processors, 50-30%, less newer processor is, but slower on
42 # contemporary ones, for example almost 2x slower on Atom, and as
43 # former are naturally disappearing, SSE2 is deemed unnecessary;
47 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
49 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
51 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
53 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
54 die "can't locate x86_64-xlate.pl";
56 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
57 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
58 $avx = ($1>=2.19) + ($1>=2.22);
61 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
62 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
63 $avx = ($1>=2.09) + ($1>=2.10);
66 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
67 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
68 $avx = ($1>=10) + ($1>=12);
71 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
72 $avx = ($2>=3.0) + ($2>3.0);
75 open OUT,"| \"$^X\" $xlate $flavour $output";
78 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
79 my ($mac,$nonce)=($inp,$len); # *_emit arguments
80 my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
81 my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
83 sub poly1305_iteration {
84 # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
85 # output: $h0-$h2 *= $r0-$r1
93 mov %rax,$h0 # future $h0
103 mov $h2,$h1 # borrow $h1
107 imulq $s1,$h1 # h2*s1
112 imulq $r0,$h2 # h2*r0
114 mov \$-4,%rax # mask value
117 and $d3,%rax # last reduction step
128 ########################################################################
129 # Layout of opaque area is following.
131 # unsigned __int64 h[3]; # current hash value base 2^64
132 # unsigned __int64 r[2]; # key value base 2^64
137 .extern OPENSSL_ia32cap_P
140 .hidden poly1305_init
141 .globl poly1305_blocks
142 .hidden poly1305_blocks
144 .hidden poly1305_emit
146 .type poly1305_init,\@function,3
150 mov %rax,0($ctx) # initialize hash value
157 lea poly1305_blocks(%rip),%r10
158 lea poly1305_emit(%rip),%r11
160 $code.=<<___ if ($avx);
161 mov OPENSSL_ia32cap_P+4(%rip),%r9
162 lea poly1305_blocks_avx(%rip),%rax
163 lea poly1305_emit_avx(%rip),%rcx
164 bt \$`60-32`,%r9 # AVX?
168 $code.=<<___ if ($avx>1);
169 lea poly1305_blocks_avx2(%rip),%rax
170 bt \$`5+32`,%r9 # AVX2?
174 mov \$0x0ffffffc0fffffff,%rax
175 mov \$0x0ffffffc0ffffffc,%rcx
181 $code.=<<___ if ($flavour !~ /elf32/);
185 $code.=<<___ if ($flavour =~ /elf32/);
193 .size poly1305_init,.-poly1305_init
195 .type poly1305_blocks,\@function,4
200 jz .Lno_data # too short
210 mov $len,%r15 # reassign $len
212 mov 24($ctx),$r0 # load r
215 mov 0($ctx),$h0 # load hash value
222 add $r1,$s1 # s1 = r1 + (r1 >> 2)
227 add 0($inp),$h0 # accumulate input
232 &poly1305_iteration();
238 mov $h0,0($ctx) # store hash value
252 .size poly1305_blocks,.-poly1305_blocks
254 .type poly1305_emit,\@function,3
258 mov 0($ctx),%r8 # load hash value
263 add \$5,%r8 # compare to modulus
267 shr \$2,%r10 # did 130-bit value overfow?
271 add 0($nonce),%rax # accumulate nonce
273 mov %rax,0($mac) # write result
277 .size poly1305_emit,.-poly1305_emit
281 ########################################################################
282 # Layout of opaque area is following.
284 # unsigned __int32 h[5]; # current hash value base 2^26
285 # unsigned __int32 is_base2_26;
286 # unsigned __int64 r[2]; # key value base 2^64
287 # unsigned __int64 pad;
288 # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
290 # where r^n are base 2^26 digits of degrees of multiplier key. There are
291 # 5 digits, but last four are interleaved with multiples of 5, totalling
292 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
294 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
295 map("%xmm$_",(0..15));
298 .type __poly1305_block,\@abi-omnipotent
302 &poly1305_iteration();
305 .size __poly1305_block,.-__poly1305_block
307 .type __poly1305_init_avx,\@abi-omnipotent
314 lea 48+64($ctx),$ctx # size optimization
317 call __poly1305_block # r^2
319 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
325 mov %eax,`16*0+0-64`($ctx)
327 mov %edx,`16*0+4-64`($ctx)
334 mov %eax,`16*1+0-64`($ctx)
335 lea (%rax,%rax,4),%eax # *5
336 mov %edx,`16*1+4-64`($ctx)
337 lea (%rdx,%rdx,4),%edx # *5
338 mov %eax,`16*2+0-64`($ctx)
340 mov %edx,`16*2+4-64`($ctx)
351 mov %eax,`16*3+0-64`($ctx)
352 lea (%rax,%rax,4),%eax # *5
353 mov %edx,`16*3+4-64`($ctx)
354 lea (%rdx,%rdx,4),%edx # *5
355 mov %eax,`16*4+0-64`($ctx)
357 mov %edx,`16*4+4-64`($ctx)
366 mov %eax,`16*5+0-64`($ctx)
367 lea (%rax,%rax,4),%eax # *5
368 mov %edx,`16*5+4-64`($ctx)
369 lea (%rdx,%rdx,4),%edx # *5
370 mov %eax,`16*6+0-64`($ctx)
372 mov %edx,`16*6+4-64`($ctx)
378 mov $d1#d,`16*7+0-64`($ctx)
379 lea ($d1,$d1,4),$d1 # *5
380 mov $d2#d,`16*7+4-64`($ctx)
381 lea ($d2,$d2,4),$d2 # *5
382 mov $d1#d,`16*8+0-64`($ctx)
383 mov $d2#d,`16*8+4-64`($ctx)
386 call __poly1305_block # r^3
388 mov \$0x3ffffff,%eax # save r^3 base 2^26
392 mov %eax,`16*0+12-64`($ctx)
396 mov %edx,`16*1+12-64`($ctx)
397 lea (%rdx,%rdx,4),%edx # *5
399 mov %edx,`16*2+12-64`($ctx)
405 mov %eax,`16*3+12-64`($ctx)
406 lea (%rax,%rax,4),%eax # *5
408 mov %eax,`16*4+12-64`($ctx)
413 mov %edx,`16*5+12-64`($ctx)
414 lea (%rdx,%rdx,4),%edx # *5
416 mov %edx,`16*6+12-64`($ctx)
421 mov $d1#d,`16*7+12-64`($ctx)
422 lea ($d1,$d1,4),$d1 # *5
423 mov $d1#d,`16*8+12-64`($ctx)
426 call __poly1305_block # r^4
428 mov \$0x3ffffff,%eax # save r^4 base 2^26
432 mov %eax,`16*0+8-64`($ctx)
436 mov %edx,`16*1+8-64`($ctx)
437 lea (%rdx,%rdx,4),%edx # *5
439 mov %edx,`16*2+8-64`($ctx)
445 mov %eax,`16*3+8-64`($ctx)
446 lea (%rax,%rax,4),%eax # *5
448 mov %eax,`16*4+8-64`($ctx)
453 mov %edx,`16*5+8-64`($ctx)
454 lea (%rdx,%rdx,4),%edx # *5
456 mov %edx,`16*6+8-64`($ctx)
461 mov $d1#d,`16*7+8-64`($ctx)
462 lea ($d1,$d1,4),$d1 # *5
463 mov $d1#d,`16*8+8-64`($ctx)
465 lea -48-64($ctx),$ctx # size [de-]optimization
467 .size __poly1305_init_avx,.-__poly1305_init_avx
469 .type poly1305_blocks_avx,\@function,4
472 mov 20($ctx),%r8d # is_base2_26
498 mov $len,%r15 # reassign $len
500 mov 0($ctx),$d1 # load hash value
504 mov 24($ctx),$r0 # load r
507 ################################# base 2^26 -> base 2^64
509 and \$`-1*(1<<31)`,$d1
510 mov $d2,$r1 # borrow $r1
512 and \$`-1*(1<<31)`,$d2
526 adc \$0,$h2 # can be partially reduced...
528 mov \$-4,$d2 # ... so reduce
541 add $r1,$s1 # s1 = r1 + (r1 >> 2)
543 add 0($inp),$h0 # accumulate input
548 call __poly1305_block
550 test $padbit,$padbit # if $padbit is zero,
551 jz .Lstore_base2_64_avx # store hash in base 2^64 format
553 ################################# base 2^64 -> base 2^26
560 and \$0x3ffffff,%rax # h[0]
562 and \$0x3ffffff,%rdx # h[1]
566 and \$0x3ffffff,$h0 # h[2]
568 and \$0x3ffffff,$h1 # h[3]
572 jz .Lstore_base2_26_avx
582 .Lstore_base2_64_avx:
585 mov $h2,16($ctx) # note that is_base2_26 is zeroed
589 .Lstore_base2_26_avx:
590 mov %rax#d,0($ctx) # store hash value base 2^26
605 .Lblocks_avx_epilogue:
618 mov $len,%r15 # reassign $len
620 mov 24($ctx),$r0 # load r
623 mov 0($ctx),$h0 # load hash value
630 add $r1,$s1 # s1 = r1 + (r1 >> 2)
635 add 0($inp),$h0 # accumulate input
641 call __poly1305_block
644 ################################# base 2^64 -> base 2^26
651 and \$0x3ffffff,%rax # h[0]
653 and \$0x3ffffff,%rdx # h[1]
657 and \$0x3ffffff,$h0 # h[2]
659 and \$0x3ffffff,$h1 # h[3]
667 movl \$1,20($ctx) # set is_base2_26
669 call __poly1305_init_avx
682 .Lbase2_64_avx_epilogue:
687 vmovd 4*0($ctx),$H0 # load hash value
695 $code.=<<___ if (!$win64);
699 $code.=<<___ if ($win64);
702 vmovdqa %xmm6,0x50(%r11)
703 vmovdqa %xmm7,0x60(%r11)
704 vmovdqa %xmm8,0x70(%r11)
705 vmovdqa %xmm9,0x80(%r11)
706 vmovdqa %xmm10,0x90(%r11)
707 vmovdqa %xmm11,0xa0(%r11)
708 vmovdqa %xmm12,0xb0(%r11)
709 vmovdqa %xmm13,0xc0(%r11)
710 vmovdqa %xmm14,0xd0(%r11)
711 vmovdqa %xmm15,0xe0(%r11)
719 vmovdqu `16*3`($ctx),$D4 # preload r0^2
720 lea `16*3+64`($ctx),$ctx # size optimization
721 lea .Lconst(%rip),%rcx
723 ################################################################
725 vmovdqu 16*2($inp),$T0
726 vmovdqu 16*3($inp),$T1
727 vmovdqa 64(%rcx),$MASK # .Lmask26
729 vpsrldq \$6,$T0,$T2 # splat input
731 vpunpckhqdq $T1,$T0,$T4 # 4
732 vpunpcklqdq $T1,$T0,$T0 # 0:1
733 vpunpcklqdq $T3,$T2,$T3 # 2:3
735 vpsrlq \$40,$T4,$T4 # 4
737 vpand $MASK,$T0,$T0 # 0
739 vpand $MASK,$T1,$T1 # 1
741 vpand $MASK,$T2,$T2 # 2
742 vpand $MASK,$T3,$T3 # 3
743 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
747 # expand and copy pre-calculated table to stack
748 vmovdqu `16*1-64`($ctx),$D1
749 vmovdqu `16*2-64`($ctx),$D2
750 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
751 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
752 vmovdqa $D3,-0x90(%r11)
753 vmovdqa $D0,0x00(%rsp)
754 vpshufd \$0xEE,$D1,$D4
755 vmovdqu `16*3-64`($ctx),$D0
756 vpshufd \$0x44,$D1,$D1
757 vmovdqa $D4,-0x80(%r11)
758 vmovdqa $D1,0x10(%rsp)
759 vpshufd \$0xEE,$D2,$D3
760 vmovdqu `16*4-64`($ctx),$D1
761 vpshufd \$0x44,$D2,$D2
762 vmovdqa $D3,-0x70(%r11)
763 vmovdqa $D2,0x20(%rsp)
764 vpshufd \$0xEE,$D0,$D4
765 vmovdqu `16*5-64`($ctx),$D2
766 vpshufd \$0x44,$D0,$D0
767 vmovdqa $D4,-0x60(%r11)
768 vmovdqa $D0,0x30(%rsp)
769 vpshufd \$0xEE,$D1,$D3
770 vmovdqu `16*6-64`($ctx),$D0
771 vpshufd \$0x44,$D1,$D1
772 vmovdqa $D3,-0x50(%r11)
773 vmovdqa $D1,0x40(%rsp)
774 vpshufd \$0xEE,$D2,$D4
775 vmovdqu `16*7-64`($ctx),$D1
776 vpshufd \$0x44,$D2,$D2
777 vmovdqa $D4,-0x40(%r11)
778 vmovdqa $D2,0x50(%rsp)
779 vpshufd \$0xEE,$D0,$D3
780 vmovdqu `16*8-64`($ctx),$D2
781 vpshufd \$0x44,$D0,$D0
782 vmovdqa $D3,-0x30(%r11)
783 vmovdqa $D0,0x60(%rsp)
784 vpshufd \$0xEE,$D1,$D4
785 vpshufd \$0x44,$D1,$D1
786 vmovdqa $D4,-0x20(%r11)
787 vmovdqa $D1,0x70(%rsp)
788 vpshufd \$0xEE,$D2,$D3
789 vmovdqa 0x00(%rsp),$D4 # preload r0^2
790 vpshufd \$0x44,$D2,$D2
791 vmovdqa $D3,-0x10(%r11)
792 vmovdqa $D2,0x80(%rsp)
798 ################################################################
799 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
800 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
801 # \___________________/
802 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
803 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
804 # \___________________/ \____________________/
806 # Note that we start with inp[2:3]*r^2. This is because it
807 # doesn't depend on reduction in previous iteration.
808 ################################################################
809 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
810 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
811 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
812 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
813 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
815 # though note that $Tx and $Hx are "reversed" in this section,
816 # and $D4 is preloaded with r0^2...
818 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
819 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
820 vmovdqa $H2,0x20(%r11) # offload hash
821 vpmuludq $T2,$D4,$D2 # d3 = h2*r0
822 vmovdqa 0x10(%rsp),$H2 # r1^2
823 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
824 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
826 vmovdqa $H0,0x00(%r11) #
827 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
828 vmovdqa $H1,0x10(%r11) #
829 vpmuludq $T3,$H2,$H1 # h3*r1
830 vpaddq $H0,$D0,$D0 # d0 += h4*s1
831 vpaddq $H1,$D4,$D4 # d4 += h3*r1
832 vmovdqa $H3,0x30(%r11) #
833 vpmuludq $T2,$H2,$H0 # h2*r1
834 vpmuludq $T1,$H2,$H1 # h1*r1
835 vpaddq $H0,$D3,$D3 # d3 += h2*r1
836 vmovdqa 0x30(%rsp),$H3 # r2^2
837 vpaddq $H1,$D2,$D2 # d2 += h1*r1
838 vmovdqa $H4,0x40(%r11) #
839 vpmuludq $T0,$H2,$H2 # h0*r1
840 vpmuludq $T2,$H3,$H0 # h2*r2
841 vpaddq $H2,$D1,$D1 # d1 += h0*r1
843 vmovdqa 0x40(%rsp),$H4 # s2^2
844 vpaddq $H0,$D4,$D4 # d4 += h2*r2
845 vpmuludq $T1,$H3,$H1 # h1*r2
846 vpmuludq $T0,$H3,$H3 # h0*r2
847 vpaddq $H1,$D3,$D3 # d3 += h1*r2
848 vmovdqa 0x50(%rsp),$H2 # r3^2
849 vpaddq $H3,$D2,$D2 # d2 += h0*r2
850 vpmuludq $T4,$H4,$H0 # h4*s2
851 vpmuludq $T3,$H4,$H4 # h3*s2
852 vpaddq $H0,$D1,$D1 # d1 += h4*s2
853 vmovdqa 0x60(%rsp),$H3 # s3^2
854 vpaddq $H4,$D0,$D0 # d0 += h3*s2
856 vmovdqa 0x80(%rsp),$H4 # s4^2
857 vpmuludq $T1,$H2,$H1 # h1*r3
858 vpmuludq $T0,$H2,$H2 # h0*r3
859 vpaddq $H1,$D4,$D4 # d4 += h1*r3
860 vpaddq $H2,$D3,$D3 # d3 += h0*r3
861 vpmuludq $T4,$H3,$H0 # h4*s3
862 vpmuludq $T3,$H3,$H1 # h3*s3
863 vpaddq $H0,$D2,$D2 # d2 += h4*s3
864 vmovdqu 16*0($inp),$H0 # load input
865 vpaddq $H1,$D1,$D1 # d1 += h3*s3
866 vpmuludq $T2,$H3,$H3 # h2*s3
867 vpmuludq $T2,$H4,$T2 # h2*s4
868 vpaddq $H3,$D0,$D0 # d0 += h2*s3
870 vmovdqu 16*1($inp),$H1 #
871 vpaddq $T2,$D1,$D1 # d1 += h2*s4
872 vpmuludq $T3,$H4,$T3 # h3*s4
873 vpmuludq $T4,$H4,$T4 # h4*s4
874 vpsrldq \$6,$H0,$H2 # splat input
875 vpaddq $T3,$D2,$D2 # d2 += h3*s4
876 vpaddq $T4,$D3,$D3 # d3 += h4*s4
877 vpsrldq \$6,$H1,$H3 #
878 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
879 vpmuludq $T1,$H4,$T0 # h1*s4
880 vpunpckhqdq $H1,$H0,$H4 # 4
881 vpaddq $T4,$D4,$D4 # d4 += h0*r4
882 vmovdqa -0x90(%r11),$T4 # r0^4
883 vpaddq $T0,$D0,$D0 # d0 += h1*s4
885 vpunpcklqdq $H1,$H0,$H0 # 0:1
886 vpunpcklqdq $H3,$H2,$H3 # 2:3
888 #vpsrlq \$40,$H4,$H4 # 4
889 vpsrldq \$`40/8`,$H4,$H4 # 4
891 vpand $MASK,$H0,$H0 # 0
893 vpand $MASK,$H1,$H1 # 1
894 vpand 0(%rcx),$H4,$H4 # .Lmask24
896 vpand $MASK,$H2,$H2 # 2
897 vpand $MASK,$H3,$H3 # 3
898 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
900 vpaddq 0x00(%r11),$H0,$H0 # add hash value
901 vpaddq 0x10(%r11),$H1,$H1
902 vpaddq 0x20(%r11),$H2,$H2
903 vpaddq 0x30(%r11),$H3,$H3
904 vpaddq 0x40(%r11),$H4,$H4
911 ################################################################
912 # Now we accumulate (inp[0:1]+hash)*r^4
913 ################################################################
914 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
915 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
916 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
917 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
918 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
920 vpmuludq $H0,$T4,$T0 # h0*r0
921 vpmuludq $H1,$T4,$T1 # h1*r0
924 vmovdqa -0x80(%r11),$T2 # r1^4
925 vpmuludq $H2,$T4,$T0 # h2*r0
926 vpmuludq $H3,$T4,$T1 # h3*r0
929 vpmuludq $H4,$T4,$T4 # h4*r0
930 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
933 vpaddq $T0,$D0,$D0 # d0 += h4*s1
934 vpmuludq $H2,$T2,$T1 # h2*r1
935 vpmuludq $H3,$T2,$T0 # h3*r1
936 vpaddq $T1,$D3,$D3 # d3 += h2*r1
937 vmovdqa -0x60(%r11),$T3 # r2^4
938 vpaddq $T0,$D4,$D4 # d4 += h3*r1
939 vpmuludq $H1,$T2,$T1 # h1*r1
940 vpmuludq $H0,$T2,$T2 # h0*r1
941 vpaddq $T1,$D2,$D2 # d2 += h1*r1
942 vpaddq $T2,$D1,$D1 # d1 += h0*r1
944 vmovdqa -0x50(%r11),$T4 # s2^4
945 vpmuludq $H2,$T3,$T0 # h2*r2
946 vpmuludq $H1,$T3,$T1 # h1*r2
947 vpaddq $T0,$D4,$D4 # d4 += h2*r2
948 vpaddq $T1,$D3,$D3 # d3 += h1*r2
949 vmovdqa -0x40(%r11),$T2 # r3^4
950 vpmuludq $H0,$T3,$T3 # h0*r2
951 vpmuludq $H4,$T4,$T0 # h4*s2
952 vpaddq $T3,$D2,$D2 # d2 += h0*r2
953 vpaddq $T0,$D1,$D1 # d1 += h4*s2
954 vmovdqa -0x30(%r11),$T3 # s3^4
955 vpmuludq $H3,$T4,$T4 # h3*s2
956 vpmuludq $H1,$T2,$T1 # h1*r3
957 vpaddq $T4,$D0,$D0 # d0 += h3*s2
959 vmovdqa -0x10(%r11),$T4 # s4^4
960 vpaddq $T1,$D4,$D4 # d4 += h1*r3
961 vpmuludq $H0,$T2,$T2 # h0*r3
962 vpmuludq $H4,$T3,$T0 # h4*s3
963 vpaddq $T2,$D3,$D3 # d3 += h0*r3
964 vpaddq $T0,$D2,$D2 # d2 += h4*s3
965 vmovdqu 16*2($inp),$T0 # load input
966 vpmuludq $H3,$T3,$T2 # h3*s3
967 vpmuludq $H2,$T3,$T3 # h2*s3
968 vpaddq $T2,$D1,$D1 # d1 += h3*s3
969 vmovdqu 16*3($inp),$T1 #
970 vpaddq $T3,$D0,$D0 # d0 += h2*s3
972 vpmuludq $H2,$T4,$H2 # h2*s4
973 vpmuludq $H3,$T4,$H3 # h3*s4
974 vpsrldq \$6,$T0,$T2 # splat input
975 vpaddq $H2,$D1,$D1 # d1 += h2*s4
976 vpmuludq $H4,$T4,$H4 # h4*s4
977 vpsrldq \$6,$T1,$T3 #
978 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
979 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
980 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
982 vpunpckhqdq $T1,$T0,$T4 # 4
983 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
984 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
986 vpunpcklqdq $T1,$T0,$T0 # 0:1
987 vpunpcklqdq $T3,$T2,$T3 # 2:3
989 #vpsrlq \$40,$T4,$T4 # 4
990 vpsrldq \$`40/8`,$T4,$T4 # 4
992 vmovdqa 0x00(%rsp),$D4 # preload r0^2
993 vpand $MASK,$T0,$T0 # 0
995 vpand $MASK,$T1,$T1 # 1
996 vpand 0(%rcx),$T4,$T4 # .Lmask24
998 vpand $MASK,$T2,$T2 # 2
999 vpand $MASK,$T3,$T3 # 3
1000 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1002 ################################################################
1003 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1008 vpaddq $D3,$H4,$H4 # h3 -> h4
1012 vpaddq $D0,$D1,$H1 # h0 -> h1
1019 vpaddq $D1,$H2,$H2 # h1 -> h2
1023 vpaddq $D0,$H0,$H0 # h4 -> h0
1027 vpaddq $D2,$H3,$H3 # h2 -> h3
1031 vpaddq $D0,$H1,$H1 # h0 -> h1
1035 vpaddq $D3,$H4,$H4 # h3 -> h4
1040 ################################################################
1041 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1043 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1054 vmovdqa $H2,0x20(%r11)
1055 vmovdqa $H0,0x00(%r11)
1056 vmovdqa $H1,0x10(%r11)
1057 vmovdqa $H3,0x30(%r11)
1058 vmovdqa $H4,0x40(%r11)
1060 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1061 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1062 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1063 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1064 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1066 vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1067 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1068 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1069 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1070 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1071 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1073 vpmuludq $T3,$H2,$H0 # h3*r1
1074 vpaddq $H0,$D4,$D4 # d4 += h3*r1
1075 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1076 vpmuludq $T2,$H2,$H1 # h2*r1
1077 vpaddq $H1,$D3,$D3 # d3 += h2*r1
1078 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1079 vpmuludq $T1,$H2,$H0 # h1*r1
1080 vpaddq $H0,$D2,$D2 # d2 += h1*r1
1081 vpmuludq $T0,$H2,$H2 # h0*r1
1082 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1083 vpmuludq $T4,$H3,$H3 # h4*s1
1084 vpaddq $H3,$D0,$D0 # d0 += h4*s1
1086 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1087 vpmuludq $T2,$H4,$H1 # h2*r2
1088 vpaddq $H1,$D4,$D4 # d4 += h2*r2
1089 vpmuludq $T1,$H4,$H0 # h1*r2
1090 vpaddq $H0,$D3,$D3 # d3 += h1*r2
1091 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1092 vpmuludq $T0,$H4,$H4 # h0*r2
1093 vpaddq $H4,$D2,$D2 # d2 += h0*r2
1094 vpmuludq $T4,$H2,$H1 # h4*s2
1095 vpaddq $H1,$D1,$D1 # d1 += h4*s2
1096 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1097 vpmuludq $T3,$H2,$H2 # h3*s2
1098 vpaddq $H2,$D0,$D0 # d0 += h3*s2
1100 vpmuludq $T1,$H3,$H0 # h1*r3
1101 vpaddq $H0,$D4,$D4 # d4 += h1*r3
1102 vpmuludq $T0,$H3,$H3 # h0*r3
1103 vpaddq $H3,$D3,$D3 # d3 += h0*r3
1104 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1105 vpmuludq $T4,$H4,$H1 # h4*s3
1106 vpaddq $H1,$D2,$D2 # d2 += h4*s3
1107 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1108 vpmuludq $T3,$H4,$H0 # h3*s3
1109 vpaddq $H0,$D1,$D1 # d1 += h3*s3
1110 vpmuludq $T2,$H4,$H4 # h2*s3
1111 vpaddq $H4,$D0,$D0 # d0 += h2*s3
1113 vpmuludq $T0,$H2,$H2 # h0*r4
1114 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1115 vpmuludq $T4,$H3,$H1 # h4*s4
1116 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1117 vpmuludq $T3,$H3,$H0 # h3*s4
1118 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1119 vpmuludq $T2,$H3,$H1 # h2*s4
1120 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1121 vpmuludq $T1,$H3,$H3 # h1*s4
1122 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1126 vmovdqu 16*0($inp),$H0 # load input
1127 vmovdqu 16*1($inp),$H1
1129 vpsrldq \$6,$H0,$H2 # splat input
1131 vpunpckhqdq $H1,$H0,$H4 # 4
1132 vpunpcklqdq $H1,$H0,$H0 # 0:1
1133 vpunpcklqdq $H3,$H2,$H3 # 2:3
1135 vpsrlq \$40,$H4,$H4 # 4
1137 vpand $MASK,$H0,$H0 # 0
1139 vpand $MASK,$H1,$H1 # 1
1141 vpand $MASK,$H2,$H2 # 2
1142 vpand $MASK,$H3,$H3 # 3
1143 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1145 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1146 vpaddq 0x00(%r11),$H0,$H0
1147 vpaddq 0x10(%r11),$H1,$H1
1148 vpaddq 0x20(%r11),$H2,$H2
1149 vpaddq 0x30(%r11),$H3,$H3
1150 vpaddq 0x40(%r11),$H4,$H4
1152 ################################################################
1153 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1155 vpmuludq $H0,$T4,$T0 # h0*r0
1156 vpaddq $T0,$D0,$D0 # d0 += h0*r0
1157 vpmuludq $H1,$T4,$T1 # h1*r0
1158 vpaddq $T1,$D1,$D1 # d1 += h1*r0
1159 vpmuludq $H2,$T4,$T0 # h2*r0
1160 vpaddq $T0,$D2,$D2 # d2 += h2*r0
1161 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1162 vpmuludq $H3,$T4,$T1 # h3*r0
1163 vpaddq $T1,$D3,$D3 # d3 += h3*r0
1164 vpmuludq $H4,$T4,$T4 # h4*r0
1165 vpaddq $T4,$D4,$D4 # d4 += h4*r0
1167 vpmuludq $H3,$T2,$T0 # h3*r1
1168 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1169 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1170 vpmuludq $H2,$T2,$T1 # h2*r1
1171 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1172 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1173 vpmuludq $H1,$T2,$T0 # h1*r1
1174 vpaddq $T0,$D2,$D2 # d2 += h1*r1
1175 vpmuludq $H0,$T2,$T2 # h0*r1
1176 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1177 vpmuludq $H4,$T3,$T3 # h4*s1
1178 vpaddq $T3,$D0,$D0 # d0 += h4*s1
1180 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1181 vpmuludq $H2,$T4,$T1 # h2*r2
1182 vpaddq $T1,$D4,$D4 # d4 += h2*r2
1183 vpmuludq $H1,$T4,$T0 # h1*r2
1184 vpaddq $T0,$D3,$D3 # d3 += h1*r2
1185 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1186 vpmuludq $H0,$T4,$T4 # h0*r2
1187 vpaddq $T4,$D2,$D2 # d2 += h0*r2
1188 vpmuludq $H4,$T2,$T1 # h4*s2
1189 vpaddq $T1,$D1,$D1 # d1 += h4*s2
1190 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1191 vpmuludq $H3,$T2,$T2 # h3*s2
1192 vpaddq $T2,$D0,$D0 # d0 += h3*s2
1194 vpmuludq $H1,$T3,$T0 # h1*r3
1195 vpaddq $T0,$D4,$D4 # d4 += h1*r3
1196 vpmuludq $H0,$T3,$T3 # h0*r3
1197 vpaddq $T3,$D3,$D3 # d3 += h0*r3
1198 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1199 vpmuludq $H4,$T4,$T1 # h4*s3
1200 vpaddq $T1,$D2,$D2 # d2 += h4*s3
1201 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1202 vpmuludq $H3,$T4,$T0 # h3*s3
1203 vpaddq $T0,$D1,$D1 # d1 += h3*s3
1204 vpmuludq $H2,$T4,$T4 # h2*s3
1205 vpaddq $T4,$D0,$D0 # d0 += h2*s3
1207 vpmuludq $H0,$T2,$T2 # h0*r4
1208 vpaddq $T2,$D4,$D4 # d4 += h0*r4
1209 vpmuludq $H4,$T3,$T1 # h4*s4
1210 vpaddq $T1,$D3,$D3 # d3 += h4*s4
1211 vpmuludq $H3,$T3,$T0 # h3*s4
1212 vpaddq $T0,$D2,$D2 # d2 += h3*s4
1213 vpmuludq $H2,$T3,$T1 # h2*s4
1214 vpaddq $T1,$D1,$D1 # d1 += h2*s4
1215 vpmuludq $H1,$T3,$T3 # h1*s4
1216 vpaddq $T3,$D0,$D0 # d0 += h1*s4
1219 ################################################################
1220 # horizontal addition
1233 ################################################################
1238 vpaddq $H3,$D4,$D4 # h3 -> h4
1242 vpaddq $H0,$D1,$D1 # h0 -> h1
1249 vpaddq $H1,$D2,$D2 # h1 -> h2
1253 vpaddq $H4,$D0,$D0 # h4 -> h0
1257 vpaddq $H2,$D3,$D3 # h2 -> h3
1261 vpaddq $H0,$D1,$D1 # h0 -> h1
1265 vpaddq $H3,$D4,$D4 # h3 -> h4
1267 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1268 vmovd $D1,`4*1-48-64`($ctx)
1269 vmovd $D2,`4*2-48-64`($ctx)
1270 vmovd $D3,`4*3-48-64`($ctx)
1271 vmovd $D4,`4*4-48-64`($ctx)
1273 $code.=<<___ if ($win64);
1274 vmovdqa 0x50(%r11),%xmm6
1275 vmovdqa 0x60(%r11),%xmm7
1276 vmovdqa 0x70(%r11),%xmm8
1277 vmovdqa 0x80(%r11),%xmm9
1278 vmovdqa 0x90(%r11),%xmm10
1279 vmovdqa 0xa0(%r11),%xmm11
1280 vmovdqa 0xb0(%r11),%xmm12
1281 vmovdqa 0xc0(%r11),%xmm13
1282 vmovdqa 0xd0(%r11),%xmm14
1283 vmovdqa 0xe0(%r11),%xmm15
1287 $code.=<<___ if (!$win64);
1293 .size poly1305_blocks_avx,.-poly1305_blocks_avx
1295 .type poly1305_emit_avx,\@function,3
1298 cmpl \$0,20($ctx) # is_base2_26?
1301 mov 0($ctx),%eax # load hash value base 2^26
1307 shl \$26,%rcx # base 2^26 -> base 2^64
1323 mov %r10,%rax # could be partially reduced, so reduce
1334 add \$5,%r8 # compare to modulus
1338 shr \$2,%r10 # did 130-bit value overfow?
1342 add 0($nonce),%rax # accumulate nonce
1344 mov %rax,0($mac) # write result
1348 .size poly1305_emit_avx,.-poly1305_emit_avx
1352 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1353 map("%ymm$_",(0..15));
1357 .type poly1305_blocks_avx2,\@function,4
1359 poly1305_blocks_avx2:
1360 mov 20($ctx),%r8d # is_base2_26
1386 mov $len,%r15 # reassign $len
1388 mov 0($ctx),$d1 # load hash value
1392 mov 24($ctx),$r0 # load r
1395 ################################# base 2^26 -> base 2^64
1397 and \$`-1*(1<<31)`,$d1
1398 mov $d2,$r1 # borrow $r1
1400 and \$`-1*(1<<31)`,$d2
1414 adc \$0,$h2 # can be partially reduced...
1416 mov \$-4,$d2 # ... so reduce
1429 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1431 .Lbase2_26_pre_avx2:
1432 add 0($inp),$h0 # accumulate input
1438 call __poly1305_block
1442 jnz .Lbase2_26_pre_avx2
1444 test $padbit,$padbit # if $padbit is zero,
1445 jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
1447 ################################# base 2^64 -> base 2^26
1454 and \$0x3ffffff,%rax # h[0]
1456 and \$0x3ffffff,%rdx # h[1]
1460 and \$0x3ffffff,$h0 # h[2]
1462 and \$0x3ffffff,$h1 # h[3]
1466 jz .Lstore_base2_26_avx2
1476 .Lstore_base2_64_avx2:
1479 mov $h2,16($ctx) # note that is_base2_26 is zeroed
1483 .Lstore_base2_26_avx2:
1484 mov %rax#d,0($ctx) # store hash value base 2^26
1499 .Lblocks_avx2_epilogue:
1510 .Lbase2_64_avx2_body:
1512 mov $len,%r15 # reassign $len
1514 mov 24($ctx),$r0 # load r
1517 mov 0($ctx),$h0 # load hash value
1524 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1529 .Lbase2_64_pre_avx2:
1530 add 0($inp),$h0 # accumulate input
1536 call __poly1305_block
1540 jnz .Lbase2_64_pre_avx2
1543 ################################# base 2^64 -> base 2^26
1550 and \$0x3ffffff,%rax # h[0]
1552 and \$0x3ffffff,%rdx # h[1]
1556 and \$0x3ffffff,$h0 # h[2]
1558 and \$0x3ffffff,$h1 # h[3]
1566 movl \$1,20($ctx) # set is_base2_26
1568 call __poly1305_init_avx
1581 .Lbase2_64_avx2_epilogue:
1586 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1587 vmovd 4*1($ctx),%x#$H1
1588 vmovd 4*2($ctx),%x#$H2
1589 vmovd 4*3($ctx),%x#$H3
1590 vmovd 4*4($ctx),%x#$H4
1594 $code.=<<___ if (!$win64);
1598 $code.=<<___ if ($win64);
1599 lea -0xf8(%rsp),%r11
1601 vmovdqa %xmm6,0x50(%r11)
1602 vmovdqa %xmm7,0x60(%r11)
1603 vmovdqa %xmm8,0x70(%r11)
1604 vmovdqa %xmm9,0x80(%r11)
1605 vmovdqa %xmm10,0x90(%r11)
1606 vmovdqa %xmm11,0xa0(%r11)
1607 vmovdqa %xmm12,0xb0(%r11)
1608 vmovdqa %xmm13,0xc0(%r11)
1609 vmovdqa %xmm14,0xd0(%r11)
1610 vmovdqa %xmm15,0xe0(%r11)
1614 lea 48+64($ctx),$ctx # size optimization
1615 lea .Lconst(%rip),%rcx
1617 # expand and copy pre-calculated table to stack
1618 vmovdqu `16*0-64`($ctx),%x#$T2
1620 vmovdqu `16*1-64`($ctx),%x#$T3
1621 vmovdqu `16*2-64`($ctx),%x#$T4
1622 vmovdqu `16*3-64`($ctx),%x#$D0
1623 vmovdqu `16*4-64`($ctx),%x#$D1
1624 vmovdqu `16*5-64`($ctx),%x#$D2
1625 vmovdqu `16*6-64`($ctx),%x#$D3
1626 vpermq \$0x15,$T2,$T2 # 00003412 -> 12343434
1627 vmovdqu `16*7-64`($ctx),%x#$D4
1628 vpermq \$0x15,$T3,$T3
1629 vpshufd \$0xc8,$T2,$T2 # 12343434 -> 14243444
1630 vmovdqu `16*8-64`($ctx),%x#$MASK
1631 vpermq \$0x15,$T4,$T4
1632 vpshufd \$0xc8,$T3,$T3
1633 vmovdqa $T2,0x00(%rsp)
1634 vpermq \$0x15,$D0,$D0
1635 vpshufd \$0xc8,$T4,$T4
1636 vmovdqa $T3,0x20(%rsp)
1637 vpermq \$0x15,$D1,$D1
1638 vpshufd \$0xc8,$D0,$D0
1639 vmovdqa $T4,0x40(%rsp)
1640 vpermq \$0x15,$D2,$D2
1641 vpshufd \$0xc8,$D1,$D1
1642 vmovdqa $D0,0x60(%rsp)
1643 vpermq \$0x15,$D3,$D3
1644 vpshufd \$0xc8,$D2,$D2
1645 vmovdqa $D1,0x80(%rsp)
1646 vpermq \$0x15,$D4,$D4
1647 vpshufd \$0xc8,$D3,$D3
1648 vmovdqa $D2,0xa0(%rsp)
1649 vpermq \$0x15,$MASK,$MASK
1650 vpshufd \$0xc8,$D4,$D4
1651 vmovdqa $D3,0xc0(%rsp)
1652 vpshufd \$0xc8,$MASK,$MASK
1653 vmovdqa $D4,0xe0(%rsp)
1654 vmovdqa $MASK,0x100(%rsp)
1655 vmovdqa 64(%rcx),$MASK # .Lmask26
1657 ################################################################
1659 vmovdqu 16*0($inp),%x#$T0
1660 vmovdqu 16*1($inp),%x#$T1
1661 vinserti128 \$1,16*2($inp),$T0,$T0
1662 vinserti128 \$1,16*3($inp),$T1,$T1
1665 vpsrldq \$6,$T0,$T2 # splat input
1667 vpunpckhqdq $T1,$T0,$T4 # 4
1668 vpunpcklqdq $T3,$T2,$T2 # 2:3
1669 vpunpcklqdq $T1,$T0,$T0 # 0:1
1674 vpsrlq \$40,$T4,$T4 # 4
1675 vpand $MASK,$T2,$T2 # 2
1676 vpand $MASK,$T0,$T0 # 0
1677 vpand $MASK,$T1,$T1 # 1
1678 vpand $MASK,$T3,$T3 # 3
1679 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1681 lea 0x90(%rsp),%rax # size optimization
1682 vpaddq $H2,$T2,$H2 # accumulate input
1689 ################################################################
1690 # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
1691 # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
1692 # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
1693 # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
1694 # \________/\________/
1695 ################################################################
1696 #vpaddq $H2,$T2,$H2 # accumulate input
1698 vmovdqa `32*0`(%rsp),$T0 # r0^4
1700 vmovdqa `32*1`(%rsp),$T1 # r1^4
1702 vmovdqa `32*3`(%rsp),$T2 # r2^4
1704 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1705 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1707 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1708 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1709 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1710 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1711 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1713 # however, as h2 is "chronologically" first one available pull
1714 # corresponding operations up, so it's
1716 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1717 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1718 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1719 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1720 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1722 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1723 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1724 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1725 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1726 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1728 vpmuludq $H0,$T1,$T4 # h0*r1
1729 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1730 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1731 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1732 vpmuludq $H3,$T1,$T4 # h3*r1
1733 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1734 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1735 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1736 vmovdqa `32*4-0x90`(%rax),$T1 # s2
1738 vpmuludq $H0,$T0,$T4 # h0*r0
1739 vpmuludq $H1,$T0,$H2 # h1*r0
1740 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1741 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1742 vpmuludq $H3,$T0,$T4 # h3*r0
1743 vpmuludq $H4,$T0,$H2 # h4*r0
1744 vmovdqu 16*0($inp),%x#$T0 # load input
1745 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1746 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1747 vinserti128 \$1,16*2($inp),$T0,$T0
1749 vpmuludq $H3,$T1,$T4 # h3*s2
1750 vpmuludq $H4,$T1,$H2 # h4*s2
1751 vmovdqu 16*1($inp),%x#$T1
1752 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1753 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1754 vmovdqa `32*5-0x90`(%rax),$H2 # r3
1755 vpmuludq $H1,$T2,$T4 # h1*r2
1756 vpmuludq $H0,$T2,$T2 # h0*r2
1757 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1758 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1759 vinserti128 \$1,16*3($inp),$T1,$T1
1762 vpmuludq $H1,$H2,$T4 # h1*r3
1763 vpmuludq $H0,$H2,$H2 # h0*r3
1764 vpsrldq \$6,$T0,$T2 # splat input
1765 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1766 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1767 vpmuludq $H3,$T3,$T4 # h3*s3
1768 vpmuludq $H4,$T3,$H2 # h4*s3
1770 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1771 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1772 vpunpckhqdq $T1,$T0,$T4 # 4
1774 vpmuludq $H3,$S4,$H3 # h3*s4
1775 vpmuludq $H4,$S4,$H4 # h4*s4
1776 vpunpcklqdq $T1,$T0,$T0 # 0:1
1777 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1778 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1779 vpunpcklqdq $T3,$T2,$T3 # 2:3
1780 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
1781 vpmuludq $H1,$S4,$H0 # h1*s4
1782 vmovdqa 64(%rcx),$MASK # .Lmask26
1783 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1784 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1786 ################################################################
1787 # lazy reduction (interleaved with tail of input splat)
1791 vpaddq $D3,$H4,$H4 # h3 -> h4
1795 vpaddq $D0,$D1,$H1 # h0 -> h1
1804 vpaddq $D1,$H2,$H2 # h1 -> h2
1808 vpaddq $D4,$H0,$H0 # h4 -> h0
1810 vpand $MASK,$T2,$T2 # 2
1815 vpaddq $D2,$H3,$H3 # h2 -> h3
1817 vpaddq $T2,$H2,$H2 # modulo-scheduled
1822 vpaddq $D0,$H1,$H1 # h0 -> h1
1824 vpsrlq \$40,$T4,$T4 # 4
1828 vpaddq $D3,$H4,$H4 # h3 -> h4
1830 vpand $MASK,$T0,$T0 # 0
1831 vpand $MASK,$T1,$T1 # 1
1832 vpand $MASK,$T3,$T3 # 3
1833 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1840 ################################################################
1841 # while above multiplications were by r^4 in all lanes, in last
1842 # iteration we multiply least significant lane by r^4 and most
1843 # significant one by r, so copy of above except that references
1844 # to the precomputed table are displaced by 4...
1846 #vpaddq $H2,$T2,$H2 # accumulate input
1848 vmovdqu `32*0+4`(%rsp),$T0 # r0^4
1850 vmovdqu `32*1+4`(%rsp),$T1 # r1^4
1852 vmovdqu `32*3+4`(%rsp),$T2 # r2^4
1854 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
1855 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
1857 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1858 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1859 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1860 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1861 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1863 vpmuludq $H0,$T1,$T4 # h0*r1
1864 vpmuludq $H1,$T1,$H2 # h1*r1
1865 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1866 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1867 vpmuludq $H3,$T1,$T4 # h3*r1
1868 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
1869 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1870 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1872 vpmuludq $H0,$T0,$T4 # h0*r0
1873 vpmuludq $H1,$T0,$H2 # h1*r0
1874 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1875 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
1876 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1877 vpmuludq $H3,$T0,$T4 # h3*r0
1878 vpmuludq $H4,$T0,$H2 # h4*r0
1879 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1880 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1882 vpmuludq $H3,$T1,$T4 # h3*s2
1883 vpmuludq $H4,$T1,$H2 # h4*s2
1884 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1885 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1886 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
1887 vpmuludq $H1,$T2,$T4 # h1*r2
1888 vpmuludq $H0,$T2,$T2 # h0*r2
1889 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1890 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1892 vpmuludq $H1,$H2,$T4 # h1*r3
1893 vpmuludq $H0,$H2,$H2 # h0*r3
1894 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1895 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1896 vpmuludq $H3,$T3,$T4 # h3*s3
1897 vpmuludq $H4,$T3,$H2 # h4*s3
1898 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1899 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1901 vpmuludq $H3,$S4,$H3 # h3*s4
1902 vpmuludq $H4,$S4,$H4 # h4*s4
1903 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1904 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1905 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
1906 vpmuludq $H1,$S4,$H0 # h1*s4
1907 vmovdqa 64(%rcx),$MASK # .Lmask26
1908 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1909 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1911 ################################################################
1912 # horizontal addition
1925 vpermq \$0x2,$H3,$T3
1926 vpermq \$0x2,$H4,$T4
1927 vpermq \$0x2,$H0,$T0
1928 vpermq \$0x2,$D1,$T1
1929 vpermq \$0x2,$H2,$T2
1936 ################################################################
1941 vpaddq $D3,$H4,$H4 # h3 -> h4
1945 vpaddq $D0,$D1,$H1 # h0 -> h1
1952 vpaddq $D1,$H2,$H2 # h1 -> h2
1956 vpaddq $D4,$H0,$H0 # h4 -> h0
1960 vpaddq $D2,$H3,$H3 # h2 -> h3
1964 vpaddq $D0,$H1,$H1 # h0 -> h1
1968 vpaddq $D3,$H4,$H4 # h3 -> h4
1970 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
1971 vmovd %x#$H1,`4*1-48-64`($ctx)
1972 vmovd %x#$H2,`4*2-48-64`($ctx)
1973 vmovd %x#$H3,`4*3-48-64`($ctx)
1974 vmovd %x#$H4,`4*4-48-64`($ctx)
1976 $code.=<<___ if ($win64);
1977 vmovdqa 0x50(%r11),%xmm6
1978 vmovdqa 0x60(%r11),%xmm7
1979 vmovdqa 0x70(%r11),%xmm8
1980 vmovdqa 0x80(%r11),%xmm9
1981 vmovdqa 0x90(%r11),%xmm10
1982 vmovdqa 0xa0(%r11),%xmm11
1983 vmovdqa 0xb0(%r11),%xmm12
1984 vmovdqa 0xc0(%r11),%xmm13
1985 vmovdqa 0xd0(%r11),%xmm14
1986 vmovdqa 0xe0(%r11),%xmm15
1990 $code.=<<___ if (!$win64);
1996 .size poly1305_blocks_avx2,.-poly1305_blocks_avx2
2003 .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
2005 .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
2007 .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
2009 .long 5,0,5,0,5,0,5,0
2014 .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2018 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2019 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2027 .extern __imp_RtlVirtualUnwind
2028 .type se_handler,\@abi-omnipotent
2042 mov 120($context),%rax # pull context->Rax
2043 mov 248($context),%rbx # pull context->Rip
2045 mov 8($disp),%rsi # disp->ImageBase
2046 mov 56($disp),%r11 # disp->HandlerData
2048 mov 0(%r11),%r10d # HandlerData[0]
2049 lea (%rsi,%r10),%r10 # prologue label
2050 cmp %r10,%rbx # context->Rip<.Lprologue
2051 jb .Lcommon_seh_tail
2053 mov 152($context),%rax # pull context->Rsp
2055 mov 4(%r11),%r10d # HandlerData[1]
2056 lea (%rsi,%r10),%r10 # epilogue label
2057 cmp %r10,%rbx # context->Rip>=.Lepilogue
2058 jae .Lcommon_seh_tail
2068 mov %rbx,144($context) # restore context->Rbx
2069 mov %rbp,160($context) # restore context->Rbp
2070 mov %r12,216($context) # restore context->R12
2071 mov %r13,224($context) # restore context->R13
2072 mov %r14,232($context) # restore context->R14
2073 mov %r15,240($context) # restore context->R14
2075 jmp .Lcommon_seh_tail
2076 .size se_handler,.-se_handler
2078 .type avx_handler,\@abi-omnipotent
2092 mov 120($context),%rax # pull context->Rax
2093 mov 248($context),%rbx # pull context->Rip
2095 mov 8($disp),%rsi # disp->ImageBase
2096 mov 56($disp),%r11 # disp->HandlerData
2098 mov 0(%r11),%r10d # HandlerData[0]
2099 lea (%rsi,%r10),%r10 # prologue label
2100 cmp %r10,%rbx # context->Rip<prologue label
2101 jb .Lcommon_seh_tail
2103 mov 152($context),%rax # pull context->Rsp
2105 mov 4(%r11),%r10d # HandlerData[1]
2106 lea (%rsi,%r10),%r10 # epilogue label
2107 cmp %r10,%rbx # context->Rip>=epilogue label
2108 jae .Lcommon_seh_tail
2110 mov 208($context),%rax # pull context->R11
2114 lea 512($context),%rdi # &context.Xmm6
2116 .long 0xa548f3fc # cld; rep movsq
2121 mov %rax,152($context) # restore context->Rsp
2122 mov %rsi,168($context) # restore context->Rsi
2123 mov %rdi,176($context) # restore context->Rdi
2125 mov 40($disp),%rdi # disp->ContextRecord
2126 mov $context,%rsi # context
2127 mov \$154,%ecx # sizeof(CONTEXT)
2128 .long 0xa548f3fc # cld; rep movsq
2131 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2132 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2133 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2134 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2135 mov 40(%rsi),%r10 # disp->ContextRecord
2136 lea 56(%rsi),%r11 # &disp->HandlerData
2137 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2138 mov %r10,32(%rsp) # arg5
2139 mov %r11,40(%rsp) # arg6
2140 mov %r12,48(%rsp) # arg7
2141 mov %rcx,56(%rsp) # arg8, (NULL)
2142 call *__imp_RtlVirtualUnwind(%rip)
2144 mov \$1,%eax # ExceptionContinueSearch
2156 .size avx_handler,.-avx_handler
2160 .rva .LSEH_begin_poly1305_init
2161 .rva .LSEH_end_poly1305_init
2162 .rva .LSEH_info_poly1305_init
2164 .rva .LSEH_begin_poly1305_blocks
2165 .rva .LSEH_end_poly1305_blocks
2166 .rva .LSEH_info_poly1305_blocks
2168 .rva .LSEH_begin_poly1305_emit
2169 .rva .LSEH_end_poly1305_emit
2170 .rva .LSEH_info_poly1305_emit
2172 $code.=<<___ if ($avx);
2173 .rva .LSEH_begin_poly1305_blocks_avx
2175 .rva .LSEH_info_poly1305_blocks_avx_1
2179 .rva .LSEH_info_poly1305_blocks_avx_2
2182 .rva .LSEH_end_poly1305_blocks_avx
2183 .rva .LSEH_info_poly1305_blocks_avx_3
2185 .rva .LSEH_begin_poly1305_emit_avx
2186 .rva .LSEH_end_poly1305_emit_avx
2187 .rva .LSEH_info_poly1305_emit_avx
2189 $code.=<<___ if ($avx>1);
2190 .rva .LSEH_begin_poly1305_blocks_avx2
2191 .rva .Lbase2_64_avx2
2192 .rva .LSEH_info_poly1305_blocks_avx2_1
2194 .rva .Lbase2_64_avx2
2196 .rva .LSEH_info_poly1305_blocks_avx2_2
2199 .rva .LSEH_end_poly1305_blocks_avx2
2200 .rva .LSEH_info_poly1305_blocks_avx2_3
2205 .LSEH_info_poly1305_init:
2208 .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
2210 .LSEH_info_poly1305_blocks:
2213 .rva .Lblocks_body,.Lblocks_epilogue
2215 .LSEH_info_poly1305_emit:
2218 .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
2220 $code.=<<___ if ($avx);
2221 .LSEH_info_poly1305_blocks_avx_1:
2224 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
2226 .LSEH_info_poly1305_blocks_avx_2:
2229 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
2231 .LSEH_info_poly1305_blocks_avx_3:
2234 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
2236 .LSEH_info_poly1305_emit_avx:
2239 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
2241 $code.=<<___ if ($avx>1);
2242 .LSEH_info_poly1305_blocks_avx2_1:
2245 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
2247 .LSEH_info_poly1305_blocks_avx2_2:
2250 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
2252 .LSEH_info_poly1305_blocks_avx2_3:
2255 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
2259 foreach (split('\n',$code)) {
2260 s/\`([^\`]*)\`/eval($1)/ge;
2261 s/%r([a-z]+)#d/%e$1/g;
2262 s/%r([0-9]+)#d/%r$1d/g;