3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements Poly1305 hash for x86_64.
14 # Numbers are cycles per processed byte with poly1305_blocks alone,
15 # measured with rdtsc at fixed clock frequency.
17 # IALU/gcc-4.8(*) AVX(**) AVX2
20 # Westmere 1.88/+120% -
21 # Sandy Bridge 1.39/+140% 1.10
22 # Haswell 1.14/+175% 1.11 0.65
23 # Skylake 1.13/+120% 0.96 0.51
24 # Silvermont 2.83/+95% -
25 # VIA Nano 1.82/+150% -
26 # Sledgehammer 1.38/+160% -
27 # Bulldozer 2.30/+130% 0.97
29 # (*) improvement coefficients relative to clang are more modest and
30 # are ~50% on most processors, in both cases we are comparing to
32 # (**) SSE2 implementation was attempted, but among non-AVX processors
33 # it was faster than integer-only code only on older Intel P4 and
34 # Core processors, 50-30%, less newer processor is, but slower on
35 # contemporary ones, for example almost 2x slower on Atom, and as
36 # former are naturally disappearing, SSE2 is deemed unnecessary;
40 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
46 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
47 die "can't locate x86_64-xlate.pl";
49 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
50 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
51 $avx = ($1>=2.19) + ($1>=2.22);
54 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
55 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
56 $avx = ($1>=2.09) + ($1>=2.10);
59 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
60 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
61 $avx = ($1>=10) + ($1>=12);
64 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
65 $avx = ($2>=3.0) + ($2>3.0);
68 open OUT,"| \"$^X\" $xlate $flavour $output";
71 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
72 my ($mac,$nonce)=($inp,$len); # *_emit arguments
73 my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
74 my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
76 sub poly1305_iteration {
77 # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
78 # output: $h0-$h2 *= $r0-$r1
86 mov %rax,$h0 # future $h0
96 mov $h2,$h1 # borrow $h1
100 imulq $s1,$h1 # h2*s1
105 imulq $r0,$h2 # h2*r0
107 mov \$-4,%rax # mask value
110 and $d3,%rax # last reduction step
121 ########################################################################
122 # Layout of opaque area is following.
124 # unsigned __int64 h[3]; # current hash value base 2^64
125 # unsigned __int64 r[2]; # key value base 2^64
130 .extern OPENSSL_ia32cap_P
133 .globl poly1305_blocks
135 .type poly1305_init,\@function,3
139 mov %rax,0($ctx) # initialize hash value
146 lea poly1305_blocks(%rip),%r10
147 lea poly1305_emit(%rip),%r11
149 $code.=<<___ if ($avx);
150 mov OPENSSL_ia32cap_P+4(%rip),%r9
151 lea poly1305_blocks_avx(%rip),%rax
152 lea poly1305_emit_avx(%rip),%rcx
153 bt \$`60-32`,%r9 # AVX?
157 $code.=<<___ if ($avx>1);
158 lea poly1305_blocks_avx2(%rip),%rax
159 bt \$`5+32`,%r9 # AVX2?
163 mov \$0x0ffffffc0fffffff,%rax
164 mov \$0x0ffffffc0ffffffc,%rcx
170 $code.=<<___ if ($flavour !~ /elf32/);
174 $code.=<<___ if ($flavour =~ /elf32/);
182 .size poly1305_init,.-poly1305_init
184 .type poly1305_blocks,\@function,4
189 jz .Lno_data # too short
199 mov $len,%r15 # reassign $len
201 mov 24($ctx),$r0 # load r
204 mov 0($ctx),$h0 # load hash value
211 add $r1,$s1 # s1 = r1 + (r1 >> 2)
216 add 0($inp),$h0 # accumulate input
221 &poly1305_iteration();
227 mov $h0,0($ctx) # store hash value
241 .size poly1305_blocks,.-poly1305_blocks
243 .type poly1305_emit,\@function,3
247 mov 0($ctx),%r8 # load hash value
252 add \$5,%r8 # compare to modulus
256 shr \$2,%r10 # did 130-bit value overfow?
260 add 0($nonce),%rax # accumulate nonce
262 mov %rax,0($mac) # write result
266 .size poly1305_emit,.-poly1305_emit
270 ########################################################################
271 # Layout of opaque area is following.
273 # unsigned __int32 h[5]; # current hash value base 2^26
274 # unsigned __int32 is_base2_26;
275 # unsigned __int64 r[2]; # key value base 2^64
276 # unsigned __int64 pad;
277 # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
279 # where r^n are base 2^26 digits of degrees of multiplier key. There are
280 # 5 digits, but last four are interleaved with multiples of 5, totalling
281 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
283 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
284 map("%xmm$_",(0..15));
287 .type __poly1305_block,\@abi-omnipotent
291 &poly1305_iteration();
294 .size __poly1305_block,.-__poly1305_block
296 .type __poly1305_init_avx,\@abi-omnipotent
303 lea 48+64($ctx),$ctx # size optimization
306 call __poly1305_block # r^2
308 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
314 mov %eax,`16*0+0-64`($ctx)
316 mov %edx,`16*0+4-64`($ctx)
323 mov %eax,`16*1+0-64`($ctx)
324 lea (%rax,%rax,4),%eax # *5
325 mov %edx,`16*1+4-64`($ctx)
326 lea (%rdx,%rdx,4),%edx # *5
327 mov %eax,`16*2+0-64`($ctx)
329 mov %edx,`16*2+4-64`($ctx)
340 mov %eax,`16*3+0-64`($ctx)
341 lea (%rax,%rax,4),%eax # *5
342 mov %edx,`16*3+4-64`($ctx)
343 lea (%rdx,%rdx,4),%edx # *5
344 mov %eax,`16*4+0-64`($ctx)
346 mov %edx,`16*4+4-64`($ctx)
355 mov %eax,`16*5+0-64`($ctx)
356 lea (%rax,%rax,4),%eax # *5
357 mov %edx,`16*5+4-64`($ctx)
358 lea (%rdx,%rdx,4),%edx # *5
359 mov %eax,`16*6+0-64`($ctx)
361 mov %edx,`16*6+4-64`($ctx)
367 mov $d1#d,`16*7+0-64`($ctx)
368 lea ($d1,$d1,4),$d1 # *5
369 mov $d2#d,`16*7+4-64`($ctx)
370 lea ($d2,$d2,4),$d2 # *5
371 mov $d1#d,`16*8+0-64`($ctx)
372 mov $d2#d,`16*8+4-64`($ctx)
375 call __poly1305_block # r^3
377 mov \$0x3ffffff,%eax # save r^3 base 2^26
381 mov %eax,`16*0+12-64`($ctx)
385 mov %edx,`16*1+12-64`($ctx)
386 lea (%rdx,%rdx,4),%edx # *5
388 mov %edx,`16*2+12-64`($ctx)
394 mov %eax,`16*3+12-64`($ctx)
395 lea (%rax,%rax,4),%eax # *5
397 mov %eax,`16*4+12-64`($ctx)
402 mov %edx,`16*5+12-64`($ctx)
403 lea (%rdx,%rdx,4),%edx # *5
405 mov %edx,`16*6+12-64`($ctx)
410 mov $d1#d,`16*7+12-64`($ctx)
411 lea ($d1,$d1,4),$d1 # *5
412 mov $d1#d,`16*8+12-64`($ctx)
415 call __poly1305_block # r^4
417 mov \$0x3ffffff,%eax # save r^4 base 2^26
421 mov %eax,`16*0+8-64`($ctx)
425 mov %edx,`16*1+8-64`($ctx)
426 lea (%rdx,%rdx,4),%edx # *5
428 mov %edx,`16*2+8-64`($ctx)
434 mov %eax,`16*3+8-64`($ctx)
435 lea (%rax,%rax,4),%eax # *5
437 mov %eax,`16*4+8-64`($ctx)
442 mov %edx,`16*5+8-64`($ctx)
443 lea (%rdx,%rdx,4),%edx # *5
445 mov %edx,`16*6+8-64`($ctx)
450 mov $d1#d,`16*7+8-64`($ctx)
451 lea ($d1,$d1,4),$d1 # *5
452 mov $d1#d,`16*8+8-64`($ctx)
454 lea -48-64($ctx),$ctx # size [de-]optimization
456 .size __poly1305_init_avx,.-__poly1305_init_avx
458 .type poly1305_blocks_avx,\@function,4
461 mov 20($ctx),%r8d # is_base2_26
487 mov $len,%r15 # reassign $len
489 mov 0($ctx),$d1 # load hash value
493 mov 24($ctx),$r0 # load r
496 ################################# base 2^26 -> base 2^64
498 and \$`-1*(1<<31)`,$d1
499 mov $d2,$r1 # borrow $r1
501 and \$`-1*(1<<31)`,$d2
515 adc \$0,$h2 # can be partially reduced...
517 mov \$-4,$d2 # ... so reduce
530 add $r1,$s1 # s1 = r1 + (r1 >> 2)
532 add 0($inp),$h0 # accumulate input
537 call __poly1305_block
539 test $padbit,$padbit # if $padbit is zero,
540 jz .Lstore_base2_64_avx # store hash in base 2^64 format
542 ################################# base 2^64 -> base 2^26
549 and \$0x3ffffff,%rax # h[0]
551 and \$0x3ffffff,%rdx # h[1]
555 and \$0x3ffffff,$h0 # h[2]
557 and \$0x3ffffff,$h1 # h[3]
561 jz .Lstore_base2_26_avx
571 .Lstore_base2_64_avx:
574 mov $h2,16($ctx) # note that is_base2_26 is zeroed
578 .Lstore_base2_26_avx:
579 mov %rax#d,0($ctx) # store hash value base 2^26
594 .Lblocks_avx_epilogue:
607 mov $len,%r15 # reassign $len
609 mov 24($ctx),$r0 # load r
612 mov 0($ctx),$h0 # load hash value
619 add $r1,$s1 # s1 = r1 + (r1 >> 2)
624 add 0($inp),$h0 # accumulate input
630 call __poly1305_block
633 ################################# base 2^64 -> base 2^26
640 and \$0x3ffffff,%rax # h[0]
642 and \$0x3ffffff,%rdx # h[1]
646 and \$0x3ffffff,$h0 # h[2]
648 and \$0x3ffffff,$h1 # h[3]
656 movl \$1,20($ctx) # set is_base2_26
658 call __poly1305_init_avx
671 .Lbase2_64_avx_epilogue:
676 vmovd 4*0($ctx),$H0 # load hash value
684 $code.=<<___ if (!$win64);
688 $code.=<<___ if ($win64);
691 vmovdqa %xmm6,0x50(%r11)
692 vmovdqa %xmm7,0x60(%r11)
693 vmovdqa %xmm8,0x70(%r11)
694 vmovdqa %xmm9,0x80(%r11)
695 vmovdqa %xmm10,0x90(%r11)
696 vmovdqa %xmm11,0xa0(%r11)
697 vmovdqa %xmm12,0xb0(%r11)
698 vmovdqa %xmm13,0xc0(%r11)
699 vmovdqa %xmm14,0xd0(%r11)
700 vmovdqa %xmm15,0xe0(%r11)
708 vmovdqu `16*3`($ctx),$D4 # preload r0^2
709 lea `16*3+64`($ctx),$ctx # size optimization
710 lea .Lconst(%rip),%rcx
712 ################################################################
714 vmovdqu 16*2($inp),$T0
715 vmovdqu 16*3($inp),$T1
716 vmovdqa 64(%rcx),$MASK # .Lmask26
718 vpsrldq \$6,$T0,$T2 # splat input
720 vpunpckhqdq $T1,$T0,$T4 # 4
721 vpunpcklqdq $T1,$T0,$T0 # 0:1
722 vpunpcklqdq $T3,$T2,$T3 # 2:3
724 vpsrlq \$40,$T4,$T4 # 4
726 vpand $MASK,$T0,$T0 # 0
728 vpand $MASK,$T1,$T1 # 1
730 vpand $MASK,$T2,$T2 # 2
731 vpand $MASK,$T3,$T3 # 3
732 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
736 # expand and copy pre-calculated table to stack
737 vmovdqu `16*1-64`($ctx),$D1
738 vmovdqu `16*2-64`($ctx),$D2
739 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
740 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
741 vmovdqa $D3,-0x90(%r11)
742 vmovdqa $D0,0x00(%rsp)
743 vpshufd \$0xEE,$D1,$D4
744 vmovdqu `16*3-64`($ctx),$D0
745 vpshufd \$0x44,$D1,$D1
746 vmovdqa $D4,-0x80(%r11)
747 vmovdqa $D1,0x10(%rsp)
748 vpshufd \$0xEE,$D2,$D3
749 vmovdqu `16*4-64`($ctx),$D1
750 vpshufd \$0x44,$D2,$D2
751 vmovdqa $D3,-0x70(%r11)
752 vmovdqa $D2,0x20(%rsp)
753 vpshufd \$0xEE,$D0,$D4
754 vmovdqu `16*5-64`($ctx),$D2
755 vpshufd \$0x44,$D0,$D0
756 vmovdqa $D4,-0x60(%r11)
757 vmovdqa $D0,0x30(%rsp)
758 vpshufd \$0xEE,$D1,$D3
759 vmovdqu `16*6-64`($ctx),$D0
760 vpshufd \$0x44,$D1,$D1
761 vmovdqa $D3,-0x50(%r11)
762 vmovdqa $D1,0x40(%rsp)
763 vpshufd \$0xEE,$D2,$D4
764 vmovdqu `16*7-64`($ctx),$D1
765 vpshufd \$0x44,$D2,$D2
766 vmovdqa $D4,-0x40(%r11)
767 vmovdqa $D2,0x50(%rsp)
768 vpshufd \$0xEE,$D0,$D3
769 vmovdqu `16*8-64`($ctx),$D2
770 vpshufd \$0x44,$D0,$D0
771 vmovdqa $D3,-0x30(%r11)
772 vmovdqa $D0,0x60(%rsp)
773 vpshufd \$0xEE,$D1,$D4
774 vpshufd \$0x44,$D1,$D1
775 vmovdqa $D4,-0x20(%r11)
776 vmovdqa $D1,0x70(%rsp)
777 vpshufd \$0xEE,$D2,$D3
778 vmovdqa 0x00(%rsp),$D4 # preload r0^2
779 vpshufd \$0x44,$D2,$D2
780 vmovdqa $D3,-0x10(%r11)
781 vmovdqa $D2,0x80(%rsp)
787 ################################################################
788 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
789 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
790 # \___________________/
791 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
792 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
793 # \___________________/ \____________________/
795 # Note that we start with inp[2:3]*r^2. This is because it
796 # doesn't depend on reduction in previous iteration.
797 ################################################################
798 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
799 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
800 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
801 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
802 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
804 # though note that $Tx and $Hx are "reversed" in this section,
805 # and $D4 is preloaded with r0^2...
807 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
808 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
809 vmovdqa $H2,0x20(%r11) # offload hash
810 vpmuludq $T2,$D4,$D2 # d3 = h2*r0
811 vmovdqa 0x10(%rsp),$H2 # r1^2
812 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
813 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
815 vmovdqa $H0,0x00(%r11) #
816 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
817 vmovdqa $H1,0x10(%r11) #
818 vpmuludq $T3,$H2,$H1 # h3*r1
819 vpaddq $H0,$D0,$D0 # d0 += h4*s1
820 vpaddq $H1,$D4,$D4 # d4 += h3*r1
821 vmovdqa $H3,0x30(%r11) #
822 vpmuludq $T2,$H2,$H0 # h2*r1
823 vpmuludq $T1,$H2,$H1 # h1*r1
824 vpaddq $H0,$D3,$D3 # d3 += h2*r1
825 vmovdqa 0x30(%rsp),$H3 # r2^2
826 vpaddq $H1,$D2,$D2 # d2 += h1*r1
827 vmovdqa $H4,0x40(%r11) #
828 vpmuludq $T0,$H2,$H2 # h0*r1
829 vpmuludq $T2,$H3,$H0 # h2*r2
830 vpaddq $H2,$D1,$D1 # d1 += h0*r1
832 vmovdqa 0x40(%rsp),$H4 # s2^2
833 vpaddq $H0,$D4,$D4 # d4 += h2*r2
834 vpmuludq $T1,$H3,$H1 # h1*r2
835 vpmuludq $T0,$H3,$H3 # h0*r2
836 vpaddq $H1,$D3,$D3 # d3 += h1*r2
837 vmovdqa 0x50(%rsp),$H2 # r3^2
838 vpaddq $H3,$D2,$D2 # d2 += h0*r2
839 vpmuludq $T4,$H4,$H0 # h4*s2
840 vpmuludq $T3,$H4,$H4 # h3*s2
841 vpaddq $H0,$D1,$D1 # d1 += h4*s2
842 vmovdqa 0x60(%rsp),$H3 # s3^2
843 vpaddq $H4,$D0,$D0 # d0 += h3*s2
845 vmovdqa 0x80(%rsp),$H4 # s4^2
846 vpmuludq $T1,$H2,$H1 # h1*r3
847 vpmuludq $T0,$H2,$H2 # h0*r3
848 vpaddq $H1,$D4,$D4 # d4 += h1*r3
849 vpaddq $H2,$D3,$D3 # d3 += h0*r3
850 vpmuludq $T4,$H3,$H0 # h4*s3
851 vpmuludq $T3,$H3,$H1 # h3*s3
852 vpaddq $H0,$D2,$D2 # d2 += h4*s3
853 vmovdqu 16*0($inp),$H0 # load input
854 vpaddq $H1,$D1,$D1 # d1 += h3*s3
855 vpmuludq $T2,$H3,$H3 # h2*s3
856 vpmuludq $T2,$H4,$T2 # h2*s4
857 vpaddq $H3,$D0,$D0 # d0 += h2*s3
859 vmovdqu 16*1($inp),$H1 #
860 vpaddq $T2,$D1,$D1 # d1 += h2*s4
861 vpmuludq $T3,$H4,$T3 # h3*s4
862 vpmuludq $T4,$H4,$T4 # h4*s4
863 vpsrldq \$6,$H0,$H2 # splat input
864 vpaddq $T3,$D2,$D2 # d2 += h3*s4
865 vpaddq $T4,$D3,$D3 # d3 += h4*s4
866 vpsrldq \$6,$H1,$H3 #
867 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
868 vpmuludq $T1,$H4,$T0 # h1*s4
869 vpunpckhqdq $H1,$H0,$H4 # 4
870 vpaddq $T4,$D4,$D4 # d4 += h0*r4
871 vmovdqa -0x90(%r11),$T4 # r0^4
872 vpaddq $T0,$D0,$D0 # d0 += h1*s4
874 vpunpcklqdq $H1,$H0,$H0 # 0:1
875 vpunpcklqdq $H3,$H2,$H3 # 2:3
877 #vpsrlq \$40,$H4,$H4 # 4
878 vpsrldq \$`40/8`,$H4,$H4 # 4
880 vpand $MASK,$H0,$H0 # 0
882 vpand $MASK,$H1,$H1 # 1
883 vpand 0(%rcx),$H4,$H4 # .Lmask24
885 vpand $MASK,$H2,$H2 # 2
886 vpand $MASK,$H3,$H3 # 3
887 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
889 vpaddq 0x00(%r11),$H0,$H0 # add hash value
890 vpaddq 0x10(%r11),$H1,$H1
891 vpaddq 0x20(%r11),$H2,$H2
892 vpaddq 0x30(%r11),$H3,$H3
893 vpaddq 0x40(%r11),$H4,$H4
900 ################################################################
901 # Now we accumulate (inp[0:1]+hash)*r^4
902 ################################################################
903 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
904 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
905 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
906 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
907 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
909 vpmuludq $H0,$T4,$T0 # h0*r0
910 vpmuludq $H1,$T4,$T1 # h1*r0
913 vmovdqa -0x80(%r11),$T2 # r1^4
914 vpmuludq $H2,$T4,$T0 # h2*r0
915 vpmuludq $H3,$T4,$T1 # h3*r0
918 vpmuludq $H4,$T4,$T4 # h4*r0
919 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
922 vpaddq $T0,$D0,$D0 # d0 += h4*s1
923 vpmuludq $H2,$T2,$T1 # h2*r1
924 vpmuludq $H3,$T2,$T0 # h3*r1
925 vpaddq $T1,$D3,$D3 # d3 += h2*r1
926 vmovdqa -0x60(%r11),$T3 # r2^4
927 vpaddq $T0,$D4,$D4 # d4 += h3*r1
928 vpmuludq $H1,$T2,$T1 # h1*r1
929 vpmuludq $H0,$T2,$T2 # h0*r1
930 vpaddq $T1,$D2,$D2 # d2 += h1*r1
931 vpaddq $T2,$D1,$D1 # d1 += h0*r1
933 vmovdqa -0x50(%r11),$T4 # s2^4
934 vpmuludq $H2,$T3,$T0 # h2*r2
935 vpmuludq $H1,$T3,$T1 # h1*r2
936 vpaddq $T0,$D4,$D4 # d4 += h2*r2
937 vpaddq $T1,$D3,$D3 # d3 += h1*r2
938 vmovdqa -0x40(%r11),$T2 # r3^4
939 vpmuludq $H0,$T3,$T3 # h0*r2
940 vpmuludq $H4,$T4,$T0 # h4*s2
941 vpaddq $T3,$D2,$D2 # d2 += h0*r2
942 vpaddq $T0,$D1,$D1 # d1 += h4*s2
943 vmovdqa -0x30(%r11),$T3 # s3^4
944 vpmuludq $H3,$T4,$T4 # h3*s2
945 vpmuludq $H1,$T2,$T1 # h1*r3
946 vpaddq $T4,$D0,$D0 # d0 += h3*s2
948 vmovdqa -0x10(%r11),$T4 # s4^4
949 vpaddq $T1,$D4,$D4 # d4 += h1*r3
950 vpmuludq $H0,$T2,$T2 # h0*r3
951 vpmuludq $H4,$T3,$T0 # h4*s3
952 vpaddq $T2,$D3,$D3 # d3 += h0*r3
953 vpaddq $T0,$D2,$D2 # d2 += h4*s3
954 vmovdqu 16*2($inp),$T0 # load input
955 vpmuludq $H3,$T3,$T2 # h3*s3
956 vpmuludq $H2,$T3,$T3 # h2*s3
957 vpaddq $T2,$D1,$D1 # d1 += h3*s3
958 vmovdqu 16*3($inp),$T1 #
959 vpaddq $T3,$D0,$D0 # d0 += h2*s3
961 vpmuludq $H2,$T4,$H2 # h2*s4
962 vpmuludq $H3,$T4,$H3 # h3*s4
963 vpsrldq \$6,$T0,$T2 # splat input
964 vpaddq $H2,$D1,$D1 # d1 += h2*s4
965 vpmuludq $H4,$T4,$H4 # h4*s4
966 vpsrldq \$6,$T1,$T3 #
967 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
968 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
969 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
971 vpunpckhqdq $T1,$T0,$T4 # 4
972 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
973 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
975 vpunpcklqdq $T1,$T0,$T0 # 0:1
976 vpunpcklqdq $T3,$T2,$T3 # 2:3
978 #vpsrlq \$40,$T4,$T4 # 4
979 vpsrldq \$`40/8`,$T4,$T4 # 4
981 vmovdqa 0x00(%rsp),$D4 # preload r0^2
982 vpand $MASK,$T0,$T0 # 0
984 vpand $MASK,$T1,$T1 # 1
985 vpand 0(%rcx),$T4,$T4 # .Lmask24
987 vpand $MASK,$T2,$T2 # 2
988 vpand $MASK,$T3,$T3 # 3
989 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
991 ################################################################
992 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
997 vpaddq $D3,$H4,$H4 # h3 -> h4
1001 vpaddq $D0,$D1,$H1 # h0 -> h1
1008 vpaddq $D1,$H2,$H2 # h1 -> h2
1012 vpaddq $D0,$H0,$H0 # h4 -> h0
1016 vpaddq $D2,$H3,$H3 # h2 -> h3
1020 vpaddq $D0,$H1,$H1 # h0 -> h1
1024 vpaddq $D3,$H4,$H4 # h3 -> h4
1029 ################################################################
1030 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1032 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
1043 vmovdqa $H2,0x20(%r11)
1044 vmovdqa $H0,0x00(%r11)
1045 vmovdqa $H1,0x10(%r11)
1046 vmovdqa $H3,0x30(%r11)
1047 vmovdqa $H4,0x40(%r11)
1049 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1050 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1051 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1052 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1053 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1055 vpmuludq $T2,$D4,$D2 # d2 = h2*r0
1056 vpmuludq $T0,$D4,$D0 # d0 = h0*r0
1057 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
1058 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1059 vpmuludq $T3,$D4,$D3 # d3 = h3*r0
1060 vpmuludq $T4,$D4,$D4 # d4 = h4*r0
1062 vpmuludq $T3,$H2,$H0 # h3*r1
1063 vpaddq $H0,$D4,$D4 # d4 += h3*r1
1064 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
1065 vpmuludq $T2,$H2,$H1 # h2*r1
1066 vpaddq $H1,$D3,$D3 # d3 += h2*r1
1067 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1068 vpmuludq $T1,$H2,$H0 # h1*r1
1069 vpaddq $H0,$D2,$D2 # d2 += h1*r1
1070 vpmuludq $T0,$H2,$H2 # h0*r1
1071 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1072 vpmuludq $T4,$H3,$H3 # h4*s1
1073 vpaddq $H3,$D0,$D0 # d0 += h4*s1
1075 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
1076 vpmuludq $T2,$H4,$H1 # h2*r2
1077 vpaddq $H1,$D4,$D4 # d4 += h2*r2
1078 vpmuludq $T1,$H4,$H0 # h1*r2
1079 vpaddq $H0,$D3,$D3 # d3 += h1*r2
1080 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
1081 vpmuludq $T0,$H4,$H4 # h0*r2
1082 vpaddq $H4,$D2,$D2 # d2 += h0*r2
1083 vpmuludq $T4,$H2,$H1 # h4*s2
1084 vpaddq $H1,$D1,$D1 # d1 += h4*s2
1085 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
1086 vpmuludq $T3,$H2,$H2 # h3*s2
1087 vpaddq $H2,$D0,$D0 # d0 += h3*s2
1089 vpmuludq $T1,$H3,$H0 # h1*r3
1090 vpaddq $H0,$D4,$D4 # d4 += h1*r3
1091 vpmuludq $T0,$H3,$H3 # h0*r3
1092 vpaddq $H3,$D3,$D3 # d3 += h0*r3
1093 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
1094 vpmuludq $T4,$H4,$H1 # h4*s3
1095 vpaddq $H1,$D2,$D2 # d2 += h4*s3
1096 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
1097 vpmuludq $T3,$H4,$H0 # h3*s3
1098 vpaddq $H0,$D1,$D1 # d1 += h3*s3
1099 vpmuludq $T2,$H4,$H4 # h2*s3
1100 vpaddq $H4,$D0,$D0 # d0 += h2*s3
1102 vpmuludq $T0,$H2,$H2 # h0*r4
1103 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
1104 vpmuludq $T4,$H3,$H1 # h4*s4
1105 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
1106 vpmuludq $T3,$H3,$H0 # h3*s4
1107 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
1108 vpmuludq $T2,$H3,$H1 # h2*s4
1109 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1110 vpmuludq $T1,$H3,$H3 # h1*s4
1111 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
1115 vmovdqu 16*0($inp),$H0 # load input
1116 vmovdqu 16*1($inp),$H1
1118 vpsrldq \$6,$H0,$H2 # splat input
1120 vpunpckhqdq $H1,$H0,$H4 # 4
1121 vpunpcklqdq $H1,$H0,$H0 # 0:1
1122 vpunpcklqdq $H3,$H2,$H3 # 2:3
1124 vpsrlq \$40,$H4,$H4 # 4
1126 vpand $MASK,$H0,$H0 # 0
1128 vpand $MASK,$H1,$H1 # 1
1130 vpand $MASK,$H2,$H2 # 2
1131 vpand $MASK,$H3,$H3 # 3
1132 vpor 32(%rcx),$H4,$H4 # padbit, yes, always
1134 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
1135 vpaddq 0x00(%r11),$H0,$H0
1136 vpaddq 0x10(%r11),$H1,$H1
1137 vpaddq 0x20(%r11),$H2,$H2
1138 vpaddq 0x30(%r11),$H3,$H3
1139 vpaddq 0x40(%r11),$H4,$H4
1141 ################################################################
1142 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1144 vpmuludq $H0,$T4,$T0 # h0*r0
1145 vpaddq $T0,$D0,$D0 # d0 += h0*r0
1146 vpmuludq $H1,$T4,$T1 # h1*r0
1147 vpaddq $T1,$D1,$D1 # d1 += h1*r0
1148 vpmuludq $H2,$T4,$T0 # h2*r0
1149 vpaddq $T0,$D2,$D2 # d2 += h2*r0
1150 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
1151 vpmuludq $H3,$T4,$T1 # h3*r0
1152 vpaddq $T1,$D3,$D3 # d3 += h3*r0
1153 vpmuludq $H4,$T4,$T4 # h4*r0
1154 vpaddq $T4,$D4,$D4 # d4 += h4*r0
1156 vpmuludq $H3,$T2,$T0 # h3*r1
1157 vpaddq $T0,$D4,$D4 # d4 += h3*r1
1158 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
1159 vpmuludq $H2,$T2,$T1 # h2*r1
1160 vpaddq $T1,$D3,$D3 # d3 += h2*r1
1161 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1162 vpmuludq $H1,$T2,$T0 # h1*r1
1163 vpaddq $T0,$D2,$D2 # d2 += h1*r1
1164 vpmuludq $H0,$T2,$T2 # h0*r1
1165 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1166 vpmuludq $H4,$T3,$T3 # h4*s1
1167 vpaddq $T3,$D0,$D0 # d0 += h4*s1
1169 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
1170 vpmuludq $H2,$T4,$T1 # h2*r2
1171 vpaddq $T1,$D4,$D4 # d4 += h2*r2
1172 vpmuludq $H1,$T4,$T0 # h1*r2
1173 vpaddq $T0,$D3,$D3 # d3 += h1*r2
1174 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
1175 vpmuludq $H0,$T4,$T4 # h0*r2
1176 vpaddq $T4,$D2,$D2 # d2 += h0*r2
1177 vpmuludq $H4,$T2,$T1 # h4*s2
1178 vpaddq $T1,$D1,$D1 # d1 += h4*s2
1179 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
1180 vpmuludq $H3,$T2,$T2 # h3*s2
1181 vpaddq $T2,$D0,$D0 # d0 += h3*s2
1183 vpmuludq $H1,$T3,$T0 # h1*r3
1184 vpaddq $T0,$D4,$D4 # d4 += h1*r3
1185 vpmuludq $H0,$T3,$T3 # h0*r3
1186 vpaddq $T3,$D3,$D3 # d3 += h0*r3
1187 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
1188 vpmuludq $H4,$T4,$T1 # h4*s3
1189 vpaddq $T1,$D2,$D2 # d2 += h4*s3
1190 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
1191 vpmuludq $H3,$T4,$T0 # h3*s3
1192 vpaddq $T0,$D1,$D1 # d1 += h3*s3
1193 vpmuludq $H2,$T4,$T4 # h2*s3
1194 vpaddq $T4,$D0,$D0 # d0 += h2*s3
1196 vpmuludq $H0,$T2,$T2 # h0*r4
1197 vpaddq $T2,$D4,$D4 # d4 += h0*r4
1198 vpmuludq $H4,$T3,$T1 # h4*s4
1199 vpaddq $T1,$D3,$D3 # d3 += h4*s4
1200 vpmuludq $H3,$T3,$T0 # h3*s4
1201 vpaddq $T0,$D2,$D2 # d2 += h3*s4
1202 vpmuludq $H2,$T3,$T1 # h2*s4
1203 vpaddq $T1,$D1,$D1 # d1 += h2*s4
1204 vpmuludq $H1,$T3,$T3 # h1*s4
1205 vpaddq $T3,$D0,$D0 # d0 += h1*s4
1208 ################################################################
1209 # horizontal addition
1222 ################################################################
1227 vpaddq $H3,$D4,$D4 # h3 -> h4
1231 vpaddq $H0,$D1,$D1 # h0 -> h1
1238 vpaddq $H1,$D2,$D2 # h1 -> h2
1242 vpaddq $H4,$D0,$D0 # h4 -> h0
1246 vpaddq $H2,$D3,$D3 # h2 -> h3
1250 vpaddq $H0,$D1,$D1 # h0 -> h1
1254 vpaddq $H3,$D4,$D4 # h3 -> h4
1256 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
1257 vmovd $D1,`4*1-48-64`($ctx)
1258 vmovd $D2,`4*2-48-64`($ctx)
1259 vmovd $D3,`4*3-48-64`($ctx)
1260 vmovd $D4,`4*4-48-64`($ctx)
1262 $code.=<<___ if ($win64);
1263 vmovdqa 0x50(%r11),%xmm6
1264 vmovdqa 0x60(%r11),%xmm7
1265 vmovdqa 0x70(%r11),%xmm8
1266 vmovdqa 0x80(%r11),%xmm9
1267 vmovdqa 0x90(%r11),%xmm10
1268 vmovdqa 0xa0(%r11),%xmm11
1269 vmovdqa 0xb0(%r11),%xmm12
1270 vmovdqa 0xc0(%r11),%xmm13
1271 vmovdqa 0xd0(%r11),%xmm14
1272 vmovdqa 0xe0(%r11),%xmm15
1276 $code.=<<___ if (!$win64);
1282 .size poly1305_blocks_avx,.-poly1305_blocks_avx
1284 .type poly1305_emit_avx,\@function,3
1287 cmpl \$0,20($ctx) # is_base2_26?
1290 mov 0($ctx),%eax # load hash value base 2^26
1296 shl \$26,%rcx # base 2^26 -> base 2^64
1312 mov %r10,%rax # could be partially reduced, so reduce
1323 add \$5,%r8 # compare to modulus
1327 shr \$2,%r10 # did 130-bit value overfow?
1331 add 0($nonce),%rax # accumulate nonce
1333 mov %rax,0($mac) # write result
1337 .size poly1305_emit_avx,.-poly1305_emit_avx
1341 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1342 map("%ymm$_",(0..15));
1346 .type poly1305_blocks_avx2,\@function,4
1348 poly1305_blocks_avx2:
1349 mov 20($ctx),%r8d # is_base2_26
1375 mov $len,%r15 # reassign $len
1377 mov 0($ctx),$d1 # load hash value
1381 mov 24($ctx),$r0 # load r
1384 ################################# base 2^26 -> base 2^64
1386 and \$`-1*(1<<31)`,$d1
1387 mov $d2,$r1 # borrow $r1
1389 and \$`-1*(1<<31)`,$d2
1403 adc \$0,$h2 # can be partially reduced...
1405 mov \$-4,$d2 # ... so reduce
1418 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1420 .Lbase2_26_pre_avx2:
1421 add 0($inp),$h0 # accumulate input
1427 call __poly1305_block
1431 jnz .Lbase2_26_pre_avx2
1433 test $padbit,$padbit # if $padbit is zero,
1434 jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
1436 ################################# base 2^64 -> base 2^26
1443 and \$0x3ffffff,%rax # h[0]
1445 and \$0x3ffffff,%rdx # h[1]
1449 and \$0x3ffffff,$h0 # h[2]
1451 and \$0x3ffffff,$h1 # h[3]
1455 jz .Lstore_base2_26_avx2
1465 .Lstore_base2_64_avx2:
1468 mov $h2,16($ctx) # note that is_base2_26 is zeroed
1472 .Lstore_base2_26_avx2:
1473 mov %rax#d,0($ctx) # store hash value base 2^26
1488 .Lblocks_avx2_epilogue:
1499 .Lbase2_64_avx2_body:
1501 mov $len,%r15 # reassign $len
1503 mov 24($ctx),$r0 # load r
1506 mov 0($ctx),$h0 # load hash value
1513 add $r1,$s1 # s1 = r1 + (r1 >> 2)
1518 .Lbase2_64_pre_avx2:
1519 add 0($inp),$h0 # accumulate input
1525 call __poly1305_block
1529 jnz .Lbase2_64_pre_avx2
1532 ################################# base 2^64 -> base 2^26
1539 and \$0x3ffffff,%rax # h[0]
1541 and \$0x3ffffff,%rdx # h[1]
1545 and \$0x3ffffff,$h0 # h[2]
1547 and \$0x3ffffff,$h1 # h[3]
1555 movl \$1,20($ctx) # set is_base2_26
1557 call __poly1305_init_avx
1570 .Lbase2_64_avx2_epilogue:
1575 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
1576 vmovd 4*1($ctx),%x#$H1
1577 vmovd 4*2($ctx),%x#$H2
1578 vmovd 4*3($ctx),%x#$H3
1579 vmovd 4*4($ctx),%x#$H4
1583 $code.=<<___ if (!$win64);
1587 $code.=<<___ if ($win64);
1588 lea -0xf8(%rsp),%r11
1590 vmovdqa %xmm6,0x50(%r11)
1591 vmovdqa %xmm7,0x60(%r11)
1592 vmovdqa %xmm8,0x70(%r11)
1593 vmovdqa %xmm9,0x80(%r11)
1594 vmovdqa %xmm10,0x90(%r11)
1595 vmovdqa %xmm11,0xa0(%r11)
1596 vmovdqa %xmm12,0xb0(%r11)
1597 vmovdqa %xmm13,0xc0(%r11)
1598 vmovdqa %xmm14,0xd0(%r11)
1599 vmovdqa %xmm15,0xe0(%r11)
1603 lea 48+64($ctx),$ctx # size optimization
1604 lea .Lconst(%rip),%rcx
1606 # expand and copy pre-calculated table to stack
1607 vmovdqu `16*0-64`($ctx),%x#$T2
1609 vmovdqu `16*1-64`($ctx),%x#$T3
1610 vmovdqu `16*2-64`($ctx),%x#$T4
1611 vmovdqu `16*3-64`($ctx),%x#$D0
1612 vmovdqu `16*4-64`($ctx),%x#$D1
1613 vmovdqu `16*5-64`($ctx),%x#$D2
1614 vmovdqu `16*6-64`($ctx),%x#$D3
1615 vpermq \$0x15,$T2,$T2 # 00003412 -> 12343434
1616 vmovdqu `16*7-64`($ctx),%x#$D4
1617 vpermq \$0x15,$T3,$T3
1618 vpshufd \$0xc8,$T2,$T2 # 12343434 -> 14243444
1619 vmovdqu `16*8-64`($ctx),%x#$MASK
1620 vpermq \$0x15,$T4,$T4
1621 vpshufd \$0xc8,$T3,$T3
1622 vmovdqa $T2,0x00(%rsp)
1623 vpermq \$0x15,$D0,$D0
1624 vpshufd \$0xc8,$T4,$T4
1625 vmovdqa $T3,0x20(%rsp)
1626 vpermq \$0x15,$D1,$D1
1627 vpshufd \$0xc8,$D0,$D0
1628 vmovdqa $T4,0x40(%rsp)
1629 vpermq \$0x15,$D2,$D2
1630 vpshufd \$0xc8,$D1,$D1
1631 vmovdqa $D0,0x60(%rsp)
1632 vpermq \$0x15,$D3,$D3
1633 vpshufd \$0xc8,$D2,$D2
1634 vmovdqa $D1,0x80(%rsp)
1635 vpermq \$0x15,$D4,$D4
1636 vpshufd \$0xc8,$D3,$D3
1637 vmovdqa $D2,0xa0(%rsp)
1638 vpermq \$0x15,$MASK,$MASK
1639 vpshufd \$0xc8,$D4,$D4
1640 vmovdqa $D3,0xc0(%rsp)
1641 vpshufd \$0xc8,$MASK,$MASK
1642 vmovdqa $D4,0xe0(%rsp)
1643 vmovdqa $MASK,0x100(%rsp)
1644 vmovdqa 64(%rcx),$MASK # .Lmask26
1646 ################################################################
1648 vmovdqu 16*0($inp),%x#$T0
1649 vmovdqu 16*1($inp),%x#$T1
1650 vinserti128 \$1,16*2($inp),$T0,$T0
1651 vinserti128 \$1,16*3($inp),$T1,$T1
1654 vpsrldq \$6,$T0,$T2 # splat input
1656 vpunpckhqdq $T1,$T0,$T4 # 4
1657 vpunpcklqdq $T3,$T2,$T2 # 2:3
1658 vpunpcklqdq $T1,$T0,$T0 # 0:1
1663 vpsrlq \$40,$T4,$T4 # 4
1664 vpand $MASK,$T2,$T2 # 2
1665 vpand $MASK,$T0,$T0 # 0
1666 vpand $MASK,$T1,$T1 # 1
1667 vpand $MASK,$T3,$T3 # 3
1668 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1670 lea 0x90(%rsp),%rax # size optimization
1671 vpaddq $H2,$T2,$H2 # accumulate input
1678 ################################################################
1679 # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
1680 # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
1681 # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
1682 # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
1683 # \________/\________/
1684 ################################################################
1685 #vpaddq $H2,$T2,$H2 # accumulate input
1687 vmovdqa `32*0`(%rsp),$T0 # r0^4
1689 vmovdqa `32*1`(%rsp),$T1 # r1^4
1691 vmovdqa `32*3`(%rsp),$T2 # r2^4
1693 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
1694 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
1696 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
1697 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
1698 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1699 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1700 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1702 # however, as h2 is "chronologically" first one available pull
1703 # corresponding operations up, so it's
1705 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
1706 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
1707 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
1708 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1709 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
1711 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1712 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1713 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1714 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1715 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1717 vpmuludq $H0,$T1,$T4 # h0*r1
1718 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
1719 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1720 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1721 vpmuludq $H3,$T1,$T4 # h3*r1
1722 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
1723 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1724 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1725 vmovdqa `32*4-0x90`(%rax),$T1 # s2
1727 vpmuludq $H0,$T0,$T4 # h0*r0
1728 vpmuludq $H1,$T0,$H2 # h1*r0
1729 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1730 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1731 vpmuludq $H3,$T0,$T4 # h3*r0
1732 vpmuludq $H4,$T0,$H2 # h4*r0
1733 vmovdqu 16*0($inp),%x#$T0 # load input
1734 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1735 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1736 vinserti128 \$1,16*2($inp),$T0,$T0
1738 vpmuludq $H3,$T1,$T4 # h3*s2
1739 vpmuludq $H4,$T1,$H2 # h4*s2
1740 vmovdqu 16*1($inp),%x#$T1
1741 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1742 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1743 vmovdqa `32*5-0x90`(%rax),$H2 # r3
1744 vpmuludq $H1,$T2,$T4 # h1*r2
1745 vpmuludq $H0,$T2,$T2 # h0*r2
1746 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1747 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1748 vinserti128 \$1,16*3($inp),$T1,$T1
1751 vpmuludq $H1,$H2,$T4 # h1*r3
1752 vpmuludq $H0,$H2,$H2 # h0*r3
1753 vpsrldq \$6,$T0,$T2 # splat input
1754 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1755 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1756 vpmuludq $H3,$T3,$T4 # h3*s3
1757 vpmuludq $H4,$T3,$H2 # h4*s3
1759 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1760 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1761 vpunpckhqdq $T1,$T0,$T4 # 4
1763 vpmuludq $H3,$S4,$H3 # h3*s4
1764 vpmuludq $H4,$S4,$H4 # h4*s4
1765 vpunpcklqdq $T1,$T0,$T0 # 0:1
1766 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1767 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1768 vpunpcklqdq $T3,$T2,$T3 # 2:3
1769 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
1770 vpmuludq $H1,$S4,$H0 # h1*s4
1771 vmovdqa 64(%rcx),$MASK # .Lmask26
1772 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1773 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1775 ################################################################
1776 # lazy reduction (interleaved with tail of input splat)
1780 vpaddq $D3,$H4,$H4 # h3 -> h4
1784 vpaddq $D0,$D1,$H1 # h0 -> h1
1793 vpaddq $D1,$H2,$H2 # h1 -> h2
1797 vpaddq $D4,$H0,$H0 # h4 -> h0
1799 vpand $MASK,$T2,$T2 # 2
1804 vpaddq $D2,$H3,$H3 # h2 -> h3
1806 vpaddq $T2,$H2,$H2 # modulo-scheduled
1811 vpaddq $D0,$H1,$H1 # h0 -> h1
1813 vpsrlq \$40,$T4,$T4 # 4
1817 vpaddq $D3,$H4,$H4 # h3 -> h4
1819 vpand $MASK,$T0,$T0 # 0
1820 vpand $MASK,$T1,$T1 # 1
1821 vpand $MASK,$T3,$T3 # 3
1822 vpor 32(%rcx),$T4,$T4 # padbit, yes, always
1829 ################################################################
1830 # while above multiplications were by r^4 in all lanes, in last
1831 # iteration we multiply least significant lane by r^4 and most
1832 # significant one by r, so copy of above except that references
1833 # to the precomputed table are displaced by 4...
1835 #vpaddq $H2,$T2,$H2 # accumulate input
1837 vmovdqu `32*0+4`(%rsp),$T0 # r0^4
1839 vmovdqu `32*1+4`(%rsp),$T1 # r1^4
1841 vmovdqu `32*3+4`(%rsp),$T2 # r2^4
1843 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
1844 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
1846 vpmuludq $H2,$T0,$D2 # d2 = h2*r0
1847 vpmuludq $H2,$T1,$D3 # d3 = h2*r1
1848 vpmuludq $H2,$T2,$D4 # d4 = h2*r2
1849 vpmuludq $H2,$T3,$D0 # d0 = h2*s3
1850 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1852 vpmuludq $H0,$T1,$T4 # h0*r1
1853 vpmuludq $H1,$T1,$H2 # h1*r1
1854 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1855 vpaddq $H2,$D2,$D2 # d2 += h1*r1
1856 vpmuludq $H3,$T1,$T4 # h3*r1
1857 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
1858 vpaddq $T4,$D4,$D4 # d4 += h3*r1
1859 vpaddq $H2,$D0,$D0 # d0 += h4*s1
1861 vpmuludq $H0,$T0,$T4 # h0*r0
1862 vpmuludq $H1,$T0,$H2 # h1*r0
1863 vpaddq $T4,$D0,$D0 # d0 += h0*r0
1864 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
1865 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1866 vpmuludq $H3,$T0,$T4 # h3*r0
1867 vpmuludq $H4,$T0,$H2 # h4*r0
1868 vpaddq $T4,$D3,$D3 # d3 += h3*r0
1869 vpaddq $H2,$D4,$D4 # d4 += h4*r0
1871 vpmuludq $H3,$T1,$T4 # h3*s2
1872 vpmuludq $H4,$T1,$H2 # h4*s2
1873 vpaddq $T4,$D0,$D0 # d0 += h3*s2
1874 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1875 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
1876 vpmuludq $H1,$T2,$T4 # h1*r2
1877 vpmuludq $H0,$T2,$T2 # h0*r2
1878 vpaddq $T4,$D3,$D3 # d3 += h1*r2
1879 vpaddq $T2,$D2,$D2 # d2 += h0*r2
1881 vpmuludq $H1,$H2,$T4 # h1*r3
1882 vpmuludq $H0,$H2,$H2 # h0*r3
1883 vpaddq $T4,$D4,$D4 # d4 += h1*r3
1884 vpaddq $H2,$D3,$D3 # d3 += h0*r3
1885 vpmuludq $H3,$T3,$T4 # h3*s3
1886 vpmuludq $H4,$T3,$H2 # h4*s3
1887 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1888 vpaddq $H2,$D2,$D2 # d2 += h4*s3
1890 vpmuludq $H3,$S4,$H3 # h3*s4
1891 vpmuludq $H4,$S4,$H4 # h4*s4
1892 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
1893 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
1894 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
1895 vpmuludq $H1,$S4,$H0 # h1*s4
1896 vmovdqa 64(%rcx),$MASK # .Lmask26
1897 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
1898 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
1900 ################################################################
1901 # horizontal addition
1914 vpermq \$0x2,$H3,$T3
1915 vpermq \$0x2,$H4,$T4
1916 vpermq \$0x2,$H0,$T0
1917 vpermq \$0x2,$D1,$T1
1918 vpermq \$0x2,$H2,$T2
1925 ################################################################
1930 vpaddq $D3,$H4,$H4 # h3 -> h4
1934 vpaddq $D0,$D1,$H1 # h0 -> h1
1941 vpaddq $D1,$H2,$H2 # h1 -> h2
1945 vpaddq $D4,$H0,$H0 # h4 -> h0
1949 vpaddq $D2,$H3,$H3 # h2 -> h3
1953 vpaddq $D0,$H1,$H1 # h0 -> h1
1957 vpaddq $D3,$H4,$H4 # h3 -> h4
1959 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
1960 vmovd %x#$H1,`4*1-48-64`($ctx)
1961 vmovd %x#$H2,`4*2-48-64`($ctx)
1962 vmovd %x#$H3,`4*3-48-64`($ctx)
1963 vmovd %x#$H4,`4*4-48-64`($ctx)
1965 $code.=<<___ if ($win64);
1966 vmovdqa 0x50(%r11),%xmm6
1967 vmovdqa 0x60(%r11),%xmm7
1968 vmovdqa 0x70(%r11),%xmm8
1969 vmovdqa 0x80(%r11),%xmm9
1970 vmovdqa 0x90(%r11),%xmm10
1971 vmovdqa 0xa0(%r11),%xmm11
1972 vmovdqa 0xb0(%r11),%xmm12
1973 vmovdqa 0xc0(%r11),%xmm13
1974 vmovdqa 0xd0(%r11),%xmm14
1975 vmovdqa 0xe0(%r11),%xmm15
1979 $code.=<<___ if (!$win64);
1985 .size poly1305_blocks_avx2,.-poly1305_blocks_avx2
1992 .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1994 .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
1996 .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1998 .long 5,0,5,0,5,0,5,0
2003 .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2007 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2008 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2016 .extern __imp_RtlVirtualUnwind
2017 .type se_handler,\@abi-omnipotent
2031 mov 120($context),%rax # pull context->Rax
2032 mov 248($context),%rbx # pull context->Rip
2034 mov 8($disp),%rsi # disp->ImageBase
2035 mov 56($disp),%r11 # disp->HandlerData
2037 mov 0(%r11),%r10d # HandlerData[0]
2038 lea (%rsi,%r10),%r10 # prologue label
2039 cmp %r10,%rbx # context->Rip<.Lprologue
2040 jb .Lcommon_seh_tail
2042 mov 152($context),%rax # pull context->Rsp
2044 mov 4(%r11),%r10d # HandlerData[1]
2045 lea (%rsi,%r10),%r10 # epilogue label
2046 cmp %r10,%rbx # context->Rip>=.Lepilogue
2047 jae .Lcommon_seh_tail
2057 mov %rbx,144($context) # restore context->Rbx
2058 mov %rbp,160($context) # restore context->Rbp
2059 mov %r12,216($context) # restore context->R12
2060 mov %r13,224($context) # restore context->R13
2061 mov %r14,232($context) # restore context->R14
2062 mov %r15,240($context) # restore context->R14
2064 jmp .Lcommon_seh_tail
2065 .size se_handler,.-se_handler
2067 .type avx_handler,\@abi-omnipotent
2081 mov 120($context),%rax # pull context->Rax
2082 mov 248($context),%rbx # pull context->Rip
2084 mov 8($disp),%rsi # disp->ImageBase
2085 mov 56($disp),%r11 # disp->HandlerData
2087 mov 0(%r11),%r10d # HandlerData[0]
2088 lea (%rsi,%r10),%r10 # prologue label
2089 cmp %r10,%rbx # context->Rip<prologue label
2090 jb .Lcommon_seh_tail
2092 mov 152($context),%rax # pull context->Rsp
2094 mov 4(%r11),%r10d # HandlerData[1]
2095 lea (%rsi,%r10),%r10 # epilogue label
2096 cmp %r10,%rbx # context->Rip>=epilogue label
2097 jae .Lcommon_seh_tail
2099 mov 208($context),%rax # pull context->R11
2103 lea 512($context),%rdi # &context.Xmm6
2105 .long 0xa548f3fc # cld; rep movsq
2110 mov %rax,152($context) # restore context->Rsp
2111 mov %rsi,168($context) # restore context->Rsi
2112 mov %rdi,176($context) # restore context->Rdi
2114 mov 40($disp),%rdi # disp->ContextRecord
2115 mov $context,%rsi # context
2116 mov \$154,%ecx # sizeof(CONTEXT)
2117 .long 0xa548f3fc # cld; rep movsq
2120 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2121 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2122 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2123 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2124 mov 40(%rsi),%r10 # disp->ContextRecord
2125 lea 56(%rsi),%r11 # &disp->HandlerData
2126 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2127 mov %r10,32(%rsp) # arg5
2128 mov %r11,40(%rsp) # arg6
2129 mov %r12,48(%rsp) # arg7
2130 mov %rcx,56(%rsp) # arg8, (NULL)
2131 call *__imp_RtlVirtualUnwind(%rip)
2133 mov \$1,%eax # ExceptionContinueSearch
2145 .size avx_handler,.-avx_handler
2149 .rva .LSEH_begin_poly1305_init
2150 .rva .LSEH_end_poly1305_init
2151 .rva .LSEH_info_poly1305_init
2153 .rva .LSEH_begin_poly1305_blocks
2154 .rva .LSEH_end_poly1305_blocks
2155 .rva .LSEH_info_poly1305_blocks
2157 .rva .LSEH_begin_poly1305_emit
2158 .rva .LSEH_end_poly1305_emit
2159 .rva .LSEH_info_poly1305_emit
2161 $code.=<<___ if ($avx);
2162 .rva .LSEH_begin_poly1305_blocks_avx
2164 .rva .LSEH_info_poly1305_blocks_avx_1
2168 .rva .LSEH_info_poly1305_blocks_avx_2
2171 .rva .LSEH_end_poly1305_blocks_avx
2172 .rva .LSEH_info_poly1305_blocks_avx_3
2174 .rva .LSEH_begin_poly1305_emit_avx
2175 .rva .LSEH_end_poly1305_emit_avx
2176 .rva .LSEH_info_poly1305_emit_avx
2178 $code.=<<___ if ($avx>1);
2179 .rva .LSEH_begin_poly1305_blocks_avx2
2180 .rva .Lbase2_64_avx2
2181 .rva .LSEH_info_poly1305_blocks_avx2_1
2183 .rva .Lbase2_64_avx2
2185 .rva .LSEH_info_poly1305_blocks_avx2_2
2188 .rva .LSEH_end_poly1305_blocks_avx2
2189 .rva .LSEH_info_poly1305_blocks_avx2_3
2194 .LSEH_info_poly1305_init:
2197 .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
2199 .LSEH_info_poly1305_blocks:
2202 .rva .Lblocks_body,.Lblocks_epilogue
2204 .LSEH_info_poly1305_emit:
2207 .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
2209 $code.=<<___ if ($avx);
2210 .LSEH_info_poly1305_blocks_avx_1:
2213 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
2215 .LSEH_info_poly1305_blocks_avx_2:
2218 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
2220 .LSEH_info_poly1305_blocks_avx_3:
2223 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
2225 .LSEH_info_poly1305_emit_avx:
2228 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
2230 $code.=<<___ if ($avx>1);
2231 .LSEH_info_poly1305_blocks_avx2_1:
2234 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
2236 .LSEH_info_poly1305_blocks_avx2_2:
2239 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
2241 .LSEH_info_poly1305_blocks_avx2_3:
2244 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
2248 foreach (split('\n',$code)) {
2249 s/\`([^\`]*)\`/eval($1)/ge;
2250 s/%r([a-z]+)#d/%e$1/g;
2251 s/%r([0-9]+)#d/%r$1d/g;