x86_64: Add endbranch at function entries for Intel CET
[oweals/openssl.git] / crypto / poly1305 / asm / poly1305-x86_64.pl
index 1dce5d61e3a081587245ff5cdd6fc0bf0ad5e2ff..e5b841260e815cb8c466b6fb0fea86bcde895c91 100755 (executable)
@@ -1,7 +1,7 @@
 #! /usr/bin/env perl
-# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
 #
-# Licensed under the OpenSSL license (the "License").  You may not use
+# Licensed under the Apache License 2.0 (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
 # in the file LICENSE in the source distribution or at
 # https://www.openssl.org/source/license.html
 #
 # Add AVX512F+VL+BW code path.
 #
+# November 2017
+#
+# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
+# executed even on Knights Landing. Trigger for modification was
+# observation that AVX512 code paths can negatively affect overall
+# Skylake-X system performance. Since we are likely to suppress
+# AVX512F capability flag [at least on Skylake-X], conversion serves
+# as kind of "investment protection". Note that next *lake processor,
+# Cannolake, has AVX512IFMA code path to execute...
+#
 # Numbers are cycles per processed byte with poly1305_blocks alone,
 # measured with rdtsc at fixed clock frequency.
 #
-#              IALU/gcc-4.8(*) AVX(**)         AVX2
+#              IALU/gcc-4.8(*) AVX(**)         AVX2    AVX-512
 # P4           4.46/+120%      -
 # Core 2       2.41/+90%       -
 # Westmere     1.88/+120%      -
 # Sandy Bridge 1.39/+140%      1.10
 # Haswell      1.14/+175%      1.11            0.65
-# Skylake      1.13/+120%      0.96            0.51
+# Skylake[-X]  1.13/+120%      0.96            0.51    [0.35]
 # Silvermont   2.83/+95%       -
+# Knights L    3.60/?          1.65            1.10    0.41(***)
 # Goldmont     1.70/+180%      -
 # VIA Nano     1.82/+150%      -
 # Sledgehammer 1.38/+160%      -
 #      Core processors, 50-30%, less newer processor is, but slower on
 #      contemporary ones, for example almost 2x slower on Atom, and as
 #      former are naturally disappearing, SSE2 is deemed unnecessary;
+# (***)        strangely enough performance seems to vary from core to core,
+#      listed result is best case;
 
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
 
 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 
@@ -77,11 +91,12 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
        $avx = ($1>=10) + ($1>=12);
 }
 
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
        $avx = ($2>=3.0) + ($2>3.0);
 }
 
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
+    or die "can't call $xlate: $!";
 *STDOUT=*OUT;
 
 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
@@ -155,6 +170,7 @@ $code.=<<___;
 .type  poly1305_init,\@function,3
 .align 32
 poly1305_init:
+.cfi_startproc
        xor     %rax,%rax
        mov     %rax,0($ctx)            # initialize hash value
        mov     %rax,8($ctx)
@@ -206,6 +222,7 @@ $code.=<<___;
        mov     \$1,%eax
 .Lno_key:
        ret
+.cfi_endproc
 .size  poly1305_init,.-poly1305_init
 
 .type  poly1305_blocks,\@function,4
@@ -285,6 +302,7 @@ $code.=<<___;
 .type  poly1305_emit,\@function,3
 .align 32
 poly1305_emit:
+.cfi_startproc
 .Lemit:
        mov     0($ctx),%r8     # load hash value
        mov     8($ctx),%r9
@@ -295,7 +313,7 @@ poly1305_emit:
        mov     %r9,%rcx
        adc     \$0,%r9
        adc     \$0,%r10
-       shr     \$2,%r10        # did 130-bit value overfow?
+       shr     \$2,%r10        # did 130-bit value overflow?
        cmovnz  %r8,%rax
        cmovnz  %r9,%rcx
 
@@ -305,6 +323,7 @@ poly1305_emit:
        mov     %rcx,8($mac)
 
        ret
+.cfi_endproc
 .size  poly1305_emit,.-poly1305_emit
 ___
 if ($avx) {
@@ -329,15 +348,18 @@ $code.=<<___;
 .type  __poly1305_block,\@abi-omnipotent
 .align 32
 __poly1305_block:
+.cfi_startproc
 ___
        &poly1305_iteration();
 $code.=<<___;
        ret
+.cfi_endproc
 .size  __poly1305_block,.-__poly1305_block
 
 .type  __poly1305_init_avx,\@abi-omnipotent
 .align 32
 __poly1305_init_avx:
+.cfi_startproc
        mov     $r0,$h0
        mov     $r1,$h1
        xor     $h2,$h2
@@ -495,6 +517,7 @@ __poly1305_init_avx:
 
        lea     -48-64($ctx),$ctx       # size [de-]optimization
        ret
+.cfi_endproc
 .size  __poly1305_init_avx,.-__poly1305_init_avx
 
 .type  poly1305_blocks_avx,\@function,4
@@ -1360,6 +1383,7 @@ $code.=<<___;
 .type  poly1305_emit_avx,\@function,3
 .align 32
 poly1305_emit_avx:
+.cfi_startproc
        cmpl    \$0,20($ctx)    # is_base2_26?
        je      .Lemit
 
@@ -1400,7 +1424,7 @@ poly1305_emit_avx:
        mov     %r9,%rcx
        adc     \$0,%r9
        adc     \$0,%r10
-       shr     \$2,%r10        # did 130-bit value overfow?
+       shr     \$2,%r10        # did 130-bit value overflow?
        cmovnz  %r8,%rax
        cmovnz  %r9,%rcx
 
@@ -1410,6 +1434,7 @@ poly1305_emit_avx:
        mov     %rcx,8($mac)
 
        ret
+.cfi_endproc
 .size  poly1305_emit_avx,.-poly1305_emit_avx
 ___
 
@@ -1682,7 +1707,6 @@ poly1305_blocks_avx2:
 .Leven_avx2:
 .cfi_startproc
        mov             OPENSSL_ia32cap_P+8(%rip),%r10d
-       mov             \$`(1<<31|1<<30|1<<16)`,%r11d
        vmovd           4*0($ctx),%x#$H0        # load hash value base 2^26
        vmovd           4*1($ctx),%x#$H1
        vmovd           4*2($ctx),%x#$H2
@@ -1695,8 +1719,8 @@ $code.=<<___              if ($avx>2);
        cmp             \$512,$len
        jb              .Lskip_avx512
        and             %r11d,%r10d
-       cmp             %r11d,%r10d             # check for AVX512F+BW+VL
-       j             .Lblocks_avx512
+       test            \$`1<<16`,%r10d         # check for AVX512F
+       jnz             .Lblocks_avx512
 .Lskip_avx512:
 ___
 $code.=<<___   if (!$win64);
@@ -2106,10 +2130,14 @@ if ($avx>2) {
 # reason stack layout is kept identical to poly1305_blocks_avx2. If not
 # for this tail, we wouldn't have to even allocate stack frame...
 
-my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%ymm$_",(16..24));
-my ($M0,$M1,$M2,$M3,$M4) = map("%ymm$_",(25..29));
+my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
+my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
 my $PADBIT="%zmm30";
-my $GATHER="%ymm31";
+
+map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));           # switch to %zmm domain
+map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
+map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
+map(s/%y/%z/,($MASK));
 
 $code.=<<___;
 .type  poly1305_blocks_avx512,\@function,4
@@ -2117,7 +2145,8 @@ $code.=<<___;
 poly1305_blocks_avx512:
 .cfi_startproc
 .Lblocks_avx512:
-       vzeroupper
+       mov             \$15,%eax
+       kmovw           %eax,%k2
 ___
 $code.=<<___   if (!$win64);
        lea             -8(%rsp),%r11
@@ -2130,52 +2159,53 @@ $code.=<<___    if ($win64);
        vmovdqa         %xmm6,0x50(%r11)
        vmovdqa         %xmm7,0x60(%r11)
        vmovdqa         %xmm8,0x70(%r11)
-       vmovdqa32       %xmm9,0x80(%r11)
-       vmovdqa32       %xmm10,0x90(%r11)
-       vmovdqa32       %xmm11,0xa0(%r11)
-       vmovdqa32       %xmm12,0xb0(%r11)
-       vmovdqa32       %xmm13,0xc0(%r11)
-       vmovdqa32       %xmm14,0xd0(%r11)
-       vmovdqa32       %xmm15,0xe0(%r11)
+       vmovdqa         %xmm9,0x80(%r11)
+       vmovdqa         %xmm10,0x90(%r11)
+       vmovdqa         %xmm11,0xa0(%r11)
+       vmovdqa         %xmm12,0xb0(%r11)
+       vmovdqa         %xmm13,0xc0(%r11)
+       vmovdqa         %xmm14,0xd0(%r11)
+       vmovdqa         %xmm15,0xe0(%r11)
 .Ldo_avx512_body:
 ___
 $code.=<<___;
        lea             .Lconst(%rip),%rcx
        lea             48+64($ctx),$ctx        # size optimization
-       vmovdqa         96(%rcx),$T2            # .Lpermd_avx2
+       vmovdqa         96(%rcx),%y#$T2         # .Lpermd_avx2
 
        # expand pre-calculated table
-       vmovdqu32       `16*0-64`($ctx),%x#$R0
+       vmovdqu         `16*0-64`($ctx),%x#$D0  # will become expanded ${R0}
        and             \$-512,%rsp
-       vmovdqu32       `16*1-64`($ctx),%x#$R1
-       vmovdqu32       `16*2-64`($ctx),%x#$S1
-       vmovdqu32       `16*3-64`($ctx),%x#$R2
-       vmovdqu32       `16*4-64`($ctx),%x#$S2
-       vmovdqu32       `16*5-64`($ctx),%x#$R3
-       vmovdqu32       `16*6-64`($ctx),%x#$S3
-       vmovdqu32       `16*7-64`($ctx),%x#$R4
-       vmovdqu32       `16*8-64`($ctx),%x#$S4
-       vpermd          $R0,$T2,$R0             # 00003412 -> 14243444
-       vmovdqa64       64(%rcx),$MASK          # .Lmask26
-       vpermd          $R1,$T2,$R1
-       vpermd          $S1,$T2,$S1
-       vpermd          $R2,$T2,$R2
-       vmovdqa32       $R0,0x00(%rsp)          # save in case $len%128 != 0
+       vmovdqu         `16*1-64`($ctx),%x#$D1  # will become ... ${R1}
+       mov             \$0x20,%rax
+       vmovdqu         `16*2-64`($ctx),%x#$T0  # ... ${S1}
+       vmovdqu         `16*3-64`($ctx),%x#$D2  # ... ${R2}
+       vmovdqu         `16*4-64`($ctx),%x#$T1  # ... ${S2}
+       vmovdqu         `16*5-64`($ctx),%x#$D3  # ... ${R3}
+       vmovdqu         `16*6-64`($ctx),%x#$T3  # ... ${S3}
+       vmovdqu         `16*7-64`($ctx),%x#$D4  # ... ${R4}
+       vmovdqu         `16*8-64`($ctx),%x#$T4  # ... ${S4}
+       vpermd          $D0,$T2,$R0             # 00003412 -> 14243444
+       vpbroadcastq    64(%rcx),$MASK          # .Lmask26
+       vpermd          $D1,$T2,$R1
+       vpermd          $T0,$T2,$S1
+       vpermd          $D2,$T2,$R2
+       vmovdqa64       $R0,0x00(%rsp){%k2}     # save in case $len%128 != 0
         vpsrlq         \$32,$R0,$T0            # 14243444 -> 01020304
-       vpermd          $S2,$T2,$S2
-       vmovdqa32       $R1,0x20(%rsp)
+       vpermd          $T1,$T2,$S2
+       vmovdqu64       $R1,0x00(%rsp,%rax){%k2}
         vpsrlq         \$32,$R1,$T1
-       vpermd          $R3,$T2,$R3
-       vmovdqa32       $S1,0x40(%rsp)
-       vpermd          $S3,$T2,$S3
-       vpermd          $R4,$T2,$R4
-       vmovdqa32       $R2,0x60(%rsp)
-       vpermd          $S4,$T2,$S4
-       vmovdqa32       $S2,0x80(%rsp)
-       vmovdqa32       $R3,0xa0(%rsp)
-       vmovdqa32       $S3,0xc0(%rsp)
-       vmovdqa32       $R4,0xe0(%rsp)
-       vmovdqa32       $S4,0x100(%rsp)
+       vpermd          $D3,$T2,$R3
+       vmovdqa64       $S1,0x40(%rsp){%k2}
+       vpermd          $T3,$T2,$S3
+       vpermd          $D4,$T2,$R4
+       vmovdqu64       $R2,0x40(%rsp,%rax){%k2}
+       vpermd          $T4,$T2,$S4
+       vmovdqa64       $S2,0x80(%rsp){%k2}
+       vmovdqu64       $R3,0x80(%rsp,%rax){%k2}
+       vmovdqa64       $S3,0xc0(%rsp){%k2}
+       vmovdqu64       $R4,0xc0(%rsp,%rax){%k2}
+       vmovdqa64       $S4,0x100(%rsp){%k2}
 
        ################################################################
        # calculate 5th through 8th powers of the key
@@ -2279,14 +2309,6 @@ $code.=<<___;
        vpandq          $MASK,$D3,$D3
        vpaddq          $M3,$D4,$D4             # d3 -> d4
 
-___
-map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3));           # switch to %zmm domain
-map(s/%y/%z/,($M4,$M0,$M1,$M2,$M3));
-map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
-map(s/%y/%z/,($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4));
-map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
-map(s/%y/%z/,($MASK));
-$code.=<<___;
        ################################################################
        # at this point we have 14243444 in $R0-$S4 and 05060708 in
        # $D0-$D4, ...
@@ -2324,7 +2346,6 @@ $code.=<<___;
        vpaddd          $R3,$S3,$S3
        vpaddd          $R4,$S4,$S4
 
-       vpbroadcastq    %x#$MASK,$MASK
        vpbroadcastq    32(%rcx),$PADBIT        # .L129
 
        vpsrlq          \$52,$T0,$T2            # splat input
@@ -2342,7 +2363,7 @@ $code.=<<___;
        vpaddq          $H2,$T2,$H2             # accumulate input
        sub             \$192,$len
        jbe             .Ltail_avx512
-       #jmp            .Loop_avx512
+       jmp             .Loop_avx512
 
 .align 32
 .Loop_avx512:
@@ -2529,7 +2550,7 @@ $code.=<<___;
         vpaddq         $H3,$T3,$H3
         vpaddq         $H4,$T4,$H4
 
-         vmovdqu64     16*0($inp),%x#$T0
+         vmovdqu       16*0($inp),%x#$T0
        vpmuludq        $H0,$R3,$M3
        vpmuludq        $H0,$R4,$M4
        vpmuludq        $H0,$R0,$M0
@@ -2539,7 +2560,7 @@ $code.=<<___;
        vpaddq          $M0,$D0,$D0             # d0 += h0*r0
        vpaddq          $M1,$D1,$D1             # d1 += h0*r1
 
-         vmovdqu64     16*1($inp),%x#$T1
+         vmovdqu       16*1($inp),%x#$T1
        vpmuludq        $H1,$R2,$M3
        vpmuludq        $H1,$R3,$M4
        vpmuludq        $H1,$S4,$M0
@@ -2549,7 +2570,7 @@ $code.=<<___;
        vpaddq          $M0,$D0,$D0             # d0 += h1*s4
        vpaddq          $M2,$D2,$D2             # d2 += h0*r2
 
-         vinserti64x2  \$1,16*2($inp),$T0,$T0
+         vinserti128   \$1,16*2($inp),%y#$T0,%y#$T0
        vpmuludq        $H3,$R0,$M3
        vpmuludq        $H3,$R1,$M4
        vpmuludq        $H1,$R0,$M1
@@ -2559,7 +2580,7 @@ $code.=<<___;
        vpaddq          $M1,$D1,$D1             # d1 += h1*r0
        vpaddq          $M2,$D2,$D2             # d2 += h1*r1
 
-         vinserti64x2  \$1,16*3($inp),$T1,$T1
+         vinserti128   \$1,16*3($inp),%y#$T1,%y#$T1
        vpmuludq        $H4,$S4,$M3
        vpmuludq        $H4,$R0,$M4
        vpmuludq        $H3,$S2,$M0
@@ -2582,11 +2603,11 @@ $code.=<<___;
        # horizontal addition
 
        mov             \$1,%eax
-       vpsrldq         \$8,$H3,$D3
-       vpsrldq         \$8,$D4,$H4
-       vpsrldq         \$8,$H0,$D0
-       vpsrldq         \$8,$H1,$D1
-       vpsrldq         \$8,$H2,$D2
+       vpermq          \$0xb1,$H3,$D3
+       vpermq          \$0xb1,$D4,$H4
+       vpermq          \$0xb1,$H0,$D0
+       vpermq          \$0xb1,$H1,$D1
+       vpermq          \$0xb1,$H2,$D2
        vpaddq          $D3,$H3,$H3
        vpaddq          $D4,$H4,$H4
        vpaddq          $D0,$H0,$H0
@@ -2623,23 +2644,23 @@ $code.=<<___;
        # lazy reduction (interleaved with input splat)
 
        vpsrlq          \$26,$H3,$D3
-       vpandq          $MASK,$H3,$H3
+       vpand           $MASK,$H3,$H3
         vpsrldq        \$6,$T0,$T2             # splat input
         vpsrldq        \$6,$T1,$T3
         vpunpckhqdq    $T1,$T0,$T4             # 4
        vpaddq          $D3,$H4,$H4             # h3 -> h4
 
        vpsrlq          \$26,$H0,$D0
-       vpandq          $MASK,$H0,$H0
+       vpand           $MASK,$H0,$H0
         vpunpcklqdq    $T3,$T2,$T2             # 2:3
         vpunpcklqdq    $T1,$T0,$T0             # 0:1
        vpaddq          $D0,$H1,$H1             # h0 -> h1
 
        vpsrlq          \$26,$H4,$D4
-       vpandq          $MASK,$H4,$H4
+       vpand           $MASK,$H4,$H4
 
        vpsrlq          \$26,$H1,$D1
-       vpandq          $MASK,$H1,$H1
+       vpand           $MASK,$H1,$H1
         vpsrlq         \$30,$T2,$T3
         vpsrlq         \$4,$T2,$T2
        vpaddq          $D1,$H2,$H2             # h1 -> h2
@@ -2651,21 +2672,21 @@ $code.=<<___;
        vpaddq          $D4,$H0,$H0             # h4 -> h0
 
        vpsrlq          \$26,$H2,$D2
-       vpandq          $MASK,$H2,$H2
-        vpandq         $MASK,$T2,$T2           # 2
-        vpandq         $MASK,$T0,$T0           # 0
+       vpand           $MASK,$H2,$H2
+        vpand          $MASK,$T2,$T2           # 2
+        vpand          $MASK,$T0,$T0           # 0
        vpaddq          $D2,$H3,$H3             # h2 -> h3
 
        vpsrlq          \$26,$H0,$D0
-       vpandq          $MASK,$H0,$H0
+       vpand           $MASK,$H0,$H0
         vpaddq         $H2,$T2,$H2             # accumulate input for .Ltail_avx2
-        vpandq         $MASK,$T1,$T1           # 1
+        vpand          $MASK,$T1,$T1           # 1
        vpaddq          $D0,$H1,$H1             # h0 -> h1
 
        vpsrlq          \$26,$H3,$D3
-       vpandq          $MASK,$H3,$H3
-        vpandq         $MASK,$T3,$T3           # 3
-        vporq          $PADBIT,$T4,$T4         # padbit, yes, always
+       vpand           $MASK,$H3,$H3
+        vpand          $MASK,$T3,$T3           # 3
+        vpor           32(%rcx),$T4,$T4        # padbit, yes, always
        vpaddq          $D3,$H4,$H4             # h3 -> h4
 
        lea             0x90(%rsp),%rax         # size optimization for .Ltail_avx2
@@ -2732,6 +2753,7 @@ $code.=<<___;
 .type  poly1305_init_base2_44,\@function,3
 .align 32
 poly1305_init_base2_44:
+.cfi_startproc
        xor     %rax,%rax
        mov     %rax,0($ctx)            # initialize hash value
        mov     %rax,8($ctx)
@@ -2773,6 +2795,7 @@ ___
 $code.=<<___;
        mov     \$1,%eax
        ret
+.cfi_endproc
 .size  poly1305_init_base2_44,.-poly1305_init_base2_44
 ___
 {
@@ -2784,6 +2807,8 @@ $code.=<<___;
 .type  poly1305_blocks_vpmadd52,\@function,4
 .align 32
 poly1305_blocks_vpmadd52:
+.cfi_startproc
+       endbranch
        shr     \$4,$len
        jz      .Lno_data_vpmadd52              # too short
 
@@ -2890,6 +2915,7 @@ poly1305_blocks_vpmadd52:
 
 .Lno_data_vpmadd52:
        ret
+.cfi_endproc
 .size  poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
 ___
 }
@@ -2907,6 +2933,7 @@ $code.=<<___;
 .type  poly1305_blocks_vpmadd52_4x,\@function,4
 .align 32
 poly1305_blocks_vpmadd52_4x:
+.cfi_startproc
        shr     \$4,$len
        jz      .Lno_data_vpmadd52_4x           # too short
 
@@ -3331,6 +3358,7 @@ poly1305_blocks_vpmadd52_4x:
 
 .Lno_data_vpmadd52_4x:
        ret
+.cfi_endproc
 .size  poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
 ___
 }
@@ -3349,6 +3377,7 @@ $code.=<<___;
 .type  poly1305_blocks_vpmadd52_8x,\@function,4
 .align 32
 poly1305_blocks_vpmadd52_8x:
+.cfi_startproc
        shr     \$4,$len
        jz      .Lno_data_vpmadd52_8x           # too short
 
@@ -3704,6 +3733,7 @@ $code.=<<___;
 
 .Lno_data_vpmadd52_8x:
        ret
+.cfi_endproc
 .size  poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
 ___
 }
@@ -3711,6 +3741,8 @@ $code.=<<___;
 .type  poly1305_emit_base2_44,\@function,3
 .align 32
 poly1305_emit_base2_44:
+.cfi_startproc
+       endbranch
        mov     0($ctx),%r8     # load hash value
        mov     8($ctx),%r9
        mov     16($ctx),%r10
@@ -3731,7 +3763,7 @@ poly1305_emit_base2_44:
        mov     %r9,%rcx
        adc     \$0,%r9
        adc     \$0,%r10
-       shr     \$2,%r10        # did 130-bit value overfow?
+       shr     \$2,%r10        # did 130-bit value overflow?
        cmovnz  %r8,%rax
        cmovnz  %r9,%rcx
 
@@ -3741,6 +3773,7 @@ poly1305_emit_base2_44:
        mov     %rcx,8($mac)
 
        ret
+.cfi_endproc
 .size  poly1305_emit_base2_44,.-poly1305_emit_base2_44
 ___
 }      }       }
@@ -3778,12 +3811,119 @@ $code.=<<___;
 .quad  0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 ___
 }
-
 $code.=<<___;
 .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align 16
 ___
 
+{      # chacha20-poly1305 helpers
+my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+                                  ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+$code.=<<___;
+.globl xor128_encrypt_n_pad
+.type  xor128_encrypt_n_pad,\@abi-omnipotent
+.align 16
+xor128_encrypt_n_pad:
+.cfi_startproc
+       sub     $otp,$inp
+       sub     $otp,$out
+       mov     $len,%r10               # put len aside
+       shr     \$4,$len                # len / 16
+       jz      .Ltail_enc
+       nop
+.Loop_enc_xmm:
+       movdqu  ($inp,$otp),%xmm0
+       pxor    ($otp),%xmm0
+       movdqu  %xmm0,($out,$otp)
+       movdqa  %xmm0,($otp)
+       lea     16($otp),$otp
+       dec     $len
+       jnz     .Loop_enc_xmm
+
+       and     \$15,%r10               # len % 16
+       jz      .Ldone_enc
+
+.Ltail_enc:
+       mov     \$16,$len
+       sub     %r10,$len
+       xor     %eax,%eax
+.Loop_enc_byte:
+       mov     ($inp,$otp),%al
+       xor     ($otp),%al
+       mov     %al,($out,$otp)
+       mov     %al,($otp)
+       lea     1($otp),$otp
+       dec     %r10
+       jnz     .Loop_enc_byte
+
+       xor     %eax,%eax
+.Loop_enc_pad:
+       mov     %al,($otp)
+       lea     1($otp),$otp
+       dec     $len
+       jnz     .Loop_enc_pad
+
+.Ldone_enc:
+       mov     $otp,%rax
+       ret
+.cfi_endproc
+.size  xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
+
+.globl xor128_decrypt_n_pad
+.type  xor128_decrypt_n_pad,\@abi-omnipotent
+.align 16
+xor128_decrypt_n_pad:
+.cfi_startproc
+       sub     $otp,$inp
+       sub     $otp,$out
+       mov     $len,%r10               # put len aside
+       shr     \$4,$len                # len / 16
+       jz      .Ltail_dec
+       nop
+.Loop_dec_xmm:
+       movdqu  ($inp,$otp),%xmm0
+       movdqa  ($otp),%xmm1
+       pxor    %xmm0,%xmm1
+       movdqu  %xmm1,($out,$otp)
+       movdqa  %xmm0,($otp)
+       lea     16($otp),$otp
+       dec     $len
+       jnz     .Loop_dec_xmm
+
+       pxor    %xmm1,%xmm1
+       and     \$15,%r10               # len % 16
+       jz      .Ldone_dec
+
+.Ltail_dec:
+       mov     \$16,$len
+       sub     %r10,$len
+       xor     %eax,%eax
+       xor     %r11,%r11
+.Loop_dec_byte:
+       mov     ($inp,$otp),%r11b
+       mov     ($otp),%al
+       xor     %r11b,%al
+       mov     %al,($out,$otp)
+       mov     %r11b,($otp)
+       lea     1($otp),$otp
+       dec     %r10
+       jnz     .Loop_dec_byte
+
+       xor     %eax,%eax
+.Loop_dec_pad:
+       mov     %al,($otp)
+       lea     1($otp),$otp
+       dec     $len
+       jnz     .Loop_dec_pad
+
+.Ldone_dec:
+       mov     $otp,%rax
+       ret
+.cfi_endproc
+.size  xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
+___
+}
+
 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 #              CONTEXT *context,DISPATCHER_CONTEXT *disp)
 if ($win64) {
@@ -4044,4 +4184,4 @@ foreach (split('\n',$code)) {
 
        print $_,"\n";
 }
-close STDOUT;
+close STDOUT or die "error closing STDOUT";