crypto/poly1305: don't break carry chains.

author Andy Polyakov <appro@openssl.org>

Tue, 29 Mar 2016 08:02:45 +0000 (10:02 +0200)

committer Andy Polyakov <appro@openssl.org>

Mon, 4 Apr 2016 14:56:20 +0000 (16:56 +0200)
author Andy Polyakov <appro@openssl.org>
Tue, 29 Mar 2016 08:02:45 +0000 (10:02 +0200)
committer Andy Polyakov <appro@openssl.org>
Mon, 4 Apr 2016 14:56:20 +0000 (16:56 +0200)
diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl

index 4c4b417fcdc82478151b000bc11fcafc663b139b..aa3f2280c6481e168ce36b4c57943b5a3b36cacf 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-armv4.pl
+++ b/crypto/poly1305/asm/poly1305-armv4.pl
@@ -10,10 +10,10 @@
  #                      IALU(*)/gcc-4.4         NEON
  #
  # ARM11xx(ARMv6)       7.78/+100%              -
-# Cortex-A5            6.30/+130%              2.96
+# Cortex-A5            6.35/+130%              2.96
  # Cortex-A8            6.25/+115%              2.36
  # Cortex-A9            5.10/+95%               2.55
-# Cortex-A15           3.79/+85%               1.25(**)
+# Cortex-A15           3.85/+85%               1.25(**)
  # Snapdragon S4                5.70/+100%              1.48(**)
  #
  # (*)  this is for -march=armv6, i.e. with bunch of ldrb loading data;
@@ -313,7 +313,8 @@ poly1305_blocks:
         adds    $h0,$h0,r1
         adcs    $h1,$h1,#0
         adcs    $h2,$h2,#0
-       adc     $h3,$h3,#0
+       adcs    $h3,$h3,#0
+       adc     $h4,$h4,#0
  
         cmp     r0,lr                   @ done yet?
         bhi     .Loop
@@ -735,9 +736,7 @@ poly1305_blocks_neon:
  .align 4
  .Leven:
         subs            $len,$len,#64
-# ifdef        __thumb2__
         it              lo
-# endif
         movlo           $in2,$zeros
  
         vmov.i32        $H4,#1<<24              @ padbit, yes, always
@@ -745,9 +744,7 @@ poly1305_blocks_neon:
         add             $inp,$inp,#64
         vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
         add             $in2,$in2,#64
-# ifdef        __thumb2__
         itt             hi
-# endif
         addhi           $tbl1,$ctx,#(48+1*9*4)
         addhi           $tbl0,$ctx,#(48+3*9*4)
  
@@ -817,9 +814,7 @@ poly1305_blocks_neon:
         vmull.u32       $D4,$H4#hi,${R0}[1]
         subs            $len,$len,#64
         vmlal.u32       $D0,$H4#hi,${S1}[1]
-# ifdef        __thumb2__
         it              lo
-# endif
         movlo           $in2,$zeros
         vmlal.u32       $D3,$H2#hi,${R1}[1]
         vld1.32         ${S4}[1],[$tbl1,:32]
@@ -946,9 +941,7 @@ poly1305_blocks_neon:
         add             $tbl1,$ctx,#(48+0*9*4)
         add             $tbl0,$ctx,#(48+1*9*4)
         adds            $len,$len,#32
-# ifdef        __thumb2__
         it              ne
-# endif
         movne           $len,#0
         bne             .Long_tail
  
@@ -990,14 +983,10 @@ poly1305_blocks_neon:
         vmlal.u32       $D2,$H0#hi,$R2
  
         vmlal.u32       $D3,$H0#hi,$R3
-# ifdef        __thumb2__
-       it              ne
-# endif
+        it             ne
          addne          $tbl1,$ctx,#(48+2*9*4)
         vmlal.u32       $D0,$H2#hi,$S3
-# ifdef        __thumb2__
-       it              ne
-# endif
+        it             ne
          addne          $tbl0,$ctx,#(48+3*9*4)
         vmlal.u32       $D4,$H1#hi,$R3
         vmlal.u32       $D1,$H3#hi,$S3
@@ -1138,7 +1127,8 @@ poly1305_emit_neon:
         adds    $h0,$h0,$g0
         adcs    $h1,$h1,#0
         adcs    $h2,$h2,#0
-       adc     $h3,$h3,#0
+       adcs    $h3,$h3,#0
+       adc     $h4,$h4,#0
  
         adds    $g0,$h0,#5              @ compare to modulus
         adcs    $g1,$h1,#0
@@ -1147,24 +1137,16 @@ poly1305_emit_neon:
         adc     $g4,$h4,#0
         tst     $g4,#4                  @ did it carry/borrow?
  
-# ifdef        __thumb2__
         it      ne
-# endif
         movne   $h0,$g0
         ldr     $g0,[$nonce,#0]
-# ifdef        __thumb2__
         it      ne
-# endif
         movne   $h1,$g1
         ldr     $g1,[$nonce,#4]
-# ifdef        __thumb2__
         it      ne
-# endif
         movne   $h2,$g2
         ldr     $g2,[$nonce,#8]
-# ifdef        __thumb2__
         it      ne
-# endif
         movne   $h3,$g3
         ldr     $g3,[$nonce,#12]
  
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl

index f1359fd44a85f2a86852ff9cfb5b66869f03af06..2e1dae3df238d157a23215ec7d88885cf105c11b 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@@ -16,10 +16,10 @@
  #              IALU/gcc-4.9    NEON
  #
  # Apple A7     1.86/+5%        0.72
-# Cortex-A53   2.63/+58%       1.47
+# Cortex-A53   2.69/+58%       1.47
  # Cortex-A57   2.70/+7%        1.14
-# Denver       1.39/+50%       1.18(*)
-# X-Gene       2.00/+68%       2.19
+# Denver       1.64/+50%       1.18(*)
+# X-Gene       2.13/+68%       2.19
  #
  # (*)  estimate based on resources availability is less than 1.0,
  #      i.e. measured result is worse than expected, presumably binary
@@ -151,7 +151,8 @@ poly1305_blocks:
         and     $h2,$d2,#3
         add     $t0,$t0,$d2,lsr#2
         adds    $h0,$d0,$t0
-       adc     $h1,$d1,xzr
+       adcs    $h1,$d1,xzr
+       adc     $h2,$h2,xzr
  
         cbnz    $len,.Loop
  
@@ -235,7 +236,8 @@ poly1305_mult:
         and     $h2,$d2,#3
         add     $t0,$t0,$d2,lsr#2
         adds    $h0,$d0,$t0
-       adc     $h1,$d1,xzr
+       adcs    $h1,$d1,xzr
+       adc     $h2,$h2,xzr
  
         ret
  .size  poly1305_mult,.-poly1305_mult
@@ -310,7 +312,8 @@ poly1305_blocks_neon:
         and     $h2,$d2,#3
         add     $t0,$t0,$d2,lsr#2
         adds    $h0,$h0,$t0
-       adc     $h1,$h1,xzr
+       adcs    $h1,$h1,xzr
+       adc     $h2,$h2,xzr
  
  #ifdef __ARMEB__
         rev     $d0,$d0
@@ -870,7 +873,8 @@ poly1305_emit_neon:
         add     $d0,$d0,$h2,lsr#2
         and     $h2,$h2,#3
         adds    $h0,$h0,$d0
-       adc     $h1,$h1,xzr
+       adcs    $h1,$h1,xzr
+       adc     $h2,$h2,xzr
  
         adds    $d0,$h0,#5              // compare to modulus
         adcs    $d1,$h1,xzr
diff --git a/crypto/poly1305/asm/poly1305-c64xplus.pl b/crypto/poly1305/asm/poly1305-c64xplus.pl

index f750a6e5ebeeb168f5d07b61a3b08f699e6abb44..a7cf47d5f05b26e3d9307e7ba6ee5eed0daea75d 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-c64xplus.pl
+++ b/crypto/poly1305/asm/poly1305-c64xplus.pl
@@ -11,7 +11,7 @@
  #
  # October 2015
  #
-# Performance is [incredible for a 32-bit processor] 1.76 cycles per
+# Performance is [incredible for a 32-bit processor] 1.82 cycles per
  # processed byte. Comparison to compiler-generated code is problematic,
  # because results were observed to vary from 2.1 to 7.6 cpb depending
  # on compiler's ability to inline small functions. Compiler also
@@ -128,7 +128,7 @@ _poly1305_blocks:
  ||     SWAP2   $D1,$D1
  
         ADDU    $D0,B24,$D0:$H0         ; h0+=inp[0]
-||     ADD     $D0,B24,B31             ; B-copy of h0+inp[0]
+||     ADD     $D0,B24,B27             ; B-copy of h0+inp[0]
  ||     SWAP4   $D1,$D1
         ADDU    $D1,B25,$D1:$H1         ; h1+=inp[1]
  ||     MVK     3,$THREE
@@ -140,12 +140,12 @@ _poly1305_blocks:
  
  loop?:
         MPY32U  $H0,$R0,A17:A16
-||     MPY32U  B31,$R1,B17:B16         ; MPY32U        $H0,$R1,B17:B16
+||     MPY32U  B27,$R1,B17:B16         ; MPY32U        $H0,$R1,B17:B16
  ||     ADDU    $D0,$D1:$H1,B25:B24     ; ADDU          $D0,$D1:$H1,$D1:$H1
  ||     ADDU    $D2,B28,$D2:$H2         ; h2+=inp[2]
  ||     SWAP2   $D3,$D3
         MPY32U  $H0,$R2,A19:A18
-||     MPY32U  B31,$R3,B19:B18         ; MPY32U        $H0,$R3,B19:B18
+||     MPY32U  B27,$R3,B19:B18         ; MPY32U        $H0,$R3,B19:B18
  ||     ADD     $D0,$H1,A24             ; A-copy of B24
  ||     SWAP4   $D3,$D3
  || [A2]        SUB     A2,1,A2                 ; decrement loop counter
@@ -227,8 +227,8 @@ loop?:
  
         SHRU    $H4,2,B16               ; last reduction step
  ||     AND     $H4,$THREE,$H4
-|| [A2]        BNOP    loop?
         ADDAW   B16,B16,B16             ; 5*(h4>>2)
+|| [A2]        BNOP    loop?
  
         ADDU    B24,B16,B25:B24         ; B24 is h0
  || [A2]        SWAP2   $D2,$D2
@@ -236,8 +236,9 @@ loop?:
  || [A2]        SWAP4   $D2,$D2
         ADDU    B28,B27,B29:B28         ; B28 is h2
  || [A2]        ADDU    $D0,B24,$D0:$H0         ; h0+=inp[0]
-|| [A2]        ADD     $D0,B24,B31             ; B-copy of h0+inp[0]
-       ADD     B30,B29,B30             ; B30 is h3
+|| [A2]        ADD     $D0,B24,B27             ; B-copy of h0+inp[0]
+       ADDU    B30,B29,B31:B30         ; B30 is h3
+       ADD     B31,$H4,$H4
  || [A2]        ADDU    $D1,B26,$D1:$H1         ; h1+=inp[1]
  ;;===== branch to loop? is taken here
  
diff --git a/crypto/poly1305/asm/poly1305-ppc.pl b/crypto/poly1305/asm/poly1305-ppc.pl

index 46130c93276cd012652e4930338a4d80e0845a94..07da9d10b610d8b72e1ab7a511eabe5a207c0118 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-ppc.pl
+++ b/crypto/poly1305/asm/poly1305-ppc.pl
@@ -17,11 +17,10 @@
  #                      -m32            -m64
  #
  # Freescale e300       14.8/+80%       -
-# PPC74x0              7.40/+60%       -
-# PPC970               7.20/+114%      3.51/+205%
-# POWER6               3.96/+250%      2.02/+170%
-# POWER7               3.67/+260%      1.87/+100%
-# POWER8               -               2.13/+200%
+# PPC74x0              7.60/+60%       -
+# PPC970               7.00/+114%      3.51/+205%
+# POWER7               3.75/+260%      1.93/+100%
+# POWER8               -               2.03/+200%
  #
  # Do we need floating-point implementation for PPC? Results presented
  # in poly1305_ieee754.c are tricky to compare to, because they are for
@@ -212,6 +211,7 @@ $code.=<<___;
         add     $t0,$t0,$t1
         addc    $h0,$d0,$t0
         addze   $h1,$d1
+       addze   $h2,$h2
  
         bdnz    Loop
  
@@ -518,6 +518,7 @@ $code.=<<___;
         addze   $h1,$h1
         addze   $h2,$h2
         addze   $h3,$h3
+       addze   $h4,$h4
  
         bdnz    Loop
  
diff --git a/crypto/poly1305/asm/poly1305-ppcfp.pl b/crypto/poly1305/asm/poly1305-ppcfp.pl

index 061a55637792ff7575384a27c7055b1b024a807f..c8636a46edf08c228bf1098a59f26ab69e6a8afc 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-ppcfp.pl
+++ b/crypto/poly1305/asm/poly1305-ppcfp.pl
@@ -15,8 +15,8 @@
  # and improvement coefficients relative to gcc-generated code.
  #
  # Freescale e300       9.78/+30%
-# PPC74x0              7.08/+50%
-# PPC970               6.24/+80%
+# PPC74x0              6.92/+50%
+# PPC970               6.03/+80%
  # POWER7               3.50/+30%
  # POWER8               3.75/+10%
  
diff --git a/crypto/poly1305/asm/poly1305-s390x.pl b/crypto/poly1305/asm/poly1305-s390x.pl

index 49b3f79f1d5bc5d078019f940b1aae0e62cf22f2..141ba8d0bd503203e1d487f80c97914511328ca2 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-s390x.pl
+++ b/crypto/poly1305/asm/poly1305-s390x.pl
@@ -11,7 +11,7 @@
  #
  # June 2015
  #
-# ~6.4/2.2 cpb on z10/z196+, >2x improvement over compiler-generated
+# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
  # code. For older compiler improvement coefficient is >3x, because
  # then base 2^64 and base 2^32 implementations are compared.
  #
@@ -138,11 +138,12 @@ poly1305_blocks:
         ngr     $h0,$h2
         srlg    $t0,$h2,2
         algr    $h0,$t0
+       lghi    $t1,3
+       ngr     $h2,$t1
  
         algr    $h0,$d0lo
-       lghi    $t1,3
         alcgr   $h1,$d1hi               # $d1hi is still zero
-       ngr     $h2,$t1
+       alcgr   $h2,$d1hi               # $d1hi is still zero
  
         brct$g  $len,.Loop
  
diff --git a/crypto/poly1305/asm/poly1305-sparcv9.pl b/crypto/poly1305/asm/poly1305-sparcv9.pl

index 5452887981d096f6a138bff1a37a8280caa3a838..497e27097dbd56862e6739a7e331e350b3b2911f 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-sparcv9.pl
+++ b/crypto/poly1305/asm/poly1305-sparcv9.pl
@@ -16,10 +16,10 @@
  #
  #                      IALU(*)         FMA
  #
-# UltraSPARC III       11.9(**)
-# SPARC T3             7.85
-# SPARC T4             1.67(***)       6.55
-# SPARC64 X            5.54            3.64
+# UltraSPARC III       12.3(**)
+# SPARC T3             7.92
+# SPARC T4             1.70(***)       6.55
+# SPARC64 X            5.60            3.64
  #
  # (*)  Comparison to compiler-generated code is really problematic,
  #      because latter's performance varies too much depending on too
@@ -251,8 +251,9 @@ poly1305_blocks:
         addcc   $t0,$d0,$h0
         addccc  %g0,$h1,$h1
         addccc  %g0,$h2,$h2
+       addccc  %g0,$h3,$h3
         brnz,pt $len,.Loop
-       addc    %g0,$h3,$h3
+       addc    %g0,$h4,$h4
  
         st      $h1,[$ctx+0]            ! store hash value
         st      $h0,[$ctx+4]
@@ -295,6 +296,7 @@ poly1305_blocks_vis3:
         neg     $shr,$shl
  
         srlx    $R1,2,$S1
+       b       .Loop_vis3
         add     $R1,$S1,$S1
  
  .Loop_vis3:
@@ -342,8 +344,9 @@ poly1305_blocks_vis3:
         add     $T1,$T0,$T0
  
         addcc   $T0,$D0,$H0
+       addxccc %g0,$D1,$H1
         brnz,pt $len,.Loop_vis3
-       addxc   %g0,$D1,$H1
+       addxc   %g0,$H2,$H2
  
         stx     $H0,[$ctx+0]            ! store hash value
         stx     $H1,[$ctx+8]
diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl

index 01c3cbcda93f05aecc2a187b29ec3955b96fbd4c..97d0a81bea4377889cbf01dfe61ae8762a76960d 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-x86.pl
+++ b/crypto/poly1305/asm/poly1305-x86.pl
@@ -299,6 +299,7 @@ if ($sse2) {
         &adc    ("ebx",0);
         &adc    ("ecx",0);
         &adc    ("esi",0);
+       &adc    ("edi",0);
  
         &cmp    ("ebp",&wparam(2));             # done yet?
         &jne    (&label("loop"));
@@ -1166,11 +1167,12 @@ my $addr = shift;
         &shr    ("edi",2);
         &lea    ("ebp",&DWP(0,"edi","edi",4));  # *5
          &mov   ("edi",&wparam(1));             # output
-       add     ("eax","ebp");
+       &add    ("eax","ebp");
          &mov   ("ebp",&wparam(2));             # key
-       adc     ("ebx",0);
-       adc     ("ecx",0);
-       adc     ("edx",0);
+       &adc    ("ebx",0);
+       &adc    ("ecx",0);
+       &adc    ("edx",0);
+       &adc    ("esi",0);
  
         &movd   ($D0,"eax");                    # offload original hash value
         &add    ("eax",5);                      # compare to modulus
diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl

index 8977d563a25166b5c3bfac9bb952703c40962cfd..7d676119a25ca0ef9fc0550c35af24a3594ba2fc 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-x86_64.pl
+++ b/crypto/poly1305/asm/poly1305-x86_64.pl
@@ -15,16 +15,16 @@
  # measured with rdtsc at fixed clock frequency.
  #
  #              IALU/gcc-4.8(*) AVX(**)         AVX2
-# P4           4.90/+120%      -
-# Core 2       2.39/+90%       -
-# Westmere     1.86/+120%      -
+# P4           4.46/+120%      -
+# Core 2       2.41/+90%       -
+# Westmere     1.88/+120%      -
  # Sandy Bridge 1.39/+140%      1.10
-# Haswell      1.10/+175%      1.11            0.65
-# Skylake      1.12/+120%      0.96            0.51
+# Haswell      1.14/+175%      1.11            0.65
+# Skylake      1.13/+120%      0.96            0.51
  # Silvermont   2.83/+95%       -
  # VIA Nano     1.82/+150%      -
  # Sledgehammer 1.38/+160%      -
-# Bulldozer    2.21/+130%      0.97
+# Bulldozer    2.30/+130%      0.97
  #
  # (*)  improvement coefficients relative to clang are more modest and
  #      are ~50% on most processors, in both cases we are comparing to
@@ -114,6 +114,7 @@ $code.=<<___;
         add     $d3,%rax
         add     %rax,$h0
         adc     \$0,$h1
+       adc     \$0,$h2
  ___
  }
  
@@ -184,8 +185,8 @@ $code.=<<___;
  .align 32
  poly1305_blocks:
  .Lblocks:
-       sub     \$16,$len               # too short?
-       jc      .Lno_data
+       shr     \$4,$len
+       jz      .Lno_data               # too short
  
         push    %rbx
         push    %rbp
@@ -220,8 +221,8 @@ ___
         &poly1305_iteration();
  $code.=<<___;
         mov     $r1,%rax
-       sub     \$16,%r15               # len-=16
-       jnc     .Loop
+       dec     %r15                    # len-=16
+       jnz     .Loop
  
         mov     $h0,0($ctx)             # store hash value
         mov     $h1,8($ctx)
@@ -521,6 +522,7 @@ poly1305_blocks_avx:
         add     $d2,$d1                 # =*5
         add     $d1,$h0
         adc     \$0,$h1
+       adc     \$0,$h2
  
         mov     $s1,$r1
         mov     $s1,%rax
@@ -1315,6 +1317,7 @@ poly1305_emit_avx:
         add     %rcx,%rax
         add     %rax,%r8
         adc     \$0,%r9
+       adc     \$0,%r10
  
         mov     %r8,%rax
         add     \$5,%r8         # compare to modulus
@@ -1407,6 +1410,7 @@ poly1305_blocks_avx2:
         add     $d2,$d1                 # =*5
         add     $d1,$h0
         adc     \$0,$h1
+       adc     \$0,$h2
  
         mov     $s1,$r1
         mov     $s1,%rax
diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c

index b500f2e7cb6e5228c7dff7459f9730fc54f98826..6bec8b30f885fd04c6afe3694590812efc76457c 100644 (file)
--- a/crypto/poly1305/poly1305.c
+++ b/crypto/poly1305/poly1305.c
@@ -207,7 +207,17 @@ poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)
          c = (h2 >> 2) + (h2 & ~3UL);
          h2 &= 3;
          h0 += c;
-        h1 += (c = CONSTANT_TIME_CARRY(h0,c));   /* doesn't overflow */
+        h1 += (c = CONSTANT_TIME_CARRY(h0,c));
+        h2 += CONSTANT_TIME_CARRY(h1,c);
+        /*
+         * Occasional overflows to 3rd bit of h2 are taken care of
+         * "naturally". If after this point we end up at the top of
+         * this loop, then the overflow bit will be accounted for
+         * in next iteration. If we end up in poly1305_emit, then
+         * comparison to modulus below will still count as "carry
+         * into 131st bit", so that properly reduced value will be
+         * picked in conditional move.
+         */
  
          inp += POLY1305_BLOCK_SIZE;
          len -= POLY1305_BLOCK_SIZE;
@@ -231,12 +241,12 @@ static void poly1305_emit(void *ctx, unsigned char mac[16],
      h1 = st->h[1];
      h2 = st->h[2];
  
-    /* compute h + -p */
+    /* compare to modulus by computing h + -p */
      g0 = (u64)(t = (u128)h0 + 5);
      g1 = (u64)(t = (u128)h1 + (t >> 64));
      g2 = h2 + (u64)(t >> 64);
  
-    /* if there was carry into 130th bit, h1:h0 = g1:g0 */
+    /* if there was carry into 131st bit, h1:h0 = g1:g0 */
      mask = 0 - (g2 >> 2);
      g0 &= mask;
      g1 &= mask;
@@ -361,7 +371,17 @@ poly1305_blocks(void *ctx, const unsigned char *inp, size_t len, u32 padbit)
          h0 += c;
          h1 += (c = CONSTANT_TIME_CARRY(h0,c));
          h2 += (c = CONSTANT_TIME_CARRY(h1,c));
-        h3 += (c = CONSTANT_TIME_CARRY(h2,c));   /* doesn't overflow */
+        h3 += (c = CONSTANT_TIME_CARRY(h2,c));
+        h4 += CONSTANT_TIME_CARRY(h3,c);
+        /*
+         * Occasional overflows to 3rd bit of h4 are taken care of
+         * "naturally". If after this point we end up at the top of
+         * this loop, then the overflow bit will be accounted for
+         * in next iteration. If we end up in poly1305_emit, then
+         * comparison to modulus below will still count as "carry
+         * into 131st bit", so that properly reduced value will be
+         * picked in conditional move.
+         */
  
          inp += POLY1305_BLOCK_SIZE;
          len -= POLY1305_BLOCK_SIZE;
@@ -389,14 +409,14 @@ static void poly1305_emit(void *ctx, unsigned char mac[16],
      h3 = st->h[3];
      h4 = st->h[4];
  
-    /* compute h + -p */
+    /* compare to modulus by computing h + -p */
      g0 = (u32)(t = (u64)h0 + 5);
      g1 = (u32)(t = (u64)h1 + (t >> 32));
      g2 = (u32)(t = (u64)h2 + (t >> 32));
      g3 = (u32)(t = (u64)h3 + (t >> 32));
      g4 = h4 + (u32)(t >> 32);
  
-    /* if there was carry into 130th bit, h3:h0 = g3:g0 */
+    /* if there was carry into 131st bit, h3:h0 = g3:g0 */
      mask = 0 - (g4 >> 2);
      g0 &= mask;
      g1 &= mask;
@@ -728,6 +748,58 @@ static const struct poly1305_test poly1305_tests[] = {
       "99e5822dd4173c995e3dae0ddefb9774""3fde3b080134b39f76e9bf8d0e88d546",
       "2637408fe13086ea73f971e3425e2820"
      },
+    /*
+     * test vectors from Hanno Böck
+     */
+    {
+     "cccccccccccccccccccccccccccccccccccccccccccccccccc80cccccccccccc"
+     "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccecccccc"
+     "ccccccccccccccccccccccccccccccc5cccccccccccccccccccccccccccccccc"
+     "cccccccccce3cccccccccccccccccccccccccccccccccccccccccccccccccccc"
+     "ccccccccaccccccccccccccccccccce6cccccccccc000000afcccccccccccccc"
+     "ccccfffffff50000000000000000000000000000000000000000000000000000"
+     "00ffffffe7000000000000000000000000000000000000000000000000000000"
+     "0000000000000000000000000000000000000000000000000000719205a8521d"
+     "fc",
+     "7f1b0264000000000000000000000000""0000000000000000cccccccccccccccc",
+     "8559b876eceed66eb37798c0457baff9"
+    },
+    {
+     "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa0000000000"
+     "00000000800264",
+     "e0001600000000000000000000000000""0000aaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+     "00bd1258978e205444c9aaaa82006fed"
+    },
+    {
+     "02fc",
+     "0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c""0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c",
+     "06120c0c0c0c0c0c0c0c0c0c0c0c0c0c"
+    },
+    {
+     "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+     "7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+     "7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+     "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b007b7b7b7b7b7b7b7b7b"
+     "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b"
+     "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+     "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+     "7b6e7b001300000000b300000000000000000000000000000000000000000000"
+     "f20000000000000000000000000000000000002000efff000900000000000000"
+     "0000000000100000000009000000640000000000000000000000001300000000"
+     "b300000000000000000000000000000000000000000000f20000000000000000"
+     "000000000000000000002000efff00090000000000000000007a000010000000"
+     "000900000064000000000000000000000000000000000000000000000000fc",
+     "00ff0000000000000000000000000000""00000000001e00000000000000007b7b",
+     "33205bbf9e9f8f7212ab9e2ab9b7e4a5"
+    },
+    {
+     "7777777777777777777777777777777777777777777777777777777777777777"
+     "7777777777777777777777777777777777777777777777777777777777777777"
+     "777777777777777777777777ffffffe9e9acacacacacacacacacacac0000acac"
+     "ec0100acacac2caca2acacacacacacacacacacac64f2",
+     "0000007f0000007f0100002000000000""0000cf77777777777777777777777777",
+     "02ee7c8c546ddeb1a467e4c3981158b9"
+    },
      /*
       * test vectors from Andrew Moon
       */
author	Andy Polyakov <appro@openssl.org>
	Tue, 29 Mar 2016 08:02:45 +0000 (10:02 +0200)
committer	Andy Polyakov <appro@openssl.org>
	Mon, 4 Apr 2016 14:56:20 +0000 (16:56 +0200)
crypto/poly1305/asm/poly1305-armv4.pl		patch \| blob \| history
crypto/poly1305/asm/poly1305-armv8.pl		patch \| blob \| history
crypto/poly1305/asm/poly1305-c64xplus.pl		patch \| blob \| history
crypto/poly1305/asm/poly1305-ppc.pl		patch \| blob \| history
crypto/poly1305/asm/poly1305-ppcfp.pl		patch \| blob \| history
crypto/poly1305/asm/poly1305-s390x.pl		patch \| blob \| history
crypto/poly1305/asm/poly1305-sparcv9.pl		patch \| blob \| history
crypto/poly1305/asm/poly1305-x86.pl		patch \| blob \| history
crypto/poly1305/asm/poly1305-x86_64.pl		patch \| blob \| history
crypto/poly1305/poly1305.c		patch \| blob \| history