# IALU(*)/gcc-4.4 NEON
#
# ARM11xx(ARMv6) 7.78/+100% -
-# Cortex-A5 6.30/+130% 2.96
+# Cortex-A5 6.35/+130% 2.96
# Cortex-A8 6.25/+115% 2.36
# Cortex-A9 5.10/+95% 2.55
-# Cortex-A15 3.79/+85% 1.25(**)
+# Cortex-A15 3.85/+85% 1.25(**)
# Snapdragon S4 5.70/+100% 1.48(**)
#
# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
adds $h0,$h0,r1
adcs $h1,$h1,#0
adcs $h2,$h2,#0
- adc $h3,$h3,#0
+ adcs $h3,$h3,#0
+ adc $h4,$h4,#0
cmp r0,lr @ done yet?
bhi .Loop
.align 4
.Leven:
subs $len,$len,#64
-# ifdef __thumb2__
it lo
-# endif
movlo $in2,$zeros
vmov.i32 $H4,#1<<24 @ padbit, yes, always
add $inp,$inp,#64
vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
add $in2,$in2,#64
-# ifdef __thumb2__
itt hi
-# endif
addhi $tbl1,$ctx,#(48+1*9*4)
addhi $tbl0,$ctx,#(48+3*9*4)
vmull.u32 $D4,$H4#hi,${R0}[1]
subs $len,$len,#64
vmlal.u32 $D0,$H4#hi,${S1}[1]
-# ifdef __thumb2__
it lo
-# endif
movlo $in2,$zeros
vmlal.u32 $D3,$H2#hi,${R1}[1]
vld1.32 ${S4}[1],[$tbl1,:32]
add $tbl1,$ctx,#(48+0*9*4)
add $tbl0,$ctx,#(48+1*9*4)
adds $len,$len,#32
-# ifdef __thumb2__
it ne
-# endif
movne $len,#0
bne .Long_tail
vmlal.u32 $D2,$H0#hi,$R2
vmlal.u32 $D3,$H0#hi,$R3
-# ifdef __thumb2__
- it ne
-# endif
+ it ne
addne $tbl1,$ctx,#(48+2*9*4)
vmlal.u32 $D0,$H2#hi,$S3
-# ifdef __thumb2__
- it ne
-# endif
+ it ne
addne $tbl0,$ctx,#(48+3*9*4)
vmlal.u32 $D4,$H1#hi,$R3
vmlal.u32 $D1,$H3#hi,$S3
adds $h0,$h0,$g0
adcs $h1,$h1,#0
adcs $h2,$h2,#0
- adc $h3,$h3,#0
+ adcs $h3,$h3,#0
+ adc $h4,$h4,#0
adds $g0,$h0,#5 @ compare to modulus
adcs $g1,$h1,#0
adc $g4,$h4,#0
tst $g4,#4 @ did it carry/borrow?
-# ifdef __thumb2__
it ne
-# endif
movne $h0,$g0
ldr $g0,[$nonce,#0]
-# ifdef __thumb2__
it ne
-# endif
movne $h1,$g1
ldr $g1,[$nonce,#4]
-# ifdef __thumb2__
it ne
-# endif
movne $h2,$g2
ldr $g2,[$nonce,#8]
-# ifdef __thumb2__
it ne
-# endif
movne $h3,$g3
ldr $g3,[$nonce,#12]
# IALU/gcc-4.9 NEON
#
# Apple A7 1.86/+5% 0.72
-# Cortex-A53 2.63/+58% 1.47
+# Cortex-A53 2.69/+58% 1.47
# Cortex-A57 2.70/+7% 1.14
-# Denver 1.39/+50% 1.18(*)
-# X-Gene 2.00/+68% 2.19
+# Denver 1.64/+50% 1.18(*)
+# X-Gene 2.13/+68% 2.19
#
# (*) estimate based on resources availability is less than 1.0,
# i.e. measured result is worse than expected, presumably binary
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$d0,$t0
- adc $h1,$d1,xzr
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
cbnz $len,.Loop
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$d0,$t0
- adc $h1,$d1,xzr
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
ret
.size poly1305_mult,.-poly1305_mult
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$h0,$t0
- adc $h1,$h1,xzr
+ adcs $h1,$h1,xzr
+ adc $h2,$h2,xzr
#ifdef __ARMEB__
rev $d0,$d0
add $d0,$d0,$h2,lsr#2
and $h2,$h2,#3
adds $h0,$h0,$d0
- adc $h1,$h1,xzr
+ adcs $h1,$h1,xzr
+ adc $h2,$h2,xzr
adds $d0,$h0,#5 // compare to modulus
adcs $d1,$h1,xzr
#
# October 2015
#
-# Performance is [incredible for a 32-bit processor] 1.76 cycles per
+# Performance is [incredible for a 32-bit processor] 1.82 cycles per
# processed byte. Comparison to compiler-generated code is problematic,
# because results were observed to vary from 2.1 to 7.6 cpb depending
# on compiler's ability to inline small functions. Compiler also
|| SWAP2 $D1,$D1
ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
-|| ADD $D0,B24,B31 ; B-copy of h0+inp[0]
+|| ADD $D0,B24,B27 ; B-copy of h0+inp[0]
|| SWAP4 $D1,$D1
ADDU $D1,B25,$D1:$H1 ; h1+=inp[1]
|| MVK 3,$THREE
loop?:
MPY32U $H0,$R0,A17:A16
-|| MPY32U B31,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16
+|| MPY32U B27,$R1,B17:B16 ; MPY32U $H0,$R1,B17:B16
|| ADDU $D0,$D1:$H1,B25:B24 ; ADDU $D0,$D1:$H1,$D1:$H1
|| ADDU $D2,B28,$D2:$H2 ; h2+=inp[2]
|| SWAP2 $D3,$D3
MPY32U $H0,$R2,A19:A18
-|| MPY32U B31,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18
+|| MPY32U B27,$R3,B19:B18 ; MPY32U $H0,$R3,B19:B18
|| ADD $D0,$H1,A24 ; A-copy of B24
|| SWAP4 $D3,$D3
|| [A2] SUB A2,1,A2 ; decrement loop counter
SHRU $H4,2,B16 ; last reduction step
|| AND $H4,$THREE,$H4
-|| [A2] BNOP loop?
ADDAW B16,B16,B16 ; 5*(h4>>2)
+|| [A2] BNOP loop?
ADDU B24,B16,B25:B24 ; B24 is h0
|| [A2] SWAP2 $D2,$D2
|| [A2] SWAP4 $D2,$D2
ADDU B28,B27,B29:B28 ; B28 is h2
|| [A2] ADDU $D0,B24,$D0:$H0 ; h0+=inp[0]
-|| [A2] ADD $D0,B24,B31 ; B-copy of h0+inp[0]
- ADD B30,B29,B30 ; B30 is h3
+|| [A2] ADD $D0,B24,B27 ; B-copy of h0+inp[0]
+ ADDU B30,B29,B31:B30 ; B30 is h3
+ ADD B31,$H4,$H4
|| [A2] ADDU $D1,B26,$D1:$H1 ; h1+=inp[1]
;;===== branch to loop? is taken here
# -m32 -m64
#
# Freescale e300 14.8/+80% -
-# PPC74x0 7.40/+60% -
-# PPC970 7.20/+114% 3.51/+205%
-# POWER6 3.96/+250% 2.02/+170%
-# POWER7 3.67/+260% 1.87/+100%
-# POWER8 - 2.13/+200%
+# PPC74x0 7.60/+60% -
+# PPC970 7.00/+114% 3.51/+205%
+# POWER7 3.75/+260% 1.93/+100%
+# POWER8 - 2.03/+200%
#
# Do we need floating-point implementation for PPC? Results presented
# in poly1305_ieee754.c are tricky to compare to, because they are for
add $t0,$t0,$t1
addc $h0,$d0,$t0
addze $h1,$d1
+ addze $h2,$h2
bdnz Loop
addze $h1,$h1
addze $h2,$h2
addze $h3,$h3
+ addze $h4,$h4
bdnz Loop
# and improvement coefficients relative to gcc-generated code.
#
# Freescale e300 9.78/+30%
-# PPC74x0 7.08/+50%
-# PPC970 6.24/+80%
+# PPC74x0 6.92/+50%
+# PPC970 6.03/+80%
# POWER7 3.50/+30%
# POWER8 3.75/+10%
#
# June 2015
#
-# ~6.4/2.2 cpb on z10/z196+, >2x improvement over compiler-generated
+# ~6.6/2.3 cpb on z10/z196+, >2x improvement over compiler-generated
# code. For older compiler improvement coefficient is >3x, because
# then base 2^64 and base 2^32 implementations are compared.
#
ngr $h0,$h2
srlg $t0,$h2,2
algr $h0,$t0
+ lghi $t1,3
+ ngr $h2,$t1
algr $h0,$d0lo
- lghi $t1,3
alcgr $h1,$d1hi # $d1hi is still zero
- ngr $h2,$t1
+ alcgr $h2,$d1hi # $d1hi is still zero
brct$g $len,.Loop
#
# IALU(*) FMA
#
-# UltraSPARC III 11.9(**)
-# SPARC T3 7.85
-# SPARC T4 1.67(***) 6.55
-# SPARC64 X 5.54 3.64
+# UltraSPARC III 12.3(**)
+# SPARC T3 7.92
+# SPARC T4 1.70(***) 6.55
+# SPARC64 X 5.60 3.64
#
# (*) Comparison to compiler-generated code is really problematic,
# because latter's performance varies too much depending on too
addcc $t0,$d0,$h0
addccc %g0,$h1,$h1
addccc %g0,$h2,$h2
+ addccc %g0,$h3,$h3
brnz,pt $len,.Loop
- addc %g0,$h3,$h3
+ addc %g0,$h4,$h4
st $h1,[$ctx+0] ! store hash value
st $h0,[$ctx+4]
neg $shr,$shl
srlx $R1,2,$S1
+ b .Loop_vis3
add $R1,$S1,$S1
.Loop_vis3:
add $T1,$T0,$T0
addcc $T0,$D0,$H0
+ addxccc %g0,$D1,$H1
brnz,pt $len,.Loop_vis3
- addxc %g0,$D1,$H1
+ addxc %g0,$H2,$H2
stx $H0,[$ctx+0] ! store hash value
stx $H1,[$ctx+8]
&adc ("ebx",0);
&adc ("ecx",0);
&adc ("esi",0);
+ &adc ("edi",0);
&cmp ("ebp",&wparam(2)); # done yet?
&jne (&label("loop"));
&shr ("edi",2);
&lea ("ebp",&DWP(0,"edi","edi",4)); # *5
&mov ("edi",&wparam(1)); # output
- add ("eax","ebp");
+ &add ("eax","ebp");
&mov ("ebp",&wparam(2)); # key
- adc ("ebx",0);
- adc ("ecx",0);
- adc ("edx",0);
+ &adc ("ebx",0);
+ &adc ("ecx",0);
+ &adc ("edx",0);
+ &adc ("esi",0);
&movd ($D0,"eax"); # offload original hash value
&add ("eax",5); # compare to modulus
# measured with rdtsc at fixed clock frequency.
#
# IALU/gcc-4.8(*) AVX(**) AVX2
-# P4 4.90/+120% -
-# Core 2 2.39/+90% -
-# Westmere 1.86/+120% -
+# P4 4.46/+120% -
+# Core 2 2.41/+90% -
+# Westmere 1.88/+120% -
# Sandy Bridge 1.39/+140% 1.10
-# Haswell 1.10/+175% 1.11 0.65
-# Skylake 1.12/+120% 0.96 0.51
+# Haswell 1.14/+175% 1.11 0.65
+# Skylake 1.13/+120% 0.96 0.51
# Silvermont 2.83/+95% -
# VIA Nano 1.82/+150% -
# Sledgehammer 1.38/+160% -
-# Bulldozer 2.21/+130% 0.97
+# Bulldozer 2.30/+130% 0.97
#
# (*) improvement coefficients relative to clang are more modest and
# are ~50% on most processors, in both cases we are comparing to
add $d3,%rax
add %rax,$h0
adc \$0,$h1
+ adc \$0,$h2
___
}
.align 32
poly1305_blocks:
.Lblocks:
- sub \$16,$len # too short?
- jc .Lno_data
+ shr \$4,$len
+ jz .Lno_data # too short
push %rbx
push %rbp
&poly1305_iteration();
$code.=<<___;
mov $r1,%rax
- sub \$16,%r15 # len-=16
- jnc .Loop
+ dec %r15 # len-=16
+ jnz .Loop
mov $h0,0($ctx) # store hash value
mov $h1,8($ctx)
add $d2,$d1 # =*5
add $d1,$h0
adc \$0,$h1
+ adc \$0,$h2
mov $s1,$r1
mov $s1,%rax
add %rcx,%rax
add %rax,%r8
adc \$0,%r9
+ adc \$0,%r10
mov %r8,%rax
add \$5,%r8 # compare to modulus
add $d2,$d1 # =*5
add $d1,$h0
adc \$0,$h1
+ adc \$0,$h2
mov $s1,$r1
mov $s1,%rax
c = (h2 >> 2) + (h2 & ~3UL);
h2 &= 3;
h0 += c;
- h1 += (c = CONSTANT_TIME_CARRY(h0,c)); /* doesn't overflow */
+ h1 += (c = CONSTANT_TIME_CARRY(h0,c));
+ h2 += CONSTANT_TIME_CARRY(h1,c);
+ /*
+ * Occasional overflows to 3rd bit of h2 are taken care of
+ * "naturally". If after this point we end up at the top of
+ * this loop, then the overflow bit will be accounted for
+ * in next iteration. If we end up in poly1305_emit, then
+ * comparison to modulus below will still count as "carry
+ * into 131st bit", so that properly reduced value will be
+ * picked in conditional move.
+ */
inp += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
h1 = st->h[1];
h2 = st->h[2];
- /* compute h + -p */
+ /* compare to modulus by computing h + -p */
g0 = (u64)(t = (u128)h0 + 5);
g1 = (u64)(t = (u128)h1 + (t >> 64));
g2 = h2 + (u64)(t >> 64);
- /* if there was carry into 130th bit, h1:h0 = g1:g0 */
+ /* if there was carry into 131st bit, h1:h0 = g1:g0 */
mask = 0 - (g2 >> 2);
g0 &= mask;
g1 &= mask;
h0 += c;
h1 += (c = CONSTANT_TIME_CARRY(h0,c));
h2 += (c = CONSTANT_TIME_CARRY(h1,c));
- h3 += (c = CONSTANT_TIME_CARRY(h2,c)); /* doesn't overflow */
+ h3 += (c = CONSTANT_TIME_CARRY(h2,c));
+ h4 += CONSTANT_TIME_CARRY(h3,c);
+ /*
+ * Occasional overflows to 3rd bit of h4 are taken care of
+ * "naturally". If after this point we end up at the top of
+ * this loop, then the overflow bit will be accounted for
+ * in next iteration. If we end up in poly1305_emit, then
+ * comparison to modulus below will still count as "carry
+ * into 131st bit", so that properly reduced value will be
+ * picked in conditional move.
+ */
inp += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
h3 = st->h[3];
h4 = st->h[4];
- /* compute h + -p */
+ /* compare to modulus by computing h + -p */
g0 = (u32)(t = (u64)h0 + 5);
g1 = (u32)(t = (u64)h1 + (t >> 32));
g2 = (u32)(t = (u64)h2 + (t >> 32));
g3 = (u32)(t = (u64)h3 + (t >> 32));
g4 = h4 + (u32)(t >> 32);
- /* if there was carry into 130th bit, h3:h0 = g3:g0 */
+ /* if there was carry into 131st bit, h3:h0 = g3:g0 */
mask = 0 - (g4 >> 2);
g0 &= mask;
g1 &= mask;
"99e5822dd4173c995e3dae0ddefb9774""3fde3b080134b39f76e9bf8d0e88d546",
"2637408fe13086ea73f971e3425e2820"
},
+ /*
+ * test vectors from Hanno Böck
+ */
+ {
+ "cccccccccccccccccccccccccccccccccccccccccccccccccc80cccccccccccc"
+ "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccecccccc"
+ "ccccccccccccccccccccccccccccccc5cccccccccccccccccccccccccccccccc"
+ "cccccccccce3cccccccccccccccccccccccccccccccccccccccccccccccccccc"
+ "ccccccccaccccccccccccccccccccce6cccccccccc000000afcccccccccccccc"
+ "ccccfffffff50000000000000000000000000000000000000000000000000000"
+ "00ffffffe7000000000000000000000000000000000000000000000000000000"
+ "0000000000000000000000000000000000000000000000000000719205a8521d"
+ "fc",
+ "7f1b0264000000000000000000000000""0000000000000000cccccccccccccccc",
+ "8559b876eceed66eb37798c0457baff9"
+ },
+ {
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa0000000000"
+ "00000000800264",
+ "e0001600000000000000000000000000""0000aaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "00bd1258978e205444c9aaaa82006fed"
+ },
+ {
+ "02fc",
+ "0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c""0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c0c",
+ "06120c0c0c0c0c0c0c0c0c0c0c0c0c0c"
+ },
+ {
+ "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+ "7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+ "7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+ "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b6e7b007b7b7b7b7b7b7b7b7b"
+ "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7a7b7b7b7b7b7b7b7b7b7b7b7b"
+ "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b5c7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+ "7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b7b"
+ "7b6e7b001300000000b300000000000000000000000000000000000000000000"
+ "f20000000000000000000000000000000000002000efff000900000000000000"
+ "0000000000100000000009000000640000000000000000000000001300000000"
+ "b300000000000000000000000000000000000000000000f20000000000000000"
+ "000000000000000000002000efff00090000000000000000007a000010000000"
+ "000900000064000000000000000000000000000000000000000000000000fc",
+ "00ff0000000000000000000000000000""00000000001e00000000000000007b7b",
+ "33205bbf9e9f8f7212ab9e2ab9b7e4a5"
+ },
+ {
+ "7777777777777777777777777777777777777777777777777777777777777777"
+ "7777777777777777777777777777777777777777777777777777777777777777"
+ "777777777777777777777777ffffffe9e9acacacacacacacacacacac0000acac"
+ "ec0100acacac2caca2acacacacacacacacacacac64f2",
+ "0000007f0000007f0100002000000000""0000cf77777777777777777777777777",
+ "02ee7c8c546ddeb1a467e4c3981158b9"
+ },
/*
* test vectors from Andrew Moon
*/