-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# measured with rdtsc at fixed clock frequency.
#
# IALU/gcc-4.8(*) AVX(**) AVX2
-# P4 4.90/+120% -
-# Core 2 2.39/+90% -
-# Westmere 1.86/+120% -
+# P4 4.46/+120% -
+# Core 2 2.41/+90% -
+# Westmere 1.88/+120% -
# Sandy Bridge 1.39/+140% 1.10
-# Haswell 1.10/+175% 1.11 0.65
-# Skylake 1.12/+120% 0.96 0.51
+# Haswell 1.14/+175% 1.11 0.65
+# Skylake 1.13/+120% 0.96 0.51
# Silvermont 2.83/+95% -
+# Goldmont 1.70/+180% -
# VIA Nano 1.82/+150% -
# Sledgehammer 1.38/+160% -
-# Bulldozer 2.21/+130% 0.97
+# Bulldozer 2.30/+130% 0.97
#
# (*) improvement coefficients relative to clang are more modest and
# are ~50% on most processors, in both cases we are comparing to
$avx = ($2>=3.0) + ($2>3.0);
}
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
add $d3,%rax
add %rax,$h0
adc \$0,$h1
+ adc \$0,$h2
___
}
.extern OPENSSL_ia32cap_P
.globl poly1305_init
-.type poly1305_init,\@function,2
+.hidden poly1305_init
+.globl poly1305_blocks
+.hidden poly1305_blocks
+.globl poly1305_emit
+.hidden poly1305_emit
+
+.type poly1305_init,\@function,3
.align 32
poly1305_init:
xor %rax,%rax
and 8($inp),%rcx
mov %rax,24($ctx)
mov %rcx,32($ctx)
-
+___
+$code.=<<___ if ($flavour !~ /elf32/);
mov %r10,0(%rdx)
mov %r11,8(%rdx)
-
+___
+$code.=<<___ if ($flavour =~ /elf32/);
+ mov %r10d,0(%rdx)
+ mov %r11d,4(%rdx)
+___
+$code.=<<___;
mov \$1,%eax
.Lno_key:
ret
.size poly1305_init,.-poly1305_init
-.globl poly1305_blocks
.type poly1305_blocks,\@function,4
.align 32
poly1305_blocks:
- sub \$16,$len # too short?
- jc .Lno_data
+.Lblocks:
+ shr \$4,$len
+ jz .Lno_data # too short
push %rbx
push %rbp
&poly1305_iteration();
$code.=<<___;
mov $r1,%rax
- sub \$16,%r15 # len-=16
- jnc .Loop
+ dec %r15 # len-=16
+ jnz .Loop
mov $h0,0($ctx) # store hash value
mov $h1,8($ctx)
ret
.size poly1305_blocks,.-poly1305_blocks
-.globl poly1305_emit
.type poly1305_emit,\@function,3
.align 32
poly1305_emit:
+.Lemit:
mov 0($ctx),%r8 # load hash value
mov 8($ctx),%r9
mov 16($ctx),%r10
cmp \$128,$len
jae .Lblocks_avx
test %r8d,%r8d
- jz poly1305_blocks
+ jz .Lblocks
.Lblocks_avx:
and \$-16,$len
################################# base 2^26 -> base 2^64
mov $d1#d,$h0#d
- and \$-1<<31,$d1
+ and \$`-1*(1<<31)`,$d1
mov $d2,$r1 # borrow $r1
mov $d2#d,$h1#d
- and \$-1<<31,$d2
+ and \$`-1*(1<<31)`,$d2
shr \$6,$d1
shl \$52,$r1
add $d2,$d1 # =*5
add $d1,$h0
adc \$0,$h1
+ adc \$0,$h2
mov $s1,$r1
mov $s1,%rax
vpaddq $T3,$D0,$D0 # d0 += h1*s4
.Lshort_tail_avx:
+ ################################################################
+ # horizontal addition
+
+ vpsrldq \$8,$D4,$T4
+ vpsrldq \$8,$D3,$T3
+ vpsrldq \$8,$D1,$T1
+ vpsrldq \$8,$D0,$T0
+ vpsrldq \$8,$D2,$T2
+ vpaddq $T3,$D3,$D3
+ vpaddq $T4,$D4,$D4
+ vpaddq $T0,$D0,$D0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$D2,$D2
+
################################################################
# lazy reduction
vpand $MASK,$D3,$D3
vpaddq $H3,$D4,$D4 # h3 -> h4
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D2,$T2
- vpsrldq \$8,$D0,$T0
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$D3,$T3
- vpsrldq \$8,$D4,$T4
- vpaddq $T2,$D2,$H2
- vpaddq $T0,$D0,$H0
- vpaddq $T1,$D1,$H1
- vpaddq $T3,$D3,$H3
- vpaddq $T4,$D4,$H4
-
- vmovd $H0,`4*0-48-64`($ctx) # save partially reduced
- vmovd $H1,`4*1-48-64`($ctx)
- vmovd $H2,`4*2-48-64`($ctx)
- vmovd $H3,`4*3-48-64`($ctx)
- vmovd $H4,`4*4-48-64`($ctx)
+ vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
+ vmovd $D1,`4*1-48-64`($ctx)
+ vmovd $D2,`4*2-48-64`($ctx)
+ vmovd $D3,`4*3-48-64`($ctx)
+ vmovd $D4,`4*4-48-64`($ctx)
___
$code.=<<___ if ($win64);
vmovdqa 0x50(%r11),%xmm6
.align 32
poly1305_emit_avx:
cmpl \$0,20($ctx) # is_base2_26?
- je poly1305_emit
+ je .Lemit
mov 0($ctx),%eax # load hash value base 2^26
mov 4($ctx),%ecx
add %rcx,%rax
add %rax,%r8
adc \$0,%r9
+ adc \$0,%r10
mov %r8,%rax
add \$5,%r8 # compare to modulus
cmp \$128,$len
jae .Lblocks_avx2
test %r8d,%r8d
- jz poly1305_blocks
+ jz .Lblocks
.Lblocks_avx2:
and \$-16,$len
################################# base 2^26 -> base 2^64
mov $d1#d,$h0#d
- and \$-1<<31,$d1
+ and \$`-1*(1<<31)`,$d1
mov $d2,$r1 # borrow $r1
mov $d2#d,$h1#d
- and \$-1<<31,$d2
+ and \$`-1*(1<<31)`,$d2
shr \$6,$d1
shl \$52,$r1
add $d2,$d1 # =*5
add $d1,$h0
adc \$0,$h1
+ adc \$0,$h2
mov $s1,$r1
mov $s1,%rax
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
+ ################################################################
+ # horizontal addition
+
+ vpsrldq \$8,$D1,$T1
+ vpsrldq \$8,$H2,$T2
+ vpsrldq \$8,$H3,$T3
+ vpsrldq \$8,$H4,$T4
+ vpsrldq \$8,$H0,$T0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$H2,$H2
+ vpaddq $T3,$H3,$H3
+ vpaddq $T4,$H4,$H4
+ vpaddq $T0,$H0,$H0
+
+ vpermq \$0x2,$H3,$T3
+ vpermq \$0x2,$H4,$T4
+ vpermq \$0x2,$H0,$T0
+ vpermq \$0x2,$D1,$T1
+ vpermq \$0x2,$H2,$T2
+ vpaddq $T3,$H3,$H3
+ vpaddq $T4,$H4,$H4
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$H2,$H2
+
################################################################
# lazy reduction
vpand $MASK,$H3,$H3
vpaddq $D3,$H4,$H4 # h3 -> h4
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$H2,$T2
- vpsrldq \$8,$H0,$T0
- vpsrldq \$8,$H1,$T1
- vpsrldq \$8,$H3,$T3
- vpsrldq \$8,$H4,$T4
- vpaddq $T2,$H2,$H2
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
-
- vpermq \$0x2,$H2,$T2
- vpermq \$0x2,$H0,$T0
- vpermq \$0x2,$H1,$T1
- vpermq \$0x2,$H3,$T3
- vpermq \$0x2,$H4,$T4
- vpaddq $T2,$H2,$H2
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
-
vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
vmovd %x#$H1,`4*1-48-64`($ctx)
vmovd %x#$H2,`4*2-48-64`($ctx)
.Lmask24:
.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
.L129:
-.long 1<<24,0,1<<24,0,1<<24,0,1<<24,0
+.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
.Lmask26:
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
.Lfive: