-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# measured with rdtsc at fixed clock frequency.
#
# IALU/gcc-4.8(*) AVX(**) AVX2
-# P4 4.90/+120% -
-# Core 2 2.39/+90% -
-# Westmere 1.86/+120% -
+# P4 4.46/+120% -
+# Core 2 2.41/+90% -
+# Westmere 1.88/+120% -
# Sandy Bridge 1.39/+140% 1.10
-# Haswell 1.10/+175% 1.11 0.65
-# Skylake 1.12/+120% 0.96 0.51
+# Haswell 1.14/+175% 1.11 0.65
+# Skylake 1.13/+120% 0.96 0.51
# Silvermont 2.83/+95% -
+# Goldmont 1.70/+180% -
# VIA Nano 1.82/+150% -
# Sledgehammer 1.38/+160% -
-# Bulldozer 2.21/+130% 0.97
+# Bulldozer 2.30/+130% 0.97
#
# (*) improvement coefficients relative to clang are more modest and
# are ~50% on most processors, in both cases we are comparing to
$avx = ($2>=3.0) + ($2>3.0);
}
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
add $d3,%rax
add %rax,$h0
adc \$0,$h1
+ adc \$0,$h2
___
}
.extern OPENSSL_ia32cap_P
.globl poly1305_init
+.hidden poly1305_init
.globl poly1305_blocks
+.hidden poly1305_blocks
.globl poly1305_emit
+.hidden poly1305_emit
+
.type poly1305_init,\@function,3
.align 32
poly1305_init:
and 8($inp),%rcx
mov %rax,24($ctx)
mov %rcx,32($ctx)
-
+___
+$code.=<<___ if ($flavour !~ /elf32/);
mov %r10,0(%rdx)
mov %r11,8(%rdx)
-
+___
+$code.=<<___ if ($flavour =~ /elf32/);
+ mov %r10d,0(%rdx)
+ mov %r11d,4(%rdx)
+___
+$code.=<<___;
mov \$1,%eax
.Lno_key:
ret
.align 32
poly1305_blocks:
.Lblocks:
- sub \$16,$len # too short?
- jc .Lno_data
+ shr \$4,$len
+ jz .Lno_data # too short
push %rbx
push %rbp
&poly1305_iteration();
$code.=<<___;
mov $r1,%rax
- sub \$16,%r15 # len-=16
- jnc .Loop
+ dec %r15 # len-=16
+ jnz .Loop
mov $h0,0($ctx) # store hash value
mov $h1,8($ctx)
################################# base 2^26 -> base 2^64
mov $d1#d,$h0#d
- and \$-1<<31,$d1
+ and \$`-1*(1<<31)`,$d1
mov $d2,$r1 # borrow $r1
mov $d2#d,$h1#d
- and \$-1<<31,$d2
+ and \$`-1*(1<<31)`,$d2
shr \$6,$d1
shl \$52,$r1
add $d2,$d1 # =*5
add $d1,$h0
adc \$0,$h1
+ adc \$0,$h2
mov $s1,$r1
mov $s1,%rax
add %rcx,%rax
add %rax,%r8
adc \$0,%r9
+ adc \$0,%r10
mov %r8,%rax
add \$5,%r8 # compare to modulus
################################# base 2^26 -> base 2^64
mov $d1#d,$h0#d
- and \$-1<<31,$d1
+ and \$`-1*(1<<31)`,$d1
mov $d2,$r1 # borrow $r1
mov $d2#d,$h1#d
- and \$-1<<31,$d2
+ and \$`-1*(1<<31)`,$d2
shr \$6,$d1
shl \$52,$r1
add $d2,$d1 # =*5
add $d1,$h0
adc \$0,$h1
+ adc \$0,$h2
mov $s1,$r1
mov $s1,%rax
.Lmask24:
.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
.L129:
-.long 1<<24,0,1<<24,0,1<<24,0,1<<24,0
+.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
.Lmask26:
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
.Lfive: