-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# IALU(*)/gcc-4.4 NEON
#
# ARM11xx(ARMv6) 7.78/+100% -
-# Cortex-A5 6.30/+130% 2.96
+# Cortex-A5 6.35/+130% 3.00
# Cortex-A8 6.25/+115% 2.36
# Cortex-A9 5.10/+95% 2.55
-# Cortex-A15 3.79/+85% 1.25(**)
+# Cortex-A15 3.85/+85% 1.25(**)
# Snapdragon S4 5.70/+100% 1.48(**)
#
# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-$flavour = shift;
-if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
+ open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
} else {
- open STDOUT,">$output";
+ $output and open STDOUT,">$output";
}
($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
$code.=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
.code 32
#endif
+.text
+
.globl poly1305_emit
.globl poly1305_blocks
.globl poly1305_init
and r4,r4,r10
#if __ARM_MAX_ARCH__>=7
+# if !defined(_WIN32)
ldr r12,[r11,r12] @ OPENSSL_armcap_P
-# ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
# endif
#endif
and r5,r5,r3
#if __ARM_MAX_ARCH__>=7
- tst r12,#1 @ check for NEON
-# ifdef __APPLE__
- adr r9,poly1305_blocks_neon
- adr r11,poly1305_blocks
-# ifdef __thumb2__
- it ne
-# endif
+ tst r12,#ARMV7_NEON @ check for NEON
+# ifdef __thumb2__
+ adr r9,.Lpoly1305_blocks_neon
+ adr r11,.Lpoly1305_blocks
+ adr r12,.Lpoly1305_emit
+ adr r10,.Lpoly1305_emit_neon
+ itt ne
movne r11,r9
- adr r12,poly1305_emit
- adr r10,poly1305_emit_neon
-# ifdef __thumb2__
- it ne
-# endif
movne r12,r10
+ orr r11,r11,#1 @ thumb-ify address
+ orr r12,r12,#1
# else
-# ifdef __thumb2__
- itete eq
-# endif
- addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
- addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
- addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
- addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
-# endif
-# ifdef __thumb2__
- orr r12,r12,#1 @ thumb-ify address
- orr r11,r11,#1
+ addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
+ addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
+ addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
+ addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
# endif
#endif
ldrb r9,[$inp,#11]
.type poly1305_blocks,%function
.align 5
poly1305_blocks:
+.Lpoly1305_blocks:
stmdb sp!,{r3-r11,lr}
ands $len,$len,#-16
adds $h0,$h0,r1
adcs $h1,$h1,#0
adcs $h2,$h2,#0
- adc $h3,$h3,#0
+ adcs $h3,$h3,#0
+ adc $h4,$h4,#0
cmp r0,lr @ done yet?
bhi .Loop
.type poly1305_emit,%function
.align 5
poly1305_emit:
+.Lpoly1305_emit:
stmdb sp!,{r4-r11}
.Lpoly1305_emit_enter:
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
@ and P. Schwabe
+ @
+ @ H0>>+H1>>+H2>>+H3>>+H4
+ @ H3>>+H4>>*5+H0>>+H1
+ @
+ @ Trivia.
+ @
+ @ Result of multiplication of n-bit number by m-bit number is
+ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+ @ m-bit number multiplied by 2^n is still n+m bits wide.
+ @
+ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+ @ one is n+1 bits wide.
+ @
+ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+ @ can be 27. However! In cases when their width exceeds 26 bits
+ @ they are limited by 2^26+2^6. This in turn means that *sum*
+ @ of the products with these values can still be viewed as sum
+ @ of 52-bit numbers as long as the amount of addends is not a
+ @ power of 2. For example,
+ @
+ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+ @
+ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
+ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+ @ which is less than 32 * (2^52) or 2^57. And when processing
+ @ data we are looking at triple as many addends...
+ @
+ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
+ @ This means that result of reduction have to be compressed upon
+ @ loop wrap-around. This can be done in the process of reduction
+ @ to minimize amount of instructions [as well as amount of
+ @ 128-bit instructions, which benefits low-end processors], but
+ @ one has to watch for H2 (which is narrower than H0) and 5*H4
+ @ not being wider than 58 bits, so that result of right shift
+ @ by 26 bits fits in 32 bits. This is also useful on x86,
+ @ because it allows to use paddd in place for paddq, which
+ @ benefits Atom, where paddq is ridiculously slow.
vshr.u64 $T0,$D3,#26
vmovn.i64 $D3#lo,$D3
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
ldr ip,[$ctx,#36] @ is_base2_26
ands $len,$len,#-16
beq .Lno_data_neon
cmp $len,#64
bhs .Lenter_neon
tst ip,ip @ is_base2_26?
- beq poly1305_blocks
+ beq .Lpoly1305_blocks
.Lenter_neon:
stmdb sp!,{r4-r7}
.align 4
.Leven:
subs $len,$len,#64
-# ifdef __thumb2__
it lo
-# endif
movlo $in2,$zeros
vmov.i32 $H4,#1<<24 @ padbit, yes, always
add $inp,$inp,#64
vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
add $in2,$in2,#64
-# ifdef __thumb2__
itt hi
-# endif
addhi $tbl1,$ctx,#(48+1*9*4)
addhi $tbl0,$ctx,#(48+3*9*4)
vmull.u32 $D4,$H4#hi,${R0}[1]
subs $len,$len,#64
vmlal.u32 $D0,$H4#hi,${S1}[1]
-# ifdef __thumb2__
it lo
-# endif
movlo $in2,$zeros
vmlal.u32 $D3,$H2#hi,${R1}[1]
vld1.32 ${S4}[1],[$tbl1,:32]
# endif
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction interleaved with base 2^32 -> base 2^26
+ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
+ @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
vshr.u64 $T0,$D3,#26
vmovn.i64 $D3#lo,$D3
vbic.i32 $H3,#0xfc000000
vshrn.u64 $T1#lo,$D2,#26
vmovn.i64 $D2#lo,$D2
- vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
+ vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
vsri.u32 $H2,$H1,#20
vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
vshl.u32 $H1,$H1,#6
vbic.i32 $D2#lo,#0xfc000000
vbic.i32 $H2,#0xfc000000
- vshr.u32 $T0#lo,$D0#lo,#26
- vbic.i32 $D0#lo,#0xfc000000
+ vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
+ vmovn.i64 $D0#lo,$D0
vsri.u32 $H1,$H0,#26
vbic.i32 $H0,#0xfc000000
vshr.u32 $T1#lo,$D3#lo,#26
vbic.i32 $D3#lo,#0xfc000000
+ vbic.i32 $D0#lo,#0xfc000000
vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
vbic.i32 $H1,#0xfc000000
add $tbl1,$ctx,#(48+0*9*4)
add $tbl0,$ctx,#(48+1*9*4)
adds $len,$len,#32
-# ifdef __thumb2__
it ne
-# endif
movne $len,#0
bne .Long_tail
vmlal.u32 $D2,$H0#hi,$R2
vmlal.u32 $D3,$H0#hi,$R3
-# ifdef __thumb2__
- it ne
-# endif
+ it ne
addne $tbl1,$ctx,#(48+2*9*4)
vmlal.u32 $D0,$H2#hi,$S3
-# ifdef __thumb2__
- it ne
-# endif
+ it ne
addne $tbl0,$ctx,#(48+3*9*4)
vmlal.u32 $D4,$H1#hi,$R3
vmlal.u32 $D1,$H3#hi,$S3
vmlal.u32 $D2,$H3#lo,$S4
.Lshort_tail:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ horizontal addition
+
+ vadd.i64 $D3#lo,$D3#lo,$D3#hi
+ vadd.i64 $D0#lo,$D0#lo,$D0#hi
+ vadd.i64 $D4#lo,$D4#lo,$D4#hi
+ vadd.i64 $D1#lo,$D1#lo,$D1#hi
+ vadd.i64 $D2#lo,$D2#lo,$D2#hi
+
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ lazy reduction, but without narrowing
vadd.i64 $D1,$D1,$T0 @ h0 -> h1
vadd.i64 $D4,$D4,$T1 @ h3 -> h4
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ horizontal addition
-
- vadd.i64 $D2#lo,$D2#lo,$D2#hi
- vadd.i64 $D0#lo,$D0#lo,$D0#hi
- vadd.i64 $D3#lo,$D3#lo,$D3#hi
- vadd.i64 $D1#lo,$D1#lo,$D1#hi
- vadd.i64 $D4#lo,$D4#lo,$D4#hi
-
cmp $len,#0
bne .Leven
.type poly1305_emit_neon,%function
.align 5
poly1305_emit_neon:
+.Lpoly1305_emit_neon:
ldr ip,[$ctx,#36] @ is_base2_26
stmdb sp!,{r4-r11}
adds $h0,$h0,$g0
adcs $h1,$h1,#0
adcs $h2,$h2,#0
- adc $h3,$h3,#0
+ adcs $h3,$h3,#0
+ adc $h4,$h4,#0
adds $g0,$h0,#5 @ compare to modulus
adcs $g1,$h1,#0
adc $g4,$h4,#0
tst $g4,#4 @ did it carry/borrow?
-# ifdef __thumb2__
it ne
-# endif
movne $h0,$g0
ldr $g0,[$nonce,#0]
-# ifdef __thumb2__
it ne
-# endif
movne $h1,$g1
ldr $g1,[$nonce,#4]
-# ifdef __thumb2__
it ne
-# endif
movne $h2,$g2
ldr $g2,[$nonce,#8]
-# ifdef __thumb2__
it ne
-# endif
movne $h3,$g3
ldr $g3,[$nonce,#12]
.Lzeros:
.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.Lpoly1305_init
+# endif
#endif
___
} }
print $_,"\n";
}
-close STDOUT; # enforce flush
+close STDOUT or die "error closing STDOUT"; # enforce flush