-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# is ~2.5x larger and there are some redundant instructions executed
# when processing last block, improvement is not as big for smallest
# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per
-# byte, which is also >80% faster than integer-only code.
+# byte, which is also >80% faster than integer-only code. Cortex-A15
+# is even faster spending 5.6 cycles per byte outperforming integer-
+# only code by factor of 2.
# May 2014.
#
# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0";
$inp="r1";
#include "arm_arch.h"
.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
.code 32
+#endif
.global sha1_block_data_order
.type sha1_block_data_order,%function
.align 5
sha1_block_data_order:
#if __ARM_MAX_ARCH__>=7
- sub r3,pc,#8 @ sha1_block_data_order
+.Lsha1_block:
+ adr r3,.Lsha1_block
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
tst r12,#ARMV8_SHA1
bne .LARMv8
tst r12,#ARMV7_NEON
&BODY_00_15(@V); unshift(@V,pop(@V));
}
$code.=<<___;
+#if defined(__thumb2__)
+ mov $t3,sp
+ teq $Xi,$t3
+#else
teq $Xi,sp
+#endif
bne .L_00_15 @ [((11+4)*5+2)*3]
sub sp,sp,#25*4
___
&BODY_20_39(@V); unshift(@V,pop(@V));
}
$code.=<<___;
+#if defined(__thumb2__)
+ mov $t3,sp
+ teq $Xi,$t3
+#else
teq $Xi,sp @ preserve carry
+#endif
bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
&BODY_40_59(@V); unshift(@V,pop(@V));
}
$code.=<<___;
+#if defined(__thumb2__)
+ mov $t3,sp
+ teq $Xi,$t3
+#else
teq $Xi,sp
+#endif
bne .L_40_59 @ [+((12+5)*5+2)*4]
ldr $K,.LK_60_79
.LK_60_79: .word 0xca62c1d6
#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha1_block_data_order
+.word OPENSSL_armcap_P-.Lsha1_block
#endif
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
&teq ($inp,$len);
&sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
+ &it ("eq");
&subeq ($inp,$inp,64); # reload last block to avoid SEGV
&vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
eval(shift(@insns));
@ dmb @ errata #451034 on early Cortex A8
@ vstmdb sp!,{d8-d15} @ ABI specification says so
mov $saved_sp,sp
- sub sp,sp,#64 @ alloca
+ sub $Xfer,sp,#64
adr $K_XX_XX,.LK_00_19
- bic sp,sp,#15 @ align for 128-bit stores
+ bic $Xfer,$Xfer,#15 @ align for 128-bit stores
ldmia $ctx,{$a,$b,$c,$d,$e} @ load context
- mov $Xfer,sp
+ mov sp,$Xfer @ alloca
vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned
veor $zero,$zero,$zero
add $b,$b,$t0
add $c,$c,$t1
add $d,$d,$Xfer
+ it eq
moveq sp,$saved_sp
add $e,$e,$Ki
+ it ne
ldrne $Ki,[sp]
stmia $ctx,{$a,$b,$c,$d,$e}
+ itt ne
addne $Xfer,sp,#3*16
bne .Loop_neon
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__)
+# define INST(a,b,c,d) .byte c,d|0xf,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d|0x10
+# endif
+
.type sha1_block_data_order_armv8,%function
.align 5
sha1_block_data_order_armv8:
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
- sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+
+ # this fix-up provides Thumb encoding in conjunction with INST
+ $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;