# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
+ open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
} else {
- open STDOUT,">$output";
+ $output and open STDOUT,">$output";
}
($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
$code.=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
.code 32
#endif
+.text
+
.globl poly1305_emit
.globl poly1305_blocks
.globl poly1305_init
and r4,r4,r10
#if __ARM_MAX_ARCH__>=7
+# if !defined(_WIN32)
ldr r12,[r11,r12] @ OPENSSL_armcap_P
-# ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
# endif
#endif
#if __ARM_MAX_ARCH__>=7
tst r12,#ARMV7_NEON @ check for NEON
-# ifdef __APPLE__
- adr r9,poly1305_blocks_neon
- adr r11,poly1305_blocks
-# ifdef __thumb2__
- it ne
-# endif
+# ifdef __thumb2__
+ adr r9,.Lpoly1305_blocks_neon
+ adr r11,.Lpoly1305_blocks
+ adr r12,.Lpoly1305_emit
+ adr r10,.Lpoly1305_emit_neon
+ itt ne
movne r11,r9
- adr r12,poly1305_emit
- adr r10,poly1305_emit_neon
-# ifdef __thumb2__
- it ne
-# endif
movne r12,r10
+ orr r11,r11,#1 @ thumb-ify address
+ orr r12,r12,#1
# else
-# ifdef __thumb2__
- itete eq
-# endif
- addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
- addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
- addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
- addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
-# endif
-# ifdef __thumb2__
- orr r12,r12,#1 @ thumb-ify address
- orr r11,r11,#1
+ addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
+ addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
+ addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
+ addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
# endif
#endif
ldrb r9,[$inp,#11]
.type poly1305_emit,%function
.align 5
poly1305_emit:
+.Lpoly1305_emit:
stmdb sp!,{r4-r11}
.Lpoly1305_emit_enter:
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
ldr ip,[$ctx,#36] @ is_base2_26
ands $len,$len,#-16
beq .Lno_data_neon
.type poly1305_emit_neon,%function
.align 5
poly1305_emit_neon:
+.Lpoly1305_emit_neon:
ldr ip,[$ctx,#36] @ is_base2_26
stmdb sp!,{r4-r11}
.Lzeros:
.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.Lpoly1305_init
+# endif
#endif
___
} }
print $_,"\n";
}
-close STDOUT; # enforce flush
+close STDOUT or die "error closing STDOUT"; # enforce flush