cflags => "-isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fno-common",
sys_id => "iOS",
},
+ "ios-cross" => {
+ inherit_from => [ "darwin-common", asm("armv4_asm") ],
+ # It should be possible to go below iOS 6 and even add -arch armv6,
+ # thus targeting iPhone pre-3GS, but it's assumed to be irrelevant
+ # at this point (and impossible to download SDK for).
+ cflags => "-arch armv7 -mios-version-min=6.0.0 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fno-common",
+ sys_id => "iOS",
+ perlasm_scheme => "ios32",
+ },
"ios64-cross" => {
inherit_from => [ "darwin-common", asm("aarch64_asm") ],
cflags => "-arch arm64 -mios-version-min=7.0.0 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fno-common",
$(PERL) alphacpuid.pl > $$preproc && \
$(CC) -E -P $$preproc > $@ && rm $$preproc)
arm64cpuid.S: arm64cpuid.pl; $(PERL) arm64cpuid.pl $(PERLASM_SCHEME) > $@
+armv4cpuid.S: armv4cpuid.pl; $(PERL) armv4cpuid.pl $(PERLASM_SCHEME) > $@
subdirs:
@target=all; $(RECURSIVE_MAKE)
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~21.5 cycles per byte.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$s0="r0";
$s1="r1";
.code 32
#else
.syntax unified
-# ifdef __thumb2__
+# if defined(__thumb2__) && !defined(__APPLE__)
.thumb
# else
.code 32
adr r3,AES_encrypt
#endif
stmdb sp!,{r1,r4-r12,lr}
+#ifdef __APPLE__
+ adr $tbl,AES_Te
+#else
+ sub $tbl,r3,#AES_encrypt-AES_Te @ Te
+#endif
mov $rounds,r0 @ inp
mov $key,r2
- sub $tbl,r3,#AES_encrypt-AES_Te @ Te
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
bne .Labrt
.Lok: stmdb sp!,{r4-r12,lr}
- sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
-
mov $rounds,r0 @ inp
mov lr,r1 @ bits
mov $key,r2 @ key
+#ifdef __APPLE__
+ adr $tbl,AES_Te+1024 @ Te4
+#else
+ sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
+#endif
+
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
adr r3,AES_decrypt
#endif
stmdb sp!,{r1,r4-r12,lr}
+#ifdef __APPLE__
+ adr $tbl,AES_Td
+#else
+ sub $tbl,r3,#AES_decrypt-AES_Td @ Td
+#endif
mov $rounds,r0 @ inp
mov $key,r2
- sub $tbl,r3,#AES_decrypt-AES_Td @ Td
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
#
# <ard.biesheuvel@linaro.org>
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
my @XMM=map("q$_",(0..15));
.text
.syntax unified @ ARMv7-capable assembler is expected to handle this
-#ifdef __thumb2__
+#if defined(__thumb2__) && !defined(__APPLE__)
.thumb
#else
.code 32
_bsaes_decrypt8:
adr $const,_bsaes_decrypt8
vldmia $key!, {@XMM[9]} @ round 0 key
+#ifdef __APPLE__
+ adr $const,.LM0ISR
+#else
add $const,$const,#.LM0ISR-_bsaes_decrypt8
+#endif
vldmia $const!, {@XMM[8]} @ .LM0ISR
veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
_bsaes_encrypt8:
adr $const,_bsaes_encrypt8
vldmia $key!, {@XMM[9]} @ round 0 key
+#ifdef __APPLE__
+ adr $const,.LM0SR
+#else
sub $const,$const,#_bsaes_encrypt8-.LM0SR
+#endif
vldmia $const!, {@XMM[8]} @ .LM0SR
_bsaes_encrypt8_alt:
_bsaes_key_convert:
adr $const,_bsaes_key_convert
vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
+#ifdef __APPLE__
+ adr $const,.LM0
+#else
sub $const,$const,#_bsaes_key_convert-.LM0
+#endif
vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
vmov.i8 @XMM[8], #0x01 @ bit masks
vstmia r12, {@XMM[7]} @ save last round key
vld1.8 {@XMM[0]}, [$ctr] @ load counter
+#ifdef __APPLE__
+ mov $ctr, #.LREVM0SR-.LM0
+ add $ctr, $const, $ctr
+#else
add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
+#endif
vldmia $keysched, {@XMM[4]} @ load round0 key
#else
ldr r12, [$key, #244]
vldmia $ctr, {@XMM[8]} @ .LREVM0SR
mov r5, $rounds @ pass rounds
vstmia $fp, {@XMM[10]} @ save next counter
+#ifdef __APPLE__
+ mov $const, #.LREVM0SR-.LSR
+ sub $const, $ctr, $const
+#else
sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
+#endif
bl _bsaes_encrypt8_alt
rev r8, r8
#endif
sub sp, sp, #0x10
- vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
+ vst1.8 {@XMM[1]}, [sp] @ copy counter value
sub sp, sp, #0x10
.Lctr_enc_short_loop:
bl AES_encrypt
vld1.8 {@XMM[0]}, [r4]! @ load input
- vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
+ vld1.8 {@XMM[1]}, [sp] @ load encrypted counter
add r8, r8, #1
#ifdef __ARMEL__
rev r0, r8
#include <string.h>
#include <setjmp.h>
#include <signal.h>
-#include <crypto.h>
+#include <openssl/crypto.h>
#include "arm_arch.h"
+++ /dev/null
-#include "arm_arch.h"
-
-.text
-.code 32
-
-.align 5
-.global OPENSSL_atomic_add
-.type OPENSSL_atomic_add,%function
-OPENSSL_atomic_add:
-#if __ARM_ARCH__>=6
-.Ladd: ldrex r2,[r0]
- add r3,r2,r1
- strex r2,r3,[r0]
- cmp r2,#0
- bne .Ladd
- mov r0,r3
- bx lr
-#else
- stmdb sp!,{r4-r6,lr}
- ldr r2,.Lspinlock
- adr r3,.Lspinlock
- mov r4,r0
- mov r5,r1
- add r6,r3,r2 @ &spinlock
- b .+8
-.Lspin: bl sched_yield
- mov r0,#-1
- swp r0,r0,[r6]
- cmp r0,#0
- bne .Lspin
-
- ldr r2,[r4]
- add r2,r2,r5
- str r2,[r4]
- str r0,[r6] @ release spinlock
- ldmia sp!,{r4-r6,lr}
- tst lr,#1
- moveq pc,lr
- .word 0xe12fff1e @ bx lr
-#endif
-.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
-
-.global OPENSSL_cleanse
-.type OPENSSL_cleanse,%function
-OPENSSL_cleanse:
- eor ip,ip,ip
- cmp r1,#7
- subhs r1,r1,#4
- bhs .Lot
- cmp r1,#0
- beq .Lcleanse_done
-.Little:
- strb ip,[r0],#1
- subs r1,r1,#1
- bhi .Little
- b .Lcleanse_done
-
-.Lot: tst r0,#3
- beq .Laligned
- strb ip,[r0],#1
- sub r1,r1,#1
- b .Lot
-.Laligned:
- str ip,[r0],#4
- subs r1,r1,#4
- bhs .Laligned
- adds r1,r1,#4
- bne .Little
-.Lcleanse_done:
-#if __ARM_ARCH__>=5
- bx lr
-#else
- tst lr,#1
- moveq pc,lr
- .word 0xe12fff1e @ bx lr
-#endif
-.size OPENSSL_cleanse,.-OPENSSL_cleanse
-
-#if __ARM_MAX_ARCH__>=7
-.arch armv7-a
-.fpu neon
-
-.align 5
-.global _armv7_neon_probe
-.type _armv7_neon_probe,%function
-_armv7_neon_probe:
- vorr q0,q0,q0
- bx lr
-.size _armv7_neon_probe,.-_armv7_neon_probe
-
-.global _armv7_tick
-.type _armv7_tick,%function
-_armv7_tick:
- mrrc p15,1,r0,r1,c14 @ CNTVCT
- bx lr
-.size _armv7_tick,.-_armv7_tick
-
-.global _armv8_aes_probe
-.type _armv8_aes_probe,%function
-_armv8_aes_probe:
- .byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0
- bx lr
-.size _armv8_aes_probe,.-_armv8_aes_probe
-
-.global _armv8_sha1_probe
-.type _armv8_sha1_probe,%function
-_armv8_sha1_probe:
- .byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0
- bx lr
-.size _armv8_sha1_probe,.-_armv8_sha1_probe
-
-.global _armv8_sha256_probe
-.type _armv8_sha256_probe,%function
-_armv8_sha256_probe:
- .byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0
- bx lr
-.size _armv8_sha256_probe,.-_armv8_sha256_probe
-.global _armv8_pmull_probe
-.type _armv8_pmull_probe,%function
-_armv8_pmull_probe:
- .byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0
- bx lr
-.size _armv8_pmull_probe,.-_armv8_pmull_probe
-#endif
-
-.global OPENSSL_wipe_cpu
-.type OPENSSL_wipe_cpu,%function
-OPENSSL_wipe_cpu:
-#if __ARM_MAX_ARCH__>=7
- ldr r0,.LOPENSSL_armcap
- adr r1,.LOPENSSL_armcap
- ldr r0,[r1,r0]
-#endif
- eor r2,r2,r2
- eor r3,r3,r3
- eor ip,ip,ip
-#if __ARM_MAX_ARCH__>=7
- tst r0,#1
- beq .Lwipe_done
- veor q0, q0, q0
- veor q1, q1, q1
- veor q2, q2, q2
- veor q3, q3, q3
- veor q8, q8, q8
- veor q9, q9, q9
- veor q10, q10, q10
- veor q11, q11, q11
- veor q12, q12, q12
- veor q13, q13, q13
- veor q14, q14, q14
- veor q15, q15, q15
-.Lwipe_done:
-#endif
- mov r0,sp
-#if __ARM_ARCH__>=5
- bx lr
-#else
- tst lr,#1
- moveq pc,lr
- .word 0xe12fff1e @ bx lr
-#endif
-.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
-
-.global OPENSSL_instrument_bus
-.type OPENSSL_instrument_bus,%function
-OPENSSL_instrument_bus:
- eor r0,r0,r0
-#if __ARM_ARCH__>=5
- bx lr
-#else
- tst lr,#1
- moveq pc,lr
- .word 0xe12fff1e @ bx lr
-#endif
-.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
-
-.global OPENSSL_instrument_bus2
-.type OPENSSL_instrument_bus2,%function
-OPENSSL_instrument_bus2:
- eor r0,r0,r0
-#if __ARM_ARCH__>=5
- bx lr
-#else
- tst lr,#1
- moveq pc,lr
- .word 0xe12fff1e @ bx lr
-#endif
-.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
-
-.align 5
-#if __ARM_MAX_ARCH__>=7
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.LOPENSSL_armcap
-#endif
-#if __ARM_ARCH__>=6
-.align 5
-#else
-.Lspinlock:
-.word atomic_add_spinlock-.Lspinlock
-.align 5
-
-.data
-.align 2
-atomic_add_spinlock:
-.word 0
-#endif
-
-.comm OPENSSL_armcap_P,4,4
-.hidden OPENSSL_armcap_P
--- /dev/null
+#!/usr/bin/env perl
+
+$flavour = shift;
+$output = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+.code 32
+
+.align 5
+.global OPENSSL_atomic_add
+.type OPENSSL_atomic_add,%function
+OPENSSL_atomic_add:
+#if __ARM_ARCH__>=6
+.Ladd: ldrex r2,[r0]
+ add r3,r2,r1
+ strex r2,r3,[r0]
+ cmp r2,#0
+ bne .Ladd
+ mov r0,r3
+ bx lr
+#else
+ stmdb sp!,{r4-r6,lr}
+ ldr r2,.Lspinlock
+ adr r3,.Lspinlock
+ mov r4,r0
+ mov r5,r1
+ add r6,r3,r2 @ &spinlock
+ b .+8
+.Lspin: bl sched_yield
+ mov r0,#-1
+ swp r0,r0,[r6]
+ cmp r0,#0
+ bne .Lspin
+
+ ldr r2,[r4]
+ add r2,r2,r5
+ str r2,[r4]
+ str r0,[r6] @ release spinlock
+ ldmia sp!,{r4-r6,lr}
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+#endif
+.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.global OPENSSL_cleanse
+.type OPENSSL_cleanse,%function
+OPENSSL_cleanse:
+ eor ip,ip,ip
+ cmp r1,#7
+ subhs r1,r1,#4
+ bhs .Lot
+ cmp r1,#0
+ beq .Lcleanse_done
+.Little:
+ strb ip,[r0],#1
+ subs r1,r1,#1
+ bhi .Little
+ b .Lcleanse_done
+
+.Lot: tst r0,#3
+ beq .Laligned
+ strb ip,[r0],#1
+ sub r1,r1,#1
+ b .Lot
+.Laligned:
+ str ip,[r0],#4
+ subs r1,r1,#4
+ bhs .Laligned
+ adds r1,r1,#4
+ bne .Little
+.Lcleanse_done:
+#if __ARM_ARCH__>=5
+ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+#endif
+.size OPENSSL_cleanse,.-OPENSSL_cleanse
+
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.align 5
+.global _armv7_neon_probe
+.type _armv7_neon_probe,%function
+_armv7_neon_probe:
+ vorr q0,q0,q0
+ bx lr
+.size _armv7_neon_probe,.-_armv7_neon_probe
+
+.global _armv7_tick
+.type _armv7_tick,%function
+_armv7_tick:
+#ifdef __APPLE__
+ mrrc p15,0,r0,r1,c14 @ CNTPCT
+#else
+ mrrc p15,1,r0,r1,c14 @ CNTVCT
+#endif
+ bx lr
+.size _armv7_tick,.-_armv7_tick
+
+.global _armv8_aes_probe
+.type _armv8_aes_probe,%function
+_armv8_aes_probe:
+ .byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0
+ bx lr
+.size _armv8_aes_probe,.-_armv8_aes_probe
+
+.global _armv8_sha1_probe
+.type _armv8_sha1_probe,%function
+_armv8_sha1_probe:
+ .byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0
+ bx lr
+.size _armv8_sha1_probe,.-_armv8_sha1_probe
+
+.global _armv8_sha256_probe
+.type _armv8_sha256_probe,%function
+_armv8_sha256_probe:
+ .byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0
+ bx lr
+.size _armv8_sha256_probe,.-_armv8_sha256_probe
+.global _armv8_pmull_probe
+.type _armv8_pmull_probe,%function
+_armv8_pmull_probe:
+ .byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0
+ bx lr
+.size _armv8_pmull_probe,.-_armv8_pmull_probe
+#endif
+
+.global OPENSSL_wipe_cpu
+.type OPENSSL_wipe_cpu,%function
+OPENSSL_wipe_cpu:
+#if __ARM_MAX_ARCH__>=7
+ ldr r0,.LOPENSSL_armcap
+ adr r1,.LOPENSSL_armcap
+ ldr r0,[r1,r0]
+#ifdef __APPLE__
+ ldr r0,[r0]
+#endif
+#endif
+ eor r2,r2,r2
+ eor r3,r3,r3
+ eor ip,ip,ip
+#if __ARM_MAX_ARCH__>=7
+ tst r0,#1
+ beq .Lwipe_done
+ veor q0, q0, q0
+ veor q1, q1, q1
+ veor q2, q2, q2
+ veor q3, q3, q3
+ veor q8, q8, q8
+ veor q9, q9, q9
+ veor q10, q10, q10
+ veor q11, q11, q11
+ veor q12, q12, q12
+ veor q13, q13, q13
+ veor q14, q14, q14
+ veor q15, q15, q15
+.Lwipe_done:
+#endif
+ mov r0,sp
+#if __ARM_ARCH__>=5
+ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+#endif
+.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+
+.global OPENSSL_instrument_bus
+.type OPENSSL_instrument_bus,%function
+OPENSSL_instrument_bus:
+ eor r0,r0,r0
+#if __ARM_ARCH__>=5
+ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+#endif
+.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.global OPENSSL_instrument_bus2
+.type OPENSSL_instrument_bus2,%function
+OPENSSL_instrument_bus2:
+ eor r0,r0,r0
+#if __ARM_ARCH__>=5
+ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr
+ .word 0xe12fff1e @ bx lr
+#endif
+.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+
+.align 5
+#if __ARM_MAX_ARCH__>=7
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.
+#endif
+#if __ARM_ARCH__>=6
+.align 5
+#else
+.Lspinlock:
+.word atomic_add_spinlock-.Lspinlock
+.align 5
+
+.data
+.align 2
+atomic_add_spinlock:
+.word 0
+#endif
+
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+___
+
+print $code;
+close STDOUT;
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$code=<<___;
#include "arm_arch.h"
.align 5
.LNEON:
ldr r12, [sp] @ 5th argument
- vmov.32 $a, r2, r1
- vmov.32 $b, r12, r3
+ vmov $a, r2, r1
+ vmov $b, r12, r3
vmov.i64 $k48, #0x0000ffffffffffff
vmov.i64 $k32, #0x00000000ffffffff
vmov.i64 $k16, #0x000000000000ffff
# for execution on all NEON-capable processors, because gain on
# others outweighs the marginal loss on Cortex-A9.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$num="r0"; # starts as num argument, but holds &tp[num-1]
$ap="r1";
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-bn_mul_mont
+.word OPENSSL_armcap_P-.Lbn_mul_mont
#endif
.global bn_mul_mont
.align 5
bn_mul_mont:
+.Lbn_mul_mont:
ldr ip,[sp,#4] @ load num
stmdb sp!,{r0,r2} @ sp points at argument block
#if __ARM_MAX_ARCH__>=7
adr r0,bn_mul_mont
ldr r2,.LOPENSSL_armcap
ldr r0,[r0,r2]
+#ifdef __APPLE__
+ ldr r0,[r0]
+#endif
tst r0,#1 @ NEON available?
ldmia sp, {r0,r2}
beq .Lialu
# operation. Keep in mind that +200% means 3x improvement.
$flavour = shift;
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-die "can't locate arm-xlate.pl";
-
-open OUT,"| \"$^X\" $xlate $flavour $output";
-*STDOUT=*OUT;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$code.=<<___;
#include "arm_arch.h"
.align 4
ecp_nistz256_mul_by_2:
stmdb sp!,{r4-r12,lr}
- bl _ecp_nistz256_mul_by_2
+ bl __ecp_nistz256_mul_by_2
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
#endif
.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
-.type _ecp_nistz256_mul_by_2,%function
+.type __ecp_nistz256_mul_by_2,%function
.align 4
-_ecp_nistz256_mul_by_2:
+__ecp_nistz256_mul_by_2:
ldr $a0,[$a_ptr,#0]
ldr $a1,[$a_ptr,#4]
ldr $a2,[$a_ptr,#8]
movcs $ff,#-1 @ $ff = carry ? -1 : 0
b .Lreduce_by_sub
-.size _ecp_nistz256_mul_by_2,.-_ecp_nistz256_mul_by_2
+.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
@ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
@ const BN_ULONG r2[8]);
.align 4
ecp_nistz256_add:
stmdb sp!,{r4-r12,lr}
- bl _ecp_nistz256_add
+ bl __ecp_nistz256_add
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
#endif
.size ecp_nistz256_add,.-ecp_nistz256_add
-.type _ecp_nistz256_add,%function
+.type __ecp_nistz256_add,%function
.align 4
-_ecp_nistz256_add:
+__ecp_nistz256_add:
str lr,[sp,#-4]! @ push lr
ldr $a0,[$a_ptr,#0]
str $a7,[$r_ptr,#28]
mov pc,lr
-.size _ecp_nistz256_add,.-_ecp_nistz256_add
+.size __ecp_nistz256_add,.-__ecp_nistz256_add
@ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]);
.globl ecp_nistz256_mul_by_3
.align 4
ecp_nistz256_mul_by_3:
stmdb sp!,{r4-r12,lr}
- bl _ecp_nistz256_mul_by_3
+ bl __ecp_nistz256_mul_by_3
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
#endif
.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
-.type _ecp_nistz256_mul_by_3,%function
+.type __ecp_nistz256_mul_by_3,%function
.align 4
-_ecp_nistz256_mul_by_3:
+__ecp_nistz256_mul_by_3:
str lr,[sp,#-4]! @ push lr
@ As multiplication by 3 is performed as 2*n+n, below are inline
- @ copies of _ecp_nistz256_mul_by_2 and _ecp_nistz256_add, see
+ @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
@ corresponding subroutines for details.
ldr $a0,[$a_ptr,#0]
.align 4
ecp_nistz256_div_by_2:
stmdb sp!,{r4-r12,lr}
- bl _ecp_nistz256_div_by_2
+ bl __ecp_nistz256_div_by_2
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
#endif
.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
-.type _ecp_nistz256_div_by_2,%function
+.type __ecp_nistz256_div_by_2,%function
.align 4
-_ecp_nistz256_div_by_2:
+__ecp_nistz256_div_by_2:
@ ret = (a is odd ? a+mod : a) >> 1
ldr $a0,[$a_ptr,#0]
str $a7,[$r_ptr,#28]
mov pc,lr
-.size _ecp_nistz256_div_by_2,.-_ecp_nistz256_div_by_2
+.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
@ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8],
-@ const BN_ULONG r2[8]);
+@ const BN_ULONG r2[8]);
.globl ecp_nistz256_sub
.type ecp_nistz256_sub,%function
.align 4
ecp_nistz256_sub:
stmdb sp!,{r4-r12,lr}
- bl _ecp_nistz256_sub
+ bl __ecp_nistz256_sub
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
#endif
.size ecp_nistz256_sub,.-ecp_nistz256_sub
-.type _ecp_nistz256_sub,%function
+.type __ecp_nistz256_sub,%function
.align 4
-_ecp_nistz256_sub:
+__ecp_nistz256_sub:
str lr,[sp,#-4]! @ push lr
ldr $a0,[$a_ptr,#0]
str $a7,[$r_ptr,#28]
mov pc,lr
-.size _ecp_nistz256_sub,.-_ecp_nistz256_sub
+.size __ecp_nistz256_sub,.-__ecp_nistz256_sub
@ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
.globl ecp_nistz256_neg
.align 4
ecp_nistz256_neg:
stmdb sp!,{r4-r12,lr}
- bl _ecp_nistz256_neg
+ bl __ecp_nistz256_neg
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
#endif
.size ecp_nistz256_neg,.-ecp_nistz256_neg
-.type _ecp_nistz256_neg,%function
+.type __ecp_nistz256_neg,%function
.align 4
-_ecp_nistz256_neg:
+__ecp_nistz256_neg:
ldr $a0,[$a_ptr,#0]
eor $ff,$ff,$ff
ldr $a1,[$a_ptr,#4]
sbc $ff,$ff,$ff
b .Lreduce_by_add
-.size _ecp_nistz256_neg,.-_ecp_nistz256_neg
+.size __ecp_nistz256_neg,.-__ecp_nistz256_neg
___
{
my @acc=map("r$_",(3..11));
ecp_nistz256_mul_mont:
.Lecp_nistz256_mul_mont:
stmdb sp!,{r4-r12,lr}
- bl _ecp_nistz256_mul_mont
+ bl __ecp_nistz256_mul_mont
#if __ARM_ARCH__>=5 || !defined(__thumb__)
ldmia sp!,{r4-r12,pc}
#else
#endif
.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
-.type _ecp_nistz256_mul_mont,%function
+.type __ecp_nistz256_mul_mont,%function
.align 4
-_ecp_nistz256_mul_mont:
+__ecp_nistz256_mul_mont:
stmdb sp!,{r0-r2,lr} @ make a copy of arguments too
ldr $bj,[$b_ptr,#0] @ b[0]
@ "other way around", namely subtract modulus from result
@ and if it borrowed, add modulus back.
- subs @acc[1],@acc[1],#-1 @ compare to modulus
- sbcs @acc[2],@acc[2],#-1
- sbcs @acc[3],@acc[3],#-1
+ adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1
+ adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1
+ adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1
sbcs @acc[4],@acc[4],#0
sbcs @acc[5],@acc[5],#0
sbcs @acc[6],@acc[6],#0
sbcs @acc[7],@acc[7],#1
- sbcs @acc[8],@acc[8],#-1
+ adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1
ldr lr,[sp,#44] @ restore lr
sbc @acc[0],@acc[0],#0 @ broadcast borrow bit
add sp,sp,#48
str @acc[8],[$r_ptr,#28]
mov pc,lr
-.size _ecp_nistz256_mul_mont,.-_ecp_nistz256_mul_mont
+.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
___
}
{{{
########################################################################
# Below $aN assignment matches order in which 256-bit result appears in
-# register bank at return from _ecp_nistz256_mul_mont, so that we can
+# register bank at return from __ecp_nistz256_mul_mont, so that we can
# skip over reloading it from memory. This means that below functions
# use custom calling sequence accepting 256-bit input in registers,
# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
mov pc,lr
.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
-.type __ecp_nistz256_mul_by_2,%function
+.type __ecp_nistz256_add_self,%function
.align 4
-__ecp_nistz256_mul_by_2:
+__ecp_nistz256_add_self:
adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
adcs $a1,$a1,$a1
adcs $a2,$a2,$a2
str $a7,[$r_ptr,#28]
mov pc,lr
-.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
+.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self
___
stmia r3,{r4-r11}
add $r_ptr,sp,#$S
- bl _ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
+ bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
add $b_ptr,$a_ptr,#32
add $a_ptr,$a_ptr,#32
add $r_ptr,sp,#$Zsqr
- bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
+ bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
add $a_ptr,sp,#$S
add $b_ptr,sp,#$S
add $r_ptr,sp,#$S
- bl _ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
+ bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
ldr $b_ptr,[sp,#32*5+4]
add $a_ptr,$b_ptr,#32
add $b_ptr,$b_ptr,#64
add $r_ptr,sp,#$tmp0
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
ldr $r_ptr,[sp,#32*5]
add $r_ptr,$r_ptr,#64
- bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(res_z, tmp0);
+ bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0);
add $a_ptr,sp,#$in_x
add $b_ptr,sp,#$Zsqr
add $r_ptr,sp,#$M
- bl _ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
+ bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
add $a_ptr,sp,#$in_x
add $b_ptr,sp,#$Zsqr
add $r_ptr,sp,#$Zsqr
- bl _ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
+ bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
add $a_ptr,sp,#$S
add $b_ptr,sp,#$S
add $r_ptr,sp,#$tmp0
- bl _ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
+ bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
add $a_ptr,sp,#$Zsqr
add $b_ptr,sp,#$M
add $r_ptr,sp,#$M
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
ldr $r_ptr,[sp,#32*5]
add $a_ptr,sp,#$tmp0
add $r_ptr,$r_ptr,#32
- bl _ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
+ bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
add $a_ptr,sp,#$M
add $r_ptr,sp,#$M
- bl _ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
+ bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
add $a_ptr,sp,#$in_x
add $b_ptr,sp,#$S
add $r_ptr,sp,#$S
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
add $r_ptr,sp,#$tmp0
- bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(tmp0, S);
+ bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S);
ldr $r_ptr,[sp,#32*5]
add $a_ptr,sp,#$M
add $b_ptr,sp,#$M
- bl _ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
+ bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
add $b_ptr,sp,#$tmp0
bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0);
add $a_ptr,sp,#$M
add $b_ptr,sp,#$S
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
ldr $r_ptr,[sp,#32*5]
add $b_ptr,$r_ptr,#32
add $a_ptr,sp,#$in2_z
add $b_ptr,sp,#$in2_z
add $r_ptr,sp,#$Z2sqr
- bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z);
+ bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z);
add $a_ptr,sp,#$in1_z
add $b_ptr,sp,#$in1_z
add $r_ptr,sp,#$Z1sqr
- bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
+ bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
add $a_ptr,sp,#$in2_z
add $b_ptr,sp,#$Z2sqr
add $r_ptr,sp,#$S1
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z);
add $a_ptr,sp,#$in1_z
add $b_ptr,sp,#$Z1sqr
add $r_ptr,sp,#$S2
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
add $a_ptr,sp,#$in1_y
add $b_ptr,sp,#$S1
add $r_ptr,sp,#$S1
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y);
add $a_ptr,sp,#$in2_y
add $b_ptr,sp,#$S2
add $r_ptr,sp,#$S2
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
add $b_ptr,sp,#$S1
add $r_ptr,sp,#$R
str $a0,[sp,#32*18+12]
add $r_ptr,sp,#$U1
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr);
add $a_ptr,sp,#$in2_x
add $b_ptr,sp,#$Z1sqr
add $r_ptr,sp,#$U2
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr);
add $b_ptr,sp,#$U1
add $r_ptr,sp,#$H
add $a_ptr,sp,#$R
add $b_ptr,sp,#$R
add $r_ptr,sp,#$Rsqr
- bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
+ bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
add $a_ptr,sp,#$H
add $b_ptr,sp,#$in1_z
add $r_ptr,sp,#$res_z
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
add $a_ptr,sp,#$H
add $b_ptr,sp,#$H
add $r_ptr,sp,#$Hsqr
- bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
+ bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
add $a_ptr,sp,#$in2_z
add $b_ptr,sp,#$res_z
add $r_ptr,sp,#$res_z
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z);
add $a_ptr,sp,#$H
add $b_ptr,sp,#$Hsqr
add $r_ptr,sp,#$Hcub
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
add $a_ptr,sp,#$Hsqr
add $b_ptr,sp,#$U1
add $r_ptr,sp,#$U2
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr);
add $r_ptr,sp,#$Hsqr
- bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(Hsqr, U2);
+ bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
add $b_ptr,sp,#$Rsqr
add $r_ptr,sp,#$res_x
add $a_ptr,sp,#$Hcub
add $b_ptr,sp,#$S1
add $r_ptr,sp,#$S2
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub);
add $a_ptr,sp,#$R
add $b_ptr,sp,#$res_y
add $r_ptr,sp,#$res_y
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
add $b_ptr,sp,#$S2
bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
add $a_ptr,sp,#$in1_z
add $b_ptr,sp,#$in1_z
add $r_ptr,sp,#$Z1sqr
- bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
+ bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
add $a_ptr,sp,#$Z1sqr
add $b_ptr,sp,#$in2_x
add $r_ptr,sp,#$U2
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x);
add $b_ptr,sp,#$in1_x
add $r_ptr,sp,#$H
add $a_ptr,sp,#$Z1sqr
add $b_ptr,sp,#$in1_z
add $r_ptr,sp,#$S2
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
add $a_ptr,sp,#$H
add $b_ptr,sp,#$in1_z
add $r_ptr,sp,#$res_z
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
add $a_ptr,sp,#$in2_y
add $b_ptr,sp,#$S2
add $r_ptr,sp,#$S2
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
add $b_ptr,sp,#$in1_y
add $r_ptr,sp,#$R
add $a_ptr,sp,#$H
add $b_ptr,sp,#$H
add $r_ptr,sp,#$Hsqr
- bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
+ bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
add $a_ptr,sp,#$R
add $b_ptr,sp,#$R
add $r_ptr,sp,#$Rsqr
- bl _ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
+ bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
add $a_ptr,sp,#$H
add $b_ptr,sp,#$Hsqr
add $r_ptr,sp,#$Hcub
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
add $a_ptr,sp,#$Hsqr
add $b_ptr,sp,#$in1_x
add $r_ptr,sp,#$U2
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr);
add $r_ptr,sp,#$Hsqr
- bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(Hsqr, U2);
+ bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
add $b_ptr,sp,#$Rsqr
add $r_ptr,sp,#$res_x
add $a_ptr,sp,#$Hcub
add $b_ptr,sp,#$in1_y
add $r_ptr,sp,#$S2
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub);
add $a_ptr,sp,#$R
add $b_ptr,sp,#$res_y
add $r_ptr,sp,#$res_y
- bl _ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
+ bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
add $b_ptr,sp,#$S2
bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
# *native* byte order on current platform. See gcm128.c for working
# example...
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$Xi="r0"; # argument block
$Htbl="r1";
.text
.code 32
+#ifdef __APPLE__
+#define ldrplb ldrbpl
+#define ldrneb ldrbne
+#endif
+
.type rem_4bit,%object
.align 5
rem_4bit:
.type gcm_init_neon,%function
.align 4
gcm_init_neon:
- vld1.64 $IN#hi,[r1,:64]! @ load H
+ vld1.64 $IN#hi,[r1]! @ load H
vmov.i8 $t0,#0xe1
- vld1.64 $IN#lo,[r1,:64]
+ vld1.64 $IN#lo,[r1]
vshl.i64 $t0#hi,#57
vshr.u64 $t0#lo,#63 @ t0=0xc2....01
vdup.8 $t1,$IN#hi[7]
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
- vld1.64 $IN#hi,[$Xi,:64]! @ load Xi
- vld1.64 $IN#lo,[$Xi,:64]!
+ vld1.64 $IN#hi,[$Xi]! @ load Xi
+ vld1.64 $IN#lo,[$Xi]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
- vld1.64 $Xl#hi,[$Xi,:64]! @ load Xi
- vld1.64 $Xl#lo,[$Xi,:64]!
+ vld1.64 $Xl#hi,[$Xi]! @ load Xi
+ vld1.64 $Xl#lo,[$Xi]!
vmov.i64 $k48,#0x0000ffffffffffff
vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
vmov.i64 $k32,#0x00000000ffffffff
vrev64.8 $Xl,$Xl
#endif
sub $Xi,#16
- vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi
- vst1.64 $Xl#lo,[$Xi,:64]
+ vst1.64 $Xl#hi,[$Xi]! @ write out Xi
+ vst1.64 $Xl#lo,[$Xi]
ret @ bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
$line = &$opcode($arg);
} elsif ($mnemonic) {
$line = $c.$mnemonic;
- $line.= "\t$arg" if ($arg);
+ $line.= "\t$arg" if ($arg ne "");
}
}
#
# Add ARMv8 code path performing at 2.35 cpb on Apple A7.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0";
$inp="r1";
sub r3,pc,#8 @ sha1_block_data_order
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
tst r12,#ARMV8_SHA1
bne .LARMv8
tst r12,#ARMV7_NEON
#
# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0"; $t0="r0";
$inp="r1"; $t4="r1";
.code 32
#else
.syntax unified
-# ifdef __thumb2__
+# if defined(__thumb2__) && !defined(__APPLE__)
# define adrl adr
.thumb
# else
.word 0 @ terminator
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha256_block_data_order
+.word OPENSSL_armcap_P-.Lsha256_block_data_order
#endif
.align 5
.global sha256_block_data_order
.type sha256_block_data_order,%function
sha256_block_data_order:
+.Lsha256_block_data_order:
#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha256_block_data_order
#else
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
tst r12,#ARMV8_SHA256
bne .LARMv8
tst r12,#ARMV7_NEON
stmdb sp!,{r4-r12,lr}
sub $H,sp,#16*4+16
- adrl $Ktbl,K256
+ adr $Ktbl,K256
bic $H,$H,#15 @ align for 128-bit stores
mov $t2,sp
mov sp,$H @ alloca
$code.=<<___;
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-# ifdef __thumb2__
+# if defined(__thumb2__) && !defined(__APPLE__)
# define INST(a,b,c,d) .byte c,d|0xc,a,b
# else
# define INST(a,b,c,d) .byte a,b,c,d
sha256_block_data_order_armv8:
.LARMv8:
vld1.32 {$ABCD,$EFGH},[$ctx]
-# ifdef __thumb2__
+# ifdef __APPLE__
+ sub $Ktbl,$Ktbl,#256+32
+# elif defined(__thumb2__)
adr $Ktbl,.LARMv8
sub $Ktbl,$Ktbl,#.LARMv8-K256
# else
$lo="LO";
# ====================================================================
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
$ctx="r0"; # parameter block
$inp="r1";
#endif
.text
-#if __ARM_ARCH__<7
+#if __ARM_ARCH__<7 || defined(__APPLE__)
.code 32
#else
.syntax unified
.size K512,.-K512
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha512_block_data_order
+.word OPENSSL_armcap_P-.Lsha512_block_data_order
.skip 32-4
#else
.skip 32
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
+.Lsha512_block_data_order:
#if __ARM_ARCH__<7
sub r3,pc,#8 @ sha512_block_data_order
#else
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
tst r12,#1
bne .LNEON
#endif
.LNEON:
dmb @ errata #451034 on early Cortex A8
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
+ adr $Ktbl,K512
VFP_ABI_PUSH
- adrl $Ktbl,K512
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___