# define __ARM_ARCH__ __LINUX_ARM_ARCH__
#endif
-.text
#if defined(__thumb2__) && !defined(__APPLE__)
.syntax unified
.thumb
#undef __thumb2__
#endif
+.text
+
.type AES_Te,%object
.align 5
AES_Te:
$prefix="aes_v8";
+$_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
+
$code=<<___;
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=7
-.text
___
-$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
+$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
$code.=<<___ if ($flavour !~ /64/);
.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
.fpu neon
+#ifdef __thumb2__
+.syntax unified
+.thumb
+# define INST(a,b,c,d) $_byte c,d|0xc,a,b
+#else
.code 32
-#undef __thumb2__
+# define INST(a,b,c,d) $_byte a,b,c,d
+#endif
+
+.text
___
# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
- sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
s/\],#[0-9]+/]!/o;
s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
- s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/vmov\.32\s+(.*)/unvmov32($1)/geo or
s/^(\s+)b\./$1b/o or
- s/^(\s+)mov\./$1mov/o or
s/^(\s+)ret/$1bx\tlr/o;
+ if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
+ print " it $2\n";
+ }
+
print $_,"\n";
}
}
.arch armv7-a
.fpu neon
-.text
.syntax unified @ ARMv7-capable assembler is expected to handle this
#if defined(__thumb2__) && !defined(__APPLE__)
.thumb
# undef __thumb2__
#endif
+.text
+
.type _bsaes_decrypt8,%function
.align 4
_bsaes_decrypt8:
#ifndef __thumb__
blo AES_cbc_encrypt
#else
- bhs 1f
+ bhs .Lcbc_do_bsaes
b AES_cbc_encrypt
-1:
+.Lcbc_do_bsaes:
#endif
#endif
$code.=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__) && !defined(__APPLE__)
.syntax unified
.thumb
#undef __thumb2__
#endif
+.text
+
.align 5
.global OPENSSL_atomic_add
.type OPENSSL_atomic_add,%function
$code=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
#else
.code 32
#endif
+
+.text
___
################
# private interface to mul_1x1_ialu
#if __ARM_MAX_ARCH__>=7
stmdb sp!,{r10,lr}
ldr r12,.LOPENSSL_armcap
+# if !defined(_WIN32)
adr r10,.LOPENSSL_armcap
ldr r12,[r12,r10]
-#ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
-#endif
+# endif
tst r12,#ARMV7_NEON
itt ne
ldrne r10,[sp],#8
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.
+# endif
#endif
.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
$code=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
.code 32
#endif
+.text
+
#if __ARM_MAX_ARCH__>=7
.align 5
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.Lbn_mul_mont
+# endif
#endif
.global bn_mul_mont
#if __ARM_MAX_ARCH__>=7
tst ip,#7
bne .Lialu
- adr r0,.Lbn_mul_mont
- ldr r2,.LOPENSSL_armcap
+ ldr r0,.LOPENSSL_armcap
+#if !defined(_WIN32)
+ adr r2,.Lbn_mul_mont
ldr r0,[r0,r2]
-#ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r0,[r0]
-#endif
+# endif
tst r0,#ARMV7_NEON @ NEON available?
ldmia sp, {r0,r2}
beq .Lialu
$code.=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__) || defined(__clang__)
.syntax unified
#endif
#define ldrhsb ldrbhs
#endif
+.text
+
.align 5
.Lsigma:
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
.long 1,0,0,0
#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.LChaCha20_ctr32
+# endif
#else
.word -1
#endif
cmp r2,#192 @ test len
bls .Lshort
ldr r4,[r14,#-32]
+# if !defined(_WIN32)
ldr r4,[r14,r4]
-# ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r4,[r4]
# endif
tst r4,#ARMV7_NEON
$code.=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
die "insane number of elements" if ($#arr != 64*16*37-1);
$code.=<<___;
+.rodata
.globl ecp_nistz256_precomputed
.type ecp_nistz256_precomputed,%object
.align 12
}
$code.=<<___;
.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
+
+.text
.align 5
.LRR: @ 2^512 mod P precomputed for NIST P256 polynomial
.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
$code=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__) || defined(__clang__)
.syntax unified
#define ldrplb ldrbpl
.code 32
#endif
+.text
+
.type rem_4bit,%object
.align 5
rem_4bit:
{
my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
+my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
$code=<<___;
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=7
-.text
___
-$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
-$code.=<<___ if ($flavour !~ /64/);
+$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
+$code.=<<___ if ($flavour !~ /64/);
.fpu neon
-.code 32
-#undef __thumb2__
+#ifdef __thumb2__
+.syntax unified
+.thumb
+# define INST(a,b,c,d) $_byte c,0xef,a,b
+#else
+.code 32
+# define INST(a,b,c,d) $_byte a,b,c,0xf2
+#endif
+
+.text
___
################################################################################
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
- sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
# fix up remaining new-style suffixes
s/\],#[0-9]+/]!/o;
- s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/^(\s+)b\./$1b/o or
s/^(\s+)ret/$1bx\tlr/o;
+ if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
+ print " it $2\n";
+ }
+
print $_,"\n";
}
}
if ($flavour =~ /linux/) { ".fpu\t".join(',',@_); }
else { ""; }
};
+my $rodata = sub {
+ SWITCH: for ($flavour) {
+ /linux/ && return ".section\t.rodata";
+ /ios/ && return ".section\t__TEXT,__const";
+ last;
+ }
+};
my $hidden = sub {
if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); }
else { ".hidden\t".join(',',@_); }
$code.=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
.code 32
#endif
+.text
+
.globl poly1305_emit
.globl poly1305_blocks
.globl poly1305_init
and r4,r4,r10
#if __ARM_MAX_ARCH__>=7
+# if !defined(_WIN32)
ldr r12,[r11,r12] @ OPENSSL_armcap_P
-# ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
# endif
#endif
#if __ARM_MAX_ARCH__>=7
tst r12,#ARMV7_NEON @ check for NEON
-# ifdef __APPLE__
- adr r9,poly1305_blocks_neon
- adr r11,poly1305_blocks
-# ifdef __thumb2__
- it ne
-# endif
+# ifdef __thumb2__
+ adr r9,.Lpoly1305_blocks_neon
+ adr r11,.Lpoly1305_blocks
+ adr r12,.Lpoly1305_emit
+ adr r10,.Lpoly1305_emit_neon
+ itt ne
movne r11,r9
- adr r12,poly1305_emit
- adr r10,poly1305_emit_neon
-# ifdef __thumb2__
- it ne
-# endif
movne r12,r10
+ orr r11,r11,#1 @ thumb-ify address
+ orr r12,r12,#1
# else
-# ifdef __thumb2__
- itete eq
-# endif
- addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
- addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
- addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
- addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
-# endif
-# ifdef __thumb2__
- orr r12,r12,#1 @ thumb-ify address
- orr r11,r11,#1
+ addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
+ addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
+ addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
+ addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
# endif
#endif
ldrb r9,[$inp,#11]
.type poly1305_emit,%function
.align 5
poly1305_emit:
+.Lpoly1305_emit:
stmdb sp!,{r4-r11}
.Lpoly1305_emit_enter:
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
ldr ip,[$ctx,#36] @ is_base2_26
ands $len,$len,#-16
beq .Lno_data_neon
.type poly1305_emit_neon,%function
.align 5
poly1305_emit_neon:
+.Lpoly1305_emit_neon:
ldr ip,[$ctx,#36] @ is_base2_26
stmdb sp!,{r4-r11}
.Lzeros:
.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.Lpoly1305_init
+# endif
#endif
___
} }
$code.=<<___;
#include "arm_arch.h"
-.text
-
#if defined(__thumb2__)
.syntax unified
.thumb
.code 32
#endif
+.text
+
.type iotas32, %object
.align 5
iotas32:
$code.=<<___;
blo .Lround2x
+#if __ARM_ARCH__>=5
ldr pc,[sp,#440]
+#else
+ ldr lr,[sp,#440]
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
.size KeccakF1600_int,.-KeccakF1600_int
.type KeccakF1600, %function
stmia @E[1], {@C[0]-@C[9]}
add sp,sp,#440+20
+#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
.size KeccakF1600,.-KeccakF1600
___
{ my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14));
.Labsorb_abort:
add sp,sp,#456+32
mov r0,$len @ return value
+#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
+#else
+ ldmia sp!,{r4-r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
.size SHA3_absorb,.-SHA3_absorb
___
}
.align 4
.Lsqueeze_done:
add sp,sp,#24
+#if __ARM_ARCH__>=5
ldmia sp!,{r4-r10,pc}
+#else
+ ldmia sp!,{r4-r10,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
.size SHA3_squeeze,.-SHA3_squeeze
___
}
subs r3, r3, #1
bne .Loop_neon
- bx lr
+ ret
.size KeccakF1600_neon,.-KeccakF1600_neon
.global SHA3_absorb_neon
$code=<<___;
#include "arm_arch.h"
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
.code 32
#endif
+.text
+
.global sha1_block_data_order
.type sha1_block_data_order,%function
sha1_block_data_order:
#if __ARM_MAX_ARCH__>=7
.Lsha1_block:
- adr r3,.Lsha1_block
ldr r12,.LOPENSSL_armcap
+# if !defined(_WIN32)
+ adr r3,.Lsha1_block
ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
-#endif
+# endif
tst r12,#ARMV8_SHA1
bne .LARMv8
tst r12,#ARMV7_NEON
.LK_60_79: .word 0xca62c1d6
#if __ARM_MAX_ARCH__>=7
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.Lsha1_block
+# endif
#endif
.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
my @MSG=map("q$_",(4..7));
my @Kxx=map("q$_",(8..11));
my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
+my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
$code.=<<___;
#if __ARM_MAX_ARCH__>=7
# if defined(__thumb2__)
-# define INST(a,b,c,d) .byte c,d|0xf,a,b
+# define INST(a,b,c,d) $_byte c,d|0xf,a,b
# else
-# define INST(a,b,c,d) .byte a,b,c,d|0x10
+# define INST(a,b,c,d) $_byte a,b,c,d|0x10
# endif
.type sha1_block_data_order_armv8,%function
# define __ARM_MAX_ARCH__ 7
#endif
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
.code 32
#endif
+.text
+
.type K256,%object
.align 5
K256:
.word 0 @ terminator
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.Lsha256_block_data_order
+# endif
#endif
.align 5
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
+# if !defined(_WIN32)
ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
-#endif
+# endif
tst r12,#ARMV8_SHA256
bne .LARMv8
tst r12,#ARMV7_NEON
my @MSG=map("q$_",(8..11));
my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
my $Ktbl="r3";
+my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
$code.=<<___;
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
# if defined(__thumb2__)
-# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# define INST(a,b,c,d) $_byte c,d|0xc,a,b
# else
-# define INST(a,b,c,d) .byte a,b,c,d
+# define INST(a,b,c,d) $_byte a,b,c,d
# endif
.type sha256_block_data_order_armv8,%function
add $Ktbl,$Ktbl,#8
___
}
+
+my $_word = ($flavour =~ /win/ ? "DCDU" : ".word");
+
$code=<<___;
#ifndef __KERNEL__
# include "arm_arch.h"
#ifdef __ARMEL__
# define LO 0
# define HI 4
-# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
+# define WORD64(hi0,lo0,hi1,lo1) $_word lo0,hi0, lo1,hi1
#else
# define HI 0
# define LO 4
-# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
+# define WORD64(hi0,lo0,hi1,lo1) $_word hi0,lo0, hi1,lo1
#endif
-.text
#if defined(__thumb2__)
.syntax unified
.thumb
.code 32
#endif
+.text
+
.type K512,%object
.align 5
K512:
.size K512,.-K512
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
.word OPENSSL_armcap_P-.Lsha512_block_data_order
+# endif
.skip 32-4
#else
.skip 32
#endif
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
+# if !defined(_WIN32)
ldr r12,[r3,r12] @ OPENSSL_armcap_P
-#ifdef __APPLE__
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
ldr r12,[r12]
-#endif
+# endif
tst r12,#ARMV7_NEON
bne .LNEON
#endif