From 3405db97e5448c784729b56837f3f8c776a01067 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Fri, 15 Feb 2019 09:44:39 +0100 Subject: [PATCH] ARM assembly pack: make it Windows-friendly. "Windows friendliness" means a) flipping .thumb and .text directives, b) always generate Thumb-2 code when asked(*); c) Windows-specific references to external OPENSSL_armcap_P. (*) so far *some* modules were compiled as .code 32 even if Thumb-2 was targeted. It works at hardware level because processor can alternate between the modes with no overhead. But clang --target=arm-windows's builtin assembler just refuses to compile .code 32... Reviewed-by: Paul Dale Reviewed-by: Richard Levitte (Merged from https://github.com/openssl/openssl/pull/8252) --- crypto/aes/asm/aes-armv4.pl | 3 +- crypto/aes/asm/aesv8-armx.pl | 24 ++++++++++---- crypto/aes/asm/bsaes-armv7.pl | 7 ++-- crypto/armv4cpuid.pl | 3 +- crypto/bn/asm/armv4-gf2m.pl | 13 ++++++-- crypto/bn/asm/armv4-mont.pl | 17 +++++++--- crypto/chacha/asm/chacha-armv4.pl | 11 ++++-- crypto/ec/asm/ecp_nistz256-armv4.pl | 4 ++- crypto/modes/asm/ghash-armv4.pl | 3 +- crypto/modes/asm/ghashv8-armx.pl | 26 +++++++++++---- crypto/perlasm/arm-xlate.pl | 7 ++++ crypto/poly1305/asm/poly1305-armv4.pl | 48 +++++++++++++-------------- crypto/sha/asm/keccak1600-armv4.pl | 34 +++++++++++++++++-- crypto/sha/asm/sha1-armv4-large.pl | 20 +++++++---- crypto/sha/asm/sha256-armv4.pl | 18 +++++++--- crypto/sha/asm/sha512-armv4.pl | 20 ++++++++--- 16 files changed, 185 insertions(+), 73 deletions(-) diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl index abb2cc79a3..456a469679 100644 --- a/crypto/aes/asm/aes-armv4.pl +++ b/crypto/aes/asm/aes-armv4.pl @@ -76,7 +76,6 @@ $code=<<___; # define __ARM_ARCH__ __LINUX_ARM_ARCH__ #endif -.text #if defined(__thumb2__) && !defined(__APPLE__) .syntax unified .thumb @@ -85,6 +84,8 @@ $code=<<___; #undef __thumb2__ #endif +.text + .type AES_Te,%object .align 5 AES_Te: diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl index 9ab2158c7d..81bc1cbf1c 100755 --- a/crypto/aes/asm/aesv8-armx.pl +++ b/crypto/aes/asm/aesv8-armx.pl @@ -53,18 +53,27 @@ open OUT,"| \"$^X\" $xlate $flavour $output"; $prefix="aes_v8"; +$_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); + $code=<<___; #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 -.text ___ -$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); +$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); $code.=<<___ if ($flavour !~ /64/); .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) .fpu neon +#ifdef __thumb2__ +.syntax unified +.thumb +# define INST(a,b,c,d) $_byte c,d|0xc,a,b +#else .code 32 -#undef __thumb2__ +# define INST(a,b,c,d) $_byte a,b,c,d +#endif + +.text ___ # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, @@ -955,7 +964,7 @@ if ($flavour =~ /64/) { ######## 64-bit code # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( - sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; @@ -996,14 +1005,17 @@ if ($flavour =~ /64/) { ######## 64-bit code s/\],#[0-9]+/]!/o; s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or - s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or s/vtbl\.8\s+(.*)/unvtbl($1)/geo or s/vdup\.32\s+(.*)/unvdup32($1)/geo or s/vmov\.32\s+(.*)/unvmov32($1)/geo or s/^(\s+)b\./$1b/o or - s/^(\s+)mov\./$1mov/o or s/^(\s+)ret/$1bx\tlr/o; + if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { + print " it $2\n"; + } + print $_,"\n"; } } diff --git a/crypto/aes/asm/bsaes-armv7.pl b/crypto/aes/asm/bsaes-armv7.pl index 5df195b9ef..7f5219bc75 100644 --- a/crypto/aes/asm/bsaes-armv7.pl +++ b/crypto/aes/asm/bsaes-armv7.pl @@ -728,7 +728,6 @@ $code.=<<___; .arch armv7-a .fpu neon -.text .syntax unified @ ARMv7-capable assembler is expected to handle this #if defined(__thumb2__) && !defined(__APPLE__) .thumb @@ -737,6 +736,8 @@ $code.=<<___; # undef __thumb2__ #endif +.text + .type _bsaes_decrypt8,%function .align 4 _bsaes_decrypt8: @@ -1125,9 +1126,9 @@ bsaes_cbc_encrypt: #ifndef __thumb__ blo AES_cbc_encrypt #else - bhs 1f + bhs .Lcbc_do_bsaes b AES_cbc_encrypt -1: +.Lcbc_do_bsaes: #endif #endif diff --git a/crypto/armv4cpuid.pl b/crypto/armv4cpuid.pl index 3a7be6e541..f8aeec64f0 100644 --- a/crypto/armv4cpuid.pl +++ b/crypto/armv4cpuid.pl @@ -21,7 +21,6 @@ open OUT,"| \"$^X\" $xlate $flavour $output"; $code.=<<___; #include "arm_arch.h" -.text #if defined(__thumb2__) && !defined(__APPLE__) .syntax unified .thumb @@ -30,6 +29,8 @@ $code.=<<___; #undef __thumb2__ #endif +.text + .align 5 .global OPENSSL_atomic_add .type OPENSSL_atomic_add,%function diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl index 442ae46953..0bf6f63ec5 100644 --- a/crypto/bn/asm/armv4-gf2m.pl +++ b/crypto/bn/asm/armv4-gf2m.pl @@ -57,13 +57,14 @@ if ($flavour && $flavour ne "void") { $code=<<___; #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb #else .code 32 #endif + +.text ___ ################ # private interface to mul_1x1_ialu @@ -176,11 +177,13 @@ bn_GF2m_mul_2x2: #if __ARM_MAX_ARCH__>=7 stmdb sp!,{r10,lr} ldr r12,.LOPENSSL_armcap +# if !defined(_WIN32) adr r10,.LOPENSSL_armcap ldr r12,[r12,r10] -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] -#endif +# endif tst r12,#ARMV7_NEON itt ne ldrne r10,[sp],#8 @@ -310,7 +313,11 @@ $code.=<<___; #if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-. +# endif #endif .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by " .align 5 diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl index b4f6c06633..7e0a4d8145 100644 --- a/crypto/bn/asm/armv4-mont.pl +++ b/crypto/bn/asm/armv4-mont.pl @@ -97,7 +97,6 @@ $_num="$num,#15*4"; $_bpend=$_num; $code=<<___; #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb @@ -105,10 +104,16 @@ $code=<<___; .code 32 #endif +.text + #if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lbn_mul_mont +# endif #endif .global bn_mul_mont @@ -122,12 +127,14 @@ bn_mul_mont: #if __ARM_MAX_ARCH__>=7 tst ip,#7 bne .Lialu - adr r0,.Lbn_mul_mont - ldr r2,.LOPENSSL_armcap + ldr r0,.LOPENSSL_armcap +#if !defined(_WIN32) + adr r2,.Lbn_mul_mont ldr r0,[r0,r2] -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r0,[r0] -#endif +# endif tst r0,#ARMV7_NEON @ NEON available? ldmia sp, {r0,r2} beq .Lialu diff --git a/crypto/chacha/asm/chacha-armv4.pl b/crypto/chacha/asm/chacha-armv4.pl index 9bbfc6b376..c4402961d4 100755 --- a/crypto/chacha/asm/chacha-armv4.pl +++ b/crypto/chacha/asm/chacha-armv4.pl @@ -171,7 +171,6 @@ my @ret; $code.=<<___; #include "arm_arch.h" -.text #if defined(__thumb2__) || defined(__clang__) .syntax unified #endif @@ -185,6 +184,8 @@ $code.=<<___; #define ldrhsb ldrbhs #endif +.text + .align 5 .Lsigma: .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral @@ -192,7 +193,11 @@ $code.=<<___; .long 1,0,0,0 #if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.LChaCha20_ctr32 +# endif #else .word -1 #endif @@ -219,8 +224,10 @@ ChaCha20_ctr32: cmp r2,#192 @ test len bls .Lshort ldr r4,[r14,#-32] +# if !defined(_WIN32) ldr r4,[r14,r4] -# ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r4,[r4] # endif tst r4,#ARMV7_NEON diff --git a/crypto/ec/asm/ecp_nistz256-armv4.pl b/crypto/ec/asm/ecp_nistz256-armv4.pl index 4005a6fbdc..43a675b4b2 100755 --- a/crypto/ec/asm/ecp_nistz256-armv4.pl +++ b/crypto/ec/asm/ecp_nistz256-armv4.pl @@ -51,7 +51,6 @@ if ($flavour && $flavour ne "void") { $code.=<<___; #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb @@ -80,6 +79,7 @@ close TABLE; die "insane number of elements" if ($#arr != 64*16*37-1); $code.=<<___; +.rodata .globl ecp_nistz256_precomputed .type ecp_nistz256_precomputed,%object .align 12 @@ -104,6 +104,8 @@ for(1..37) { } $code.=<<___; .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed + +.text .align 5 .LRR: @ 2^512 mod P precomputed for NIST P256 polynomial .long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl index 759d29c49f..1391b1b6e0 100644 --- a/crypto/modes/asm/ghash-armv4.pl +++ b/crypto/modes/asm/ghash-armv4.pl @@ -142,7 +142,6 @@ ___ $code=<<___; #include "arm_arch.h" -.text #if defined(__thumb2__) || defined(__clang__) .syntax unified #define ldrplb ldrbpl @@ -154,6 +153,8 @@ $code=<<___; .code 32 #endif +.text + .type rem_4bit,%object .align 5 rem_4bit: diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl index ea05950309..e891583312 100644 --- a/crypto/modes/asm/ghashv8-armx.pl +++ b/crypto/modes/asm/ghashv8-armx.pl @@ -66,18 +66,26 @@ $inc="x12"; { my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); +my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); $code=<<___; #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 -.text ___ -$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); -$code.=<<___ if ($flavour !~ /64/); +$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); +$code.=<<___ if ($flavour !~ /64/); .fpu neon -.code 32 -#undef __thumb2__ +#ifdef __thumb2__ +.syntax unified +.thumb +# define INST(a,b,c,d) $_byte c,0xef,a,b +#else +.code 32 +# define INST(a,b,c,d) $_byte a,b,c,0xf2 +#endif + +.text ___ ################################################################################ @@ -752,7 +760,7 @@ if ($flavour =~ /64/) { ######## 64-bit code # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( - sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; @@ -767,13 +775,17 @@ if ($flavour =~ /64/) { ######## 64-bit code # fix up remaining new-style suffixes s/\],#[0-9]+/]!/o; - s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or + s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or s/vdup\.32\s+(.*)/unvdup32($1)/geo or s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or s/^(\s+)b\./$1b/o or s/^(\s+)ret/$1bx\tlr/o; + if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { + print " it $2\n"; + } + print $_,"\n"; } } diff --git a/crypto/perlasm/arm-xlate.pl b/crypto/perlasm/arm-xlate.pl index d78a8baac1..b953f1fc19 100755 --- a/crypto/perlasm/arm-xlate.pl +++ b/crypto/perlasm/arm-xlate.pl @@ -28,6 +28,13 @@ my $fpu = sub { if ($flavour =~ /linux/) { ".fpu\t".join(',',@_); } else { ""; } }; +my $rodata = sub { + SWITCH: for ($flavour) { + /linux/ && return ".section\t.rodata"; + /ios/ && return ".section\t__TEXT,__const"; + last; + } +}; my $hidden = sub { if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); } else { ".hidden\t".join(',',@_); } diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl index 7003cc7770..38622af1ab 100755 --- a/crypto/poly1305/asm/poly1305-armv4.pl +++ b/crypto/poly1305/asm/poly1305-armv4.pl @@ -48,7 +48,6 @@ if ($flavour && $flavour ne "void") { $code.=<<___; #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb @@ -56,6 +55,8 @@ $code.=<<___; .code 32 #endif +.text + .globl poly1305_emit .globl poly1305_blocks .globl poly1305_init @@ -100,8 +101,10 @@ poly1305_init: and r4,r4,r10 #if __ARM_MAX_ARCH__>=7 +# if !defined(_WIN32) ldr r12,[r11,r12] @ OPENSSL_armcap_P -# ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] # endif #endif @@ -116,31 +119,21 @@ poly1305_init: #if __ARM_MAX_ARCH__>=7 tst r12,#ARMV7_NEON @ check for NEON -# ifdef __APPLE__ - adr r9,poly1305_blocks_neon - adr r11,poly1305_blocks -# ifdef __thumb2__ - it ne -# endif +# ifdef __thumb2__ + adr r9,.Lpoly1305_blocks_neon + adr r11,.Lpoly1305_blocks + adr r12,.Lpoly1305_emit + adr r10,.Lpoly1305_emit_neon + itt ne movne r11,r9 - adr r12,poly1305_emit - adr r10,poly1305_emit_neon -# ifdef __thumb2__ - it ne -# endif movne r12,r10 + orr r11,r11,#1 @ thumb-ify address + orr r12,r12,#1 # else -# ifdef __thumb2__ - itete eq -# endif - addeq r12,r11,#(poly1305_emit-.Lpoly1305_init) - addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init) - addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init) - addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init) -# endif -# ifdef __thumb2__ - orr r12,r12,#1 @ thumb-ify address - orr r11,r11,#1 + addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) + addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init) + addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) + addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) # endif #endif ldrb r9,[$inp,#11] @@ -352,6 +345,7 @@ $code.=<<___; .type poly1305_emit,%function .align 5 poly1305_emit: +.Lpoly1305_emit: stmdb sp!,{r4-r11} .Lpoly1305_emit_enter: @@ -671,6 +665,7 @@ poly1305_init_neon: .type poly1305_blocks_neon,%function .align 5 poly1305_blocks_neon: +.Lpoly1305_blocks_neon: ldr ip,[$ctx,#36] @ is_base2_26 ands $len,$len,#-16 beq .Lno_data_neon @@ -1157,6 +1152,7 @@ poly1305_blocks_neon: .type poly1305_emit_neon,%function .align 5 poly1305_emit_neon: +.Lpoly1305_emit_neon: ldr ip,[$ctx,#36] @ is_base2_26 stmdb sp!,{r4-r11} @@ -1229,7 +1225,11 @@ poly1305_emit_neon: .Lzeros: .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lpoly1305_init +# endif #endif ___ } } diff --git a/crypto/sha/asm/keccak1600-armv4.pl b/crypto/sha/asm/keccak1600-armv4.pl index 44504fa8ac..be411ed74b 100755 --- a/crypto/sha/asm/keccak1600-armv4.pl +++ b/crypto/sha/asm/keccak1600-armv4.pl @@ -113,8 +113,6 @@ my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50)); $code.=<<___; #include "arm_arch.h" -.text - #if defined(__thumb2__) .syntax unified .thumb @@ -122,6 +120,8 @@ $code.=<<___; .code 32 #endif +.text + .type iotas32, %object .align 5 iotas32: @@ -691,7 +691,14 @@ ___ $code.=<<___; blo .Lround2x +#if __ARM_ARCH__>=5 ldr pc,[sp,#440] +#else + ldr lr,[sp,#440] + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif .size KeccakF1600_int,.-KeccakF1600_int .type KeccakF1600, %function @@ -730,7 +737,14 @@ KeccakF1600: stmia @E[1], {@C[0]-@C[9]} add sp,sp,#440+20 +#if __ARM_ARCH__>=5 ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif .size KeccakF1600,.-KeccakF1600 ___ { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14)); @@ -905,7 +919,14 @@ SHA3_absorb: .Labsorb_abort: add sp,sp,#456+32 mov r0,$len @ return value +#if __ARM_ARCH__>=5 ldmia sp!,{r4-r12,pc} +#else + ldmia sp!,{r4-r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif .size SHA3_absorb,.-SHA3_absorb ___ } @@ -1055,7 +1076,14 @@ SHA3_squeeze: .align 4 .Lsqueeze_done: add sp,sp,#24 +#if __ARM_ARCH__>=5 ldmia sp!,{r4-r10,pc} +#else + ldmia sp!,{r4-r10,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif .size SHA3_squeeze,.-SHA3_squeeze ___ } @@ -1265,7 +1293,7 @@ KeccakF1600_neon: subs r3, r3, #1 bne .Loop_neon - bx lr + ret .size KeccakF1600_neon,.-KeccakF1600_neon .global SHA3_absorb_neon diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl index cdacbd4633..cd0b95ade8 100644 --- a/crypto/sha/asm/sha1-armv4-large.pl +++ b/crypto/sha/asm/sha1-armv4-large.pl @@ -187,7 +187,6 @@ ___ $code=<<___; #include "arm_arch.h" -.text #if defined(__thumb2__) .syntax unified .thumb @@ -195,6 +194,8 @@ $code=<<___; .code 32 #endif +.text + .global sha1_block_data_order .type sha1_block_data_order,%function @@ -202,12 +203,14 @@ $code=<<___; sha1_block_data_order: #if __ARM_MAX_ARCH__>=7 .Lsha1_block: - adr r3,.Lsha1_block ldr r12,.LOPENSSL_armcap +# if !defined(_WIN32) + adr r3,.Lsha1_block ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] -#endif +# endif tst r12,#ARMV8_SHA1 bne .LARMv8 tst r12,#ARMV7_NEON @@ -311,7 +314,11 @@ $code.=<<___; .LK_60_79: .word 0xca62c1d6 #if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lsha1_block +# endif #endif .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " .align 5 @@ -613,14 +620,15 @@ my ($ABCD,$E,$E0,$E1)=map("q$_",(0..3)); my @MSG=map("q$_",(4..7)); my @Kxx=map("q$_",(8..11)); my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14)); +my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); $code.=<<___; #if __ARM_MAX_ARCH__>=7 # if defined(__thumb2__) -# define INST(a,b,c,d) .byte c,d|0xf,a,b +# define INST(a,b,c,d) $_byte c,d|0xf,a,b # else -# define INST(a,b,c,d) .byte a,b,c,d|0x10 +# define INST(a,b,c,d) $_byte a,b,c,d|0x10 # endif .type sha1_block_data_order_armv8,%function diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl index e0512b4d41..cd01fcb57b 100644 --- a/crypto/sha/asm/sha256-armv4.pl +++ b/crypto/sha/asm/sha256-armv4.pl @@ -181,7 +181,6 @@ $code=<<___; # define __ARM_MAX_ARCH__ 7 #endif -.text #if defined(__thumb2__) .syntax unified .thumb @@ -189,6 +188,8 @@ $code=<<___; .code 32 #endif +.text + .type K256,%object .align 5 K256: @@ -212,7 +213,11 @@ K256: .word 0 @ terminator #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lsha256_block_data_order +# endif #endif .align 5 @@ -227,10 +232,12 @@ sha256_block_data_order: #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap +# if !defined(_WIN32) ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] -#endif +# endif tst r12,#ARMV8_SHA256 bne .LARMv8 tst r12,#ARMV7_NEON @@ -598,14 +605,15 @@ my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); my @MSG=map("q$_",(8..11)); my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); my $Ktbl="r3"; +my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); $code.=<<___; #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) # if defined(__thumb2__) -# define INST(a,b,c,d) .byte c,d|0xc,a,b +# define INST(a,b,c,d) $_byte c,d|0xc,a,b # else -# define INST(a,b,c,d) .byte a,b,c,d +# define INST(a,b,c,d) $_byte a,b,c,d # endif .type sha256_block_data_order_armv8,%function diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl index 795c4cbb45..39c943b449 100644 --- a/crypto/sha/asm/sha512-armv4.pl +++ b/crypto/sha/asm/sha512-armv4.pl @@ -196,6 +196,9 @@ $code.=<<___; add $Ktbl,$Ktbl,#8 ___ } + +my $_word = ($flavour =~ /win/ ? "DCDU" : ".word"); + $code=<<___; #ifndef __KERNEL__ # include "arm_arch.h" @@ -211,14 +214,13 @@ $code=<<___; #ifdef __ARMEL__ # define LO 0 # define HI 4 -# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +# define WORD64(hi0,lo0,hi1,lo1) $_word lo0,hi0, lo1,hi1 #else # define HI 0 # define LO 4 -# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +# define WORD64(hi0,lo0,hi1,lo1) $_word hi0,lo0, hi1,lo1 #endif -.text #if defined(__thumb2__) .syntax unified .thumb @@ -227,6 +229,8 @@ $code=<<___; .code 32 #endif +.text + .type K512,%object .align 5 K512: @@ -273,7 +277,11 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: +# ifdef _WIN32 +.word OPENSSL_armcap_P +# else .word OPENSSL_armcap_P-.Lsha512_block_data_order +# endif .skip 32-4 #else .skip 32 @@ -290,10 +298,12 @@ sha512_block_data_order: #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap +# if !defined(_WIN32) ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ +# endif +# if defined(__APPLE__) || defined(_WIN32) ldr r12,[r12] -#endif +# endif tst r12,#ARMV7_NEON bne .LNEON #endif -- 2.25.1