From 11208dcfb9105e8afa37233185decefd45e89e17 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 23 Sep 2015 18:41:27 +0200 Subject: [PATCH] ARMv4 assembly pack: implement support for Thumb2. As some of ARM processors, more specifically Cortex-Mx series, are Thumb2-only, we need to support Thumb2-only builds even in assembly. Reviewed-by: Tim Hudson --- crypto/aes/asm/aes-armv4.pl | 24 ++++++--------- crypto/armv4cpuid.pl | 24 +++++++++++++++ crypto/bn/asm/armv4-gf2m.pl | 36 ++++++++++++++++++---- crypto/bn/asm/armv4-mont.pl | 35 +++++++++++++++++---- crypto/ec/asm/ecp_nistz256-armv4.pl | 38 +++++++++++++++++++++++ crypto/modes/asm/ghash-armv4.pl | 45 ++++++++++++++++++++++++--- crypto/sha/asm/sha1-armv4-large.pl | 47 +++++++++++++++++++++++++---- crypto/sha/asm/sha256-armv4.pl | 14 +++------ crypto/sha/asm/sha512-armv4.pl | 16 ++++------ 9 files changed, 223 insertions(+), 56 deletions(-) diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl index 0f7ec39d56..c3d166ff5f 100644 --- a/crypto/aes/asm/aes-armv4.pl +++ b/crypto/aes/asm/aes-armv4.pl @@ -70,15 +70,11 @@ $code=<<___; #endif .text -#if __ARM_ARCH__<7 -.code 32 -#else +#if defined(__thumb2__) && !defined(__APPLE__) .syntax unified -# if defined(__thumb2__) && !defined(__APPLE__) .thumb -# else +#else .code 32 -# endif #endif .type AES_Te,%object @@ -193,7 +189,7 @@ AES_Te: .type AES_encrypt,%function .align 5 AES_encrypt: -#if __ARM_ARCH__<7 +#ifndef __thumb2__ sub r3,pc,#8 @ AES_encrypt #else adr r3,AES_encrypt @@ -443,19 +439,19 @@ _armv4_AES_encrypt: .align 5 AES_set_encrypt_key: _armv4_AES_set_encrypt_key: -#if __ARM_ARCH__<7 +#ifndef __thumb2__ sub r3,pc,#8 @ AES_set_encrypt_key #else adr r3,AES_set_encrypt_key #endif teq r0,#0 -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif moveq r0,#-1 beq .Labrt teq r2,#0 -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif moveq r0,#-1 @@ -466,7 +462,7 @@ _armv4_AES_set_encrypt_key: teq r1,#192 beq .Lok teq r1,#256 -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt ne @ Thumb2 thing, sanity check in ARM #endif movne r0,#-1 @@ -627,7 +623,7 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-16] subs $rounds,$rounds,#1 str $s3,[$key,#-12] -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif subeq r2,$key,#216 @@ -699,7 +695,7 @@ _armv4_AES_set_encrypt_key: str $s2,[$key,#-24] subs $rounds,$rounds,#1 str $s3,[$key,#-20] -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif subeq r2,$key,#256 @@ -969,7 +965,7 @@ AES_Td: .type AES_decrypt,%function .align 5 AES_decrypt: -#if __ARM_ARCH__<7 +#ifndef __thumb2__ sub r3,pc,#8 @ AES_decrypt #else adr r3,AES_decrypt diff --git a/crypto/armv4cpuid.pl b/crypto/armv4cpuid.pl index 1c447187c7..c66962350d 100644 --- a/crypto/armv4cpuid.pl +++ b/crypto/armv4cpuid.pl @@ -15,7 +15,12 @@ $code.=<<___; #include "arm_arch.h" .text +#if defined(__thumb2__) && !defined(__APPLE__) +.syntax unified +.thumb +#else .code 32 +#endif .align 5 .global OPENSSL_atomic_add @@ -59,6 +64,9 @@ OPENSSL_atomic_add: OPENSSL_cleanse: eor ip,ip,ip cmp r1,#7 +#ifdef __thumb2__ + itt hs +#endif subhs r1,r1,#4 bhs .Lot cmp r1,#0 @@ -116,27 +124,43 @@ _armv7_tick: .global _armv8_aes_probe .type _armv8_aes_probe,%function _armv8_aes_probe: +#if defined(__thumb2__) && !defined(__APPLE__) + .byte 0xb0,0xff,0x00,0x03 @ aese.8 q0,q0 +#else .byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0 +#endif bx lr .size _armv8_aes_probe,.-_armv8_aes_probe .global _armv8_sha1_probe .type _armv8_sha1_probe,%function _armv8_sha1_probe: +#if defined(__thumb2__) && !defined(__APPLE__) + .byte 0x00,0xef,0x40,0x0c @ sha1c.32 q0,q0,q0 +#else .byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0 +#endif bx lr .size _armv8_sha1_probe,.-_armv8_sha1_probe .global _armv8_sha256_probe .type _armv8_sha256_probe,%function _armv8_sha256_probe: +#if defined(__thumb2__) && !defined(__APPLE__) + .byte 0x00,0xff,0x40,0x0c @ sha256h.32 q0,q0,q0 +#else .byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0 +#endif bx lr .size _armv8_sha256_probe,.-_armv8_sha256_probe .global _armv8_pmull_probe .type _armv8_pmull_probe,%function _armv8_pmull_probe: +#if defined(__thumb2__) && !defined(__APPLE__) + .byte 0xa0,0xef,0x00,0x0e @ vmull.p64 q0,d0,d0 +#else .byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0 +#endif bx lr .size _armv8_pmull_probe,.-_armv8_pmull_probe #endif diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl index a0b018c508..227581e10c 100644 --- a/crypto/bn/asm/armv4-gf2m.pl +++ b/crypto/bn/asm/armv4-gf2m.pl @@ -51,7 +51,12 @@ $code=<<___; #include "arm_arch.h" .text +#if defined(__thumb2__) && !defined(__APPLE__) +.syntax unified +.thumb +#else .code 32 +#endif ___ ################ # private interface to mul_1x1_ialu @@ -132,11 +137,17 @@ mul_1x1_ialu: eor $hi,$hi,$t0,lsr#8 ldr $t0,[sp,$i0] @ tab[b >> 30 ] +#ifdef __thumb2__ + itt ne +#endif eorne $lo,$lo,$b,lsl#30 eorne $hi,$hi,$b,lsr#2 tst $a,#1<<31 eor $lo,$lo,$t1,lsl#27 eor $hi,$hi,$t1,lsr#5 +#ifdef __thumb2__ + itt ne +#endif eorne $lo,$lo,$b,lsl#31 eorne $hi,$hi,$b,lsr#1 eor $lo,$lo,$t0,lsl#30 @@ -156,20 +167,33 @@ $code.=<<___; .align 5 bn_GF2m_mul_2x2: #if __ARM_MAX_ARCH__>=7 + stmdb sp!,{r10,lr} ldr r12,.LOPENSSL_armcap -.Lpic: ldr r12,[pc,r12] - tst r12,#1 + adr r10,.LOPENSSL_armcap + ldr r12,[r12,r10] +#ifdef __APPLE__ + ldr r12,[r12] +#endif + tst r12,#ARMV7_NEON + itt ne + ldrne r10,[sp],#8 bne .LNEON + stmdb sp!,{r4-r9} +#else + stmdb sp!,{r4-r10,lr} #endif ___ $ret="r10"; # reassigned 1st argument $code.=<<___; - stmdb sp!,{r4-r10,lr} mov $ret,r0 @ reassign 1st argument mov $b,r3 @ $b=b1 + sub r7,sp,#36 + mov r8,sp + and r7,r7,#-32 ldr r3,[sp,#32] @ load b0 mov $mask,#7<<2 - sub sp,sp,#32 @ allocate tab[8] + mov sp,r7 @ allocate tab[8] + str r8,[r7,#32] bl mul_1x1_ialu @ a1·b1 str $lo,[$ret,#8] @@ -193,6 +217,7 @@ ___ $code.=<<___; ldmia $ret,{@r[0]-@r[3]} eor $lo,$lo,$hi + ldr sp,[sp,#32] @ destroy tab[8] eor $hi,$hi,@r[1] eor $lo,$lo,@r[0] eor $hi,$hi,@r[2] @@ -200,7 +225,6 @@ $code.=<<___; eor $hi,$hi,@r[3] str $hi,[$ret,#8] eor $lo,$lo,$hi - add sp,sp,#32 @ destroy tab[8] str $lo,[$ret,#4] #if __ARM_ARCH__>=5 @@ -279,7 +303,7 @@ $code.=<<___; #if __ARM_MAX_ARCH__>=7 .align 5 .LOPENSSL_armcap: -.word OPENSSL_armcap_P-(.Lpic+8) +.word OPENSSL_armcap_P-. #endif .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by " .align 5 diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl index 59f218b5cf..bd56f989c7 100644 --- a/crypto/bn/asm/armv4-mont.pl +++ b/crypto/bn/asm/armv4-mont.pl @@ -82,7 +82,12 @@ $code=<<___; #include "arm_arch.h" .text +#if defined(__thumb2__) && !defined(__APPLE__) +.syntax unified +.thumb +#else .code 32 +#endif #if __ARM_MAX_ARCH__>=7 .align 5 @@ -101,7 +106,7 @@ bn_mul_mont: #if __ARM_MAX_ARCH__>=7 tst ip,#7 bne .Lialu - adr r0,bn_mul_mont + adr r0,.Lbn_mul_mont ldr r2,.LOPENSSL_armcap ldr r0,[r0,r2] #ifdef __APPLE__ @@ -117,6 +122,9 @@ bn_mul_mont: #endif cmp ip,#2 mov $num,ip @ load num +#ifdef __thumb2__ + ittt lt +#endif movlt r0,#0 addlt sp,sp,#2*4 blt .Labrt @@ -164,10 +172,11 @@ bn_mul_mont: ldr $n0,[$_n0] @ restore n0 adc $nhi,$nhi,#0 str $nlo,[$num] @ tp[num-1]= + mov $tj,sp str $nhi,[$num,#4] @ tp[num]= .Louter: - sub $tj,$num,sp @ "original" $num-1 value + sub $tj,$num,$tj @ "original" $num-1 value sub $ap,$ap,$tj @ "rewind" ap to &ap[1] ldr $bi,[$tp,#4]! @ *(++bp) sub $np,$np,$tj @ "rewind" np to &np[1] @@ -212,11 +221,16 @@ bn_mul_mont: str $nhi,[$num,#4] @ tp[num]= cmp $tp,$tj +#ifdef __thumb2__ + itt ne +#endif + movne $tj,sp bne .Louter ldr $rp,[$_rp] @ pull rp + mov $aj,sp add $num,$num,#4 @ $num to point at &tp[num] - sub $aj,$num,sp @ "original" num value + sub $aj,$num,$aj @ "original" num value mov $tp,sp @ "rewind" $tp mov $ap,$tp @ "borrow" $ap sub $np,$np,$aj @ "rewind" $np to &np[0] @@ -242,7 +256,8 @@ bn_mul_mont: cmp $tp,$num bne .Lcopy - add sp,$num,#4 @ skip over tp[num+1] + mov sp,$num + add sp,sp,#4 @ skip over tp[num+1] ldmia sp!,{r4-r12,lr} @ restore registers add sp,sp,#2*4 @ skip over {r0,r2} mov r0,#1 @@ -283,6 +298,7 @@ bn_mul8x_mont_neon: stmdb sp!,{r4-r11} vstmdb sp!,{d8-d15} @ ABI specification says so ldmia ip,{r4-r5} @ load rest of parameter block + mov ip,sp sub $toutptr,sp,#16 vld1.32 {${Bi}[0]}, [$bptr,:32]! @@ -638,8 +654,9 @@ bn_mul8x_mont_neon: bne .LNEON_sub ldr r10, [$aptr] @ load top-most bit + mov r11,sp veor q0,q0,q0 - sub r11,$bptr,sp @ this is num*4 + sub r11,$bptr,r11 @ this is num*4 veor q1,q1,q1 mov $aptr,sp sub $rptr,$rptr,r11 @ rewind $rptr @@ -649,27 +666,33 @@ bn_mul8x_mont_neon: .LNEON_copy_n_zap: ldmia $aptr!, {r4-r7} ldmia $rptr, {r8-r11} + it cc movcc r8, r4 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + itt cc movcc r9, r5 movcc r10,r6 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + it cc movcc r11,r7 ldmia $aptr, {r4-r7} stmia $rptr!, {r8-r11} sub $aptr,$aptr,#16 ldmia $rptr, {r8-r11} + it cc movcc r8, r4 vst1.64 {q0-q1}, [$aptr,:256]! @ wipe + itt cc movcc r9, r5 movcc r10,r6 vst1.64 {q0-q1}, [$nptr,:256]! @ wipe + it cc movcc r11,r7 teq $aptr,$bptr @ preserves carry stmia $rptr!, {r8-r11} bne .LNEON_copy_n_zap - sub sp,ip,#96 + mov sp,ip vldmia sp!,{d8-d15} ldmia sp!,{r4-r11} ret @ bx lr diff --git a/crypto/ec/asm/ecp_nistz256-armv4.pl b/crypto/ec/asm/ecp_nistz256-armv4.pl index b49b77ea3e..aeeb190335 100755 --- a/crypto/ec/asm/ecp_nistz256-armv4.pl +++ b/crypto/ec/asm/ecp_nistz256-armv4.pl @@ -45,7 +45,12 @@ $code.=<<___; #include "arm_arch.h" .text +#if defined(__thumb2__) && !defined(__APPLE__) +.syntax unified +.thumb +#else .code 32 +#endif ___ ######################################################################## # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 @@ -162,6 +167,9 @@ __ecp_nistz256_mul_by_2: adcs $a6,$a6,$a6 mov $ff,#0 adcs $a7,$a7,$a7 +#ifdef __thumb2__ + it cs +#endif movcs $ff,#-1 @ $ff = carry ? -1 : 0 b .Lreduce_by_sub @@ -213,6 +221,9 @@ __ecp_nistz256_add: adcs $a6,$a6,$t2 mov $ff,#0 adcs $a7,$a7,$t3 +#ifdef __thumb2__ + it cs +#endif movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry ldr lr,[sp],#4 @ pop lr @@ -286,6 +297,9 @@ __ecp_nistz256_mul_by_3: adcs $a6,$a6,$a6 mov $ff,#0 adcs $a7,$a7,$a7 +#ifdef __thumb2__ + it cs +#endif movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry subs $a0,$a0,$ff @ subtract synthesized modulus, see @@ -318,6 +332,9 @@ __ecp_nistz256_mul_by_3: adcs $a6,$a6,$t2 mov $ff,#0 adcs $a7,$a7,$t3 +#ifdef __thumb2__ + it cs +#endif movcs $ff,#-1 @ $ff = carry ? -1 : 0, "broadcast" carry ldr lr,[sp],#4 @ pop lr @@ -781,6 +798,9 @@ ecp_nistz256_gather_w5: cmp $index,#0 mov $mask,#0 +#ifdef __thumb2__ + itt ne +#endif subne $index,$index,#1 movne $mask,#-1 add $inp,$inp,$index,lsl#2 @@ -887,6 +907,9 @@ ecp_nistz256_gather_w7: cmp $index,#0 mov $mask,#0 +#ifdef __thumb2__ + itt ne +#endif subne $index,$index,#1 movne $mask,#-1 add $inp,$inp,$index @@ -1180,6 +1203,9 @@ __ecp_nistz256_add_self: adcs $a6,$a6,$a6 mov $ff,#0 adcs $a7,$a7,$a7 +#ifdef __thumb2__ + it cs +#endif movcs $ff,#-1 @ $ff = carry ? -1 : 0 subs $a0,$a0,$ff @ subtract synthesized modulus @@ -1369,6 +1395,9 @@ ecp_nistz256_point_add: stmia r3!,{r4-r11} ldmia $b_ptr,{r4-r11} cmp r12,#0 +#ifdef __thumb2__ + it ne +#endif movne r12,#-1 stmia r3,{r4-r11} str r12,[sp,#32*18+8] @ !in2infty @@ -1395,6 +1424,9 @@ ecp_nistz256_point_add: stmia r3!,{r4-r11} ldmia $a_ptr,{r4-r11} cmp r12,#0 +#ifdef __thumb2__ + it ne +#endif movne r12,#-1 stmia r3,{r4-r11} str r12,[sp,#32*18+4] @ !in1infty @@ -1636,6 +1668,9 @@ ecp_nistz256_point_add_affine: stmia r3!,{r4-r11} ldmia $a_ptr,{r4-r11} cmp r12,#0 +#ifdef __thumb2__ + it ne +#endif movne r12,#-1 stmia r3,{r4-r11} str r12,[sp,#32*15+4] @ !in1infty @@ -1661,6 +1696,9 @@ ecp_nistz256_point_add_affine: orr r12,r12,r11 stmia r3!,{r4-r11} cmp r12,#0 +#ifdef __thumb2__ + it ne +#endif movne r12,#-1 str r12,[sp,#32*15+8] @ !in2infty diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl index 2d225cf6d8..1506e5b203 100644 --- a/crypto/modes/asm/ghash-armv4.pl +++ b/crypto/modes/asm/ghash-armv4.pl @@ -136,7 +136,12 @@ $code=<<___; #include "arm_arch.h" .text +#if defined(__thumb2__) && !defined(__APPLE__) +.syntax unified +.thumb +#else .code 32 +#endif #ifdef __APPLE__ #define ldrplb ldrbpl @@ -154,19 +159,27 @@ rem_4bit: .type rem_4bit_get,%function rem_4bit_get: - sub $rem_4bit,pc,#8 - sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit +#if defined(__thumb2__) + adr $rem_4bit,rem_4bit +#else + sub $rem_4bit,pc,#8+32 @ &rem_4bit +#endif b .Lrem_4bit_got nop + nop .size rem_4bit_get,.-rem_4bit_get .global gcm_ghash_4bit .type gcm_ghash_4bit,%function +.align 4 gcm_ghash_4bit: - sub r12,pc,#8 +#if defined(__thumb2__) + adr r12,rem_4bit +#else + sub r12,pc,#8+48 @ &rem_4bit +#endif add $len,$inp,$len @ $len to point at the end stmdb sp!,{r3-r11,lr} @ save $len/end too - sub r12,r12,#48 @ &rem_4bit ldmia r12,{r4-r11} @ copy rem_4bit ... stmdb sp!,{r4-r11} @ ... to stack @@ -213,6 +226,9 @@ gcm_ghash_4bit: eor $Zlh,$Zlh,$Zhl,lsl#28 ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] eor $Zhl,$Thl,$Zhl,lsr#4 +#ifdef __thumb2__ + it pl +#endif ldrplb $nlo,[$inp,$cnt] eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhh,$Thh,$Zhh,lsr#4 @@ -223,6 +239,9 @@ gcm_ghash_4bit: add $nhi,$nhi,$nhi ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] eor $Zll,$Tll,$Zll,lsr#4 +#ifdef __thumb2__ + it pl +#endif ldrplb $Tll,[$Xi,$cnt] eor $Zll,$Zll,$Zlh,lsl#28 eor $Zlh,$Tlh,$Zlh,lsr#4 @@ -230,8 +249,14 @@ gcm_ghash_4bit: eor $Zlh,$Zlh,$Zhl,lsl#28 eor $Zhl,$Thl,$Zhl,lsr#4 eor $Zhl,$Zhl,$Zhh,lsl#28 +#ifdef __thumb2__ + it pl +#endif eorpl $nlo,$nlo,$Tll eor $Zhh,$Thh,$Zhh,lsr#4 +#ifdef __thumb2__ + itt pl +#endif andpl $nhi,$nlo,#0xf0 andpl $nlo,$nlo,#0x0f eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] @@ -241,7 +266,11 @@ gcm_ghash_4bit: add $inp,$inp,#16 mov $nhi,$Zll ___ - &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]"); + &Zsmash("cmp\t$inp,$len","\n". + "#ifdef __thumb2__\n". + " it ne\n". + "#endif\n". + " ldrneb $nlo,[$inp,#15]"); $code.=<<___; bne .Louter @@ -299,6 +328,9 @@ gcm_gmult_4bit: eor $Zlh,$Zlh,$Zhl,lsl#28 ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] eor $Zhl,$Thl,$Zhl,lsr#4 +#ifdef __thumb2__ + it pl +#endif ldrplb $nlo,[$Xi,$cnt] eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhh,$Thh,$Zhh,lsr#4 @@ -316,6 +348,9 @@ gcm_gmult_4bit: eor $Zhl,$Thl,$Zhl,lsr#4 eor $Zhl,$Zhl,$Zhh,lsl#28 eor $Zhh,$Thh,$Zhh,lsr#4 +#ifdef __thumb2__ + itt pl +#endif andpl $nhi,$nlo,#0xf0 andpl $nlo,$nlo,#0x0f eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl index 356b52fc1b..9d34e04f7b 100644 --- a/crypto/sha/asm/sha1-armv4-large.pl +++ b/crypto/sha/asm/sha1-armv4-large.pl @@ -181,7 +181,12 @@ $code=<<___; #include "arm_arch.h" .text +#if defined(__thumb2__) && !defined(__APPLE__) +.syntax unified +.thumb +#else .code 32 +#endif .global sha1_block_data_order .type sha1_block_data_order,%function @@ -189,7 +194,8 @@ $code=<<___; .align 5 sha1_block_data_order: #if __ARM_MAX_ARCH__>=7 - sub r3,pc,#8 @ sha1_block_data_order +.Lsha1_block: + adr r3,.Lsha1_block ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P #ifdef __APPLE__ @@ -216,7 +222,12 @@ for($i=0;$i<5;$i++) { &BODY_00_15(@V); unshift(@V,pop(@V)); } $code.=<<___; +#if defined(__thumb2__) && !defined(__APPLE__) + mov $t3,sp + teq $Xi,$t3 +#else teq $Xi,sp +#endif bne .L_00_15 @ [((11+4)*5+2)*3] sub sp,sp,#25*4 ___ @@ -235,7 +246,12 @@ for($i=0;$i<5;$i++) { &BODY_20_39(@V); unshift(@V,pop(@V)); } $code.=<<___; +#if defined(__thumb2__) && !defined(__APPLE__) + mov $t3,sp + teq $Xi,$t3 +#else teq $Xi,sp @ preserve carry +#endif bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes @@ -247,7 +263,12 @@ for($i=0;$i<5;$i++) { &BODY_40_59(@V); unshift(@V,pop(@V)); } $code.=<<___; +#if defined(__thumb2__) && !defined(__APPLE__) + mov $t3,sp + teq $Xi,$t3 +#else teq $Xi,sp +#endif bne .L_40_59 @ [+((12+5)*5+2)*4] ldr $K,.LK_60_79 @@ -283,7 +304,7 @@ $code.=<<___; .LK_60_79: .word 0xca62c1d6 #if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha1_block_data_order +.word OPENSSL_armcap_P-.Lsha1_block #endif .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by " .align 5 @@ -458,6 +479,7 @@ sub Xuplast_80 () &teq ($inp,$len); &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX + &it ("eq"); &subeq ($inp,$inp,64); # reload last block to avoid SEGV &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!"); eval(shift(@insns)); @@ -508,12 +530,12 @@ sha1_block_data_order_neon: @ dmb @ errata #451034 on early Cortex A8 @ vstmdb sp!,{d8-d15} @ ABI specification says so mov $saved_sp,sp - sub sp,sp,#64 @ alloca + sub $Xfer,sp,#64 adr $K_XX_XX,.LK_00_19 - bic sp,sp,#15 @ align for 128-bit stores + bic $Xfer,$Xfer,#15 @ align for 128-bit stores ldmia $ctx,{$a,$b,$c,$d,$e} @ load context - mov $Xfer,sp + mov sp,$Xfer @ alloca vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned veor $zero,$zero,$zero @@ -560,10 +582,13 @@ $code.=<<___; add $b,$b,$t0 add $c,$c,$t1 add $d,$d,$Xfer + it eq moveq sp,$saved_sp add $e,$e,$Ki + it ne ldrne $Ki,[sp] stmia $ctx,{$a,$b,$c,$d,$e} + itt ne addne $Xfer,sp,#3*16 bne .Loop_neon @@ -584,6 +609,13 @@ my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14)); $code.=<<___; #if __ARM_MAX_ARCH__>=7 + +# if defined(__thumb2__) && !defined(__APPLE__) +# define INST(a,b,c,d) .byte c,d|0xf,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d|0x10 +# endif + .type sha1_block_data_order_armv8,%function .align 5 sha1_block_data_order_armv8: @@ -677,7 +709,10 @@ ___ # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( - sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + + # this fix-up provides Thumb encoding in conjunction with INST + $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000); + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl index efee1fb1f3..c65073b7f9 100644 --- a/crypto/sha/asm/sha256-armv4.pl +++ b/crypto/sha/asm/sha256-armv4.pl @@ -175,16 +175,12 @@ $code=<<___; #endif .text -#if __ARM_ARCH__<7 -.code 32 -#else +#if defined(__thumb2__) && !defined(__APPLE__) .syntax unified -# if defined(__thumb2__) && !defined(__APPLE__) -# define adrl adr .thumb -# else +# define adrl adr +#else .code 32 -# endif #endif .type K256,%object @@ -218,10 +214,10 @@ K256: .type sha256_block_data_order,%function sha256_block_data_order: .Lsha256_block_data_order: -#if __ARM_ARCH__<7 +#if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ sha256_block_data_order #else - adr r3,sha256_block_data_order + adr r3,.Lsha256_block_data_order #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl index 77d6c5eae9..a83d6772c8 100644 --- a/crypto/sha/asm/sha512-armv4.pl +++ b/crypto/sha/asm/sha512-armv4.pl @@ -212,16 +212,12 @@ $code=<<___; #endif .text -#if __ARM_ARCH__<7 || defined(__APPLE__) -.code 32 -#else +#if defined(__thumb2__) && !defined(__APPLE__) .syntax unified -# ifdef __thumb2__ -# define adrl adr .thumb -# else -.code 32 -# endif +# define adrl adr +#else +.code 32 #endif .type K512,%object @@ -280,10 +276,10 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .type sha512_block_data_order,%function sha512_block_data_order: .Lsha512_block_data_order: -#if __ARM_ARCH__<7 +#if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ sha512_block_data_order #else - adr r3,sha512_block_data_order + adr r3,.Lsha512_block_data_order #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap -- 2.25.1