From 3a97ebb16b0d1eaab83171b9220280dacd99bf04 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Fri, 6 Jun 2014 21:27:18 +0200 Subject: [PATCH] ARM assembly pack: get ARMv7 instruction endianness right. Pointer out and suggested by: Ard Biesheuvel. (cherry picked from commit 5dcf70a1c57c2019bfad640fe14fd4a73212860a) --- crypto/aes/asm/aes-armv4.pl | 5 ++- crypto/armv4cpuid.S | 70 +++++++++++++++++++----------- crypto/bn/asm/armv4-gf2m.pl | 3 +- crypto/bn/asm/armv4-mont.pl | 10 ++++- crypto/modes/asm/ghash-armv4.pl | 5 ++- crypto/sha/asm/sha1-armv4-large.pl | 20 ++++++--- crypto/sha/asm/sha256-armv4.pl | 20 ++++++--- crypto/sha/asm/sha512-armv4.pl | 3 +- 8 files changed, 89 insertions(+), 47 deletions(-) diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl index a5d97ce053..4f8917089f 100644 --- a/crypto/aes/asm/aes-armv4.pl +++ b/crypto/aes/asm/aes-armv4.pl @@ -715,8 +715,8 @@ _armv4_AES_set_encrypt_key: .Ldone: mov r0,#0 ldmia sp!,{r4-r12,lr} .Labrt: -#if defined(__thumb2__) && __ARM_ARCH__>=7 - .short 0x4770 @ bx lr in Thumb2 encoding +#if __ARM_ARCH__>=5 + ret @ bx lr #else tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet @@ -1203,6 +1203,7 @@ _armv4_AES_decrypt: ___ $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx\tlr/gm; open SELF,$0; while() { diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S index 4f6ae17232..add11d405e 100644 --- a/crypto/armv4cpuid.S +++ b/crypto/armv4cpuid.S @@ -7,42 +7,46 @@ .global _armv7_neon_probe .type _armv7_neon_probe,%function _armv7_neon_probe: - .word 0xf26ee1fe @ vorr q15,q15,q15 - .word 0xe12fff1e @ bx lr + .byte 0xf0,0x01,0x60,0xf2 @ vorr q8,q8,q8 + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr .size _armv7_neon_probe,.-_armv7_neon_probe .global _armv7_tick .type _armv7_tick,%function _armv7_tick: - mrrc p15,1,r0,r1,c14 @ CNTVCT - .word 0xe12fff1e @ bx lr + mrrc p15,1,r0,r1,c14 @ CNTVCT +#if __ARM_ARCH__>=5 + bx lr +#else + .word 0xe12fff1e @ bx lr +#endif .size _armv7_tick,.-_armv7_tick .global _armv8_aes_probe .type _armv8_aes_probe,%function _armv8_aes_probe: - .word 0xf3b00300 @ aese.8 q0,q0 - .word 0xe12fff1e @ bx lr + .byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0 + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr .size _armv8_aes_probe,.-_armv8_aes_probe .global _armv8_sha1_probe .type _armv8_sha1_probe,%function _armv8_sha1_probe: - .word 0xf2000c40 @ sha1c.32 q0,q0,q0 - .word 0xe12fff1e @ bx lr + .byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0 + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr .size _armv8_sha1_probe,.-_armv8_sha1_probe .global _armv8_sha256_probe .type _armv8_sha256_probe,%function _armv8_sha256_probe: - .word 0xf3000c40 @ sha256h.32 q0,q0,q0 - .word 0xe12fff1e @ bx lr + .byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0 + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr .size _armv8_sha256_probe,.-_armv8_sha256_probe .global _armv8_pmull_probe .type _armv8_pmull_probe,%function _armv8_pmull_probe: - .word 0xf2a00e00 @ vmull.p64 q0,d0,d0 - .word 0xe12fff1e @ bx lr + .byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0 + .byte 0x1e,0xff,0x2f,0xe1 @ bx lr .size _armv8_pmull_probe,.-_armv8_pmull_probe .align 5 @@ -56,7 +60,7 @@ OPENSSL_atomic_add: cmp r2,#0 bne .Ladd mov r0,r3 - .word 0xe12fff1e @ bx lr + bx lr #else stmdb sp!,{r4-r6,lr} ldr r2,.Lspinlock @@ -109,9 +113,13 @@ OPENSSL_cleanse: adds r1,r1,#4 bne .Little .Lcleanse_done: +#if __ARM_ARCH__>=5 + bx lr +#else tst lr,#1 moveq pc,lr .word 0xe12fff1e @ bx lr +#endif .size OPENSSL_cleanse,.-OPENSSL_cleanse .global OPENSSL_wipe_cpu @@ -125,41 +133,53 @@ OPENSSL_wipe_cpu: eor ip,ip,ip tst r0,#1 beq .Lwipe_done - .word 0xf3000150 @ veor q0, q0, q0 - .word 0xf3022152 @ veor q1, q1, q1 - .word 0xf3044154 @ veor q2, q2, q2 - .word 0xf3066156 @ veor q3, q3, q3 - .word 0xf34001f0 @ veor q8, q8, q8 - .word 0xf34221f2 @ veor q9, q9, q9 - .word 0xf34441f4 @ veor q10, q10, q10 - .word 0xf34661f6 @ veor q11, q11, q11 - .word 0xf34881f8 @ veor q12, q12, q12 - .word 0xf34aa1fa @ veor q13, q13, q13 - .word 0xf34cc1fc @ veor q14, q14, q14 - .word 0xf34ee1fe @ veor q15, q15, q15 + .byte 0x50,0x01,0x00,0xf3 @ veor q0, q0, q0 + .byte 0x52,0x21,0x02,0xf3 @ veor q1, q1, q1 + .byte 0x54,0x41,0x04,0xf3 @ veor q2, q2, q2 + .byte 0x56,0x61,0x06,0xf3 @ veor q3, q3, q3 + .byte 0xf0,0x01,0x40,0xf3 @ veor q8, q8, q8 + .byte 0xf2,0x21,0x42,0xf3 @ veor q9, q9, q9 + .byte 0xf4,0x41,0x44,0xf3 @ veor q10, q10, q10 + .byte 0xf6,0x61,0x46,0xf3 @ veor q11, q11, q11 + .byte 0xf8,0x81,0x48,0xf3 @ veor q12, q12, q12 + .byte 0xfa,0xa1,0x4a,0xf3 @ veor q13, q13, q13 + .byte 0xfc,0xc1,0x4c,0xf3 @ veor q14, q14, q14 + .byte 0xfe,0xe1,0x4e,0xf3 @ veor q14, q14, q14 .Lwipe_done: mov r0,sp +#if __ARM_ARCH__>=5 + bx lr +#else tst lr,#1 moveq pc,lr .word 0xe12fff1e @ bx lr +#endif .size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu .global OPENSSL_instrument_bus .type OPENSSL_instrument_bus,%function OPENSSL_instrument_bus: eor r0,r0,r0 +#if __ARM_ARCH__>=5 + bx lr +#else tst lr,#1 moveq pc,lr .word 0xe12fff1e @ bx lr +#endif .size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus .global OPENSSL_instrument_bus2 .type OPENSSL_instrument_bus2,%function OPENSSL_instrument_bus2: eor r0,r0,r0 +#if __ARM_ARCH__>=5 + bx lr +#else tst lr,#1 moveq pc,lr .word 0xe12fff1e @ bx lr +#endif .size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2 .align 5 diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl index c66495040c..b781afbf89 100644 --- a/crypto/bn/asm/armv4-gf2m.pl +++ b/crypto/bn/asm/armv4-gf2m.pl @@ -202,7 +202,7 @@ bn_GF2m_mul_2x2: veor $r, $r, $t2 vst1.32 {$r}, [r0] - bx lr + ret @ bx lr .align 4 .Lialu: #endif @@ -273,6 +273,7 @@ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 print $_,"\n"; diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl index fe81f9b6f6..72bad8e308 100644 --- a/crypto/bn/asm/armv4-mont.pl +++ b/crypto/bn/asm/armv4-mont.pl @@ -230,9 +230,14 @@ bn_mul_mont: ldmia sp!,{r4-r12,lr} @ restore registers add sp,sp,#2*4 @ skip over {r0,r2} mov r0,#1 -.Labrt: tst lr,#1 +.Labrt: +#if __ARM_ARCH__>=5 + ret @ bx lr +#else + tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .size bn_mul_mont,.-bn_mul_mont ___ { @@ -650,7 +655,7 @@ bn_mul8x_mont_neon: sub sp,ip,#96 vldmia sp!,{d8-d15} ldmia sp!,{r4-r11} - bx lr + ret @ bx lr .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon #endif ___ @@ -665,5 +670,6 @@ ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx lr/gm; print $code; close STDOUT; diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl index 0b0dcc8a68..0023bf994b 100644 --- a/crypto/modes/asm/ghash-armv4.pl +++ b/crypto/modes/asm/ghash-armv4.pl @@ -386,7 +386,7 @@ gcm_init_neon: veor $IN,$IN,$t0 @ twisted H vstmia r0,{$IN} - bx lr + ret @ bx lr .size gcm_init_neon,.-gcm_init_neon .global gcm_gmult_neon @@ -470,7 +470,7 @@ $code.=<<___; vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi vst1.64 $Xl#lo,[$Xi,:64] - bx lr + ret @ bx lr .size gcm_ghash_neon,.-gcm_ghash_neon #endif ___ @@ -484,6 +484,7 @@ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or + s/\bret\b/bx lr/go or s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 print $_,"\n"; diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl index 43a1b9fd7f..50bd07b331 100644 --- a/crypto/sha/asm/sha1-armv4-large.pl +++ b/crypto/sha/asm/sha1-armv4-large.pl @@ -631,7 +631,7 @@ $code.=<<___; vst1.32 {$E\[0]},[$ctx] vldmia sp!,{d8-d15} - bx lr + ret @ bx lr .size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 #endif ___ @@ -648,13 +648,18 @@ ___ sub unsha1 { my ($mnemonic,$arg)=@_; - $arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o - && - sprintf ".long\t0x%08x\t@ %s %s", - $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) - |(($2&7)<<17)|(($2&8)<<4) - |(($3&7)<<1) |(($3&8)<<2), + if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; + } } } @@ -664,6 +669,7 @@ foreach (split($/,$code)) { s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo; + s/\bret\b/bx lr/o or s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4 print $_,$/; diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl index 5e5c54ec18..505ca8f350 100644 --- a/crypto/sha/asm/sha256-armv4.pl +++ b/crypto/sha/asm/sha256-armv4.pl @@ -608,7 +608,7 @@ $code.=<<___; vst1.32 {$ABCD,$EFGH},[$ctx] - bx lr + ret @ bx lr .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 #endif ___ @@ -626,13 +626,18 @@ ___ sub unsha256 { my ($mnemonic,$arg)=@_; - $arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o - && - sprintf ".long\t0x%08x\t@ %s %s", - $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) - |(($2&7)<<17)|(($2&8)<<4) - |(($3&7)<<1) |(($3&8)<<2), + if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; + } } } @@ -642,6 +647,7 @@ foreach (split($/,$code)) { s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; + s/\bret\b/bx lr/go or s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 print $_,"\n"; diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl index d3065794b3..1d5275b917 100644 --- a/crypto/sha/asm/sha512-armv4.pl +++ b/crypto/sha/asm/sha512-armv4.pl @@ -584,7 +584,7 @@ $code.=<<___; bne .Loop_neon vldmia sp!,{d8-d15} @ epilogue - bx lr + ret @ bx lr #endif ___ } @@ -597,5 +597,6 @@ ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +$code =~ s/\bret\b/bx lr/gm; print $code; close STDOUT; # enforce flush -- 2.25.1