From 3a97ebb16b0d1eaab83171b9220280dacd99bf04 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Fri, 6 Jun 2014 21:27:18 +0200
Subject: [PATCH] ARM assembly pack: get ARMv7 instruction endianness right.

Pointer out and suggested by: Ard Biesheuvel.
(cherry picked from commit 5dcf70a1c57c2019bfad640fe14fd4a73212860a)
---
 crypto/aes/asm/aes-armv4.pl        |  5 ++-
 crypto/armv4cpuid.S                | 70 +++++++++++++++++++-----------
 crypto/bn/asm/armv4-gf2m.pl        |  3 +-
 crypto/bn/asm/armv4-mont.pl        | 10 ++++-
 crypto/modes/asm/ghash-armv4.pl    |  5 ++-
 crypto/sha/asm/sha1-armv4-large.pl | 20 ++++++---
 crypto/sha/asm/sha256-armv4.pl     | 20 ++++++---
 crypto/sha/asm/sha512-armv4.pl     |  3 +-
 8 files changed, 89 insertions(+), 47 deletions(-)
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl
index a5d97ce053..4f8917089f 100644
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -715,8 +715,8 @@ _armv4_AES_set_encrypt_key:
 .Ldone:	mov	r0,#0
 	ldmia   sp!,{r4-r12,lr}
 .Labrt:
-#if defined(__thumb2__) && __ARM_ARCH__>=7
-	.short	0x4770			@ bx lr in Thumb2 encoding
+#if __ARM_ARCH__>=5
+	ret				@ bx lr
 #else
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
@@ -1203,6 +1203,7 @@ _armv4_AES_decrypt:
 ___
 
 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx\tlr/gm;
 
 open SELF,$0;
 while(<SELF>) {
diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S
index 4f6ae17232..add11d405e 100644
--- a/crypto/armv4cpuid.S
+++ b/crypto/armv4cpuid.S
@@ -7,42 +7,46 @@
 .global	_armv7_neon_probe
 .type	_armv7_neon_probe,%function
 _armv7_neon_probe:
-	.word	0xf26ee1fe	@ vorr	q15,q15,q15
-	.word	0xe12fff1e	@ bx	lr
+	.byte	0xf0,0x01,0x60,0xf2	@ vorr	q8,q8,q8
+	.byte	0x1e,0xff,0x2f,0xe1	@ bx	lr
 .size	_armv7_neon_probe,.-_armv7_neon_probe
 
 .global	_armv7_tick
 .type	_armv7_tick,%function
 _armv7_tick:
-	mrrc	p15,1,r0,r1,c14	@ CNTVCT
-	.word	0xe12fff1e	@ bx	lr
+	mrrc	p15,1,r0,r1,c14		@ CNTVCT
+#if __ARM_ARCH__>=5
+	bx	lr
+#else
+	.word	0xe12fff1e		@ bx	lr
+#endif
 .size	_armv7_tick,.-_armv7_tick
 
 .global	_armv8_aes_probe
 .type	_armv8_aes_probe,%function
 _armv8_aes_probe:
-	.word	0xf3b00300	@ aese.8	q0,q0
-	.word	0xe12fff1e	@ bx	lr
+	.byte	0x00,0x03,0xb0,0xf3	@ aese.8	q0,q0
+	.byte	0x1e,0xff,0x2f,0xe1	@ bx	lr
 .size	_armv8_aes_probe,.-_armv8_aes_probe
 
 .global	_armv8_sha1_probe
 .type	_armv8_sha1_probe,%function
 _armv8_sha1_probe:
-	.word	0xf2000c40	@ sha1c.32	q0,q0,q0
-	.word	0xe12fff1e	@ bx	lr
+	.byte	0x40,0x0c,0x00,0xf2	@ sha1c.32	q0,q0,q0
+	.byte	0x1e,0xff,0x2f,0xe1	@ bx	lr
 .size	_armv8_sha1_probe,.-_armv8_sha1_probe
 
 .global	_armv8_sha256_probe
 .type	_armv8_sha256_probe,%function
 _armv8_sha256_probe:
-	.word	0xf3000c40	@ sha256h.32	q0,q0,q0
-	.word	0xe12fff1e	@ bx	lr
+	.byte	0x40,0x0c,0x00,0xf3	@ sha256h.32	q0,q0,q0
+	.byte	0x1e,0xff,0x2f,0xe1	@ bx lr
 .size	_armv8_sha256_probe,.-_armv8_sha256_probe
 .global	_armv8_pmull_probe
 .type	_armv8_pmull_probe,%function
 _armv8_pmull_probe:
-	.word	0xf2a00e00	@ vmull.p64	q0,d0,d0
-	.word	0xe12fff1e	@ bx	lr
+	.byte	0x00,0x0e,0xa0,0xf2	@ vmull.p64	q0,d0,d0
+	.byte	0x1e,0xff,0x2f,0xe1	@ bx	lr
 .size	_armv8_pmull_probe,.-_armv8_pmull_probe
 
 .align	5
@@ -56,7 +60,7 @@ OPENSSL_atomic_add:
 	cmp	r2,#0
 	bne	.Ladd
 	mov	r0,r3
-	.word	0xe12fff1e	@ bx	lr
+	bx	lr
 #else
 	stmdb	sp!,{r4-r6,lr}
 	ldr	r2,.Lspinlock
@@ -109,9 +113,13 @@ OPENSSL_cleanse:
 	adds	r1,r1,#4
 	bne	.Little
 .Lcleanse_done:
+#if __ARM_ARCH__>=5
+	bx	lr
+#else
 	tst	lr,#1
 	moveq	pc,lr
 	.word	0xe12fff1e	@ bx	lr
+#endif
 .size	OPENSSL_cleanse,.-OPENSSL_cleanse
 
 .global	OPENSSL_wipe_cpu
@@ -125,41 +133,53 @@ OPENSSL_wipe_cpu:
 	eor	ip,ip,ip
 	tst	r0,#1
 	beq	.Lwipe_done
-	.word	0xf3000150	@ veor    q0, q0, q0
-	.word	0xf3022152	@ veor    q1, q1, q1
-	.word	0xf3044154	@ veor    q2, q2, q2
-	.word	0xf3066156	@ veor    q3, q3, q3
-	.word	0xf34001f0	@ veor    q8, q8, q8
-	.word	0xf34221f2	@ veor    q9, q9, q9
-	.word	0xf34441f4	@ veor    q10, q10, q10
-	.word	0xf34661f6	@ veor    q11, q11, q11
-	.word	0xf34881f8	@ veor    q12, q12, q12
-	.word	0xf34aa1fa	@ veor    q13, q13, q13
-	.word	0xf34cc1fc	@ veor    q14, q14, q14
-	.word	0xf34ee1fe	@ veor    q15, q15, q15
+	.byte	0x50,0x01,0x00,0xf3	@ veor	q0, q0, q0
+	.byte	0x52,0x21,0x02,0xf3	@ veor	q1, q1, q1
+	.byte	0x54,0x41,0x04,0xf3	@ veor	q2, q2, q2
+	.byte	0x56,0x61,0x06,0xf3	@ veor	q3, q3, q3
+	.byte	0xf0,0x01,0x40,0xf3	@ veor	q8, q8, q8
+	.byte	0xf2,0x21,0x42,0xf3	@ veor	q9, q9, q9
+	.byte	0xf4,0x41,0x44,0xf3	@ veor	q10, q10, q10
+	.byte	0xf6,0x61,0x46,0xf3	@ veor	q11, q11, q11
+	.byte	0xf8,0x81,0x48,0xf3	@ veor	q12, q12, q12
+	.byte	0xfa,0xa1,0x4a,0xf3	@ veor	q13, q13, q13
+	.byte	0xfc,0xc1,0x4c,0xf3	@ veor	q14, q14, q14
+	.byte	0xfe,0xe1,0x4e,0xf3	@ veor	q14, q14, q14
 .Lwipe_done:
 	mov	r0,sp
+#if __ARM_ARCH__>=5
+	bx	lr
+#else
 	tst	lr,#1
 	moveq	pc,lr
 	.word	0xe12fff1e	@ bx	lr
+#endif
 .size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
 
 .global	OPENSSL_instrument_bus
 .type	OPENSSL_instrument_bus,%function
 OPENSSL_instrument_bus:
 	eor	r0,r0,r0
+#if __ARM_ARCH__>=5
+	bx	lr
+#else
 	tst	lr,#1
 	moveq	pc,lr
 	.word	0xe12fff1e	@ bx	lr
+#endif
 .size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
 
 .global	OPENSSL_instrument_bus2
 .type	OPENSSL_instrument_bus2,%function
 OPENSSL_instrument_bus2:
 	eor	r0,r0,r0
+#if __ARM_ARCH__>=5
+	bx	lr
+#else
 	tst	lr,#1
 	moveq	pc,lr
 	.word	0xe12fff1e	@ bx	lr
+#endif
 .size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
 
 .align	5
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index c66495040c..b781afbf89 100644
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -202,7 +202,7 @@ bn_GF2m_mul_2x2:
 	veor		$r, $r, $t2
 
 	vst1.32		{$r}, [r0]
-	bx	lr
+	ret		@ bx lr
 .align	4
 .Lialu:
 #endif
@@ -273,6 +273,7 @@ foreach (split("\n",$code)) {
 	s/\`([^\`]*)\`/eval $1/geo;
 
 	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/\bret\b/bx	lr/go		or
 	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
 
 	print $_,"\n";
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
index fe81f9b6f6..72bad8e308 100644
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -230,9 +230,14 @@ bn_mul_mont:
 	ldmia	sp!,{r4-r12,lr}		@ restore registers
 	add	sp,sp,#2*4		@ skip over {r0,r2}
 	mov	r0,#1
-.Labrt:	tst	lr,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+	ret				@ bx lr
+#else
+	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .size	bn_mul_mont,.-bn_mul_mont
 ___
 {
@@ -650,7 +655,7 @@ bn_mul8x_mont_neon:
 	sub	sp,ip,#96
         vldmia  sp!,{d8-d15}
         ldmia   sp!,{r4-r11}
-	bx	lr
+	ret						@ bx lr
 .size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
 #endif
 ___
@@ -665,5 +670,6 @@ ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx	lr/gm;
 print $code;
 close STDOUT;
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl
index 0b0dcc8a68..0023bf994b 100644
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -386,7 +386,7 @@ gcm_init_neon:
 	veor		$IN,$IN,$t0		@ twisted H
 	vstmia		r0,{$IN}
 
-	bx	lr
+	ret					@ bx lr
 .size	gcm_init_neon,.-gcm_init_neon
 
 .global	gcm_gmult_neon
@@ -470,7 +470,7 @@ $code.=<<___;
 	vst1.64		$Xl#hi,[$Xi,:64]!	@ write out Xi
 	vst1.64		$Xl#lo,[$Xi,:64]
 
-	bx	lr
+	ret					@ bx lr
 .size	gcm_ghash_neon,.-gcm_ghash_neon
 #endif
 ___
@@ -484,6 +484,7 @@ foreach (split("\n",$code)) {
 	s/\`([^\`]*)\`/eval $1/geo;
 
 	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/\bret\b/bx	lr/go		or
 	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
 
 	print $_,"\n";
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl
index 43a1b9fd7f..50bd07b331 100644
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -631,7 +631,7 @@ $code.=<<___;
 	vst1.32		{$E\[0]},[$ctx]
 
 	vldmia	sp!,{d8-d15}
-	bx	lr
+	ret					@ bx lr
 .size	sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
 #endif
 ___
@@ -648,13 +648,18 @@ ___
     sub unsha1 {
 	my ($mnemonic,$arg)=@_;
 
-	$arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o
-	&&
-	sprintf ".long\t0x%08x\t@ %s %s",
-			$opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
-					  |(($2&7)<<17)|(($2&8)<<4)
-					  |(($3&7)<<1) |(($3&8)<<2),
+	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<17)|(($2&8)<<4)
+					 |(($3&7)<<1) |(($3&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
 			$mnemonic,$arg;
+	}
     }
 }
 
@@ -664,6 +669,7 @@ foreach (split($/,$code)) {
 
 	s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
 
+	s/\bret\b/bx	lr/o		or
 	s/\bbx\s+lr\b/.word\t0xe12fff1e/o;	# make it possible to compile with -march=armv4
 
 	print $_,$/;
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl
index 5e5c54ec18..505ca8f350 100644
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -608,7 +608,7 @@ $code.=<<___;
 
 	vst1.32		{$ABCD,$EFGH},[$ctx]
 
-	bx	lr
+	ret		@ bx lr
 .size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
 #endif
 ___
@@ -626,13 +626,18 @@ ___
     sub unsha256 {
 	my ($mnemonic,$arg)=@_;
 
-	$arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o
-	&&
-	sprintf ".long\t0x%08x\t@ %s %s",
-			$opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
-					  |(($2&7)<<17)|(($2&8)<<4)
-					  |(($3&7)<<1) |(($3&8)<<2),
+	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<17)|(($2&8)<<4)
+					 |(($3&7)<<1) |(($3&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
 			$mnemonic,$arg;
+	}
     }
 }
 
@@ -642,6 +647,7 @@ foreach (split($/,$code)) {
 
 	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
 
+	s/\bret\b/bx	lr/go		or
 	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
 
 	print $_,"\n";
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index d3065794b3..1d5275b917 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -584,7 +584,7 @@ $code.=<<___;
 	bne		.Loop_neon
 
 	vldmia	sp!,{d8-d15}		@ epilogue
-	bx	lr
+	ret				@ bx lr
 #endif
 ___
 }
@@ -597,5 +597,6 @@ ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx	lr/gm;
 print $code;
 close STDOUT; # enforce flush
-- 
2.25.1