From 87873f4328274fc64b4089de6deabf52e5b2d481 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Sun, 17 Jul 2011 17:40:29 +0000
Subject: [PATCH] ARM assembler pack: add platform run-time detection.

---
 Configure                      |   2 +-
 TABLE                          |   4 +-
 config                         |   1 +
 crypto/arm_arch.h              |   9 +-
 crypto/armcap.c                |  77 +++++++++++++++++
 crypto/armv4cpuid.S            | 154 +++++++++++++++++++++++++++++++++
 crypto/bn/asm/armv4-gf2m.pl    |   6 +-
 crypto/modes/gcm128.c          |   7 +-
 crypto/sha/asm/sha512-armv4.pl |   6 +-
 9 files changed, 252 insertions(+), 14 deletions(-)
 create mode 100644 crypto/armcap.c
 create mode 100644 crypto/armv4cpuid.S

diff --git a/Configure b/Configure
index 46d5cb259b..8ae48e75f3 100755
--- a/Configure
+++ b/Configure
@@ -135,7 +135,7 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-a
 my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::";
 my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::";
 my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o";
-my $armv4_asm=":bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o:void";
+my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o:void";
 my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:32";
 my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:64";
 my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o:::::::";
diff --git a/TABLE b/TABLE
index 2e405ec230..b7c764a9b9 100644
--- a/TABLE
+++ b/TABLE
@@ -1032,7 +1032,7 @@ $thread_cflag = -D_REENTRANT
 $sys_id       = 
 $lflags       = -ldl
 $bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR
-$cpuid_obj    = 
+$cpuid_obj    = armcap.o armv4cpuid.o
 $bn_obj       = bn_asm.o armv4-mont.o armv4-gf2m.o
 $des_obj      = 
 $aes_obj      = aes_cbc.o aes-armv4.o
@@ -3688,7 +3688,7 @@ $thread_cflag = -D_REENTRANT
 $sys_id       = 
 $lflags       = -ldl
 $bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR
-$cpuid_obj    = 
+$cpuid_obj    = armcap.o armv4cpuid.o
 $bn_obj       = bn_asm.o armv4-mont.o armv4-gf2m.o
 $des_obj      = 
 $aes_obj      = aes_cbc.o aes-armv4.o
diff --git a/config b/config
index e82868721e..32bec89db2 100755
--- a/config
+++ b/config
@@ -630,6 +630,7 @@ case "$GUESSOS" in
 	options="$options -DB_ENDIAN -mschedule=$CPUSCHEDULE -march=$CPUARCH"
 	OUT="linux-generic32" ;;
   armv[1-3]*-*-linux2) OUT="linux-generic32" ;;
+  armv[7-9]*-*-linux2) OUT="linux-armv4"; options="$options -march=armv7-a" ;;
   arm*-*-linux2) OUT="linux-armv4" ;;
   sh*b-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;;
   sh*-*-linux2)  OUT="linux-generic32"; options="$options -DL_ENDIAN" ;;
diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
index 82401add19..15027ed3de 100644
--- a/crypto/arm_arch.h
+++ b/crypto/arm_arch.h
@@ -18,7 +18,7 @@
    */
 #  if	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \
 	defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__)	|| \
-	defined(__ARM_ARCH_7EM)
+	defined(__ARM_ARCH_7EM__)
 #   define __ARM_ARCH__ 7
 #  elif	defined(__ARM_ARCH_6__)	|| defined(__ARM_ARCH_6J__)	|| \
 	defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__)	|| \
@@ -39,5 +39,12 @@
 #include <openssl/fipssyms.h>
 #endif
 
+#if !__ASSEMBLER__
+extern unsigned int OPENSSL_armcap_P;
+                                     
+#define ARMV7_NEON      (1<<0)
+#define ARMV7_TICK      (1<<1)
+#endif
+
 #endif
 #endif
diff --git a/crypto/armcap.c b/crypto/armcap.c
new file mode 100644
index 0000000000..74c2c57295
--- /dev/null
+++ b/crypto/armcap.c
@@ -0,0 +1,77 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <crypto.h>
+
+#include "arm_arch.h"
+
+unsigned int OPENSSL_armcap_P;
+
+static sigset_t all_masked;
+
+static sigjmp_buf ill_jmp;
+static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
+
+/*
+ * Following subroutines could have been inlined, but it's not all
+ * ARM compilers support inline assembler...
+ */
+void _armv7_neon_probe(void);
+unsigned int _armv7_tick(void);
+
+unsigned int OPENSSL_rdtsc(void)
+	{
+	if (OPENSSL_armcap_P|ARMV7_TICK)
+		return _armv7_tick();
+	else
+		return 0;
+	}
+
+void OPENSSL_cpuid_setup(void)
+	{
+	char *e;
+	struct sigaction	ill_oact,ill_act;
+	sigset_t		oset;
+	static int trigger=0;
+
+	if (trigger) return;
+	trigger=1;
+ 
+	if ((e=getenv("OPENSSL_armcap")))
+		{
+		OPENSSL_armcap_P=strtoul(e,NULL,0);
+		return;
+		}
+
+	sigfillset(&all_masked);
+	sigdelset(&all_masked,SIGILL);
+	sigdelset(&all_masked,SIGTRAP);
+	sigdelset(&all_masked,SIGFPE);
+	sigdelset(&all_masked,SIGBUS);
+	sigdelset(&all_masked,SIGSEGV);
+
+	OPENSSL_armcap_P = 0;
+
+	memset(&ill_act,0,sizeof(ill_act));
+	ill_act.sa_handler = ill_handler;
+	ill_act.sa_mask    = all_masked;
+
+	sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
+	sigaction(SIGILL,&ill_act,&ill_oact);
+
+	if (sigsetjmp(ill_jmp,1) == 0)
+		{
+		_armv7_neon_probe();
+		OPENSSL_armcap_P |= ARMV7_NEON;
+		}
+	if (sigsetjmp(ill_jmp,1) == 0)
+		{
+		_armv7_tick();
+		OPENSSL_armcap_P |= ARMV7_TICK;
+		}
+
+	sigaction (SIGILL,&ill_oact,NULL);
+	sigprocmask(SIG_SETMASK,&oset,NULL);
+	}
diff --git a/crypto/armv4cpuid.S b/crypto/armv4cpuid.S
new file mode 100644
index 0000000000..c9102ca2a5
--- /dev/null
+++ b/crypto/armv4cpuid.S
@@ -0,0 +1,154 @@
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.align	5
+.global	_armv7_neon_probe
+.type	_armv7_neon_probe,%function
+_armv7_neon_probe:
+	.word	0xf26ee1fe	@ vorr	q15,q15,q15
+	.word	0xe12fff1e	@ bx	lr
+.size	_armv7_neon_probe,.-_armv7_neon_probe
+
+.global	_armv7_tick
+.type	_armv7_tick,%function
+_armv7_tick:
+	mrc	p15,0,r0,c9,c13,0
+	.word	0xe12fff1e	@ bx	lr
+.size	_armv7_tick,.-_armv7_tick
+
+.global	OPENSSL_atomic_add
+.type	OPENSSL_atomic_add,%function
+OPENSSL_atomic_add:
+#if __ARM_ARCH__>=6
+.Ladd:	ldrex	r2,[r0]
+	add	r3,r2,r1
+	strex	r2,r3,[r0]
+	cmp	r2,#0
+	bne	.Ladd
+	mov	r0,r3
+	.word	0xe12fff1e	@ bx	lr
+#else
+	stmdb	sp!,{r4-r6,lr}
+	ldr	r2,.Lspinlock
+	adr	r3,.Lspinlock
+	mov	r4,r0
+	mov	r5,r1
+	add	r6,r3,r2	@ &spinlock
+	b	.+8
+.Lspin:	bl	sched_yield
+	mov	r0,#-1
+	swp	r0,r0,[r6]
+	cmp	r0,#0
+	bne	.Lspin
+
+	ldr	r2,[r4]
+	add	r2,r5
+	str	r2,[r4]
+	str	r0,[r6]		@ release spinlock
+	ldmia	sp!,{r4-r6,lr}
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+#endif
+.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.global	OPENSSL_cleanse
+.type	OPENSSL_cleanse,%function
+OPENSSL_cleanse:
+	eor	ip,ip,ip
+	cmp	r1,#7
+	subhs	r1,#4
+	bhs	.Lot
+	cmp	r1,#0
+	beq	.Lcleanse_done
+.Little:
+	strb	ip,[r0],#1
+	subs	r1,#1
+	bhi	.Little
+	b	.Lcleanse_done
+
+.Lot:	tst	r0,#3
+	beq	.Laligned
+	strb	ip,[r0],#1
+	sub	r1,#1
+	b	.Lot
+.Laligned:
+	str	ip,[r0],#4
+	subs	r1,#4
+	bhs	.Laligned
+	adds	r1,#4
+	bne	.Little
+.Lcleanse_done:
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_cleanse,.-OPENSSL_cleanse
+
+.global	OPENSSL_wipe_cpu
+.type	OPENSSL_wipe_cpu,%function
+OPENSSL_wipe_cpu:
+	ldr	r0,.LOPENSSL_armcap
+	adr	r1,.LOPENSSL_armcap
+	ldr	r0,[r1,r0]
+	eor	r2,r2,r2
+	eor	r3,r3,r3
+	eor	ip,ip,ip
+	tst	r0,#1
+	beq	.Lwipe_done
+	.word	0xf3000150	@ veor    q0, q0, q0
+	.word	0xf3022152	@ veor    q1, q1, q1
+	.word	0xf3044154	@ veor    q2, q2, q2
+	.word	0xf3066156	@ veor    q3, q3, q3
+	.word	0xf34001f0	@ veor    q8, q8, q8
+	.word	0xf34221f2	@ veor    q9, q9, q9
+	.word	0xf34441f4	@ veor    q10, q10, q10
+	.word	0xf34661f6	@ veor    q11, q11, q11
+	.word	0xf34881f8	@ veor    q12, q12, q12
+	.word	0xf34aa1fa	@ veor    q13, q13, q13
+	.word	0xf34cc1fc	@ veor    q14, q14, q14
+	.word	0xf34ee1fe	@ veor    q15, q15, q15
+.Lwipe_done:
+	mov	r0,sp
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+
+.global	OPENSSL_instrument_bus
+.type	OPENSSL_instrument_bus,%function
+OPENSSL_instrument_bus:
+	eor	r0,r0,r0
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.global	OPENSSL_instrument_bus2
+.type	OPENSSL_instrument_bus2,%function
+OPENSSL_instrument_bus2:
+	eor	r0,r0,r0
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.LOPENSSL_armcap
+#if __ARM_ARCH__>=6
+.align	5
+#else
+.Lspinlock:
+.word	atomic_add_spinlock-.Lspinlock
+.align	5
+
+.data
+.align	2
+atomic_add_spinlock:
+.word	0
+#endif
+
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl
index 4fe9db9894..9928dae872 100644
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -264,12 +264,12 @@ $code.=<<___;
 #if __ARM_ARCH__>=7
 .align	5
 .LOPENSSL_armcap:
-.word	OPENSSL_armcap-(.Lpic+8)
+.word	OPENSSL_armcap_P-(.Lpic+8)
 #endif
-.asciz	"GF2m Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align	5
 
-.comm	OPENSSL_armcap,4,4
+.comm	OPENSSL_armcap_P,4,4
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/crypto/modes/gcm128.c b/crypto/modes/gcm128.c
index 8b9070a45b..2e42e71804 100644
--- a/crypto/modes/gcm128.c
+++ b/crypto/modes/gcm128.c
@@ -668,8 +668,6 @@ void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len
 #  if __ARM_ARCH__>=7
 #   define GHASH_ASM_ARM
 #   define GCM_FUNCREF_4BIT
-extern unsigned int OPENSSL_armcap;
-
 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 #  endif
@@ -715,7 +713,8 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
 #elif	TABLE_BITS==4
 # if	defined(GHASH_ASM_X86_OR_64)
 #  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
-	if (OPENSSL_ia32cap_P[1]&(1<<1)) {	/* check PCLMULQDQ bit */
+	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
+	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
 		gcm_init_clmul(ctx->Htable,ctx->H.u);
 		ctx->gmult = gcm_gmult_clmul;
 		ctx->ghash = gcm_ghash_clmul;
@@ -736,7 +735,7 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
 	ctx->ghash = gcm_ghash_4bit;
 #  endif
 # elif	defined(GHASH_ASM_ARM)
-	if (OPENSSL_armcap & 1) {
+	if (OPENSSL_armcap_P & ARMV7_NEON) {
 		ctx->gmult = gcm_gmult_neon;
 		ctx->ghash = gcm_ghash_neon;
 	} else {
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl
index 8ba56e3d16..7faf37b147 100644
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -221,7 +221,7 @@ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
 .LOPENSSL_armcap:
-.word	OPENSSL_armcap-sha512_block_data_order
+.word	OPENSSL_armcap_P-sha512_block_data_order
 .skip	32-4
 
 .global	sha512_block_data_order
@@ -231,7 +231,7 @@ sha512_block_data_order:
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
 #if __ARM_ARCH__>=7
 	ldr	r12,.LOPENSSL_armcap
-	ldr	r12,[r3,r12]		@ OPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
 	tst	r12,#1
 	bne	.LNEON
 #endif
@@ -573,7 +573,7 @@ $code.=<<___;
 .size	sha512_block_data_order,.-sha512_block_data_order
 .asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
-.comm	OPENSSL_armcap,4,4
+.comm	OPENSSL_armcap_P,4,4
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
-- 
2.25.1