ARMv4 assembly pack: implement support for Thumb2.

author Andy Polyakov <appro@openssl.org>

Wed, 23 Sep 2015 16:41:27 +0000 (18:41 +0200)

committer Andy Polyakov <appro@openssl.org>

Fri, 25 Sep 2015 11:34:02 +0000 (13:34 +0200)
author Andy Polyakov <appro@openssl.org>
Wed, 23 Sep 2015 16:41:27 +0000 (18:41 +0200)
committer Andy Polyakov <appro@openssl.org>
Fri, 25 Sep 2015 11:34:02 +0000 (13:34 +0200)
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl

index 0f7ec39d562b50f24576f5196d368eefc5c2fdee..c3d166ff5f94d445a120545d3b4cb0dec525c2ce 100644 (file)
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -70,15 +70,11 @@ $code=<<___;
  #endif
  
  .text
-#if __ARM_ARCH__<7
-.code  32
-#else
+#if defined(__thumb2__) && !defined(__APPLE__)
  .syntax        unified
-# if defined(__thumb2__) && !defined(__APPLE__)
  .thumb
-# else
+#else
  .code  32
-# endif
  #endif
  
  .type  AES_Te,%object
@@ -193,7 +189,7 @@ AES_Te:
  .type   AES_encrypt,%function
  .align 5
  AES_encrypt:
-#if __ARM_ARCH__<7
+#ifndef        __thumb2__
         sub     r3,pc,#8                @ AES_encrypt
  #else
         adr     r3,AES_encrypt
@@ -443,19 +439,19 @@ _armv4_AES_encrypt:
  .align 5
  AES_set_encrypt_key:
  _armv4_AES_set_encrypt_key:
-#if __ARM_ARCH__<7
+#ifndef        __thumb2__
         sub     r3,pc,#8                @ AES_set_encrypt_key
  #else
         adr     r3,AES_set_encrypt_key
  #endif
         teq     r0,#0
-#if __ARM_ARCH__>=7
+#ifdef __thumb2__
         itt     eq                      @ Thumb2 thing, sanity check in ARM
  #endif
         moveq   r0,#-1
         beq     .Labrt
         teq     r2,#0
-#if __ARM_ARCH__>=7
+#ifdef __thumb2__
         itt     eq                      @ Thumb2 thing, sanity check in ARM
  #endif
         moveq   r0,#-1
@@ -466,7 +462,7 @@ _armv4_AES_set_encrypt_key:
         teq     r1,#192
         beq     .Lok
         teq     r1,#256
-#if __ARM_ARCH__>=7
+#ifdef __thumb2__
         itt     ne                      @ Thumb2 thing, sanity check in ARM
  #endif
         movne   r0,#-1
@@ -627,7 +623,7 @@ _armv4_AES_set_encrypt_key:
         str     $s2,[$key,#-16]
         subs    $rounds,$rounds,#1
         str     $s3,[$key,#-12]
-#if __ARM_ARCH__>=7
+#ifdef __thumb2__
         itt     eq                              @ Thumb2 thing, sanity check in ARM
  #endif
         subeq   r2,$key,#216
@@ -699,7 +695,7 @@ _armv4_AES_set_encrypt_key:
         str     $s2,[$key,#-24]
         subs    $rounds,$rounds,#1
         str     $s3,[$key,#-20]
-#if __ARM_ARCH__>=7
+#ifdef __thumb2__
         itt     eq                              @ Thumb2 thing, sanity check in ARM
  #endif
         subeq   r2,$key,#256
@@ -969,7 +965,7 @@ AES_Td:
  .type   AES_decrypt,%function
  .align 5
  AES_decrypt:
-#if __ARM_ARCH__<7
+#ifndef        __thumb2__
         sub     r3,pc,#8                @ AES_decrypt
  #else
         adr     r3,AES_decrypt
diff --git a/crypto/armv4cpuid.pl b/crypto/armv4cpuid.pl

index 1c447187c74e680af71b4bfa80b5403d54ae200c..c66962350d30d71a22d70614ffd7250d7bbefac9 100644 (file)
--- a/crypto/armv4cpuid.pl
+++ b/crypto/armv4cpuid.pl
@@ -15,7 +15,12 @@ $code.=<<___;
  #include "arm_arch.h"
  
  .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax        unified
+.thumb
+#else
  .code  32
+#endif
  
  .align 5
  .global        OPENSSL_atomic_add
@@ -59,6 +64,9 @@ OPENSSL_atomic_add:
  OPENSSL_cleanse:
         eor     ip,ip,ip
         cmp     r1,#7
+#ifdef __thumb2__
+       itt     hs
+#endif
         subhs   r1,r1,#4
         bhs     .Lot
         cmp     r1,#0
@@ -116,27 +124,43 @@ _armv7_tick:
  .global        _armv8_aes_probe
  .type  _armv8_aes_probe,%function
  _armv8_aes_probe:
+#if defined(__thumb2__) && !defined(__APPLE__)
+       .byte   0xb0,0xff,0x00,0x03     @ aese.8        q0,q0
+#else
         .byte   0x00,0x03,0xb0,0xf3     @ aese.8        q0,q0
+#endif
         bx      lr
  .size  _armv8_aes_probe,.-_armv8_aes_probe
  
  .global        _armv8_sha1_probe
  .type  _armv8_sha1_probe,%function
  _armv8_sha1_probe:
+#if defined(__thumb2__) && !defined(__APPLE__)
+       .byte   0x00,0xef,0x40,0x0c     @ sha1c.32      q0,q0,q0
+#else
         .byte   0x40,0x0c,0x00,0xf2     @ sha1c.32      q0,q0,q0
+#endif
         bx      lr
  .size  _armv8_sha1_probe,.-_armv8_sha1_probe
  
  .global        _armv8_sha256_probe
  .type  _armv8_sha256_probe,%function
  _armv8_sha256_probe:
+#if defined(__thumb2__) && !defined(__APPLE__)
+       .byte   0x00,0xff,0x40,0x0c     @ sha256h.32    q0,q0,q0
+#else
         .byte   0x40,0x0c,0x00,0xf3     @ sha256h.32    q0,q0,q0
+#endif
         bx      lr
  .size  _armv8_sha256_probe,.-_armv8_sha256_probe
  .global        _armv8_pmull_probe
  .type  _armv8_pmull_probe,%function
  _armv8_pmull_probe:
+#if defined(__thumb2__) && !defined(__APPLE__)
+       .byte   0xa0,0xef,0x00,0x0e     @ vmull.p64     q0,d0,d0
+#else
         .byte   0x00,0x0e,0xa0,0xf2     @ vmull.p64     q0,d0,d0
+#endif
         bx      lr
  .size  _armv8_pmull_probe,.-_armv8_pmull_probe
  #endif
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl

index a0b018c5089d9b53677936382b8ece3fab497d52..227581e10c7fed7e1f9aac97f3fe31bd2350d384 100644 (file)
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -51,7 +51,12 @@ $code=<<___;
  #include "arm_arch.h"
  
  .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax        unified
+.thumb
+#else
  .code  32
+#endif
  ___
  ################
  # private interface to mul_1x1_ialu
@@ -132,11 +137,17 @@ mul_1x1_ialu:
         eor     $hi,$hi,$t0,lsr#8
         ldr     $t0,[sp,$i0]            @ tab[b >> 30      ]
  
+#ifdef __thumb2__
+       itt     ne
+#endif
         eorne   $lo,$lo,$b,lsl#30
         eorne   $hi,$hi,$b,lsr#2
         tst     $a,#1<<31
         eor     $lo,$lo,$t1,lsl#27
         eor     $hi,$hi,$t1,lsr#5
+#ifdef __thumb2__
+       itt     ne
+#endif
         eorne   $lo,$lo,$b,lsl#31
         eorne   $hi,$hi,$b,lsr#1
         eor     $lo,$lo,$t0,lsl#30
@@ -156,20 +167,33 @@ $code.=<<___;
  .align 5
  bn_GF2m_mul_2x2:
  #if __ARM_MAX_ARCH__>=7
+       stmdb   sp!,{r10,lr}
         ldr     r12,.LOPENSSL_armcap
-.Lpic: ldr     r12,[pc,r12]
-       tst     r12,#1
+       adr     r10,.LOPENSSL_armcap
+       ldr     r12,[r12,r10]
+#ifdef __APPLE__
+       ldr     r12,[r12]
+#endif
+       tst     r12,#ARMV7_NEON
+       itt     ne
+       ldrne   r10,[sp],#8
         bne     .LNEON
+       stmdb   sp!,{r4-r9}
+#else
+       stmdb   sp!,{r4-r10,lr}
  #endif
  ___
  $ret="r10";    # reassigned 1st argument
  $code.=<<___;
-       stmdb   sp!,{r4-r10,lr}
         mov     $ret,r0                 @ reassign 1st argument
         mov     $b,r3                   @ $b=b1
+       sub     r7,sp,#36
+       mov     r8,sp
+       and     r7,r7,#-32
         ldr     r3,[sp,#32]             @ load b0
         mov     $mask,#7<<2
-       sub     sp,sp,#32               @ allocate tab[8]
+       mov     sp,r7                   @ allocate tab[8]
+       str     r8,[r7,#32]
  
         bl      mul_1x1_ialu            @ a1·b1
         str     $lo,[$ret,#8]
@@ -193,6 +217,7 @@ ___
  $code.=<<___;
         ldmia   $ret,{@r[0]-@r[3]}
         eor     $lo,$lo,$hi
+       ldr     sp,[sp,#32]             @ destroy tab[8]
         eor     $hi,$hi,@r[1]
         eor     $lo,$lo,@r[0]
         eor     $hi,$hi,@r[2]
@@ -200,7 +225,6 @@ $code.=<<___;
         eor     $hi,$hi,@r[3]
         str     $hi,[$ret,#8]
         eor     $lo,$lo,$hi
-       add     sp,sp,#32               @ destroy tab[8]
         str     $lo,[$ret,#4]
  
  #if __ARM_ARCH__>=5
@@ -279,7 +303,7 @@ $code.=<<___;
  #if __ARM_MAX_ARCH__>=7
  .align 5
  .LOPENSSL_armcap:
-.word  OPENSSL_armcap_P-(.Lpic+8)
+.word  OPENSSL_armcap_P-.
  #endif
  .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
  .align 5
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl

index 59f218b5cf3d3b72070c76ab309818c2b7eadcd5..bd56f989c70a158451571541f06bb1936a41ca51 100644 (file)
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -82,7 +82,12 @@ $code=<<___;
  #include "arm_arch.h"
  
  .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax        unified
+.thumb
+#else
  .code  32
+#endif
  
  #if __ARM_MAX_ARCH__>=7
  .align 5
@@ -101,7 +106,7 @@ bn_mul_mont:
  #if __ARM_MAX_ARCH__>=7
         tst     ip,#7
         bne     .Lialu
-       adr     r0,bn_mul_mont
+       adr     r0,.Lbn_mul_mont
         ldr     r2,.LOPENSSL_armcap
         ldr     r0,[r0,r2]
  #ifdef __APPLE__
@@ -117,6 +122,9 @@ bn_mul_mont:
  #endif
         cmp     ip,#2
         mov     $num,ip                 @ load num
+#ifdef __thumb2__
+       ittt    lt
+#endif
         movlt   r0,#0
         addlt   sp,sp,#2*4
         blt     .Labrt
@@ -164,10 +172,11 @@ bn_mul_mont:
         ldr     $n0,[$_n0]              @ restore n0
         adc     $nhi,$nhi,#0
         str     $nlo,[$num]             @ tp[num-1]=
+       mov     $tj,sp
         str     $nhi,[$num,#4]          @ tp[num]=
  \f
  .Louter:
-       sub     $tj,$num,sp             @ "original" $num-1 value
+       sub     $tj,$num,$tj            @ "original" $num-1 value
         sub     $ap,$ap,$tj             @ "rewind" ap to &ap[1]
         ldr     $bi,[$tp,#4]!           @ *(++bp)
         sub     $np,$np,$tj             @ "rewind" np to &np[1]
@@ -212,11 +221,16 @@ bn_mul_mont:
         str     $nhi,[$num,#4]          @ tp[num]=
  
         cmp     $tp,$tj
+#ifdef __thumb2__
+       itt     ne
+#endif
+       movne   $tj,sp
         bne     .Louter
  \f
         ldr     $rp,[$_rp]              @ pull rp
+       mov     $aj,sp
         add     $num,$num,#4            @ $num to point at &tp[num]
-       sub     $aj,$num,sp             @ "original" num value
+       sub     $aj,$num,$aj            @ "original" num value
         mov     $tp,sp                  @ "rewind" $tp
         mov     $ap,$tp                 @ "borrow" $ap
         sub     $np,$np,$aj             @ "rewind" $np to &np[0]
@@ -242,7 +256,8 @@ bn_mul_mont:
         cmp     $tp,$num
         bne     .Lcopy
  
-       add     sp,$num,#4              @ skip over tp[num+1]
+       mov     sp,$num
+       add     sp,sp,#4                @ skip over tp[num+1]
         ldmia   sp!,{r4-r12,lr}         @ restore registers
         add     sp,sp,#2*4              @ skip over {r0,r2}
         mov     r0,#1
@@ -283,6 +298,7 @@ bn_mul8x_mont_neon:
         stmdb   sp!,{r4-r11}
         vstmdb  sp!,{d8-d15}            @ ABI specification says so
         ldmia   ip,{r4-r5}              @ load rest of parameter block
+       mov     ip,sp
  
         sub             $toutptr,sp,#16
         vld1.32         {${Bi}[0]}, [$bptr,:32]!
@@ -638,8 +654,9 @@ bn_mul8x_mont_neon:
         bne     .LNEON_sub
  
         ldr     r10, [$aptr]                            @ load top-most bit
+       mov     r11,sp
         veor    q0,q0,q0
-       sub     r11,$bptr,sp                            @ this is num*4
+       sub     r11,$bptr,r11                           @ this is num*4
         veor    q1,q1,q1
         mov     $aptr,sp
         sub     $rptr,$rptr,r11                         @ rewind $rptr
@@ -649,27 +666,33 @@ bn_mul8x_mont_neon:
  .LNEON_copy_n_zap:
         ldmia   $aptr!, {r4-r7}
         ldmia   $rptr,  {r8-r11}
+       it      cc
         movcc   r8, r4
         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
+       itt     cc
         movcc   r9, r5
         movcc   r10,r6
         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
+       it      cc
         movcc   r11,r7
         ldmia   $aptr, {r4-r7}
         stmia   $rptr!, {r8-r11}
         sub     $aptr,$aptr,#16
         ldmia   $rptr, {r8-r11}
+       it      cc
         movcc   r8, r4
         vst1.64 {q0-q1}, [$aptr,:256]!                  @ wipe
+       itt     cc
         movcc   r9, r5
         movcc   r10,r6
         vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
+       it      cc
         movcc   r11,r7
         teq     $aptr,$bptr                             @ preserves carry
         stmia   $rptr!, {r8-r11}
         bne     .LNEON_copy_n_zap
  
-       sub     sp,ip,#96
+       mov     sp,ip
          vldmia  sp!,{d8-d15}
          ldmia   sp!,{r4-r11}
         ret                                             @ bx lr
diff --git a/crypto/ec/asm/ecp_nistz256-armv4.pl b/crypto/ec/asm/ecp_nistz256-armv4.pl

index b49b77ea3e09352c6428503262203539d8c032f8..aeeb190335f9ae2f8f57b56b57cd4fe88fa7ff80 100755 (executable)
--- a/crypto/ec/asm/ecp_nistz256-armv4.pl
+++ b/crypto/ec/asm/ecp_nistz256-armv4.pl
@@ -45,7 +45,12 @@ $code.=<<___;
  #include "arm_arch.h"
  
  .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax        unified
+.thumb
+#else
  .code  32
+#endif
  ___
  ########################################################################
  # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
@@ -162,6 +167,9 @@ __ecp_nistz256_mul_by_2:
         adcs    $a6,$a6,$a6
         mov     $ff,#0
         adcs    $a7,$a7,$a7
+#ifdef __thumb2__
+       it      cs
+#endif
         movcs   $ff,#-1                 @ $ff = carry ? -1 : 0
  
         b       .Lreduce_by_sub
@@ -213,6 +221,9 @@ __ecp_nistz256_add:
         adcs    $a6,$a6,$t2
         mov     $ff,#0
         adcs    $a7,$a7,$t3
+#ifdef __thumb2__
+       it      cs
+#endif
         movcs   $ff,#-1                 @ $ff = carry ? -1 : 0, "broadcast" carry
         ldr     lr,[sp],#4              @ pop lr
  
@@ -286,6 +297,9 @@ __ecp_nistz256_mul_by_3:
         adcs    $a6,$a6,$a6
         mov     $ff,#0
         adcs    $a7,$a7,$a7
+#ifdef __thumb2__
+       it      cs
+#endif
         movcs   $ff,#-1                 @ $ff = carry ? -1 : 0, "broadcast" carry
  
         subs    $a0,$a0,$ff             @ subtract synthesized modulus, see
@@ -318,6 +332,9 @@ __ecp_nistz256_mul_by_3:
         adcs    $a6,$a6,$t2
         mov     $ff,#0
         adcs    $a7,$a7,$t3
+#ifdef __thumb2__
+       it      cs
+#endif
         movcs   $ff,#-1                 @ $ff = carry ? -1 : 0, "broadcast" carry
         ldr     lr,[sp],#4              @ pop lr
  
@@ -781,6 +798,9 @@ ecp_nistz256_gather_w5:
  
         cmp     $index,#0
         mov     $mask,#0
+#ifdef __thumb2__
+       itt     ne
+#endif
         subne   $index,$index,#1
         movne   $mask,#-1
         add     $inp,$inp,$index,lsl#2
@@ -887,6 +907,9 @@ ecp_nistz256_gather_w7:
  
         cmp     $index,#0
         mov     $mask,#0
+#ifdef __thumb2__
+       itt     ne
+#endif
         subne   $index,$index,#1
         movne   $mask,#-1
         add     $inp,$inp,$index
@@ -1180,6 +1203,9 @@ __ecp_nistz256_add_self:
         adcs    $a6,$a6,$a6
         mov     $ff,#0
         adcs    $a7,$a7,$a7
+#ifdef __thumb2__
+       it      cs
+#endif
         movcs   $ff,#-1                 @ $ff = carry ? -1 : 0
  
         subs    $a0,$a0,$ff             @ subtract synthesized modulus
@@ -1369,6 +1395,9 @@ ecp_nistz256_point_add:
         stmia   r3!,{r4-r11}
         ldmia   $b_ptr,{r4-r11}
         cmp     r12,#0
+#ifdef __thumb2__
+       it      ne
+#endif
         movne   r12,#-1
         stmia   r3,{r4-r11}
         str     r12,[sp,#32*18+8]       @ !in2infty
@@ -1395,6 +1424,9 @@ ecp_nistz256_point_add:
         stmia   r3!,{r4-r11}
         ldmia   $a_ptr,{r4-r11}
         cmp     r12,#0
+#ifdef __thumb2__
+       it      ne
+#endif
         movne   r12,#-1
         stmia   r3,{r4-r11}
         str     r12,[sp,#32*18+4]       @ !in1infty
@@ -1636,6 +1668,9 @@ ecp_nistz256_point_add_affine:
         stmia   r3!,{r4-r11}
         ldmia   $a_ptr,{r4-r11}
         cmp     r12,#0
+#ifdef __thumb2__
+       it      ne
+#endif
         movne   r12,#-1
         stmia   r3,{r4-r11}
         str     r12,[sp,#32*15+4]       @ !in1infty
@@ -1661,6 +1696,9 @@ ecp_nistz256_point_add_affine:
         orr     r12,r12,r11
         stmia   r3!,{r4-r11}
         cmp     r12,#0
+#ifdef __thumb2__
+       it      ne
+#endif
         movne   r12,#-1
         str     r12,[sp,#32*15+8]       @ !in2infty
  
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl

index 2d225cf6d895c76bb781cddc221f513c70ca2b19..1506e5b203fbe38fae72e80152515f378e94f262 100644 (file)
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -136,7 +136,12 @@ $code=<<___;
  #include "arm_arch.h"
  
  .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax        unified
+.thumb
+#else
  .code  32
+#endif
  
  #ifdef  __APPLE__
  #define ldrplb  ldrbpl
@@ -154,19 +159,27 @@ rem_4bit:
  
  .type  rem_4bit_get,%function
  rem_4bit_get:
-       sub     $rem_4bit,pc,#8
-       sub     $rem_4bit,$rem_4bit,#32 @ &rem_4bit
+#if defined(__thumb2__)
+       adr     $rem_4bit,rem_4bit
+#else
+       sub     $rem_4bit,pc,#8+32      @ &rem_4bit
+#endif
         b       .Lrem_4bit_got
         nop
+       nop
  .size  rem_4bit_get,.-rem_4bit_get
  
  .global        gcm_ghash_4bit
  .type  gcm_ghash_4bit,%function
+.align 4
  gcm_ghash_4bit:
-       sub     r12,pc,#8
+#if defined(__thumb2__)
+       adr     r12,rem_4bit
+#else
+       sub     r12,pc,#8+48            @ &rem_4bit
+#endif
         add     $len,$inp,$len          @ $len to point at the end
         stmdb   sp!,{r3-r11,lr}         @ save $len/end too
-       sub     r12,r12,#48             @ &rem_4bit
  
         ldmia   r12,{r4-r11}            @ copy rem_4bit ...
         stmdb   sp!,{r4-r11}            @ ... to stack
@@ -213,6 +226,9 @@ gcm_ghash_4bit:
         eor     $Zlh,$Zlh,$Zhl,lsl#28
         ldrh    $Tll,[sp,$nlo]          @ rem_4bit[rem]
         eor     $Zhl,$Thl,$Zhl,lsr#4
+#ifdef __thumb2__
+       it      pl
+#endif
         ldrplb  $nlo,[$inp,$cnt]
         eor     $Zhl,$Zhl,$Zhh,lsl#28
         eor     $Zhh,$Thh,$Zhh,lsr#4
@@ -223,6 +239,9 @@ gcm_ghash_4bit:
         add     $nhi,$nhi,$nhi
         ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
         eor     $Zll,$Tll,$Zll,lsr#4
+#ifdef __thumb2__
+       it      pl
+#endif
         ldrplb  $Tll,[$Xi,$cnt]
         eor     $Zll,$Zll,$Zlh,lsl#28
         eor     $Zlh,$Tlh,$Zlh,lsr#4
@@ -230,8 +249,14 @@ gcm_ghash_4bit:
         eor     $Zlh,$Zlh,$Zhl,lsl#28
         eor     $Zhl,$Thl,$Zhl,lsr#4
         eor     $Zhl,$Zhl,$Zhh,lsl#28
+#ifdef __thumb2__
+       it      pl
+#endif
         eorpl   $nlo,$nlo,$Tll
         eor     $Zhh,$Thh,$Zhh,lsr#4
+#ifdef __thumb2__
+       itt     pl
+#endif
         andpl   $nhi,$nlo,#0xf0
         andpl   $nlo,$nlo,#0x0f
         eor     $Zhh,$Zhh,$Tlh,lsl#16   @ ^= rem_4bit[rem]
@@ -241,7 +266,11 @@ gcm_ghash_4bit:
         add     $inp,$inp,#16
         mov     $nhi,$Zll
  ___
-       &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+       &Zsmash("cmp\t$inp,$len","\n".
+                                "#ifdef __thumb2__\n".
+                                "      it      ne\n".
+                                "#endif\n".
+                                "      ldrneb  $nlo,[$inp,#15]");
  $code.=<<___;
         bne     .Louter
  
@@ -299,6 +328,9 @@ gcm_gmult_4bit:
         eor     $Zlh,$Zlh,$Zhl,lsl#28
         ldrh    $Tll,[$rem_4bit,$nlo]   @ rem_4bit[rem]
         eor     $Zhl,$Thl,$Zhl,lsr#4
+#ifdef __thumb2__
+       it      pl
+#endif
         ldrplb  $nlo,[$Xi,$cnt]
         eor     $Zhl,$Zhl,$Zhh,lsl#28
         eor     $Zhh,$Thh,$Zhh,lsr#4
@@ -316,6 +348,9 @@ gcm_gmult_4bit:
         eor     $Zhl,$Thl,$Zhl,lsr#4
         eor     $Zhl,$Zhl,$Zhh,lsl#28
         eor     $Zhh,$Thh,$Zhh,lsr#4
+#ifdef __thumb2__
+       itt     pl
+#endif
         andpl   $nhi,$nlo,#0xf0
         andpl   $nlo,$nlo,#0x0f
         eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl

index 356b52fc1be10ca72a6bb807c49567a7b199a165..9d34e04f7b632d25b146ce16f49be87fa3bb6829 100644 (file)
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -181,7 +181,12 @@ $code=<<___;
  #include "arm_arch.h"
  
  .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax        unified
+.thumb
+#else
  .code  32
+#endif
  
  .global        sha1_block_data_order
  .type  sha1_block_data_order,%function
@@ -189,7 +194,8 @@ $code=<<___;
  .align 5
  sha1_block_data_order:
  #if __ARM_MAX_ARCH__>=7
-       sub     r3,pc,#8                @ sha1_block_data_order
+.Lsha1_block:
+       adr     r3,.Lsha1_block
         ldr     r12,.LOPENSSL_armcap
         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
  #ifdef __APPLE__
@@ -216,7 +222,12 @@ for($i=0;$i<5;$i++) {
         &BODY_00_15(@V);        unshift(@V,pop(@V));
  }
  $code.=<<___;
+#if defined(__thumb2__) && !defined(__APPLE__)
+       mov     $t3,sp
+       teq     $Xi,$t3
+#else
         teq     $Xi,sp
+#endif
         bne     .L_00_15                @ [((11+4)*5+2)*3]
         sub     sp,sp,#25*4
  ___
@@ -235,7 +246,12 @@ for($i=0;$i<5;$i++) {
         &BODY_20_39(@V);        unshift(@V,pop(@V));
  }
  $code.=<<___;
+#if defined(__thumb2__) && !defined(__APPLE__)
+       mov     $t3,sp
+       teq     $Xi,$t3
+#else
         teq     $Xi,sp                  @ preserve carry
+#endif
         bne     .L_20_39_or_60_79       @ [+((12+3)*5+2)*4]
         bcs     .L_done                 @ [+((12+3)*5+2)*4], spare 300 bytes
  
@@ -247,7 +263,12 @@ for($i=0;$i<5;$i++) {
         &BODY_40_59(@V);        unshift(@V,pop(@V));
  }
  $code.=<<___;
+#if defined(__thumb2__) && !defined(__APPLE__)
+       mov     $t3,sp
+       teq     $Xi,$t3
+#else
         teq     $Xi,sp
+#endif
         bne     .L_40_59                @ [+((12+5)*5+2)*4]
  
         ldr     $K,.LK_60_79
@@ -283,7 +304,7 @@ $code.=<<___;
  .LK_60_79:     .word   0xca62c1d6
  #if __ARM_MAX_ARCH__>=7
  .LOPENSSL_armcap:
-.word  OPENSSL_armcap_P-sha1_block_data_order
+.word  OPENSSL_armcap_P-.Lsha1_block
  #endif
  .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  .align 5
@@ -458,6 +479,7 @@ sub Xuplast_80 ()
  
         &teq            ($inp,$len);
         &sub            ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
+       &it             ("eq");
         &subeq          ($inp,$inp,64);         # reload last block to avoid SEGV
         &vld1_8         ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
          eval(shift(@insns));
@@ -508,12 +530,12 @@ sha1_block_data_order_neon:
         @ dmb                           @ errata #451034 on early Cortex A8
         @ vstmdb        sp!,{d8-d15}    @ ABI specification says so
         mov     $saved_sp,sp
-       sub     sp,sp,#64               @ alloca
+       sub     $Xfer,sp,#64
         adr     $K_XX_XX,.LK_00_19
-       bic     sp,sp,#15               @ align for 128-bit stores
+       bic     $Xfer,$Xfer,#15         @ align for 128-bit stores
  
         ldmia   $ctx,{$a,$b,$c,$d,$e}   @ load context
-       mov     $Xfer,sp
+       mov     sp,$Xfer                @ alloca
  
         vld1.8          {@X[-4&7]-@X[-3&7]},[$inp]!     @ handles unaligned
         veor            $zero,$zero,$zero
@@ -560,10 +582,13 @@ $code.=<<___;
         add     $b,$b,$t0
         add     $c,$c,$t1
         add     $d,$d,$Xfer
+       it      eq
         moveq   sp,$saved_sp
         add     $e,$e,$Ki
+       it      ne
         ldrne   $Ki,[sp]
         stmia   $ctx,{$a,$b,$c,$d,$e}
+       itt     ne
         addne   $Xfer,sp,#3*16
         bne     .Loop_neon
  
@@ -584,6 +609,13 @@ my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
  
  $code.=<<___;
  #if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__) && !defined(__APPLE__)
+#  define INST(a,b,c,d)        .byte   c,d|0xf,a,b
+# else
+#  define INST(a,b,c,d)        .byte   a,b,c,d|0x10
+# endif
+
  .type  sha1_block_data_order_armv8,%function
  .align 5
  sha1_block_data_order_armv8:
@@ -677,7 +709,10 @@ ___
             # since ARMv7 instructions are always encoded little-endian.
             # correct solution is to use .inst directive, but older
             # assemblers don't implement it:-(
-           sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+
+           # this fix-up provides Thumb encoding in conjunction with INST
+           $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
+           sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
                         $word&0xff,($word>>8)&0xff,
                         ($word>>16)&0xff,($word>>24)&0xff,
                         $mnemonic,$arg;
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl

index efee1fb1f3b4023d82e0b13472ddd7e8f10d0dfc..c65073b7f95c1f6f6cd1badcaf75192122e9e29a 100644 (file)
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -175,16 +175,12 @@ $code=<<___;
  #endif
  
  .text
-#if __ARM_ARCH__<7
-.code  32
-#else
+#if defined(__thumb2__) && !defined(__APPLE__)
  .syntax unified
-# if defined(__thumb2__) && !defined(__APPLE__)
-#  define adrl adr
  .thumb
-# else
+# define adrl adr
+#else
  .code   32
-# endif
  #endif
  
  .type  K256,%object
@@ -218,10 +214,10 @@ K256:
  .type  sha256_block_data_order,%function
  sha256_block_data_order:
  .Lsha256_block_data_order:
-#if __ARM_ARCH__<7
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
         sub     r3,pc,#8                @ sha256_block_data_order
  #else
-       adr     r3,sha256_block_data_order
+       adr     r3,.Lsha256_block_data_order
  #endif
  #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
         ldr     r12,.LOPENSSL_armcap
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl

index 77d6c5eae95f945b71bbb1ffbf8e9053f6385cc2..a83d6772c8d357c71523196ffc8c559363c0fc20 100644 (file)
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -212,16 +212,12 @@ $code=<<___;
  #endif
  
  .text
-#if __ARM_ARCH__<7 || defined(__APPLE__)
-.code  32
-#else
+#if defined(__thumb2__) && !defined(__APPLE__)
  .syntax unified
-# ifdef __thumb2__
-#  define adrl adr
  .thumb
-# else
-.code   32
-# endif
+# define adrl adr
+#else
+.code  32
  #endif
  
  .type  K512,%object
@@ -280,10 +276,10 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
  .type  sha512_block_data_order,%function
  sha512_block_data_order:
  .Lsha512_block_data_order:
-#if __ARM_ARCH__<7
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
         sub     r3,pc,#8                @ sha512_block_data_order
  #else
-       adr     r3,sha512_block_data_order
+       adr     r3,.Lsha512_block_data_order
  #endif
  #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
         ldr     r12,.LOPENSSL_armcap
author	Andy Polyakov <appro@openssl.org>
	Wed, 23 Sep 2015 16:41:27 +0000 (18:41 +0200)
committer	Andy Polyakov <appro@openssl.org>
	Fri, 25 Sep 2015 11:34:02 +0000 (13:34 +0200)
crypto/aes/asm/aes-armv4.pl		patch \| blob \| history
crypto/armv4cpuid.pl		patch \| blob \| history
crypto/bn/asm/armv4-gf2m.pl		patch \| blob \| history
crypto/bn/asm/armv4-mont.pl		patch \| blob \| history
crypto/ec/asm/ecp_nistz256-armv4.pl		patch \| blob \| history
crypto/modes/asm/ghash-armv4.pl		patch \| blob \| history
crypto/sha/asm/sha1-armv4-large.pl		patch \| blob \| history
crypto/sha/asm/sha256-armv4.pl		patch \| blob \| history
crypto/sha/asm/sha512-armv4.pl		patch \| blob \| history