Adapt ARM assembly pack for iOS.

author Andy Polyakov <appro@openssl.org>

Mon, 11 May 2015 09:43:55 +0000 (11:43 +0200)

committer Andy Polyakov <appro@openssl.org>

Wed, 13 May 2015 14:46:58 +0000 (16:46 +0200)
author Andy Polyakov <appro@openssl.org>
Mon, 11 May 2015 09:43:55 +0000 (11:43 +0200)
committer Andy Polyakov <appro@openssl.org>
Wed, 13 May 2015 14:46:58 +0000 (16:46 +0200)
diff --git a/crypto/aes/asm/aes-armv4.pl b/crypto/aes/asm/aes-armv4.pl

index 55b6e04b676d65cc6eb992721a2d1e30230b68e5..ed5125827b6a4267ee428322adedb82cd1b2a04b 100644 (file)
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@@ -32,8 +32,20 @@
  # Profiler-assisted and platform-specific optimization resulted in 16%
  # improvement on Cortex A8 core and ~21.5 cycles per byte.
  
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
  
  $s0="r0";
  $s1="r1";
@@ -171,7 +183,12 @@ AES_encrypt:
         stmdb   sp!,{r1,r4-r12,lr}
         mov     $rounds,r0              @ inp
         mov     $key,r2
+#ifdef __APPLE__
+       mov     $tbl,#AES_encrypt-AES_Te
+       sub     $tbl,r3,$tbl                    @ Te
+#else
         sub     $tbl,r3,#AES_encrypt-AES_Te     @ Te
+#endif
  #if __ARM_ARCH__<7
         ldrb    $s0,[$rounds,#3]        @ load input data in endian-neutral
         ldrb    $t1,[$rounds,#2]        @ manner...
@@ -425,7 +442,12 @@ AES_set_encrypt_key:
         bne     .Labrt
  
  .Lok:  stmdb   sp!,{r4-r12,lr}
+#ifdef __APPLE__
+       mov     $tbl,#AES_set_encrypt_key-AES_Te-1024
+       sub     $tbl,r3,$tbl                                    @ Te4
+#else
         sub     $tbl,r3,#AES_set_encrypt_key-AES_Te-1024        @ Te4
+#endif
  
         mov     $rounds,r0              @ inp
         mov     lr,r1                   @ bits
@@ -886,7 +908,12 @@ AES_decrypt:
         stmdb   sp!,{r1,r4-r12,lr}
         mov     $rounds,r0              @ inp
         mov     $key,r2
+#ifdef __APPLE__
+       mov     $tbl,#AES_decrypt-AES_Td
+       sub     $tbl,r3,$tbl                            @ Td
+#else
         sub     $tbl,r3,#AES_decrypt-AES_Td             @ Td
+#endif
  #if __ARM_ARCH__<7
         ldrb    $s0,[$rounds,#3]        @ load input data in endian-neutral
         ldrb    $t1,[$rounds,#2]        @ manner...
diff --git a/crypto/bn/asm/armv4-gf2m.pl b/crypto/bn/asm/armv4-gf2m.pl

index c52e0b75b5b6231c00847ea5451d0965d527feff..737659f0db2631817f83298222add449860de759 100644 (file)
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -21,8 +21,20 @@
  # runs in even less cycles, ~30, improvement is measurable only on
  # longer keys. One has to optimize code elsewhere to get NEON glow...
  
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
  
  sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
  sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
@@ -170,11 +182,18 @@ bn_GF2m_mul_2x2:
  #if __ARM_ARCH__>=7
         ldr     r12,.LOPENSSL_armcap
  .Lpic: ldr     r12,[pc,r12]
+#ifdef __APPLE__
+       ldr     r12,[r12]
+#endif
         tst     r12,#1
         beq     .Lialu
  
         veor    $A1,$A1
+#ifdef __APPLE__
+       vmov    $B1,r3,r3               @ two copies of b1
+#else
         vmov.32 $B1,r3,r3               @ two copies of b1
+#endif
         vmov.32 ${A1}[0],r1             @ a1
  
         veor    $A0,$A0
diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl

index f78a8b5f0f5573141fa7f6496b7a34d0acadd0e2..aa00f38c2f0e59943475936e25e161639971ae20 100644 (file)
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -23,8 +23,20 @@
  # than 1/2KB. Windows CE port would be trivial, as it's exclusively
  # about decorations, ABI and instruction syntax are identical.
  
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
  
  $num="r0";     # starts as num argument, but holds &tp[num-1]
  $ap="r1";
diff --git a/crypto/modes/asm/ghash-armv4.pl b/crypto/modes/asm/ghash-armv4.pl

index d91586ee2925bb695899b17bb8a7242aa3bf9150..3799b2b5593d1229440a9319a9cc845c6b3e78de 100644 (file)
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@@ -57,8 +57,20 @@
  # *native* byte order on current platform. See gcm128.c for working
  # example...
  
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
  
  $Xi="r0";      # argument block
  $Htbl="r1";
@@ -112,6 +124,11 @@ $code=<<___;
  .text
  .code  32
  
+#ifdef  __APPLE__
+#define ldrplb ldrbpl
+#define ldrneb ldrbne
+#endif
+
  .type  rem_4bit,%object
  .align 5
  rem_4bit:
@@ -326,9 +343,9 @@ $code.=<<___;
  .align 4
  gcm_gmult_neon:
         sub             $Htbl,#16               @ point at H in GCM128_CTX
-       vld1.64         `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+       vld1.64         `&Dhi("$IN")`,[$Xi]!    @ load Xi
         vmov.i32        $mod,#0xe1              @ our irreducible polynomial
-       vld1.64         `&Dlo("$IN")`,[$Xi,:64]!
+       vld1.64         `&Dlo("$IN")`,[$Xi]!
         vshr.u64        $mod,#32
         vldmia          $Htbl,{$Hhi-$Hlo}       @ load H
         veor            $zero,$zero
@@ -349,9 +366,9 @@ gcm_gmult_neon:
  .type  gcm_ghash_neon,%function
  .align 4
  gcm_ghash_neon:
-       vld1.64         `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
+       vld1.64         `&Dhi("$Z")`,[$Xi]!     @ load Xi
         vmov.i32        $mod,#0xe1              @ our irreducible polynomial
-       vld1.64         `&Dlo("$Z")`,[$Xi,:64]!
+       vld1.64         `&Dlo("$Z")`,[$Xi]!
         vshr.u64        $mod,#32
         vldmia          $Xi,{$Hhi-$Hlo}         @ load H
         veor            $zero,$zero
@@ -410,8 +427,8 @@ gcm_ghash_neon:
         vrev64.8        $Z,$Z
  #endif
         sub             $Xi,#16 
-       vst1.64         `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
-       vst1.64         `&Dlo("$Z")`,[$Xi,:64]
+       vst1.64         `&Dhi("$Z")`,[$Xi]!     @ write out Xi
+       vst1.64         `&Dlo("$Z")`,[$Xi]
  
         bx      lr
  .size  gcm_ghash_neon,.-gcm_ghash_neon
diff --git a/crypto/sha/asm/sha1-armv4-large.pl b/crypto/sha/asm/sha1-armv4-large.pl

index 33da3e0e3c0d50031a705b182b22b74c6b6cbdf1..6c0adb991182f14a6258b5e25c8dd2a0018faeb6 100644 (file)
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@@ -52,8 +52,20 @@
  # Profiler-assisted and platform-specific optimization resulted in 10%
  # improvement on Cortex A8 core and 12.2 cycles per byte.
  
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
  
  $ctx="r0";
  $inp="r1";
diff --git a/crypto/sha/asm/sha256-armv4.pl b/crypto/sha/asm/sha256-armv4.pl

index 9c84e8d93c301a35b544b112cb87a90b7ddbd573..252a583d062af9ab2d5789d016e52ceae1afd94c 100644 (file)
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@@ -23,8 +23,20 @@
  # Profiler-assisted and platform-specific optimization resulted in 16%
  # improvement on Cortex A8 core and ~17 cycles per processed byte.
  
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
  
  $ctx="r0";     $t0="r0";
  $inp="r1";     $t3="r1";
diff --git a/crypto/sha/asm/sha512-armv4.pl b/crypto/sha/asm/sha512-armv4.pl

index 7faf37b1479029e152ec99200b4c55b551850131..c032afdbcae8de59297a426c7a09608b1a535cd7 100644 (file)
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@@ -38,8 +38,20 @@ $hi="HI";
  $lo="LO";
  # ====================================================================
  
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
  
  $ctx="r0";     # parameter block
  $inp="r1";
@@ -221,17 +233,21 @@ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
  WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
  .size  K512,.-K512
  .LOPENSSL_armcap:
-.word  OPENSSL_armcap_P-sha512_block_data_order
+.word  OPENSSL_armcap_P-.Lsha512_block_data_order
  .skip  32-4
  
  .global        sha512_block_data_order
  .type  sha512_block_data_order,%function
  sha512_block_data_order:
+.Lsha512_block_data_order:
         sub     r3,pc,#8                @ sha512_block_data_order
         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
  #if __ARM_ARCH__>=7
         ldr     r12,.LOPENSSL_armcap
         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
+#ifdef __APPLE__
+       ldr     r12,[r12]
+#endif
         tst     r12,#1
         bne     .LNEON
  #endif
author	Andy Polyakov <appro@openssl.org>
	Mon, 11 May 2015 09:43:55 +0000 (11:43 +0200)
committer	Andy Polyakov <appro@openssl.org>
	Wed, 13 May 2015 14:46:58 +0000 (16:46 +0200)
crypto/aes/asm/aes-armv4.pl		patch \| blob \| history
crypto/bn/asm/armv4-gf2m.pl		patch \| blob \| history
crypto/bn/asm/armv4-mont.pl		patch \| blob \| history
crypto/modes/asm/ghash-armv4.pl		patch \| blob \| history
crypto/sha/asm/sha1-armv4-large.pl		patch \| blob \| history
crypto/sha/asm/sha256-armv4.pl		patch \| blob \| history
crypto/sha/asm/sha512-armv4.pl		patch \| blob \| history