ARM64 assembly pack: make it Windows-friendly.

author Andy Polyakov <appro@openssl.org>

Fri, 15 Feb 2019 21:16:41 +0000 (22:16 +0100)

committer Richard Levitte <levitte@openssl.org>

Sat, 16 Feb 2019 16:01:15 +0000 (17:01 +0100)
author Andy Polyakov <appro@openssl.org>
Fri, 15 Feb 2019 21:16:41 +0000 (22:16 +0100)
committer Richard Levitte <levitte@openssl.org>
Sat, 16 Feb 2019 16:01:15 +0000 (17:01 +0100)
diff --git a/crypto/aes/asm/vpaes-armv8.pl b/crypto/aes/asm/vpaes-armv8.pl

index ece9f20bec61385143ff761e5cb11f967bb391c9..f08ae583833fffe0a08a744b771d52e0f5f8f4cc 100755 (executable)
--- a/crypto/aes/asm/vpaes-armv8.pl
+++ b/crypto/aes/asm/vpaes-armv8.pl
@@ -150,12 +150,12 @@ my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
  my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
  
  $code.=<<___;
-##
-##  _aes_preheat
-##
-##  Fills register %r10 -> .aes_consts (so you can -fPIC)
-##  and %xmm9-%xmm15 as specified below.
-##
+//
+//  _aes_preheat
+//
+//  Fills register %r10 -> .aes_consts (so you can -fPIC)
+//  and %xmm9-%xmm15 as specified below.
+//
  .type  _vpaes_encrypt_preheat,%function
  .align 4
  _vpaes_encrypt_preheat:
@@ -167,21 +167,21 @@ _vpaes_encrypt_preheat:
         ret
  .size  _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
  
-##
-##  _aes_encrypt_core
-##
-##  AES-encrypt %xmm0.
-##
-##  Inputs:
-##     %xmm0 = input
-##     %xmm9-%xmm15 as in _vpaes_preheat
-##    (%rdx) = scheduled keys
-##
-##  Output in %xmm0
-##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
-##  Preserves %xmm6 - %xmm8 so you get some local vectors
-##
-##
+//
+//  _aes_encrypt_core
+//
+//  AES-encrypt %xmm0.
+//
+//  Inputs:
+//     %xmm0 = input
+//     %xmm9-%xmm15 as in _vpaes_preheat
+//    (%rdx) = scheduled keys
+//
+//  Output in %xmm0
+//  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+//  Preserves %xmm6 - %xmm8 so you get some local vectors
+//
+//
  .type  _vpaes_encrypt_core,%function
  .align 4
  _vpaes_encrypt_core:
@@ -387,11 +387,11 @@ _vpaes_decrypt_preheat:
         ret
  .size  _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
  
-##
-##  Decryption core
-##
-##  Same API as encryption core.
-##
+//
+//  Decryption core
+//
+//  Same API as encryption core.
+//
  .type  _vpaes_decrypt_core,%function
  .align 4
  _vpaes_decrypt_core:
@@ -643,11 +643,11 @@ my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
  my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
  
  $code.=<<___;
-########################################################
-##                                                    ##
-##                  AES key schedule                  ##
-##                                                    ##
-########################################################
+////////////////////////////////////////////////////////
+//                                                    //
+//                  AES key schedule                  //
+//                                                    //
+////////////////////////////////////////////////////////
  .type  _vpaes_key_preheat,%function
  .align 4
  _vpaes_key_preheat:
@@ -703,14 +703,14 @@ _vpaes_schedule_core:
         b.eq    .Lschedule_192
         // 128: fall though
  
-##
-##  .schedule_128
-##
-##  128-bit specific part of key schedule.
-##
-##  This schedule is really simple, because all its parts
-##  are accomplished by the subroutines.
-##
+//
+//  .schedule_128
+//
+//  128-bit specific part of key schedule.
+//
+//  This schedule is really simple, because all its parts
+//  are accomplished by the subroutines.
+//
  .Lschedule_128:
         mov     $inp, #10                       // mov  \$10, %esi
  
@@ -721,21 +721,21 @@ _vpaes_schedule_core:
         bl      _vpaes_schedule_mangle          // write output
         b       .Loop_schedule_128
  
-##
-##  .aes_schedule_192
-##
-##  192-bit specific part of key schedule.
-##
-##  The main body of this schedule is the same as the 128-bit
-##  schedule, but with more smearing.  The long, high side is
-##  stored in %xmm7 as before, and the short, low side is in
-##  the high bits of %xmm6.
-##
-##  This schedule is somewhat nastier, however, because each
-##  round produces 192 bits of key material, or 1.5 round keys.
-##  Therefore, on each cycle we do 2 rounds and produce 3 round
-##  keys.
-##
+//
+//  .aes_schedule_192
+//
+//  192-bit specific part of key schedule.
+//
+//  The main body of this schedule is the same as the 128-bit
+//  schedule, but with more smearing.  The long, high side is
+//  stored in %xmm7 as before, and the short, low side is in
+//  the high bits of %xmm6.
+//
+//  This schedule is somewhat nastier, however, because each
+//  round produces 192 bits of key material, or 1.5 round keys.
+//  Therefore, on each cycle we do 2 rounds and produce 3 round
+//  keys.
+//
  .align 4
  .Lschedule_192:
         sub     $inp, $inp, #8
@@ -759,16 +759,16 @@ _vpaes_schedule_core:
         bl      _vpaes_schedule_192_smear
         b       .Loop_schedule_192
  
-##
-##  .aes_schedule_256
-##
-##  256-bit specific part of key schedule.
-##
-##  The structure here is very similar to the 128-bit
-##  schedule, but with an additional "low side" in
-##  %xmm6.  The low side's rounds are the same as the
-##  high side's, except no rcon and no rotation.
-##
+//
+//  .aes_schedule_256
+//
+//  256-bit specific part of key schedule.
+//
+//  The structure here is very similar to the 128-bit
+//  schedule, but with an additional "low side" in
+//  %xmm6.  The low side's rounds are the same as the
+//  high side's, except no rcon and no rotation.
+//
  .align 4
  .Lschedule_256:
         ld1     {v0.16b}, [$inp]                // vmovdqu      16(%rdi),%xmm0          # load key part 2 (unaligned)
@@ -795,16 +795,16 @@ _vpaes_schedule_core:
  
         b       .Loop_schedule_256
  
-##
-##  .aes_schedule_mangle_last
-##
-##  Mangler for last round of key schedule
-##  Mangles %xmm0
-##    when encrypting, outputs out(%xmm0) ^ 63
-##    when decrypting, outputs unskew(%xmm0)
-##
-##  Always called right before return... jumps to cleanup and exits
-##
+//
+//  .aes_schedule_mangle_last
+//
+//  Mangler for last round of key schedule
+//  Mangles %xmm0
+//    when encrypting, outputs out(%xmm0) ^ 63
+//    when decrypting, outputs unskew(%xmm0)
+//
+//  Always called right before return... jumps to cleanup and exits
+//
  .align 4
  .Lschedule_mangle_last:
         // schedule last round key from xmm0
@@ -838,20 +838,20 @@ _vpaes_schedule_core:
         ret
  .size  _vpaes_schedule_core,.-_vpaes_schedule_core
  
-##
-##  .aes_schedule_192_smear
-##
-##  Smear the short, low side in the 192-bit key schedule.
-##
-##  Inputs:
-##    %xmm7: high side, b  a  x  y
-##    %xmm6:  low side, d  c  0  0
-##    %xmm13: 0
-##
-##  Outputs:
-##    %xmm6: b+c+d  b+c  0  0
-##    %xmm0: b+c+d  b+c  b  a
-##
+//
+//  .aes_schedule_192_smear
+//
+//  Smear the short, low side in the 192-bit key schedule.
+//
+//  Inputs:
+//    %xmm7: high side, b  a  x  y
+//    %xmm6:  low side, d  c  0  0
+//    %xmm13: 0
+//
+//  Outputs:
+//    %xmm6: b+c+d  b+c  0  0
+//    %xmm0: b+c+d  b+c  b  a
+//
  .type  _vpaes_schedule_192_smear,%function
  .align 4
  _vpaes_schedule_192_smear:
@@ -867,24 +867,24 @@ _vpaes_schedule_192_smear:
         ret
  .size  _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
  
-##
-##  .aes_schedule_round
-##
-##  Runs one main round of the key schedule on %xmm0, %xmm7
-##
-##  Specifically, runs subbytes on the high dword of %xmm0
-##  then rotates it by one byte and xors into the low dword of
-##  %xmm7.
-##
-##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
-##  next rcon.
-##
-##  Smears the dwords of %xmm7 by xoring the low into the
-##  second low, result into third, result into highest.
-##
-##  Returns results in %xmm7 = %xmm0.
-##  Clobbers %xmm1-%xmm4, %r11.
-##
+//
+//  .aes_schedule_round
+//
+//  Runs one main round of the key schedule on %xmm0, %xmm7
+//
+//  Specifically, runs subbytes on the high dword of %xmm0
+//  then rotates it by one byte and xors into the low dword of
+//  %xmm7.
+//
+//  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+//  next rcon.
+//
+//  Smears the dwords of %xmm7 by xoring the low into the
+//  second low, result into third, result into highest.
+//
+//  Returns results in %xmm7 = %xmm0.
+//  Clobbers %xmm1-%xmm4, %r11.
+//
  .type  _vpaes_schedule_round,%function
  .align 4
  _vpaes_schedule_round:
@@ -932,15 +932,15 @@ _vpaes_schedule_low_round:
         ret
  .size  _vpaes_schedule_round,.-_vpaes_schedule_round
  
-##
-##  .aes_schedule_transform
-##
-##  Linear-transform %xmm0 according to tables at (%r11)
-##
-##  Requires that %xmm9 = 0x0F0F... as in preheat
-##  Output in %xmm0
-##  Clobbers %xmm1, %xmm2
-##
+//
+//  .aes_schedule_transform
+//
+//  Linear-transform %xmm0 according to tables at (%r11)
+//
+//  Requires that %xmm9 = 0x0F0F... as in preheat
+//  Output in %xmm0
+//  Clobbers %xmm1, %xmm2
+//
  .type  _vpaes_schedule_transform,%function
  .align 4
  _vpaes_schedule_transform:
@@ -954,29 +954,29 @@ _vpaes_schedule_transform:
         ret
  .size  _vpaes_schedule_transform,.-_vpaes_schedule_transform
  
-##
-##  .aes_schedule_mangle
-##
-##  Mangle xmm0 from (basis-transformed) standard version
-##  to our version.
-##
-##  On encrypt,
-##    xor with 0x63
-##    multiply by circulant 0,1,1,1
-##    apply shiftrows transform
-##
-##  On decrypt,
-##    xor with 0x63
-##    multiply by "inverse mixcolumns" circulant E,B,D,9
-##    deskew
-##    apply shiftrows transform
-##
-##
-##  Writes out to (%rdx), and increments or decrements it
-##  Keeps track of round number mod 4 in %r8
-##  Preserves xmm0
-##  Clobbers xmm1-xmm5
-##
+//
+//  .aes_schedule_mangle
+//
+//  Mangle xmm0 from (basis-transformed) standard version
+//  to our version.
+//
+//  On encrypt,
+//    xor with 0x63
+//    multiply by circulant 0,1,1,1
+//    apply shiftrows transform
+//
+//  On decrypt,
+//    xor with 0x63
+//    multiply by "inverse mixcolumns" circulant E,B,D,9
+//    deskew
+//    apply shiftrows transform
+//
+//
+//  Writes out to (%rdx), and increments or decrements it
+//  Keeps track of round number mod 4 in %r8
+//  Preserves xmm0
+//  Clobbers xmm1-xmm5
+//
  .type  _vpaes_schedule_mangle,%function
  .align 4
  _vpaes_schedule_mangle:
diff --git a/crypto/bn/asm/armv8-mont.pl b/crypto/bn/asm/armv8-mont.pl

index c09c783e5182bdee901df5106dc644650e8e39da..c755555d884da06c2f98cb09faa57fccc0f110de 100755 (executable)
--- a/crypto/bn/asm/armv8-mont.pl
+++ b/crypto/bn/asm/armv8-mont.pl
@@ -197,7 +197,7 @@ bn_mul_mont:
         mul     $nlo,$nj,$m1            // np[j]*m1
         adds    $lo1,$lo1,$lo0
         umulh   $nhi,$nj,$m1
-       str     $lo1,[$tp,#-16]         // tp[j-1]
+       stur    $lo1,[$tp,#-16]         // tp[j-1]
         cbnz    $j,.Linner
  
  .Linner_skip:
@@ -253,13 +253,13 @@ bn_mul_mont:
         csel    $nj,$tj,$aj,lo          // did it borrow?
         ldr     $tj,[$tp],#8
         ldr     $aj,[$rp],#8
-       str     xzr,[$tp,#-16]          // wipe tp
-       str     $nj,[$rp,#-16]
+       stur    xzr,[$tp,#-16]          // wipe tp
+       stur    $nj,[$rp,#-16]
         cbnz    $num,.Lcond_copy
  
         csel    $nj,$tj,$aj,lo
-       str     xzr,[$tp,#-8]           // wipe tp
-       str     $nj,[$rp,#-8]
+       stur    xzr,[$tp,#-8]           // wipe tp
+       stur    $nj,[$rp,#-8]
  
         ldp     x19,x20,[x29,#16]
         mov     sp,x29
@@ -596,7 +596,7 @@ __bn_sqr8x_mont:
         ldp     $a4,$a5,[$tp,#8*4]
         ldp     $a6,$a7,[$tp,#8*6]
         adds    $acc0,$acc0,$a0
-       ldr     $n0,[$rp,#-8*8]
+       ldur    $n0,[$rp,#-8*8]
         adcs    $acc1,$acc1,$a1
         ldp     $a0,$a1,[$ap,#8*0]
         adcs    $acc2,$acc2,$a2
@@ -794,7 +794,7 @@ $code.=<<___;
         //adc   $carry,xzr,xzr          // moved below
         cbz     $cnt,.Lsqr8x8_post_condition
  
-       ldr     $n0,[$tp,#-8*8]
+       ldur    $n0,[$tp,#-8*8]
         ldp     $a0,$a1,[$np,#8*0]
         ldp     $a2,$a3,[$np,#8*2]
         ldp     $a4,$a5,[$np,#8*4]
@@ -852,7 +852,7 @@ $code.=<<___;
         ldp     $a6,$a7,[$tp,#8*6]
         cbz     $cnt,.Lsqr8x_tail_break
  
-       ldr     $n0,[$rp,#-8*8]
+       ldur    $n0,[$rp,#-8*8]
         adds    $acc0,$acc0,$a0
         adcs    $acc1,$acc1,$a1
         ldp     $a0,$a1,[$np,#8*0]
diff --git a/crypto/chacha/asm/chacha-armv8.pl b/crypto/chacha/asm/chacha-armv8.pl

index cee787d4d6514e7e56bf2a8ecda72f285556ca4f..56ba1c36ba6762efb197f51ea9da4db5a6c9452f 100755 (executable)
--- a/crypto/chacha/asm/chacha-armv8.pl
+++ b/crypto/chacha/asm/chacha-armv8.pl
@@ -131,12 +131,6 @@ $code.=<<___;
  .quad  0x3320646e61707865,0x6b20657479622d32           // endian-neutral
  .Lone:
  .long  1,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long  OPENSSL_armcap_P-.
-#else
-.quad  OPENSSL_armcap_P-.
-#endif
  .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  
  .globl ChaCha20_ctr32
@@ -144,17 +138,13 @@ $code.=<<___;
  .align 5
  ChaCha20_ctr32:
         cbz     $len,.Labort
-       adr     @x[0],.LOPENSSL_armcap_P
         cmp     $len,#192
         b.lo    .Lshort
-#ifdef __ILP32__
-       ldrsw   @x[1],[@x[0]]
-#else
-       ldr     @x[1],[@x[0]]
-#endif
-       ldr     w17,[@x[1],@x[0]]
+
+       adrp    x17,OPENSSL_armcap_P
+       ldr     w17,[x17,#:lo12:OPENSSL_armcap_P]
         tst     w17,#ARMV7_NEON
-       b.ne    ChaCha20_neon
+       b.ne    .LChaCha20_neon
  
  .Lshort:
         .inst   0xd503233f                      // paciasp
@@ -380,6 +370,7 @@ $code.=<<___;
  .type  ChaCha20_neon,%function
  .align 5
  ChaCha20_neon:
+.LChaCha20_neon:
         .inst   0xd503233f                      // paciasp
         stp     x29,x30,[sp,#-96]!
         add     x29,sp,#0
diff --git a/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/ec/asm/ecp_nistz256-armv8.pl

index d84f523cd1b9207e9e16c8f43fe03475f72c39f9..8914f1a619dafe060f8fbd6df21b9ba738f68324 100644 (file)
--- a/crypto/ec/asm/ecp_nistz256-armv8.pl
+++ b/crypto/ec/asm/ecp_nistz256-armv8.pl
@@ -1654,7 +1654,7 @@ ecp_nistz256_scatter_w5:
  
         ldp     x4,x5,[$inp]            // X
         ldp     x6,x7,[$inp,#16]
-       str     w4,[$out,#64*0-4]
+       stur    w4,[$out,#64*0-4]
         lsr     x4,x4,#32
         str     w5,[$out,#64*1-4]
         lsr     x5,x5,#32
@@ -1670,7 +1670,7 @@ ecp_nistz256_scatter_w5:
  
         ldp     x4,x5,[$inp,#32]        // Y
         ldp     x6,x7,[$inp,#48]
-       str     w4,[$out,#64*0-4]
+       stur    w4,[$out,#64*0-4]
         lsr     x4,x4,#32
         str     w5,[$out,#64*1-4]
         lsr     x5,x5,#32
@@ -1686,7 +1686,7 @@ ecp_nistz256_scatter_w5:
  
         ldp     x4,x5,[$inp,#64]        // Z
         ldp     x6,x7,[$inp,#80]
-       str     w4,[$out,#64*0-4]
+       stur    w4,[$out,#64*0-4]
         lsr     x4,x4,#32
         str     w5,[$out,#64*1-4]
         lsr     x5,x5,#32
diff --git a/crypto/perlasm/arm-xlate.pl b/crypto/perlasm/arm-xlate.pl

index b953f1fc19ef91bbf9141f67b01a4c31e78b6e59..af6f7d31885abe16573f16c87a47e18054f8893b 100755 (executable)
--- a/crypto/perlasm/arm-xlate.pl
+++ b/crypto/perlasm/arm-xlate.pl
@@ -103,6 +103,12 @@ my $asciz = sub {
      {  "";     }
  };
  
+my $adrp = sub {
+    my ($args,$comment) = split(m|\s*//|,shift);
+    "\tadrp\t$args\@PAGE";
+} if ($flavour =~ /ios64/);
+
+
  sub range {
    my ($r,$sfx,$start,$end) = @_;
  
@@ -132,6 +138,10 @@ sub expand_line {
  
      $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;
  
+    if ($flavour =~ /ios64/) {
+       $line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/;
+    }
+
      return $line;
  }
  
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl

index 1aded5a275a8df59def3ae44dbfcb0d02bdd7b73..b7aa7dc90b2758b09d072c64033d81d38828df72 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@@ -71,17 +71,12 @@ poly1305_init:
         csel    x0,xzr,x0,eq
         b.eq    .Lno_key
  
-#ifdef __ILP32__
-       ldrsw   $t1,.LOPENSSL_armcap_P
-#else
-       ldr     $t1,.LOPENSSL_armcap_P
-#endif
-       adr     $t0,.LOPENSSL_armcap_P
+       adrp    x17,OPENSSL_armcap_P
+       ldr     w17,[x17,#:lo12:OPENSSL_armcap_P]
  
         ldp     $r0,$r1,[$inp]          // load key
         mov     $s1,#0xfffffffc0fffffff
         movk    $s1,#0x0fff,lsl#48
-       ldr     w17,[$t0,$t1]
  #ifdef __ARMEB__
         rev     $r0,$r0                 // flip bytes
         rev     $r1,$r1
@@ -93,10 +88,10 @@ poly1305_init:
  
         tst     w17,#ARMV7_NEON
  
-       adr     $d0,poly1305_blocks
-       adr     $r0,poly1305_blocks_neon
-       adr     $d1,poly1305_emit
-       adr     $r1,poly1305_emit_neon
+       adr     $d0,.Lpoly1305_blocks
+       adr     $r0,.Lpoly1305_blocks_neon
+       adr     $d1,.Lpoly1305_emit
+       adr     $r1,.Lpoly1305_emit_neon
  
         csel    $d0,$d0,$r0,eq
         csel    $d1,$d1,$r1,eq
@@ -115,6 +110,7 @@ poly1305_init:
  .type  poly1305_blocks,%function
  .align 5
  poly1305_blocks:
+.Lpoly1305_blocks:
         ands    $len,$len,#-16
         b.eq    .Lno_data
  
@@ -179,6 +175,7 @@ poly1305_blocks:
  .type  poly1305_emit,%function
  .align 5
  poly1305_emit:
+.Lpoly1305_emit:
         ldp     $h0,$h1,[$ctx]          // load hash base 2^64
         ldr     $h2,[$ctx,#16]
         ldp     $t0,$t1,[$nonce]        // load nonce
@@ -285,10 +282,11 @@ poly1305_splat:
  .type  poly1305_blocks_neon,%function
  .align 5
  poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
         ldr     $is_base2_26,[$ctx,#24]
         cmp     $len,#128
         b.hs    .Lblocks_neon
-       cbz     $is_base2_26,poly1305_blocks
+       cbz     $is_base2_26,.Lpoly1305_blocks
  
  .Lblocks_neon:
         .inst   0xd503233f              // paciasp
@@ -431,7 +429,7 @@ poly1305_blocks_neon:
         csel    $in2,$zeros,$in2,lo
  
         mov     x4,#1
-       str     x4,[$ctx,#-24]          // set is_base2_26
+       stur    x4,[$ctx,#-24]          // set is_base2_26
         sub     $ctx,$ctx,#48           // restore original $ctx
         b       .Ldo_neon
  
@@ -868,6 +866,7 @@ poly1305_blocks_neon:
  .type  poly1305_emit_neon,%function
  .align 5
  poly1305_emit_neon:
+.Lpoly1305_emit_neon:
         ldr     $is_base2_26,[$ctx,#24]
         cbz     $is_base2_26,poly1305_emit
  
@@ -920,12 +919,6 @@ poly1305_emit_neon:
  .align 5
  .Lzeros:
  .long  0,0,0,0,0,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long  OPENSSL_armcap_P-.
-#else
-.quad  OPENSSL_armcap_P-.
-#endif
  .asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  .align 2
  ___
diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl

index a695b2c474b96a5dcfe4cef910710315a39794b0..7a0cbf539bad1c2631fe548c693d14537134a2c0 100644 (file)
--- a/crypto/sha/asm/sha1-armv8.pl
+++ b/crypto/sha/asm/sha1-armv8.pl
@@ -58,10 +58,10 @@ $code.=<<___ if ($i<15 && !($i&1));
         lsr     @Xx[$i+1],@Xx[$i],#32
  ___
  $code.=<<___ if ($i<14 && !($i&1));
-       ldr     @Xx[$i+2],[$inp,#`($i+2)*4-64`]
+       ldur    @Xx[$i+2],[$inp,#`($i+2)*4-64`]
  ___
  $code.=<<___ if ($i<14 && ($i&1));
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
         ror     @Xx[$i+1],@Xx[$i+1],#32
  #else
         rev32   @Xx[$i+1],@Xx[$i+1]
@@ -171,23 +171,19 @@ ___
  }
  
  $code.=<<___;
-#include "arm_arch.h"
+#ifndef        __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armcap_P
+#endif
  
  .text
  
-.extern        OPENSSL_armcap_P
  .globl sha1_block_data_order
  .type  sha1_block_data_order,%function
  .align 6
  sha1_block_data_order:
-#ifdef __ILP32__
-       ldrsw   x16,.LOPENSSL_armcap_P
-#else
-       ldr     x16,.LOPENSSL_armcap_P
-#endif
-       adr     x17,.LOPENSSL_armcap_P
-       add     x16,x16,x17
-       ldr     w16,[x16]
+       adrp    x16,OPENSSL_armcap_P
+       ldr     w16,[x16,#:lo12:OPENSSL_armcap_P]
         tst     w16,#ARMV8_SHA1
         b.ne    .Lv8_entry
  
@@ -208,7 +204,7 @@ sha1_block_data_order:
         movz    $K,#0x7999
         sub     $num,$num,#1
         movk    $K,#0x5a82,lsl#16
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
         ror     $Xx[0],@Xx[0],#32
  #else
         rev32   @Xx[0],@Xx[0]
@@ -321,15 +317,11 @@ $code.=<<___;
  .long  0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     //K_20_39
  .long  0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     //K_40_59
  .long  0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     //K_60_79
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long  OPENSSL_armcap_P-.
-#else
-.quad  OPENSSL_armcap_P-.
-#endif
  .asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  .align 2
+#if !defined(__KERNELL__) && !defined(_WIN64)
  .comm  OPENSSL_armcap_P,4,4
+#endif
  ___
  }}}
  
diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl

index 6540a9f3a61e0f3ae9d8ba73670a0d6979ee9645..f7c67219ed0904ae83bc68a4412639e5564d920c 100644 (file)
--- a/crypto/sha/asm/sha512-armv8.pl
+++ b/crypto/sha/asm/sha512-armv8.pl
@@ -188,24 +188,18 @@ ___
  $code.=<<___;
  #ifndef        __KERNEL__
  # include "arm_arch.h"
+.extern        OPENSSL_armcap_P
  #endif
  
  .text
  
-.extern        OPENSSL_armcap_P
  .globl $func
  .type  $func,%function
  .align 6
  $func:
  #ifndef        __KERNEL__
-# ifdef        __ILP32__
-       ldrsw   x16,.LOPENSSL_armcap_P
-# else
-       ldr     x16,.LOPENSSL_armcap_P
-# endif
-       adr     x17,.LOPENSSL_armcap_P
-       add     x16,x16,x17
-       ldr     w16,[x16]
+       adrp    x16,OPENSSL_armcap_P
+       ldr     w16,[x16,#:lo12:OPENSSL_armcap_P]
  ___
  $code.=<<___   if ($SZ==4);
         tst     w16,#ARMV8_SHA256
@@ -353,15 +347,6 @@ $code.=<<___ if ($SZ==4);
  ___
  $code.=<<___;
  .size  .LK$BITS,.-.LK$BITS
-#ifndef        __KERNEL__
-.align 3
-.LOPENSSL_armcap_P:
-# ifdef        __ILP32__
-       .long   OPENSSL_armcap_P-.
-# else
-       .quad   OPENSSL_armcap_P-.
-# endif
-#endif
  .asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  .align 2
  ___
@@ -841,7 +826,7 @@ ___
  }
  
  $code.=<<___;
-#ifndef        __KERNEL__
+#if !defined(__KERNEL__) && !defined(_WIN64)
  .comm  OPENSSL_armcap_P,4,4
  #endif
  ___
author	Andy Polyakov <appro@openssl.org>
	Fri, 15 Feb 2019 21:16:41 +0000 (22:16 +0100)
committer	Richard Levitte <levitte@openssl.org>
	Sat, 16 Feb 2019 16:01:15 +0000 (17:01 +0100)
crypto/aes/asm/vpaes-armv8.pl		patch \| blob \| history
crypto/bn/asm/armv8-mont.pl		patch \| blob \| history
crypto/chacha/asm/chacha-armv8.pl		patch \| blob \| history
crypto/ec/asm/ecp_nistz256-armv8.pl		patch \| blob \| history
crypto/perlasm/arm-xlate.pl		patch \| blob \| history
crypto/poly1305/asm/poly1305-armv8.pl		patch \| blob \| history
crypto/sha/asm/sha1-armv8.pl		patch \| blob \| history
crypto/sha/asm/sha512-armv8.pl		patch \| blob \| history