Fix chacha-armv4.pl with clang -fno-integrated-as.

[oweals/openssl.git] / crypto / chacha / asm / chacha-armv4.pl
diff --git a/crypto/chacha/asm/chacha-armv4.pl b/crypto/chacha/asm/chacha-armv4.pl

index 4d234b727562ffc1eb20ad2bafc53af178dc7c38..d3fadcc63d3338e25f0eadb41d2c6ce435410413 100755 (executable)
--- a/crypto/chacha/asm/chacha-armv4.pl
+++ b/crypto/chacha/asm/chacha-armv4.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
  #
  # ====================================================================
  # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -8,7 +15,7 @@
  # ====================================================================
  #
  # December 2014
-# 
+#
  # ChaCha20 for ARMv4.
  #
  # Performance in cycles per byte out of large buffer.
@@ -28,8 +35,8 @@
  #      20-25% worse;
  
  $flavour = shift;
-if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
  
  if ($flavour && $flavour ne "void") {
      $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
@@ -165,8 +172,10 @@ $code.=<<___;
  #include "arm_arch.h"
  
  .text
-#if defined(__thumb2__)
+#if defined(__thumb2__) || defined(__clang__)
  .syntax        unified
+#endif
+#if defined(__thumb2__)
  .thumb
  #else
  .code  32
@@ -200,6 +209,12 @@ ChaCha20_ctr32:
  #else
         adr     r14,.LChaCha20_ctr32
  #endif
+       cmp     r2,#0                   @ len==0?
+#ifdef __thumb2__
+       itt     eq
+#endif
+       addeq   sp,sp,#4*3
+       beq     .Lno_data
  #if __ARM_MAX_ARCH__>=7
         cmp     r2,#192                 @ test len
         bls     .Lshort
@@ -208,7 +223,7 @@ ChaCha20_ctr32:
  # ifdef        __APPLE__
         ldr     r4,[r4]
  # endif
-       tst     r4,#1
+       tst     r4,#ARMV7_NEON
         bne     .LChaCha20_neon
  .Lshort:
  #endif
@@ -434,9 +449,9 @@ $code.=<<___;
         eorhs   @x[4],@x[4],@t[0]
         eorhs   @x[5],@x[5],@t[1]
  # ifdef        __thumb2__
-       it      hi
+        it     ne
  # endif
-        ldrhi  @t[0],[sp,#4*(32+2)]    @ re-load len
+        ldrne  @t[0],[sp,#4*(32+2)]    @ re-load len
  # ifdef        __thumb2__
         itt     hs
  # endif
@@ -578,9 +593,9 @@ ___
  }
  $code.=<<___;
  # ifdef        __thumb2__
-       it      hi
+       it      ne
  # endif
-       ldrhi   @t[0],[sp,#4*(32+2)]            @ re-load len
+       ldrne   @t[0],[sp,#4*(32+2)]            @ re-load len
  # ifdef        __thumb2__
         it      hs
  # endif
@@ -592,19 +607,20 @@ $code.=<<___;
  
  .Ltail:
         ldr     r12,[sp,#4*(32+1)]      @ load inp
-       add     @t[2],sp,#4*(0)
+       add     @t[1],sp,#4*(0)
         ldr     r14,[sp,#4*(32+0)]      @ load out
  
  .Loop_tail:
-       ldrb    @t[0],[@t[2]],#1        @ read buffer on stack
-       ldrb    @t[1],[r12],#1          @ read input
-       subs    @t[3],@t[3],#1
-       eor     @t[0],@t[0],@t[1]
-       strb    @t[0],[r14],#1          @ store output
+       ldrb    @t[2],[@t[1]],#1        @ read buffer on stack
+       ldrb    @t[3],[r12],#1          @ read input
+       subs    @t[0],@t[0],#1
+       eor     @t[3],@t[3],@t[2]
+       strb    @t[3],[r14],#1          @ store output
         bne     .Loop_tail
  
  .Ldone:
         add     sp,sp,#4*(32+3)
+.Lno_data:
         ldmia   sp!,{r4-r11,pc}
  .size  ChaCha20_ctr32,.-ChaCha20_ctr32
  ___
@@ -706,7 +722,7 @@ ChaCha20_neon:
         vadd.i32        $d2,$d1,$t0             @ counter+2
         str             @t[3], [sp,#4*(16+15)]
         mov             @t[3],#10
-       add             @x[12],@x[12],#3        @ counter+3 
+       add             @x[12],@x[12],#3        @ counter+3
         b               .Loop_neon
  
  .align 4
@@ -1113,14 +1129,14 @@ $code.=<<___;
  # endif
         stmia           @t[0],{@x[0]-@x[7]}
          add            @t[2],sp,#4*(0)
-        sub            @t[3],@t[0],#64*3       @ len-=64*3
+        sub            @t[3],@t[3],#64*3       @ len-=64*3
  
  .Loop_tail_neon:
         ldrb            @t[0],[@t[2]],#1        @ read buffer on stack
         ldrb            @t[1],[r12],#1          @ read input
         subs            @t[3],@t[3],#1
         eor             @t[0],@t[0],@t[1]
-       strb            @t[0],[r14],#1          @ store ouput
+       strb            @t[0],[r14],#1          @ store output
         bne             .Loop_tail_neon
  
  .Ldone_neon: