-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# ====================================================================
#
# December 2014
-#
+#
# ChaCha20 for ARMv4.
#
# Performance in cycles per byte out of large buffer.
# 20-25% worse;
$flavour = shift;
-if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
#include "arm_arch.h"
.text
-#if defined(__thumb2__)
+#if defined(__thumb2__) || defined(__clang__)
.syntax unified
+#endif
+#if defined(__thumb2__)
.thumb
#else
.code 32
#else
adr r14,.LChaCha20_ctr32
#endif
+ cmp r2,#0 @ len==0?
+#ifdef __thumb2__
+ itt eq
+#endif
+ addeq sp,sp,#4*3
+ beq .Lno_data
#if __ARM_MAX_ARCH__>=7
cmp r2,#192 @ test len
bls .Lshort
# ifdef __APPLE__
ldr r4,[r4]
# endif
- tst r4,#1
+ tst r4,#ARMV7_NEON
bne .LChaCha20_neon
.Lshort:
#endif
eorhs @x[4],@x[4],@t[0]
eorhs @x[5],@x[5],@t[1]
# ifdef __thumb2__
- it hi
+ it ne
# endif
- ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
+ ldrne @t[0],[sp,#4*(32+2)] @ re-load len
# ifdef __thumb2__
itt hs
# endif
}
$code.=<<___;
# ifdef __thumb2__
- it hi
+ it ne
# endif
- ldrhi @t[0],[sp,#4*(32+2)] @ re-load len
+ ldrne @t[0],[sp,#4*(32+2)] @ re-load len
# ifdef __thumb2__
it hs
# endif
.Ltail:
ldr r12,[sp,#4*(32+1)] @ load inp
- add @t[2],sp,#4*(0)
+ add @t[1],sp,#4*(0)
ldr r14,[sp,#4*(32+0)] @ load out
.Loop_tail:
- ldrb @t[0],[@t[2]],#1 @ read buffer on stack
- ldrb @t[1],[r12],#1 @ read input
- subs @t[3],@t[3],#1
- eor @t[0],@t[0],@t[1]
- strb @t[0],[r14],#1 @ store output
+ ldrb @t[2],[@t[1]],#1 @ read buffer on stack
+ ldrb @t[3],[r12],#1 @ read input
+ subs @t[0],@t[0],#1
+ eor @t[3],@t[3],@t[2]
+ strb @t[3],[r14],#1 @ store output
bne .Loop_tail
.Ldone:
add sp,sp,#4*(32+3)
+.Lno_data:
ldmia sp!,{r4-r11,pc}
.size ChaCha20_ctr32,.-ChaCha20_ctr32
___
vadd.i32 $d2,$d1,$t0 @ counter+2
str @t[3], [sp,#4*(16+15)]
mov @t[3],#10
- add @x[12],@x[12],#3 @ counter+3
+ add @x[12],@x[12],#3 @ counter+3
b .Loop_neon
.align 4
# endif
stmia @t[0],{@x[0]-@x[7]}
add @t[2],sp,#4*(0)
- sub @t[3],@t[0],#64*3 @ len-=64*3
+ sub @t[3],@t[3],#64*3 @ len-=64*3
.Loop_tail_neon:
ldrb @t[0],[@t[2]],#1 @ read buffer on stack
ldrb @t[1],[r12],#1 @ read input
subs @t[3],@t[3],#1
eor @t[0],@t[0],@t[1]
- strb @t[0],[r14],#1 @ store ouput
+ strb @t[0],[r14],#1 @ store output
bne .Loop_tail_neon
.Ldone_neon: