# This code is ~4.5 (four and a half) times faster than code generated
# by gcc 3.4 and it spends ~72 clock cycles per byte.
-# This module currently has dependency on byte order, namely *dword*
-# order in ctx->h[0-9]. I have to think of a way to reliably detect
-# "endianness" [and flip below two constants] or arrange given dword
-# order in C.
-$lo=0; # this denotes little-endian platform.
-$hi=4;
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in h[0-7],
+# namely with most significant dword at *lower* address, which is
+# reflected in below two parameters. *Byte* order within these dwords
+# in turn is whatever *native* byte order on current platform.
+$hi=0;
+$lo=4;
+# ====================================================================
+
+$output=shift;
+open STDOUT,">$output";
$ctx="r0";
$inp="r1";
bx lr @ interoperable with Thumb ISA:-)
.size sha512_block_data_order,.-sha512_block_data_order
.asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
print $code;
+close STDOUT; # enforce flush