# -m64 -m32
# ----------------------------------
# PPC970,gcc-4.0.0 +76% +59%
+# Power6,xlc-7 +68% +33%
$output = shift;
# gcc 3.4 32-bit asm cycles/byte
# Opteron +45% +20% 6.8
# Xeon P4 +65% +0% 9.9
-# Core2 +60% +10% 8.8
+# Core2 +60% +10% 7.0
$output=shift;
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
-# SHA256 block procedure for ARMv4.
+# SHA256 block procedure for ARMv4. May 2007.
-# Performance is ~2x better than gcc 3.4 generated code.
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte.
$ctx="r0"; $t0="r0";
$inp="r1";
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
-.size sha256_block_data_order,.-sha1_block_data_order
+.size sha256_block_data_order,.-sha256_block_data_order
.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
___
# -m64 -m32 | -m64 -m32
# --------------------------------------+-----------------------
# PPC970,gcc-4.0.0 +50% +38% | +40% +410%(*)
+# Power6,xlc-7 +150% +90% | +100% +430%(*)
#
# (*) 64-bit code in 32-bit application context, which actually is
# on TODO list. It should be noted that for safe deployment in
# April 2007.
#
# sha256_block_data_order is reportedly >3 times faster than gcc 3.3
-# generated code (must to be a bug in compiler, as improvement is
+# generated code (must be a bug in compiler, as improvement is
# "pathologically" high, in particular in comparison to other SHA
# modules). But the real twist is that it detects if hardware support
# for SHA256 is available and in such case utilizes it. Then the