OPENSSL_memcmp is a must in GCM decrypt and general-purpose loop takes
quite a portion of execution time for short inputs, more than GHASH for
few-byte inputs according to profiler. Special 16-byte case takes it off
top five list in profiler output.
Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6312)
CRYPTO_memcmp:
eor w3,w3,w3
cbz x2,.Lno_data // len==0?
+ cmp x2,#16
+ b.ne .Loop_cmp
+ ldp x8,x9,[x0]
+ ldp x10,x11,[x1]
+ eor x8,x8,x10
+ eor x9,x9,x11
+ orr x8,x8,x9
+ mov x0,#1
+ cmp x8,#0
+ csel x0,xzr,x0,eq
+ ret
+
+.align 4
.Loop_cmp:
ldrb w4,[x0],#1
ldrb w5,[x1],#1
xor %r10,%r10
cmp \$0,$arg3
je .Lno_data
+ cmp \$16,$arg3
+ jne .Loop_cmp
+ mov ($arg1),%r10
+ mov 8($arg1),%r11
+ mov \$1,$arg3
+ xor ($arg2),%r10
+ xor 8($arg2),%r11
+ or %r11,%r10
+ cmovnz $arg3,%rax
+ ret
+
+.align 16
.Loop_cmp:
mov ($arg1),%r10b
lea 1($arg1),$arg1