Why is it redundant? We're looking at carry from addition of small,
11-bit number to 256-bit one. And carry would mean only one thing,
resulting first limb being small number and remaing ones - zeros.
Hence adding 38 to first limb can't carry.
Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/5476)
# P4 +22% +40%
# Sandy Bridge -3% +11%
# Haswell -1% +13%
-# Broadwell(***) +26% +30%
-# Skylake(***) +30% +47%
+# Broadwell(***) +30% +35%
+# Skylake(***) +33% +47%
# Silvermont +20% +26%
# Goldmont +40% +50%
# Bulldozer +20% +9%
-# Ryzen(***) +35% +32%
+# Ryzen(***) +43% +40%
# VIA +170% +120%
#
# (*) amd64-51 is popular assembly implementation with 2^51 radix,
and \$38,%rax
add %rax,$acc0
- adc \$0,$acc1
- mov $acc0,8*0(%rdi)
- adc \$0,$acc2
mov $acc1,8*1(%rdi)
- adc \$0,$acc3
mov $acc2,8*2(%rdi)
mov $acc3,8*3(%rdi)
+ mov $acc0,8*0(%rdi)
mov 8*3(%rsp),%r15
mov 8*4(%rsp),%r14
and \$38,%rax
add %rax,$acc0
- adc \$0,$acc1
- mov $acc0,8*0(%rdi)
- adc \$0,$acc2
mov $acc1,8*1(%rdi)
- adc \$0,$acc3
mov $acc2,8*2(%rdi)
mov $acc3,8*3(%rdi)
+ mov $acc0,8*0(%rdi)
ret
.size x25519_fe64_mul121666,.-x25519_fe64_mul121666
and \$19,%rax
add %rax,$acc0
- adc \$0,$acc1
- adc \$0,$acc2
- adc \$0,$acc3
- mov $acc0,8*0(%rdi)
mov $acc1,8*1(%rdi)
mov $acc2,8*2(%rdi)
mov $acc3,8*3(%rdi)
+ mov $acc0,8*0(%rdi)
ret
.size x25519_fe64_tobytes,.-x25519_fe64_tobytes