#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+# Copyright (c) 2015 CloudFlare, Inc.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
-# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
# (1) Intel Corporation, Israel Development Center, Haifa, Israel
# (2) University of Haifa, Israel
+# (3) CloudFlare, Inc.
#
# Reference:
# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
# Further optimization by <appro@openssl.org>:
#
# this/original with/without -DECP_NISTZ256_ASM(*)
-# Opteron +12-49% +110-150%
-# Bulldozer +14-45% +175-210%
-# P4 +18-46% n/a :-(
-# Westmere +12-34% +80-87%
-# Sandy Bridge +9-35% +110-120%
-# Ivy Bridge +9-35% +110-125%
-# Haswell +8-37% +140-160%
-# Broadwell +18-58% +145-210%
-# Atom +15-50% +130-180%
-# VIA Nano +43-160% +300-480%
+# Opteron +15-49% +150-195%
+# Bulldozer +18-45% +175-240%
+# P4 +24-46% +100-150%
+# Westmere +18-34% +87-160%
+# Sandy Bridge +14-35% +120-185%
+# Ivy Bridge +11-35% +125-180%
+# Haswell +10-37% +160-200%
+# Broadwell +24-58% +210-270%
+# Atom +20-50% +180-240%
+# VIA Nano +50-160% +480-480%
#
# (*) "without -DECP_NISTZ256_ASM" refers to build with
# "enable-ec_nistp_64_gcc_128";
#
# Ranges denote minimum and maximum improvement coefficients depending
-# on benchmark. Lower coefficients are for ECDSA sign, relatively fastest
-# server-side operation. Keep in mind that +100% means 2x improvement.
+# on benchmark. In "this/original" column lower coefficient is for
+# ECDSA sign, while in "with/without" - for ECDH key agreement, and
+# higher - for ECDSA sign, relatively fastest server-side operation.
+# Keep in mind that +100% means 2x improvement.
$flavour = shift;
$output = shift;
.long 3,3,3,3,3,3,3,3
.LONE_mont:
.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+# Constants for computations modulo ord(p256)
+.Lord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
___
{
my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
my ($poly1,$poly3)=($acc6,$acc7);
+$code.=<<___;
+################################################################################
+# void ecp_nistz256_ord_mul_mont(
+# uint64_t res[4],
+# uint64_t a[4],
+# uint64_t b[4]);
+
+.globl ecp_nistz256_ord_mul_mont
+.type ecp_nistz256_ord_mul_mont,\@function,3
+.align 32
+ecp_nistz256_ord_mul_mont:
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+ cmp \$0x80100, %ecx
+ je .Lecp_nistz256_ord_mul_montx
+___
+$code.=<<___;
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov 8*0($b_org), %rax
+ mov $b_org, $b_ptr
+ lea .Lord(%rip), %r14
+ mov .LordK(%rip), %r15
+
+ ################################# * b[0]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ mov %rax, $acc0
+ mov $t0, %rax
+ mov %rdx, $acc1
+
+ mulq 8*1($a_ptr)
+ add %rax, $acc1
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc2
+
+ mulq 8*2($a_ptr)
+ add %rax, $acc2
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc0, $acc5
+ imulq %r15,$acc0
+
+ mov %rdx, $acc3
+ mulq 8*3($a_ptr)
+ add %rax, $acc3
+ mov $acc0, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc4
+
+ ################################# First reduction step
+ mulq 8*0(%r14)
+ mov $acc0, $t1
+ add %rax, $acc5 # guaranteed to be zero
+ mov $acc0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t0
+
+ sub $acc0, $acc2
+ sbb \$0, $acc0 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc1
+ adc \$0, %rdx
+ add %rax, $acc1
+ mov $t1, %rax
+ adc %rdx, $acc2
+ mov $t1, %rdx
+ adc \$0, $acc0 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc3
+ mov 8*1($b_ptr), %rax
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc0, $acc3
+ adc $t1, $acc4
+ adc \$0, $acc5
+
+ ################################# * b[1]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ add %rax, $acc1
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*1($a_ptr)
+ add $t1, $acc2
+ adc \$0, %rdx
+ add %rax, $acc2
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*2($a_ptr)
+ add $t1, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc1, $t0
+ imulq %r15, $acc1
+
+ mov %rdx, $t1
+ mulq 8*3($a_ptr)
+ add $t1, $acc4
+ adc \$0, %rdx
+ xor $acc0, $acc0
+ add %rax, $acc4
+ mov $acc1, %rax
+ adc %rdx, $acc5
+ adc \$0, $acc0
+
+ ################################# Second reduction step
+ mulq 8*0(%r14)
+ mov $acc1, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov $acc1, %rax
+ adc %rdx, $t0
+
+ sub $acc1, $acc3
+ sbb \$0, $acc1 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc2
+ adc \$0, %rdx
+ add %rax, $acc2
+ mov $t1, %rax
+ adc %rdx, $acc3
+ mov $t1, %rdx
+ adc \$0, $acc1 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc4
+ mov 8*2($b_ptr), %rax
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc1, $acc4
+ adc $t1, $acc5
+ adc \$0, $acc0
+
+ ################################## * b[2]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ add %rax, $acc2
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*1($a_ptr)
+ add $t1, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*2($a_ptr)
+ add $t1, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc2, $t0
+ imulq %r15, $acc2
+
+ mov %rdx, $t1
+ mulq 8*3($a_ptr)
+ add $t1, $acc5
+ adc \$0, %rdx
+ xor $acc1, $acc1
+ add %rax, $acc5
+ mov $acc2, %rax
+ adc %rdx, $acc0
+ adc \$0, $acc1
+
+ ################################# Third reduction step
+ mulq 8*0(%r14)
+ mov $acc2, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov $acc2, %rax
+ adc %rdx, $t0
+
+ sub $acc2, $acc4
+ sbb \$0, $acc2 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $t1, %rax
+ adc %rdx, $acc4
+ mov $t1, %rdx
+ adc \$0, $acc2 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc5
+ mov 8*3($b_ptr), %rax
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc2, $acc5
+ adc $t1, $acc0
+ adc \$0, $acc1
+
+ ################################# * b[3]
+ mov %rax, $t0
+ mulq 8*0($a_ptr)
+ add %rax, $acc3
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*1($a_ptr)
+ add $t1, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t0, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mulq 8*2($a_ptr)
+ add $t1, $acc5
+ adc \$0, %rdx
+ add %rax, $acc5
+ mov $t0, %rax
+ adc \$0, %rdx
+
+ mov $acc3, $t0
+ imulq %r15, $acc3
+
+ mov %rdx, $t1
+ mulq 8*3($a_ptr)
+ add $t1, $acc0
+ adc \$0, %rdx
+ xor $acc2, $acc2
+ add %rax, $acc0
+ mov $acc3, %rax
+ adc %rdx, $acc1
+ adc \$0, $acc2
+
+ ################################# Last reduction step
+ mulq 8*0(%r14)
+ mov $acc3, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov $acc3, %rax
+ adc %rdx, $t0
+
+ sub $acc3, $acc5
+ sbb \$0, $acc3 # can't borrow
+
+ mulq 8*1(%r14)
+ add $t0, $acc4
+ adc \$0, %rdx
+ add %rax, $acc4
+ mov $t1, %rax
+ adc %rdx, $acc5
+ mov $t1, %rdx
+ adc \$0, $acc3 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc0
+ sbb %rdx, $t1 # can't borrow
+
+ add $acc3, $acc0
+ adc $t1, $acc1
+ adc \$0, $acc2
+
+ ################################# Subtract ord
+ mov $acc4, $a_ptr
+ sub 8*0(%r14), $acc4
+ mov $acc5, $acc3
+ sbb 8*1(%r14), $acc5
+ mov $acc0, $t0
+ sbb 8*2(%r14), $acc0
+ mov $acc1, $t1
+ sbb 8*3(%r14), $acc1
+ sbb \$0, $acc2
+
+ cmovc $a_ptr, $acc4
+ cmovc $acc3, $acc5
+ cmovc $t0, $acc0
+ cmovc $t1, $acc1
+
+ mov $acc4, 8*0($r_ptr)
+ mov $acc5, 8*1($r_ptr)
+ mov $acc0, 8*2($r_ptr)
+ mov $acc1, 8*3($r_ptr)
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+################################################################################
+# void ecp_nistz256_ord_sqr_mont(
+# uint64_t res[4],
+# uint64_t a[4],
+# int rep);
+
+.globl ecp_nistz256_ord_sqr_mont
+.type ecp_nistz256_ord_sqr_mont,\@function,3
+.align 32
+ecp_nistz256_ord_sqr_mont:
+___
+$code.=<<___ if ($addx);
+ mov \$0x80100, %ecx
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
+ cmp \$0x80100, %ecx
+ je .Lecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov 8*0($a_ptr), $acc0
+ mov 8*1($a_ptr), %rax
+ mov 8*2($a_ptr), $acc6
+ mov 8*3($a_ptr), $acc7
+ lea .Lord(%rip), $a_ptr # pointer to modulus
+ mov $b_org, $b_ptr
+ jmp .Loop_ord_sqr
+
+.align 32
+.Loop_ord_sqr:
+ ################################# a[1:] * a[0]
+ mov %rax, $t1 # put aside a[1]
+ mul $acc0 # a[1] * a[0]
+ mov %rax, $acc1
+ movq $t1, %xmm1 # offload a[1]
+ mov $acc6, %rax
+ mov %rdx, $acc2
+
+ mul $acc0 # a[2] * a[0]
+ add %rax, $acc2
+ mov $acc7, %rax
+ movq $acc6, %xmm2 # offload a[2]
+ adc \$0, %rdx
+ mov %rdx, $acc3
+
+ mul $acc0 # a[3] * a[0]
+ add %rax, $acc3
+ mov $acc7, %rax
+ movq $acc7, %xmm3 # offload a[3]
+ adc \$0, %rdx
+ mov %rdx, $acc4
+
+ ################################# a[3] * a[2]
+ mul $acc6 # a[3] * a[2]
+ mov %rax, $acc5
+ mov $acc6, %rax
+ mov %rdx, $acc6
+
+ ################################# a[2:] * a[1]
+ mul $t1 # a[2] * a[1]
+ add %rax, $acc3
+ mov $acc7, %rax
+ adc \$0, %rdx
+ mov %rdx, $acc7
+
+ mul $t1 # a[3] * a[1]
+ add %rax, $acc4
+ adc \$0, %rdx
+
+ add $acc7, $acc4
+ adc %rdx, $acc5
+ adc \$0, $acc6 # can't overflow
+
+ ################################# *2
+ xor $acc7, $acc7
+ mov $acc0, %rax
+ add $acc1, $acc1
+ adc $acc2, $acc2
+ adc $acc3, $acc3
+ adc $acc4, $acc4
+ adc $acc5, $acc5
+ adc $acc6, $acc6
+ adc \$0, $acc7
+
+ ################################# Missing products
+ mul %rax # a[0] * a[0]
+ mov %rax, $acc0
+ movq %xmm1, %rax
+ mov %rdx, $t1
+
+ mul %rax # a[1] * a[1]
+ add $t1, $acc1
+ adc %rax, $acc2
+ movq %xmm2, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mul %rax # a[2] * a[2]
+ add $t1, $acc3
+ adc %rax, $acc4
+ movq %xmm3, %rax
+ adc \$0, %rdx
+ mov %rdx, $t1
+
+ mov $acc0, $t0
+ imulq 8*4($a_ptr), $acc0 # *= .LordK
+
+ mul %rax # a[3] * a[3]
+ add $t1, $acc5
+ adc %rax, $acc6
+ mov 8*0($a_ptr), %rax # modulus[0]
+ adc %rdx, $acc7 # can't overflow
+
+ ################################# First reduction step
+ mul $acc0
+ mov $acc0, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax # modulus[1]
+ adc %rdx, $t0
+
+ sub $acc0, $acc2
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc0
+ add $t0, $acc1
+ adc \$0, %rdx
+ add %rax, $acc1
+ mov $acc0, %rax
+ adc %rdx, $acc2
+ mov $acc0, %rdx
+ adc \$0, $t1 # can't overflow
+
+ mov $acc1, $t0
+ imulq 8*4($a_ptr), $acc1 # *= .LordK
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc3
+ mov 8*0($a_ptr), %rax
+ sbb %rdx, $acc0 # can't borrow
+
+ add $t1, $acc3
+ adc \$0, $acc0 # can't overflow
+
+ ################################# Second reduction step
+ mul $acc1
+ mov $acc1, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax
+ adc %rdx, $t0
+
+ sub $acc1, $acc3
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc1
+ add $t0, $acc2
+ adc \$0, %rdx
+ add %rax, $acc2
+ mov $acc1, %rax
+ adc %rdx, $acc3
+ mov $acc1, %rdx
+ adc \$0, $t1 # can't overflow
+
+ mov $acc2, $t0
+ imulq 8*4($a_ptr), $acc2 # *= .LordK
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc0
+ mov 8*0($a_ptr), %rax
+ sbb %rdx, $acc1 # can't borrow
+
+ add $t1, $acc0
+ adc \$0, $acc1 # can't overflow
+
+ ################################# Third reduction step
+ mul $acc2
+ mov $acc2, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax
+ adc %rdx, $t0
+
+ sub $acc2, $acc0
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc2
+ add $t0, $acc3
+ adc \$0, %rdx
+ add %rax, $acc3
+ mov $acc2, %rax
+ adc %rdx, $acc0
+ mov $acc2, %rdx
+ adc \$0, $t1 # can't overflow
+
+ mov $acc3, $t0
+ imulq 8*4($a_ptr), $acc3 # *= .LordK
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc1
+ mov 8*0($a_ptr), %rax
+ sbb %rdx, $acc2 # can't borrow
+
+ add $t1, $acc1
+ adc \$0, $acc2 # can't overflow
+
+ ################################# Last reduction step
+ mul $acc3
+ mov $acc3, $t1
+ add %rax, $t0 # guaranteed to be zero
+ mov 8*1($a_ptr), %rax
+ adc %rdx, $t0
+
+ sub $acc3, $acc1
+ sbb \$0, $t1 # can't borrow
+
+ mul $acc3
+ add $t0, $acc0
+ adc \$0, %rdx
+ add %rax, $acc0
+ mov $acc3, %rax
+ adc %rdx, $acc1
+ mov $acc3, %rdx
+ adc \$0, $t1 # can't overflow
+
+ shl \$32, %rax
+ shr \$32, %rdx
+ sub %rax, $acc2
+ sbb %rdx, $acc3 # can't borrow
+
+ add $t1, $acc2
+ adc \$0, $acc3 # can't overflow
+
+ ################################# Add bits [511:256] of the sqr result
+ xor %rdx, %rdx
+ add $acc4, $acc0
+ adc $acc5, $acc1
+ mov $acc0, $acc4
+ adc $acc6, $acc2
+ adc $acc7, $acc3
+ mov $acc1, %rax
+ adc \$0, %rdx
+
+ ################################# Compare to modulus
+ sub 8*0($a_ptr), $acc0
+ mov $acc2, $acc6
+ sbb 8*1($a_ptr), $acc1
+ sbb 8*2($a_ptr), $acc2
+ mov $acc3, $acc7
+ sbb 8*3($a_ptr), $acc3
+ sbb \$0, %rdx
+
+ cmovc $acc4, $acc0
+ cmovnc $acc1, %rax
+ cmovnc $acc2, $acc6
+ cmovnc $acc3, $acc7
+
+ dec $b_ptr
+ jnz .Loop_ord_sqr
+
+ mov $acc0, 8*0($r_ptr)
+ mov %rax, 8*1($r_ptr)
+ pxor %xmm1, %xmm1
+ mov $acc6, 8*2($r_ptr)
+ pxor %xmm2, %xmm2
+ mov $acc7, 8*3($r_ptr)
+ pxor %xmm3, %xmm3
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
+
+$code.=<<___ if ($addx);
+################################################################################
+.type ecp_nistz256_ord_mul_montx,\@function,3
+.align 32
+ecp_nistz256_ord_mul_montx:
+.Lecp_nistz256_ord_mul_montx:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov $b_org, $b_ptr
+ mov 8*0($b_org), %rdx
+ mov 8*0($a_ptr), $acc1
+ mov 8*1($a_ptr), $acc2
+ mov 8*2($a_ptr), $acc3
+ mov 8*3($a_ptr), $acc4
+ lea -128($a_ptr), $a_ptr # control u-op density
+ lea .Lord-128(%rip), %r14
+ mov .LordK(%rip), %r15
+
+ ################################# Multiply by b[0]
+ mulx $acc1, $acc0, $acc1
+ mulx $acc2, $t0, $acc2
+ mulx $acc3, $t1, $acc3
+ add $t0, $acc1
+ mulx $acc4, $t0, $acc4
+ mov $acc0, %rdx
+ mulx %r15, %rdx, %rax
+ adc $t1, $acc2
+ adc $t0, $acc3
+ adc \$0, $acc4
+
+ ################################# reduction
+ xor $acc5, $acc5 # $acc5=0, cf=0, of=0
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc0 # guaranteed to be zero
+ adox $t1, $acc1
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*3+128(%r14), $t0, $t1
+ mov 8*1($b_ptr), %rdx
+ adcx $t0, $acc3
+ adox $t1, $acc4
+ adcx $acc0, $acc4
+ adox $acc0, $acc5
+ adc \$0, $acc5 # cf=0, of=0
+
+ ################################# Multiply by b[1]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc1, %rdx
+ mulx %r15, %rdx, %rax
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ adcx $acc0, $acc5
+ adox $acc0, $acc0
+ adc \$0, $acc0 # cf=0, of=0
+
+ ################################# reduction
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc1 # guaranteed to be zero
+ adox $t1, $acc2
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*3+128(%r14), $t0, $t1
+ mov 8*2($b_ptr), %rdx
+ adcx $t0, $acc4
+ adox $t1, $acc5
+ adcx $acc1, $acc5
+ adox $acc1, $acc0
+ adc \$0, $acc0 # cf=0, of=0
+
+ ################################# Multiply by b[2]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc2, %rdx
+ mulx %r15, %rdx, %rax
+ adcx $t0, $acc5
+ adox $t1, $acc0
+
+ adcx $acc1, $acc0
+ adox $acc1, $acc1
+ adc \$0, $acc1 # cf=0, of=0
+
+ ################################# reduction
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc2 # guaranteed to be zero
+ adox $t1, $acc3
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*3+128(%r14), $t0, $t1
+ mov 8*3($b_ptr), %rdx
+ adcx $t0, $acc5
+ adox $t1, $acc0
+ adcx $acc2, $acc0
+ adox $acc2, $acc1
+ adc \$0, $acc1 # cf=0, of=0
+
+ ################################# Multiply by b[3]
+ mulx 8*0+128($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx 8*1+128($a_ptr), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*2+128($a_ptr), $t0, $t1
+ adcx $t0, $acc5
+ adox $t1, $acc0
+
+ mulx 8*3+128($a_ptr), $t0, $t1
+ mov $acc3, %rdx
+ mulx %r15, %rdx, %rax
+ adcx $t0, $acc0
+ adox $t1, $acc1
+
+ adcx $acc2, $acc1
+ adox $acc2, $acc2
+ adc \$0, $acc2 # cf=0, of=0
+
+ ################################# reduction
+ mulx 8*0+128(%r14), $t0, $t1
+ adcx $t0, $acc3 # guranteed to be zero
+ adox $t1, $acc4
+
+ mulx 8*1+128(%r14), $t0, $t1
+ adcx $t0, $acc4
+ adox $t1, $acc5
+
+ mulx 8*2+128(%r14), $t0, $t1
+ adcx $t0, $acc5
+ adox $t1, $acc0
+
+ mulx 8*3+128(%r14), $t0, $t1
+ lea 128(%r14),%r14
+ mov $acc4, $t2
+ adcx $t0, $acc0
+ adox $t1, $acc1
+ mov $acc5, $t3
+ adcx $acc3, $acc1
+ adox $acc3, $acc2
+ adc \$0, $acc2
+
+ #################################
+ # Branch-less conditional subtraction of P
+ mov $acc0, $t0
+ sub 8*0(%r14), $acc4
+ sbb 8*1(%r14), $acc5
+ sbb 8*2(%r14), $acc0
+ mov $acc1, $t1
+ sbb 8*3(%r14), $acc1
+ sbb \$0, $acc2
+
+ cmovc $t2, $acc4
+ cmovc $t3, $acc5
+ cmovc $t0, $acc0
+ cmovc $t1, $acc1
+
+ mov $acc4, 8*0($r_ptr)
+ mov $acc5, 8*1($r_ptr)
+ mov $acc0, 8*2($r_ptr)
+ mov $acc1, 8*3($r_ptr)
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+
+.type ecp_nistz256_ord_sqr_montx,\@function,3
+.align 32
+ecp_nistz256_ord_sqr_montx:
+.Lecp_nistz256_ord_sqr_montx:
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov $b_org, $b_ptr
+ mov 8*0($a_ptr), %rdx
+ mov 8*1($a_ptr), $acc6
+ mov 8*2($a_ptr), $acc7
+ mov 8*3($a_ptr), $acc0
+ lea .Lord(%rip), $a_ptr
+ jmp .Loop_ord_sqrx
+
+.align 32
+.Loop_ord_sqrx:
+ mulx $acc6, $acc1, $acc2 # a[0]*a[1]
+ mulx $acc7, $t0, $acc3 # a[0]*a[2]
+ mov %rdx, %rax # offload a[0]
+ movq $acc6, %xmm1 # offload a[1]
+ mulx $acc0, $t1, $acc4 # a[0]*a[3]
+ mov $acc6, %rdx
+ add $t0, $acc2
+ movq $acc7, %xmm2 # offload a[2]
+ adc $t1, $acc3
+ adc \$0, $acc4
+ xor $acc5, $acc5 # $acc5=0,cf=0,of=0
+ #################################
+ mulx $acc7, $t0, $t1 # a[1]*a[2]
+ adcx $t0, $acc3
+ adox $t1, $acc4
+
+ mulx $acc0, $t0, $t1 # a[1]*a[3]
+ mov $acc7, %rdx
+ adcx $t0, $acc4
+ adox $t1, $acc5
+ adc \$0, $acc5
+ #################################
+ mulx $acc0, $t0, $acc6 # a[2]*a[3]
+ mov %rax, %rdx
+ movq $acc0, %xmm3 # offload a[3]
+ xor $acc7, $acc7 # $acc7=0,cf=0,of=0
+ adcx $acc1, $acc1 # acc1:6<<1
+ adox $t0, $acc5
+ adcx $acc2, $acc2
+ adox $acc7, $acc6 # of=0
+
+ ################################# a[i]*a[i]
+ mulx %rdx, $acc0, $t1
+ movq %xmm1, %rdx
+ adcx $acc3, $acc3
+ adox $t1, $acc1
+ adcx $acc4, $acc4
+ mulx %rdx, $t0, $t4
+ movq %xmm2, %rdx
+ adcx $acc5, $acc5
+ adox $t0, $acc2
+ adcx $acc6, $acc6
+ mulx %rdx, $t0, $t1
+ .byte 0x67
+ movq %xmm3, %rdx
+ adox $t4, $acc3
+ adcx $acc7, $acc7
+ adox $t0, $acc4
+ adox $t1, $acc5
+ mulx %rdx, $t0, $t4
+ adox $t0, $acc6
+ adox $t4, $acc7
+
+ ################################# reduction
+ mov $acc0, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
+
+ xor %rax, %rax # cf=0, of=0
+ mulx 8*0($a_ptr), $t0, $t1
+ adcx $t0, $acc0 # guaranteed to be zero
+ adox $t1, $acc1
+ mulx 8*1($a_ptr), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2
+ mulx 8*2($a_ptr), $t0, $t1
+ adcx $t0, $acc2
+ adox $t1, $acc3
+ mulx 8*3($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc0 # of=0
+ adcx %rax, $acc0 # cf=0
+
+ #################################
+ mov $acc1, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
+
+ mulx 8*0($a_ptr), $t0, $t1
+ adox $t0, $acc1 # guaranteed to be zero
+ adcx $t1, $acc2
+ mulx 8*1($a_ptr), $t0, $t1
+ adox $t0, $acc2
+ adcx $t1, $acc3
+ mulx 8*2($a_ptr), $t0, $t1
+ adox $t0, $acc3
+ adcx $t1, $acc0
+ mulx 8*3($a_ptr), $t0, $t1
+ adox $t0, $acc0
+ adcx $t1, $acc1 # cf=0
+ adox %rax, $acc1 # of=0
+
+ #################################
+ mov $acc2, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
+
+ mulx 8*0($a_ptr), $t0, $t1
+ adcx $t0, $acc2 # guaranteed to be zero
+ adox $t1, $acc3
+ mulx 8*1($a_ptr), $t0, $t1
+ adcx $t0, $acc3
+ adox $t1, $acc0
+ mulx 8*2($a_ptr), $t0, $t1
+ adcx $t0, $acc0
+ adox $t1, $acc1
+ mulx 8*3($a_ptr), $t0, $t1
+ adcx $t0, $acc1
+ adox $t1, $acc2 # of=0
+ adcx %rax, $acc2 # cf=0
+
+ #################################
+ mov $acc3, %rdx
+ mulx 8*4($a_ptr), %rdx, $t0
+
+ mulx 8*0($a_ptr), $t0, $t1
+ adox $t0, $acc3 # guaranteed to be zero
+ adcx $t1, $acc0
+ mulx 8*1($a_ptr), $t0, $t1
+ adox $t0, $acc0
+ adcx $t1, $acc1
+ mulx 8*2($a_ptr), $t0, $t1
+ adox $t0, $acc1
+ adcx $t1, $acc2
+ mulx 8*3($a_ptr), $t0, $t1
+ adox $t0, $acc2
+ adcx $t1, $acc3
+ adox %rax, $acc3
+
+ ################################# accumulate upper half
+ add $acc0, $acc4 # add $acc4, $acc0
+ adc $acc5, $acc1
+ mov $acc4, %rdx
+ adc $acc6, $acc2
+ adc $acc7, $acc3
+ mov $acc1, $acc6
+ adc \$0, %rax
+
+ ################################# compare to modulus
+ sub 8*0($a_ptr), $acc4
+ mov $acc2, $acc7
+ sbb 8*1($a_ptr), $acc1
+ sbb 8*2($a_ptr), $acc2
+ mov $acc3, $acc0
+ sbb 8*3($a_ptr), $acc3
+ sbb \$0, %rax
+
+ cmovnc $acc4, %rdx
+ cmovnc $acc1, $acc6
+ cmovnc $acc2, $acc7
+ cmovnc $acc3, $acc0
+
+ dec $b_ptr
+ jnz .Loop_ord_sqrx
+
+ mov %rdx, 8*0($r_ptr)
+ mov $acc6, 8*1($r_ptr)
+ pxor %xmm1, %xmm1
+ mov $acc7, 8*2($r_ptr)
+ pxor %xmm2, %xmm2
+ mov $acc0, 8*3($r_ptr)
+ pxor %xmm3, %xmm3
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+
+.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+___
+
$code.=<<___;
################################################################################
# void ecp_nistz256_to_mont(
/*
* Copyright 2014-2017 The OpenSSL Project Authors. All Rights Reserved.
* Copyright (c) 2014, Intel Corporation. All Rights Reserved.
+ * Copyright (c) 2015, CloudFlare, Inc.
*
* Licensed under the OpenSSL license (the "License"). You may not use
* this file except in compliance with the License. You can obtain a copy
* in the file LICENSE in the source distribution or at
* https://www.openssl.org/source/license.html
*
- * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
+ * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
* (1) Intel Corporation, Israel Development Center, Haifa, Israel
* (2) University of Haifa, Israel
+ * (3) CloudFlare, Inc.
*
* Reference:
* S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
*/
#if defined(ECP_NISTZ256_AVX2)
# if !(defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_AMD64) || defined(_MX64)) || \
+ defined(_M_AMD64) || defined(_M_X64)) || \
!(defined(__GNUC__) || defined(_MSC_VER)) /* this is for ALIGN32 */
# undef ECP_NISTZ256_AVX2
# else
return HAVEPRECOMP(group, nistz256);
}
+#if defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_AMD64) || defined(_M_X64) || \
+ defined(__powerpc64__) || defined(_ARCH_PP64)
+/*
+ * Montgomery mul modulo Order(P): res = a*b*2^-256 mod Order(P)
+ */
+void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS],
+ const BN_ULONG b[P256_LIMBS]);
+void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
+ const BN_ULONG a[P256_LIMBS],
+ int rep);
+
+static int ecp_nistz256_inv_mod_ord(const EC_GROUP *group, BIGNUM *r,
+ BIGNUM *x, BN_CTX *ctx)
+{
+ /* RR = 2^512 mod ord(p256) */
+ static const BN_ULONG RR[P256_LIMBS] = { TOBN(0x83244c95,0xbe79eea2),
+ TOBN(0x4699799c,0x49bd6fa6),
+ TOBN(0x2845b239,0x2b6bec59),
+ TOBN(0x66e12d94,0xf3d95620) };
+ /* The constant 1 (unlike ONE that is one in Montgomery representation) */
+ static const BN_ULONG one[P256_LIMBS] = { TOBN(0,1),TOBN(0,0),
+ TOBN(0,0),TOBN(0,0) };
+ /* expLo - the low 128bit of the exponent we use (ord(p256) - 2),
+ * split into 4bit windows */
+ static const unsigned char expLo[32] = { 0xb,0xc,0xe,0x6,0xf,0xa,0xa,0xd,
+ 0xa,0x7,0x1,0x7,0x9,0xe,0x8,0x4,
+ 0xf,0x3,0xb,0x9,0xc,0xa,0xc,0x2,
+ 0xf,0xc,0x6,0x3,0x2,0x5,0x4,0xf };
+ /*
+ * We don't use entry 0 in the table, so we omit it and address
+ * with -1 offset.
+ */
+ BN_ULONG table[15][P256_LIMBS];
+ BN_ULONG out[P256_LIMBS], t[P256_LIMBS];
+ int i, ret = 0;
+
+ /*
+ * Catch allocation failure early.
+ */
+ if (bn_wexpand(r, P256_LIMBS) == NULL) {
+ ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB);
+ goto err;
+ }
+
+ if ((BN_num_bits(x) > 256) || BN_is_negative(x)) {
+ BIGNUM *tmp;
+
+ if ((tmp = BN_CTX_get(ctx)) == NULL
+ || !BN_nnmod(tmp, x, group->order, ctx)) {
+ ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, ERR_R_BN_LIB);
+ goto err;
+ }
+ x = tmp;
+ }
+
+ if (!ecp_nistz256_bignum_to_field_elem(t, x)) {
+ ECerr(EC_F_ECP_NISTZ256_INV_MOD_ORD, EC_R_COORDINATES_OUT_OF_RANGE);
+ goto err;
+ }
+
+ ecp_nistz256_ord_mul_mont(table[0], t, RR);
+ for (i = 2; i < 16; i += 2) {
+ ecp_nistz256_ord_sqr_mont(table[i-1], table[i/2-1], 1);
+ ecp_nistz256_ord_mul_mont(table[i], table[i-1], table[0]);
+ }
+
+ /*
+ * The top 128bit of the exponent are highly redudndant, so we
+ * perform an optimized flow
+ */
+ ecp_nistz256_ord_sqr_mont(t, table[15-1], 4); /* f0 */
+ ecp_nistz256_ord_mul_mont(t, t, table[15-1]); /* ff */
+
+ ecp_nistz256_ord_sqr_mont(out, t, 8); /* ff00 */
+ ecp_nistz256_ord_mul_mont(out, out, t); /* ffff */
+
+ ecp_nistz256_ord_sqr_mont(t, out, 16); /* ffff0000 */
+ ecp_nistz256_ord_mul_mont(t, t, out); /* ffffffff */
+
+ ecp_nistz256_ord_sqr_mont(out, t, 64); /* ffffffff0000000000000000 */
+ ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffff */
+
+ ecp_nistz256_ord_sqr_mont(out, out, 32); /* ffffffff00000000ffffffff00000000 */
+ ecp_nistz256_ord_mul_mont(out, out, t); /* ffffffff00000000ffffffffffffffff */
+
+ /*
+ * The bottom 128 bit of the exponent are easier done with a table
+ */
+ for(i = 0; i < 32; i++) {
+ ecp_nistz256_ord_sqr_mont(out, out, 4);
+ /* The exponent is public, no need in constant-time access */
+ ecp_nistz256_ord_mul_mont(out, out, table[expLo[i]-1]);
+ }
+ ecp_nistz256_ord_mul_mont(out, out, one);
+
+ /*
+ * Can't fail, but check return code to be consistent anyway.
+ */
+ if (!bn_set_words(r, out, P256_LIMBS))
+ goto err;
+
+ ret = 1;
+err:
+ return ret;
+}
+#else
+# define ecp_nistz256_inv_mod_ord NULL
+#endif
+
const EC_METHOD *EC_GFp_nistz256_method(void)
{
static const EC_METHOD ret = {
ec_key_simple_generate_public_key,
0, /* keycopy */
0, /* keyfinish */
- ecdh_simple_compute_key
+ ecdh_simple_compute_key,
+ ecp_nistz256_inv_mod_ord /* can be #defined-ed NULL */
};
return &ret;