From: Andy Polyakov Date: Mon, 19 Feb 2018 14:17:30 +0000 (+0100) Subject: ec/curve25519.c: facilitate assembly implementations. X-Git-Tag: OpenSSL_1_1_1-pre2~53 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=c521e4392f8423af4bc64cdee57495e39e4e7a07;p=oweals%2Fopenssl.git ec/curve25519.c: facilitate assembly implementations. Currently it's limited to 64-bit platforms only as minimum radix expected in assembly is 2^51. Reviewed-by: Rich Salz Reviewed-by: Paul Dale (Merged from https://github.com/openssl/openssl/pull/5408) --- diff --git a/crypto/ec/curve25519.c b/crypto/ec/curve25519.c index 63ec53171c..f354107c5d 100644 --- a/crypto/ec/curve25519.c +++ b/crypto/ec/curve25519.c @@ -11,16 +11,19 @@ #include "ec_lcl.h" #include -#if !defined(PEDANTIC) && \ - !defined(__sparc__) && \ - (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) +#if defined(X25519_ASM) \ + || ( !defined(PEDANTIC) && \ + !defined(__sparc__) && \ + (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) ) /* * Base 2^51 implementation. */ # define BASE_2_51_IMPLEMENTED typedef uint64_t fe51[5]; +# if !defined(X25519_ASM) typedef unsigned __int128 u128; +# endif static const uint64_t MASK51 = 0x7ffffffffffff; @@ -132,6 +135,250 @@ static void fe51_tobytes(uint8_t *s, const fe51 h) s[31] = h4 >> 44; } +# ifdef X25519_ASM +void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g); +void x25519_fe51_sqr(fe51 h, const fe51 f); +void x25519_fe51_mul121666(fe51 h, fe51 f); +# define fe51_mul x25519_fe51_mul +# define fe51_sq x25519_fe51_sqr +# define fe51_mul121666 x25519_fe51_mul121666 + +# if defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64) + +# define BASE_2_64_IMPLEMENTED + +typedef uint64_t fe64[4]; + +int x25519_fe64_eligible(); + +/* + * There are no reference C implementations for this radix. + */ +void x25519_fe64_mul(fe64 h, const fe64 f, const fe64 g); +void x25519_fe64_sqr(fe64 h, const fe64 f); +void x25519_fe64_mul121666(fe64 h, fe64 f); +void x25519_fe64_add(fe64 h, const fe64 f, const fe64 g); +void x25519_fe64_sub(fe64 h, const fe64 f, const fe64 g); +void x25519_fe64_tobytes(uint8_t *s, const fe64 f); +# define fe64_mul x25519_fe64_mul +# define fe64_sqr x25519_fe64_sqr +# define fe64_mul121666 x25519_fe64_mul121666 +# define fe64_add x25519_fe64_add +# define fe64_sub x25519_fe64_sub +# define fe64_tobytes x25519_fe64_tobytes + +static uint64_t load_8(const uint8_t *in) +{ + uint64_t result; + + result = in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + result |= ((uint64_t)in[3]) << 24; + result |= ((uint64_t)in[4]) << 32; + result |= ((uint64_t)in[5]) << 40; + result |= ((uint64_t)in[6]) << 48; + result |= ((uint64_t)in[7]) << 56; + + return result; +} + +static void fe64_frombytes(fe64 h, const uint8_t *s) +{ + h[0] = load_8(s); + h[1] = load_8(s + 8); + h[2] = load_8(s + 16); + h[3] = load_8(s + 24) & 0x7fffffffffffffff; +} + +static void fe64_0(fe64 h) +{ + h[0] = 0; + h[1] = 0; + h[2] = 0; + h[3] = 0; +} + +static void fe64_1(fe64 h) +{ + h[0] = 1; + h[1] = 0; + h[2] = 0; + h[3] = 0; +} + +static void fe64_copy(fe64 h, const fe64 f) +{ + h[0] = f[0]; + h[1] = f[1]; + h[2] = f[2]; + h[3] = f[3]; +} + +static void fe64_cswap(fe64 f, fe64 g, unsigned int b) +{ + int i; + uint64_t mask = 0 - (uint64_t)b; + + for (i = 0; i < 4; i++) { + uint64_t x = f[i] ^ g[i]; + x &= mask; + f[i] ^= x; + g[i] ^= x; + } +} + +static void fe64_invert(fe64 out, const fe64 z) +{ + fe64 t0; + fe64 t1; + fe64 t2; + fe64 t3; + int i; + + /* + * Compute z ** -1 = z ** (2 ** 255 - 19 - 2) with the exponent as + * 2 ** 255 - 21 = (2 ** 5) * (2 ** 250 - 1) + 11. + */ + + /* t0 = z ** 2 */ + fe64_sqr(t0, z); + + /* t1 = t0 ** (2 ** 2) = z ** 8 */ + fe64_sqr(t1, t0); + fe64_sqr(t1, t1); + + /* t1 = z * t1 = z ** 9 */ + fe64_mul(t1, z, t1); + /* t0 = t0 * t1 = z ** 11 -- stash t0 away for the end. */ + fe64_mul(t0, t0, t1); + + /* t2 = t0 ** 2 = z ** 22 */ + fe64_sqr(t2, t0); + + /* t1 = t1 * t2 = z ** (2 ** 5 - 1) */ + fe64_mul(t1, t1, t2); + + /* t2 = t1 ** (2 ** 5) = z ** ((2 ** 5) * (2 ** 5 - 1)) */ + fe64_sqr(t2, t1); + for (i = 1; i < 5; ++i) + fe64_sqr(t2, t2); + + /* t1 = t1 * t2 = z ** ((2 ** 5 + 1) * (2 ** 5 - 1)) = z ** (2 ** 10 - 1) */ + fe64_mul(t1, t2, t1); + + /* Continuing similarly... */ + + /* t2 = z ** (2 ** 20 - 1) */ + fe64_sqr(t2, t1); + for (i = 1; i < 10; ++i) + fe64_sqr(t2, t2); + + fe64_mul(t2, t2, t1); + + /* t2 = z ** (2 ** 40 - 1) */ + fe64_sqr(t3, t2); + for (i = 1; i < 20; ++i) + fe64_sqr(t3, t3); + + fe64_mul(t2, t3, t2); + + /* t2 = z ** (2 ** 10) * (2 ** 40 - 1) */ + for (i = 0; i < 10; ++i) + fe64_sqr(t2, t2); + + /* t1 = z ** (2 ** 50 - 1) */ + fe64_mul(t1, t2, t1); + + /* t2 = z ** (2 ** 100 - 1) */ + fe64_sqr(t2, t1); + for (i = 1; i < 50; ++i) + fe64_sqr(t2, t2); + + fe64_mul(t2, t2, t1); + + /* t2 = z ** (2 ** 200 - 1) */ + fe64_sqr(t3, t2); + for (i = 1; i < 100; ++i) + fe64_sqr(t3, t3); + + fe64_mul(t2, t3, t2); + + /* t2 = z ** ((2 ** 50) * (2 ** 200 - 1) */ + for (i = 0; i < 50; ++i) + fe64_sqr(t2, t2); + + /* t1 = z ** (2 ** 250 - 1) */ + fe64_mul(t1, t2, t1); + + /* t1 = z ** ((2 ** 5) * (2 ** 250 - 1)) */ + for (i = 0; i < 5; ++i) + fe64_sqr(t1, t1); + + /* Recall t0 = z ** 11; out = z ** (2 ** 255 - 21) */ + fe64_mul(out, t1, t0); +} + +/* + * Duplicate of original x25519_scalar_mult_generic, but using + * fe64_* subroutines. + */ +static void x25519_scalar_mulx(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) +{ + fe64 x1, x2, z2, x3, z3, tmp0, tmp1; + uint8_t e[32]; + unsigned swap = 0; + int pos; + + memcpy(e, scalar, 32); + e[0] &= 0xf8; + e[31] &= 0x7f; + e[31] |= 0x40; + fe64_frombytes(x1, point); + fe64_1(x2); + fe64_0(z2); + fe64_copy(x3, x1); + fe64_1(z3); + + for (pos = 254; pos >= 0; --pos) { + unsigned int b = 1 & (e[pos / 8] >> (pos & 7)); + + swap ^= b; + fe64_cswap(x2, x3, swap); + fe64_cswap(z2, z3, swap); + swap = b; + fe64_sub(tmp0, x3, z3); + fe64_sub(tmp1, x2, z2); + fe64_add(x2, x2, z2); + fe64_add(z2, x3, z3); + fe64_mul(z3, x2, tmp0); + fe64_mul(z2, z2, tmp1); + fe64_sqr(tmp0, tmp1); + fe64_sqr(tmp1, x2); + fe64_add(x3, z3, z2); + fe64_sub(z2, z3, z2); + fe64_mul(x2, tmp1, tmp0); + fe64_sub(tmp1, tmp1, tmp0); + fe64_sqr(z2, z2); + fe64_mul121666(z3, tmp1); + fe64_sqr(x3, x3); + fe64_add(tmp0, tmp0, z3); + fe64_mul(z3, x1, z2); + fe64_mul(z2, tmp1, tmp0); + } + + fe64_invert(z2, z2); + fe64_mul(x2, x2, z2); + fe64_tobytes(out, x2); + + OPENSSL_cleanse(e, sizeof(e)); +} +# endif + +# else + static void fe51_mul(fe51 h, const fe51 f, const fe51 g) { u128 h0, h1, h2, h3, h4; @@ -192,9 +439,9 @@ static void fe51_mul(fe51 h, const fe51 f, const fe51 g) static void fe51_sq(fe51 h, const fe51 f) { -# if defined(OPENSSL_SMALL_FOOTPRINT) +# if defined(OPENSSL_SMALL_FOOTPRINT) fe51_mul(h, f, f); -# else +# else /* dedicated squaring gives 16-25% overall improvement */ uint64_t g0 = f[0]; uint64_t g1 = f[1]; @@ -241,9 +488,36 @@ static void fe51_sq(fe51 h, const fe51 f) h[2] = g2; h[3] = g3; h[4] = g4; -# endif +# endif } +static void fe51_mul121666(fe51 h, fe51 f) +{ + u128 h0 = f[0] * (u128)121666; + u128 h1 = f[1] * (u128)121666; + u128 h2 = f[2] * (u128)121666; + u128 h3 = f[3] * (u128)121666; + u128 h4 = f[4] * (u128)121666; + uint64_t g0, g1, g2, g3, g4; + + h3 += (uint64_t)(h2 >> 51); g2 = (uint64_t)h2 & MASK51; + h1 += (uint64_t)(h0 >> 51); g0 = (uint64_t)h0 & MASK51; + + h4 += (uint64_t)(h3 >> 51); g3 = (uint64_t)h3 & MASK51; + g2 += (uint64_t)(h1 >> 51); g1 = (uint64_t)h1 & MASK51; + + g0 += (uint64_t)(h4 >> 51) * 19; g4 = (uint64_t)h4 & MASK51; + g3 += g2 >> 51; g2 &= MASK51; + g1 += g0 >> 51; g0 &= MASK51; + + h[0] = g0; + h[1] = g1; + h[2] = g2; + h[3] = g3; + h[4] = g4; +} +# endif + static void fe51_add(fe51 h, const fe51 f, const fe51 g) { h[0] = f[0] + g[0]; @@ -397,32 +671,6 @@ static void fe51_invert(fe51 out, const fe51 z) fe51_mul(out, t1, t0); } -static void fe51_mul121666(fe51 h, fe51 f) -{ - u128 h0 = f[0] * (u128)121666; - u128 h1 = f[1] * (u128)121666; - u128 h2 = f[2] * (u128)121666; - u128 h3 = f[3] * (u128)121666; - u128 h4 = f[4] * (u128)121666; - uint64_t g0, g1, g2, g3, g4; - - h3 += (uint64_t)(h2 >> 51); g2 = (uint64_t)h2 & MASK51; - h1 += (uint64_t)(h0 >> 51); g0 = (uint64_t)h0 & MASK51; - - h4 += (uint64_t)(h3 >> 51); g3 = (uint64_t)h3 & MASK51; - g2 += (uint64_t)(h1 >> 51); g1 = (uint64_t)h1 & MASK51; - - g0 += (uint64_t)(h4 >> 51) * 19; g4 = (uint64_t)h4 & MASK51; - g3 += g2 >> 51; g2 &= MASK51; - g1 += g0 >> 51; g0 &= MASK51; - - h[0] = g0; - h[1] = g1; - h[2] = g2; - h[3] = g3; - h[4] = g4; -} - /* * Duplicate of original x25519_scalar_mult_generic, but using * fe51_* subroutines. @@ -435,6 +683,13 @@ static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32], unsigned swap = 0; int pos; +# ifdef BASE_2_64_IMPLEMENTED + if (x25519_fe64_eligible()) { + x25519_scalar_mulx(out, scalar, point); + return; + } +# endif + memcpy(e, scalar, 32); e[0] &= 0xf8; e[31] &= 0x7f;