#include "ec_lcl.h"
#include <openssl/sha.h>
-#if defined(X25519_ASM) \
- || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \
- && !defined(__sparc__) \
- && !(defined(__ANDROID__) && !defined(__clang__)) )
-/*
- * Base 2^51 implementation.
- */
-# define BASE_2_51_IMPLEMENTED
-
-typedef uint64_t fe51[5];
-# if !defined(X25519_ASM)
-typedef __uint128_t u128;
-# endif
-
-static const uint64_t MASK51 = 0x7ffffffffffff;
-
-static uint64_t load_7(const uint8_t *in)
-{
- uint64_t result;
-
- result = in[0];
- result |= ((uint64_t)in[1]) << 8;
- result |= ((uint64_t)in[2]) << 16;
- result |= ((uint64_t)in[3]) << 24;
- result |= ((uint64_t)in[4]) << 32;
- result |= ((uint64_t)in[5]) << 40;
- result |= ((uint64_t)in[6]) << 48;
-
- return result;
-}
-
-static uint64_t load_6(const uint8_t *in)
-{
- uint64_t result;
-
- result = in[0];
- result |= ((uint64_t)in[1]) << 8;
- result |= ((uint64_t)in[2]) << 16;
- result |= ((uint64_t)in[3]) << 24;
- result |= ((uint64_t)in[4]) << 32;
- result |= ((uint64_t)in[5]) << 40;
-
- return result;
-}
-
-static void fe51_frombytes(fe51 h, const uint8_t *s)
-{
- uint64_t h0 = load_7(s); /* 56 bits */
- uint64_t h1 = load_6(s + 7) << 5; /* 53 bits */
- uint64_t h2 = load_7(s + 13) << 2; /* 58 bits */
- uint64_t h3 = load_6(s + 20) << 7; /* 55 bits */
- uint64_t h4 = (load_6(s + 26) & 0x7fffffffffff) << 4; /* 51 bits */
-
- h1 |= h0 >> 51; h0 &= MASK51;
- h2 |= h1 >> 51; h1 &= MASK51;
- h3 |= h2 >> 51; h2 &= MASK51;
- h4 |= h3 >> 51; h3 &= MASK51;
-
- h[0] = h0;
- h[1] = h1;
- h[2] = h2;
- h[3] = h3;
- h[4] = h4;
-}
-
-static void fe51_tobytes(uint8_t *s, const fe51 h)
-{
- uint64_t h0 = h[0];
- uint64_t h1 = h[1];
- uint64_t h2 = h[2];
- uint64_t h3 = h[3];
- uint64_t h4 = h[4];
- uint64_t q;
+#if defined(X25519_ASM) && (defined(__x86_64) || defined(__x86_64__) || \
+ defined(_M_AMD64) || defined(_M_X64))
- /* compare to modulus */
- q = (h0 + 19) >> 51;
- q = (h1 + q) >> 51;
- q = (h2 + q) >> 51;
- q = (h3 + q) >> 51;
- q = (h4 + q) >> 51;
-
- /* full reduce */
- h0 += 19 * q;
- h1 += h0 >> 51; h0 &= MASK51;
- h2 += h1 >> 51; h1 &= MASK51;
- h3 += h2 >> 51; h2 &= MASK51;
- h4 += h3 >> 51; h3 &= MASK51;
- h4 &= MASK51;
-
- /* smash */
- s[0] = (uint8_t)(h0 >> 0);
- s[1] = (uint8_t)(h0 >> 8);
- s[2] = (uint8_t)(h0 >> 16);
- s[3] = (uint8_t)(h0 >> 24);
- s[4] = (uint8_t)(h0 >> 32);
- s[5] = (uint8_t)(h0 >> 40);
- s[6] = (uint8_t)((h0 >> 48) | ((uint32_t)h1 << 3));
- s[7] = (uint8_t)(h1 >> 5);
- s[8] = (uint8_t)(h1 >> 13);
- s[9] = (uint8_t)(h1 >> 21);
- s[10] = (uint8_t)(h1 >> 29);
- s[11] = (uint8_t)(h1 >> 37);
- s[12] = (uint8_t)((h1 >> 45) | ((uint32_t)h2 << 6));
- s[13] = (uint8_t)(h2 >> 2);
- s[14] = (uint8_t)(h2 >> 10);
- s[15] = (uint8_t)(h2 >> 18);
- s[16] = (uint8_t)(h2 >> 26);
- s[17] = (uint8_t)(h2 >> 34);
- s[18] = (uint8_t)(h2 >> 42);
- s[19] = (uint8_t)((h2 >> 50) | ((uint32_t)h3 << 1));
- s[20] = (uint8_t)(h3 >> 7);
- s[21] = (uint8_t)(h3 >> 15);
- s[22] = (uint8_t)(h3 >> 23);
- s[23] = (uint8_t)(h3 >> 31);
- s[24] = (uint8_t)(h3 >> 39);
- s[25] = (uint8_t)((h3 >> 47) | ((uint32_t)h4 << 4));
- s[26] = (uint8_t)(h4 >> 4);
- s[27] = (uint8_t)(h4 >> 12);
- s[28] = (uint8_t)(h4 >> 20);
- s[29] = (uint8_t)(h4 >> 28);
- s[30] = (uint8_t)(h4 >> 36);
- s[31] = (uint8_t)(h4 >> 44);
-}
-
-# ifdef X25519_ASM
-void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
-void x25519_fe51_sqr(fe51 h, const fe51 f);
-void x25519_fe51_mul121666(fe51 h, fe51 f);
-# define fe51_mul x25519_fe51_mul
-# define fe51_sq x25519_fe51_sqr
-# define fe51_mul121666 x25519_fe51_mul121666
-
-# if defined(__x86_64) || defined(__x86_64__) || \
- defined(_M_AMD64) || defined(_M_X64)
-
-# define BASE_2_64_IMPLEMENTED
+# define BASE_2_64_IMPLEMENTED
typedef uint64_t fe64[4];
int x25519_fe64_eligible(void);
/*
- * There are no reference C implementations for this radix.
+ * Following subroutines perform corresponding operations modulo
+ * 2^256-38, i.e. double the curve modulus. However, inputs and
+ * outputs are permitted to be partially reduced, i.e. to remain
+ * in [0..2^256) range. It's all tied up in final fe64_tobytes
+ * that performs full reduction modulo 2^255-19.
+ *
+ * There are no reference C implementations for these.
*/
void x25519_fe64_mul(fe64 h, const fe64 f, const fe64 g);
void x25519_fe64_sqr(fe64 h, const fe64 f);
void x25519_fe64_add(fe64 h, const fe64 f, const fe64 g);
void x25519_fe64_sub(fe64 h, const fe64 f, const fe64 g);
void x25519_fe64_tobytes(uint8_t *s, const fe64 f);
-# define fe64_mul x25519_fe64_mul
-# define fe64_sqr x25519_fe64_sqr
-# define fe64_mul121666 x25519_fe64_mul121666
-# define fe64_add x25519_fe64_add
-# define fe64_sub x25519_fe64_sub
-# define fe64_tobytes x25519_fe64_tobytes
+# define fe64_mul x25519_fe64_mul
+# define fe64_sqr x25519_fe64_sqr
+# define fe64_mul121666 x25519_fe64_mul121666
+# define fe64_add x25519_fe64_add
+# define fe64_sub x25519_fe64_sub
+# define fe64_tobytes x25519_fe64_tobytes
static uint64_t load_8(const uint8_t *in)
{
OPENSSL_cleanse(e, sizeof(e));
}
-# endif
+#endif
+
+#if defined(X25519_ASM) \
+ || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \
+ && !defined(__sparc__) \
+ && !(defined(__ANDROID__) && !defined(__clang__)) )
+/*
+ * Base 2^51 implementation. It's virtually no different from reference
+ * base 2^25.5 implementation in respect to lax boundary conditions for
+ * intermediate values and even individual limbs. So that whatever you
+ * know about the reference, applies even here...
+ */
+# define BASE_2_51_IMPLEMENTED
+
+typedef uint64_t fe51[5];
+
+static const uint64_t MASK51 = 0x7ffffffffffff;
+
+static uint64_t load_7(const uint8_t *in)
+{
+ uint64_t result;
+
+ result = in[0];
+ result |= ((uint64_t)in[1]) << 8;
+ result |= ((uint64_t)in[2]) << 16;
+ result |= ((uint64_t)in[3]) << 24;
+ result |= ((uint64_t)in[4]) << 32;
+ result |= ((uint64_t)in[5]) << 40;
+ result |= ((uint64_t)in[6]) << 48;
+
+ return result;
+}
+
+static uint64_t load_6(const uint8_t *in)
+{
+ uint64_t result;
+
+ result = in[0];
+ result |= ((uint64_t)in[1]) << 8;
+ result |= ((uint64_t)in[2]) << 16;
+ result |= ((uint64_t)in[3]) << 24;
+ result |= ((uint64_t)in[4]) << 32;
+ result |= ((uint64_t)in[5]) << 40;
+
+ return result;
+}
+
+static void fe51_frombytes(fe51 h, const uint8_t *s)
+{
+ uint64_t h0 = load_7(s); /* 56 bits */
+ uint64_t h1 = load_6(s + 7) << 5; /* 53 bits */
+ uint64_t h2 = load_7(s + 13) << 2; /* 58 bits */
+ uint64_t h3 = load_6(s + 20) << 7; /* 55 bits */
+ uint64_t h4 = (load_6(s + 26) & 0x7fffffffffff) << 4; /* 51 bits */
+
+ h1 |= h0 >> 51; h0 &= MASK51;
+ h2 |= h1 >> 51; h1 &= MASK51;
+ h3 |= h2 >> 51; h2 &= MASK51;
+ h4 |= h3 >> 51; h3 &= MASK51;
+
+ h[0] = h0;
+ h[1] = h1;
+ h[2] = h2;
+ h[3] = h3;
+ h[4] = h4;
+}
+
+static void fe51_tobytes(uint8_t *s, const fe51 h)
+{
+ uint64_t h0 = h[0];
+ uint64_t h1 = h[1];
+ uint64_t h2 = h[2];
+ uint64_t h3 = h[3];
+ uint64_t h4 = h[4];
+ uint64_t q;
+ /* compare to modulus */
+ q = (h0 + 19) >> 51;
+ q = (h1 + q) >> 51;
+ q = (h2 + q) >> 51;
+ q = (h3 + q) >> 51;
+ q = (h4 + q) >> 51;
+
+ /* full reduce */
+ h0 += 19 * q;
+ h1 += h0 >> 51; h0 &= MASK51;
+ h2 += h1 >> 51; h1 &= MASK51;
+ h3 += h2 >> 51; h2 &= MASK51;
+ h4 += h3 >> 51; h3 &= MASK51;
+ h4 &= MASK51;
+
+ /* smash */
+ s[0] = (uint8_t)(h0 >> 0);
+ s[1] = (uint8_t)(h0 >> 8);
+ s[2] = (uint8_t)(h0 >> 16);
+ s[3] = (uint8_t)(h0 >> 24);
+ s[4] = (uint8_t)(h0 >> 32);
+ s[5] = (uint8_t)(h0 >> 40);
+ s[6] = (uint8_t)((h0 >> 48) | ((uint32_t)h1 << 3));
+ s[7] = (uint8_t)(h1 >> 5);
+ s[8] = (uint8_t)(h1 >> 13);
+ s[9] = (uint8_t)(h1 >> 21);
+ s[10] = (uint8_t)(h1 >> 29);
+ s[11] = (uint8_t)(h1 >> 37);
+ s[12] = (uint8_t)((h1 >> 45) | ((uint32_t)h2 << 6));
+ s[13] = (uint8_t)(h2 >> 2);
+ s[14] = (uint8_t)(h2 >> 10);
+ s[15] = (uint8_t)(h2 >> 18);
+ s[16] = (uint8_t)(h2 >> 26);
+ s[17] = (uint8_t)(h2 >> 34);
+ s[18] = (uint8_t)(h2 >> 42);
+ s[19] = (uint8_t)((h2 >> 50) | ((uint32_t)h3 << 1));
+ s[20] = (uint8_t)(h3 >> 7);
+ s[21] = (uint8_t)(h3 >> 15);
+ s[22] = (uint8_t)(h3 >> 23);
+ s[23] = (uint8_t)(h3 >> 31);
+ s[24] = (uint8_t)(h3 >> 39);
+ s[25] = (uint8_t)((h3 >> 47) | ((uint32_t)h4 << 4));
+ s[26] = (uint8_t)(h4 >> 4);
+ s[27] = (uint8_t)(h4 >> 12);
+ s[28] = (uint8_t)(h4 >> 20);
+ s[29] = (uint8_t)(h4 >> 28);
+ s[30] = (uint8_t)(h4 >> 36);
+ s[31] = (uint8_t)(h4 >> 44);
+}
+
+# if defined(X25519_ASM)
+void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
+void x25519_fe51_sqr(fe51 h, const fe51 f);
+void x25519_fe51_mul121666(fe51 h, fe51 f);
+# define fe51_mul x25519_fe51_mul
+# define fe51_sq x25519_fe51_sqr
+# define fe51_mul121666 x25519_fe51_mul121666
# else
+typedef __uint128_t u128;
+
static void fe51_mul(fe51 h, const fe51 f, const fe51 g)
{
u128 h0, h1, h2, h3, h4;