From: Andy Polyakov <appro@openssl.org>
Date: Thu, 12 Jul 2018 09:53:16 +0000 (+0200)
Subject: ec/curve25519.c: reorganize for better accessibility.
X-Git-Tag: OpenSSL_1_1_1-pre9~146
X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=3c849bc901fa191fc517bc20d905783e6e428de5;p=oweals%2Fopenssl.git

ec/curve25519.c: reorganize for better accessibility.

Move base 2^64 code to own #if section. It was nested in base 2^51 section,
which arguably might have been tricky to follow.

Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6699)
---

diff --git a/crypto/ec/curve25519.c b/crypto/ec/curve25519.c
index 9666de1201..abe9b9cbf6 100644
--- a/crypto/ec/curve25519.c
+++ b/crypto/ec/curve25519.c
@@ -11,149 +11,23 @@
 #include "ec_lcl.h"
 #include <openssl/sha.h>
 
-#if defined(X25519_ASM) \
-    || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \
-         && !defined(__sparc__) \
-         && !(defined(__ANDROID__) && !defined(__clang__)) )
-/*
- * Base 2^51 implementation.
- */
-# define BASE_2_51_IMPLEMENTED
-
-typedef uint64_t fe51[5];
-# if !defined(X25519_ASM)
-typedef __uint128_t u128;
-# endif
-
-static const uint64_t MASK51 = 0x7ffffffffffff;
-
-static uint64_t load_7(const uint8_t *in)
-{
-    uint64_t result;
-
-    result = in[0];
-    result |= ((uint64_t)in[1]) << 8;
-    result |= ((uint64_t)in[2]) << 16;
-    result |= ((uint64_t)in[3]) << 24;
-    result |= ((uint64_t)in[4]) << 32;
-    result |= ((uint64_t)in[5]) << 40;
-    result |= ((uint64_t)in[6]) << 48;
-
-    return result;
-}
-
-static uint64_t load_6(const uint8_t *in)
-{
-    uint64_t result;
-
-    result = in[0];
-    result |= ((uint64_t)in[1]) << 8;
-    result |= ((uint64_t)in[2]) << 16;
-    result |= ((uint64_t)in[3]) << 24;
-    result |= ((uint64_t)in[4]) << 32;
-    result |= ((uint64_t)in[5]) << 40;
-
-    return result;
-}
-
-static void fe51_frombytes(fe51 h, const uint8_t *s)
-{
-    uint64_t h0 = load_7(s);                                /* 56 bits */
-    uint64_t h1 = load_6(s + 7) << 5;                       /* 53 bits */
-    uint64_t h2 = load_7(s + 13) << 2;                      /* 58 bits */
-    uint64_t h3 = load_6(s + 20) << 7;                      /* 55 bits */
-    uint64_t h4 = (load_6(s + 26) & 0x7fffffffffff) << 4;   /* 51 bits */
-
-    h1 |= h0 >> 51; h0 &= MASK51;
-    h2 |= h1 >> 51; h1 &= MASK51;
-    h3 |= h2 >> 51; h2 &= MASK51;
-    h4 |= h3 >> 51; h3 &= MASK51;
-
-    h[0] = h0;
-    h[1] = h1;
-    h[2] = h2;
-    h[3] = h3;
-    h[4] = h4;
-}
-
-static void fe51_tobytes(uint8_t *s, const fe51 h)
-{
-    uint64_t h0 = h[0];
-    uint64_t h1 = h[1];
-    uint64_t h2 = h[2];
-    uint64_t h3 = h[3];
-    uint64_t h4 = h[4];
-    uint64_t q;
+#if defined(X25519_ASM) && (defined(__x86_64) || defined(__x86_64__) || \
+                            defined(_M_AMD64) || defined(_M_X64))
 
-    /* compare to modulus */
-    q = (h0 + 19) >> 51;
-    q = (h1 + q) >> 51;
-    q = (h2 + q) >> 51;
-    q = (h3 + q) >> 51;
-    q = (h4 + q) >> 51;
-
-    /* full reduce */
-    h0 += 19 * q;
-    h1 += h0 >> 51; h0 &= MASK51;
-    h2 += h1 >> 51; h1 &= MASK51;
-    h3 += h2 >> 51; h2 &= MASK51;
-    h4 += h3 >> 51; h3 &= MASK51;
-                    h4 &= MASK51;
-
-    /* smash */
-    s[0] = (uint8_t)(h0 >> 0);
-    s[1] = (uint8_t)(h0 >> 8);
-    s[2] = (uint8_t)(h0 >> 16);
-    s[3] = (uint8_t)(h0 >> 24);
-    s[4] = (uint8_t)(h0 >> 32);
-    s[5] = (uint8_t)(h0 >> 40);
-    s[6] = (uint8_t)((h0 >> 48) | ((uint32_t)h1 << 3));
-    s[7] = (uint8_t)(h1 >> 5);
-    s[8] = (uint8_t)(h1 >> 13);
-    s[9] = (uint8_t)(h1 >> 21);
-    s[10] = (uint8_t)(h1 >> 29);
-    s[11] = (uint8_t)(h1 >> 37);
-    s[12] = (uint8_t)((h1 >> 45) | ((uint32_t)h2 << 6));
-    s[13] = (uint8_t)(h2 >> 2);
-    s[14] = (uint8_t)(h2 >> 10);
-    s[15] = (uint8_t)(h2 >> 18);
-    s[16] = (uint8_t)(h2 >> 26);
-    s[17] = (uint8_t)(h2 >> 34);
-    s[18] = (uint8_t)(h2 >> 42);
-    s[19] = (uint8_t)((h2 >> 50) | ((uint32_t)h3 << 1));
-    s[20] = (uint8_t)(h3 >> 7);
-    s[21] = (uint8_t)(h3 >> 15);
-    s[22] = (uint8_t)(h3 >> 23);
-    s[23] = (uint8_t)(h3 >> 31);
-    s[24] = (uint8_t)(h3 >> 39);
-    s[25] = (uint8_t)((h3 >> 47) | ((uint32_t)h4 << 4));
-    s[26] = (uint8_t)(h4 >> 4);
-    s[27] = (uint8_t)(h4 >> 12);
-    s[28] = (uint8_t)(h4 >> 20);
-    s[29] = (uint8_t)(h4 >> 28);
-    s[30] = (uint8_t)(h4 >> 36);
-    s[31] = (uint8_t)(h4 >> 44);
-}
-
-# ifdef X25519_ASM
-void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
-void x25519_fe51_sqr(fe51 h, const fe51 f);
-void x25519_fe51_mul121666(fe51 h, fe51 f);
-#  define fe51_mul x25519_fe51_mul
-#  define fe51_sq  x25519_fe51_sqr
-#  define fe51_mul121666 x25519_fe51_mul121666
-
-#  if defined(__x86_64) || defined(__x86_64__) || \
-      defined(_M_AMD64) || defined(_M_X64)
-
-#   define BASE_2_64_IMPLEMENTED
+# define BASE_2_64_IMPLEMENTED
 
 typedef uint64_t fe64[4];
 
 int x25519_fe64_eligible(void);
 
 /*
- * There are no reference C implementations for this radix.
+ * Following subroutines perform corresponding operations modulo
+ * 2^256-38, i.e. double the curve modulus. However, inputs and
+ * outputs are permitted to be partially reduced, i.e. to remain
+ * in [0..2^256) range. It's all tied up in final fe64_tobytes
+ * that performs full reduction modulo 2^255-19.
+ *
+ * There are no reference C implementations for these.
  */
 void x25519_fe64_mul(fe64 h, const fe64 f, const fe64 g);
 void x25519_fe64_sqr(fe64 h, const fe64 f);
@@ -161,12 +35,12 @@ void x25519_fe64_mul121666(fe64 h, fe64 f);
 void x25519_fe64_add(fe64 h, const fe64 f, const fe64 g);
 void x25519_fe64_sub(fe64 h, const fe64 f, const fe64 g);
 void x25519_fe64_tobytes(uint8_t *s, const fe64 f);
-#   define fe64_mul x25519_fe64_mul
-#   define fe64_sqr x25519_fe64_sqr
-#   define fe64_mul121666 x25519_fe64_mul121666
-#   define fe64_add x25519_fe64_add
-#   define fe64_sub x25519_fe64_sub
-#   define fe64_tobytes x25519_fe64_tobytes
+# define fe64_mul x25519_fe64_mul
+# define fe64_sqr x25519_fe64_sqr
+# define fe64_mul121666 x25519_fe64_mul121666
+# define fe64_add x25519_fe64_add
+# define fe64_sub x25519_fe64_sub
+# define fe64_tobytes x25519_fe64_tobytes
 
 static uint64_t load_8(const uint8_t *in)
 {
@@ -375,10 +249,143 @@ static void x25519_scalar_mulx(uint8_t out[32], const uint8_t scalar[32],
 
     OPENSSL_cleanse(e, sizeof(e));
 }
-#  endif
+#endif
+
+#if defined(X25519_ASM) \
+    || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \
+         && !defined(__sparc__) \
+         && !(defined(__ANDROID__) && !defined(__clang__)) )
+/*
+ * Base 2^51 implementation. It's virtually no different from reference
+ * base 2^25.5 implementation in respect to lax boundary conditions for
+ * intermediate values and even individual limbs. So that whatever you
+ * know about the reference, applies even here...
+ */
+# define BASE_2_51_IMPLEMENTED
+
+typedef uint64_t fe51[5];
+
+static const uint64_t MASK51 = 0x7ffffffffffff;
+
+static uint64_t load_7(const uint8_t *in)
+{
+    uint64_t result;
+
+    result = in[0];
+    result |= ((uint64_t)in[1]) << 8;
+    result |= ((uint64_t)in[2]) << 16;
+    result |= ((uint64_t)in[3]) << 24;
+    result |= ((uint64_t)in[4]) << 32;
+    result |= ((uint64_t)in[5]) << 40;
+    result |= ((uint64_t)in[6]) << 48;
+
+    return result;
+}
+
+static uint64_t load_6(const uint8_t *in)
+{
+    uint64_t result;
+
+    result = in[0];
+    result |= ((uint64_t)in[1]) << 8;
+    result |= ((uint64_t)in[2]) << 16;
+    result |= ((uint64_t)in[3]) << 24;
+    result |= ((uint64_t)in[4]) << 32;
+    result |= ((uint64_t)in[5]) << 40;
+
+    return result;
+}
+
+static void fe51_frombytes(fe51 h, const uint8_t *s)
+{
+    uint64_t h0 = load_7(s);                                /* 56 bits */
+    uint64_t h1 = load_6(s + 7) << 5;                       /* 53 bits */
+    uint64_t h2 = load_7(s + 13) << 2;                      /* 58 bits */
+    uint64_t h3 = load_6(s + 20) << 7;                      /* 55 bits */
+    uint64_t h4 = (load_6(s + 26) & 0x7fffffffffff) << 4;   /* 51 bits */
+
+    h1 |= h0 >> 51; h0 &= MASK51;
+    h2 |= h1 >> 51; h1 &= MASK51;
+    h3 |= h2 >> 51; h2 &= MASK51;
+    h4 |= h3 >> 51; h3 &= MASK51;
+
+    h[0] = h0;
+    h[1] = h1;
+    h[2] = h2;
+    h[3] = h3;
+    h[4] = h4;
+}
+
+static void fe51_tobytes(uint8_t *s, const fe51 h)
+{
+    uint64_t h0 = h[0];
+    uint64_t h1 = h[1];
+    uint64_t h2 = h[2];
+    uint64_t h3 = h[3];
+    uint64_t h4 = h[4];
+    uint64_t q;
 
+    /* compare to modulus */
+    q = (h0 + 19) >> 51;
+    q = (h1 + q) >> 51;
+    q = (h2 + q) >> 51;
+    q = (h3 + q) >> 51;
+    q = (h4 + q) >> 51;
+
+    /* full reduce */
+    h0 += 19 * q;
+    h1 += h0 >> 51; h0 &= MASK51;
+    h2 += h1 >> 51; h1 &= MASK51;
+    h3 += h2 >> 51; h2 &= MASK51;
+    h4 += h3 >> 51; h3 &= MASK51;
+                    h4 &= MASK51;
+
+    /* smash */
+    s[0] = (uint8_t)(h0 >> 0);
+    s[1] = (uint8_t)(h0 >> 8);
+    s[2] = (uint8_t)(h0 >> 16);
+    s[3] = (uint8_t)(h0 >> 24);
+    s[4] = (uint8_t)(h0 >> 32);
+    s[5] = (uint8_t)(h0 >> 40);
+    s[6] = (uint8_t)((h0 >> 48) | ((uint32_t)h1 << 3));
+    s[7] = (uint8_t)(h1 >> 5);
+    s[8] = (uint8_t)(h1 >> 13);
+    s[9] = (uint8_t)(h1 >> 21);
+    s[10] = (uint8_t)(h1 >> 29);
+    s[11] = (uint8_t)(h1 >> 37);
+    s[12] = (uint8_t)((h1 >> 45) | ((uint32_t)h2 << 6));
+    s[13] = (uint8_t)(h2 >> 2);
+    s[14] = (uint8_t)(h2 >> 10);
+    s[15] = (uint8_t)(h2 >> 18);
+    s[16] = (uint8_t)(h2 >> 26);
+    s[17] = (uint8_t)(h2 >> 34);
+    s[18] = (uint8_t)(h2 >> 42);
+    s[19] = (uint8_t)((h2 >> 50) | ((uint32_t)h3 << 1));
+    s[20] = (uint8_t)(h3 >> 7);
+    s[21] = (uint8_t)(h3 >> 15);
+    s[22] = (uint8_t)(h3 >> 23);
+    s[23] = (uint8_t)(h3 >> 31);
+    s[24] = (uint8_t)(h3 >> 39);
+    s[25] = (uint8_t)((h3 >> 47) | ((uint32_t)h4 << 4));
+    s[26] = (uint8_t)(h4 >> 4);
+    s[27] = (uint8_t)(h4 >> 12);
+    s[28] = (uint8_t)(h4 >> 20);
+    s[29] = (uint8_t)(h4 >> 28);
+    s[30] = (uint8_t)(h4 >> 36);
+    s[31] = (uint8_t)(h4 >> 44);
+}
+
+# if defined(X25519_ASM)
+void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
+void x25519_fe51_sqr(fe51 h, const fe51 f);
+void x25519_fe51_mul121666(fe51 h, fe51 f);
+#  define fe51_mul x25519_fe51_mul
+#  define fe51_sq  x25519_fe51_sqr
+#  define fe51_mul121666 x25519_fe51_mul121666
 # else
 
+typedef __uint128_t u128;
+
 static void fe51_mul(fe51 h, const fe51 f, const fe51 g)
 {
     u128 h0, h1, h2, h3, h4;