crypto/ec/ecp_nistp521.c

   1 /*
   2  * Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the Apache License 2.0 (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 /* Copyright 2011 Google Inc.
  11  *
  12  * Licensed under the Apache License, Version 2.0 (the "License");
  13  *
  14  * you may not use this file except in compliance with the License.
  15  * You may obtain a copy of the License at
  16  *
  17  *     http://www.apache.org/licenses/LICENSE-2.0
  18  *
  19  *  Unless required by applicable law or agreed to in writing, software
  20  *  distributed under the License is distributed on an "AS IS" BASIS,
  21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22  *  See the License for the specific language governing permissions and
  23  *  limitations under the License.
  24  */
  25
  26 /*
  27  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  28  *
  29  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  30  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  31  * work which got its smarts from Daniel J. Bernstein's work on the same.
  32  */
  33
  34 #include <openssl/e_os2.h>
  35 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
  36 NON_EMPTY_TRANSLATION_UNIT
  37 #else
  38
  39 # include <string.h>
  40 # include <openssl/err.h>
  41 # include "ec_lcl.h"
  42
  43 # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
  44   /* even with gcc, the typedef won't work for 32-bit platforms */
  45 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  46                                  * platforms */
  47 # else
  48 #  error "Your compiler doesn't appear to support 128-bit integer types"
  49 # endif
  50
  51 typedef uint8_t u8;
  52 typedef uint64_t u64;
  53
  54 /*
  55  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
  56  * element of this field into 66 bytes where the most significant byte
  57  * contains only a single bit. We call this an felem_bytearray.
  58  */
  59
  60 typedef u8 felem_bytearray[66];
  61
  62 /*
  63  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  64  * These values are big-endian.
  65  */
  66 static const felem_bytearray nistp521_curve_params[5] = {
  67     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
  68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  75      0xff, 0xff},
  76     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
  77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  84      0xff, 0xfc},
  85     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
  86      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  87      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  88      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  89      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  90      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  91      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  92      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  93      0x3f, 0x00},
  94     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
  95      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  96      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  97      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  98      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  99      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
 100      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
 101      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 102      0xbd, 0x66},
 103     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
 104      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 105      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 106      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 107      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 108      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 109      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 110      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 111      0x66, 0x50}
 112 };
 113
 114 /*-
 115  * The representation of field elements.
 116  * ------------------------------------
 117  *
 118  * We represent field elements with nine values. These values are either 64 or
 119  * 128 bits and the field element represented is:
 120  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 121  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 122  * 58 bits apart, but are greater than 58 bits in length, the most significant
 123  * bits of each limb overlap with the least significant bits of the next.
 124  *
 125  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 126  * 'largefelem' */
 127
 128 # define NLIMBS 9
 129
 130 typedef uint64_t limb;
 131 typedef limb felem[NLIMBS];
 132 typedef uint128_t largefelem[NLIMBS];
 133
 134 static const limb bottom57bits = 0x1ffffffffffffff;
 135 static const limb bottom58bits = 0x3ffffffffffffff;
 136
 137 /*
 138  * bin66_to_felem takes a little-endian byte array and converts it into felem
 139  * form. This assumes that the CPU is little-endian.
 140  */
 141 static void bin66_to_felem(felem out, const u8 in[66])
 142 {
 143     out[0] = (*((limb *) & in[0])) & bottom58bits;
 144     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
 145     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
 146     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
 147     out[4] = (*((limb *) & in[29])) & bottom58bits;
 148     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
 149     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
 150     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
 151     out[8] = (*((limb *) & in[58])) & bottom57bits;
 152 }
 153
 154 /*
 155  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 156  * array. This assumes that the CPU is little-endian.
 157  */
 158 static void felem_to_bin66(u8 out[66], const felem in)
 159 {
 160     memset(out, 0, 66);
 161     (*((limb *) & out[0])) = in[0];
 162     (*((limb *) & out[7])) |= in[1] << 2;
 163     (*((limb *) & out[14])) |= in[2] << 4;
 164     (*((limb *) & out[21])) |= in[3] << 6;
 165     (*((limb *) & out[29])) = in[4];
 166     (*((limb *) & out[36])) |= in[5] << 2;
 167     (*((limb *) & out[43])) |= in[6] << 4;
 168     (*((limb *) & out[50])) |= in[7] << 6;
 169     (*((limb *) & out[58])) = in[8];
 170 }
 171
 172 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 173 static int BN_to_felem(felem out, const BIGNUM *bn)
 174 {
 175     felem_bytearray b_out;
 176     int num_bytes;
 177
 178     if (BN_is_negative(bn)) {
 179         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 180         return 0;
 181     }
 182     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
 183     if (num_bytes < 0) {
 184         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 185         return 0;
 186     }
 187     bin66_to_felem(out, b_out);
 188     return 1;
 189 }
 190
 191 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 192 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 193 {
 194     felem_bytearray b_out;
 195     felem_to_bin66(b_out, in);
 196     return BN_lebin2bn(b_out, sizeof(b_out), out);
 197 }
 198
 199 /*-
 200  * Field operations
 201  * ----------------
 202  */
 203
 204 static void felem_one(felem out)
 205 {
 206     out[0] = 1;
 207     out[1] = 0;
 208     out[2] = 0;
 209     out[3] = 0;
 210     out[4] = 0;
 211     out[5] = 0;
 212     out[6] = 0;
 213     out[7] = 0;
 214     out[8] = 0;
 215 }
 216
 217 static void felem_assign(felem out, const felem in)
 218 {
 219     out[0] = in[0];
 220     out[1] = in[1];
 221     out[2] = in[2];
 222     out[3] = in[3];
 223     out[4] = in[4];
 224     out[5] = in[5];
 225     out[6] = in[6];
 226     out[7] = in[7];
 227     out[8] = in[8];
 228 }
 229
 230 /* felem_sum64 sets out = out + in. */
 231 static void felem_sum64(felem out, const felem in)
 232 {
 233     out[0] += in[0];
 234     out[1] += in[1];
 235     out[2] += in[2];
 236     out[3] += in[3];
 237     out[4] += in[4];
 238     out[5] += in[5];
 239     out[6] += in[6];
 240     out[7] += in[7];
 241     out[8] += in[8];
 242 }
 243
 244 /* felem_scalar sets out = in * scalar */
 245 static void felem_scalar(felem out, const felem in, limb scalar)
 246 {
 247     out[0] = in[0] * scalar;
 248     out[1] = in[1] * scalar;
 249     out[2] = in[2] * scalar;
 250     out[3] = in[3] * scalar;
 251     out[4] = in[4] * scalar;
 252     out[5] = in[5] * scalar;
 253     out[6] = in[6] * scalar;
 254     out[7] = in[7] * scalar;
 255     out[8] = in[8] * scalar;
 256 }
 257
 258 /* felem_scalar64 sets out = out * scalar */
 259 static void felem_scalar64(felem out, limb scalar)
 260 {
 261     out[0] *= scalar;
 262     out[1] *= scalar;
 263     out[2] *= scalar;
 264     out[3] *= scalar;
 265     out[4] *= scalar;
 266     out[5] *= scalar;
 267     out[6] *= scalar;
 268     out[7] *= scalar;
 269     out[8] *= scalar;
 270 }
 271
 272 /* felem_scalar128 sets out = out * scalar */
 273 static void felem_scalar128(largefelem out, limb scalar)
 274 {
 275     out[0] *= scalar;
 276     out[1] *= scalar;
 277     out[2] *= scalar;
 278     out[3] *= scalar;
 279     out[4] *= scalar;
 280     out[5] *= scalar;
 281     out[6] *= scalar;
 282     out[7] *= scalar;
 283     out[8] *= scalar;
 284 }
 285
 286 /*-
 287  * felem_neg sets |out| to |-in|
 288  * On entry:
 289  *   in[i] < 2^59 + 2^14
 290  * On exit:
 291  *   out[i] < 2^62
 292  */
 293 static void felem_neg(felem out, const felem in)
 294 {
 295     /* In order to prevent underflow, we subtract from 0 mod p. */
 296     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 297     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 298
 299     out[0] = two62m3 - in[0];
 300     out[1] = two62m2 - in[1];
 301     out[2] = two62m2 - in[2];
 302     out[3] = two62m2 - in[3];
 303     out[4] = two62m2 - in[4];
 304     out[5] = two62m2 - in[5];
 305     out[6] = two62m2 - in[6];
 306     out[7] = two62m2 - in[7];
 307     out[8] = two62m2 - in[8];
 308 }
 309
 310 /*-
 311  * felem_diff64 subtracts |in| from |out|
 312  * On entry:
 313  *   in[i] < 2^59 + 2^14
 314  * On exit:
 315  *   out[i] < out[i] + 2^62
 316  */
 317 static void felem_diff64(felem out, const felem in)
 318 {
 319     /*
 320      * In order to prevent underflow, we add 0 mod p before subtracting.
 321      */
 322     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
 323     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
 324
 325     out[0] += two62m3 - in[0];
 326     out[1] += two62m2 - in[1];
 327     out[2] += two62m2 - in[2];
 328     out[3] += two62m2 - in[3];
 329     out[4] += two62m2 - in[4];
 330     out[5] += two62m2 - in[5];
 331     out[6] += two62m2 - in[6];
 332     out[7] += two62m2 - in[7];
 333     out[8] += two62m2 - in[8];
 334 }
 335
 336 /*-
 337  * felem_diff_128_64 subtracts |in| from |out|
 338  * On entry:
 339  *   in[i] < 2^62 + 2^17
 340  * On exit:
 341  *   out[i] < out[i] + 2^63
 342  */
 343 static void felem_diff_128_64(largefelem out, const felem in)
 344 {
 345     /*
 346      * In order to prevent underflow, we add 64p mod p (which is equivalent
 347      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
 348      * digit number with all bits set to 1. See "The representation of field
 349      * elements" comment above for a description of how limbs are used to
 350      * represent a number. 64p is represented with 8 limbs containing a number
 351      * with 58 bits set and one limb with a number with 57 bits set.
 352      */
 353     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
 354     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
 355
 356     out[0] += two63m6 - in[0];
 357     out[1] += two63m5 - in[1];
 358     out[2] += two63m5 - in[2];
 359     out[3] += two63m5 - in[3];
 360     out[4] += two63m5 - in[4];
 361     out[5] += two63m5 - in[5];
 362     out[6] += two63m5 - in[6];
 363     out[7] += two63m5 - in[7];
 364     out[8] += two63m5 - in[8];
 365 }
 366
 367 /*-
 368  * felem_diff_128_64 subtracts |in| from |out|
 369  * On entry:
 370  *   in[i] < 2^126
 371  * On exit:
 372  *   out[i] < out[i] + 2^127 - 2^69
 373  */
 374 static void felem_diff128(largefelem out, const largefelem in)
 375 {
 376     /*
 377      * In order to prevent underflow, we add 0 mod p before subtracting.
 378      */
 379     static const uint128_t two127m70 =
 380         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
 381     static const uint128_t two127m69 =
 382         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
 383
 384     out[0] += (two127m70 - in[0]);
 385     out[1] += (two127m69 - in[1]);
 386     out[2] += (two127m69 - in[2]);
 387     out[3] += (two127m69 - in[3]);
 388     out[4] += (two127m69 - in[4]);
 389     out[5] += (two127m69 - in[5]);
 390     out[6] += (two127m69 - in[6]);
 391     out[7] += (two127m69 - in[7]);
 392     out[8] += (two127m69 - in[8]);
 393 }
 394
 395 /*-
 396  * felem_square sets |out| = |in|^2
 397  * On entry:
 398  *   in[i] < 2^62
 399  * On exit:
 400  *   out[i] < 17 * max(in[i]) * max(in[i])
 401  */
 402 static void felem_square(largefelem out, const felem in)
 403 {
 404     felem inx2, inx4;
 405     felem_scalar(inx2, in, 2);
 406     felem_scalar(inx4, in, 4);
 407
 408     /*-
 409      * We have many cases were we want to do
 410      *   in[x] * in[y] +
 411      *   in[y] * in[x]
 412      * This is obviously just
 413      *   2 * in[x] * in[y]
 414      * However, rather than do the doubling on the 128 bit result, we
 415      * double one of the inputs to the multiplication by reading from
 416      * |inx2|
 417      */
 418
 419     out[0] = ((uint128_t) in[0]) * in[0];
 420     out[1] = ((uint128_t) in[0]) * inx2[1];
 421     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
 422     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
 423     out[4] = ((uint128_t) in[0]) * inx2[4] +
 424              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
 425     out[5] = ((uint128_t) in[0]) * inx2[5] +
 426              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
 427     out[6] = ((uint128_t) in[0]) * inx2[6] +
 428              ((uint128_t) in[1]) * inx2[5] +
 429              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
 430     out[7] = ((uint128_t) in[0]) * inx2[7] +
 431              ((uint128_t) in[1]) * inx2[6] +
 432              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
 433     out[8] = ((uint128_t) in[0]) * inx2[8] +
 434              ((uint128_t) in[1]) * inx2[7] +
 435              ((uint128_t) in[2]) * inx2[6] +
 436              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
 437
 438     /*
 439      * The remaining limbs fall above 2^521, with the first falling at 2^522.
 440      * They correspond to locations one bit up from the limbs produced above
 441      * so we would have to multiply by two to align them. Again, rather than
 442      * operate on the 128-bit result, we double one of the inputs to the
 443      * multiplication. If we want to double for both this reason, and the
 444      * reason above, then we end up multiplying by four.
 445      */
 446
 447     /* 9 */
 448     out[0] += ((uint128_t) in[1]) * inx4[8] +
 449               ((uint128_t) in[2]) * inx4[7] +
 450               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
 451
 452     /* 10 */
 453     out[1] += ((uint128_t) in[2]) * inx4[8] +
 454               ((uint128_t) in[3]) * inx4[7] +
 455               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
 456
 457     /* 11 */
 458     out[2] += ((uint128_t) in[3]) * inx4[8] +
 459               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
 460
 461     /* 12 */
 462     out[3] += ((uint128_t) in[4]) * inx4[8] +
 463               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
 464
 465     /* 13 */
 466     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
 467
 468     /* 14 */
 469     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
 470
 471     /* 15 */
 472     out[6] += ((uint128_t) in[7]) * inx4[8];
 473
 474     /* 16 */
 475     out[7] += ((uint128_t) in[8]) * inx2[8];
 476 }
 477
 478 /*-
 479  * felem_mul sets |out| = |in1| * |in2|
 480  * On entry:
 481  *   in1[i] < 2^64
 482  *   in2[i] < 2^63
 483  * On exit:
 484  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 485  */
 486 static void felem_mul(largefelem out, const felem in1, const felem in2)
 487 {
 488     felem in2x2;
 489     felem_scalar(in2x2, in2, 2);
 490
 491     out[0] = ((uint128_t) in1[0]) * in2[0];
 492
 493     out[1] = ((uint128_t) in1[0]) * in2[1] +
 494              ((uint128_t) in1[1]) * in2[0];
 495
 496     out[2] = ((uint128_t) in1[0]) * in2[2] +
 497              ((uint128_t) in1[1]) * in2[1] +
 498              ((uint128_t) in1[2]) * in2[0];
 499
 500     out[3] = ((uint128_t) in1[0]) * in2[3] +
 501              ((uint128_t) in1[1]) * in2[2] +
 502              ((uint128_t) in1[2]) * in2[1] +
 503              ((uint128_t) in1[3]) * in2[0];
 504
 505     out[4] = ((uint128_t) in1[0]) * in2[4] +
 506              ((uint128_t) in1[1]) * in2[3] +
 507              ((uint128_t) in1[2]) * in2[2] +
 508              ((uint128_t) in1[3]) * in2[1] +
 509              ((uint128_t) in1[4]) * in2[0];
 510
 511     out[5] = ((uint128_t) in1[0]) * in2[5] +
 512              ((uint128_t) in1[1]) * in2[4] +
 513              ((uint128_t) in1[2]) * in2[3] +
 514              ((uint128_t) in1[3]) * in2[2] +
 515              ((uint128_t) in1[4]) * in2[1] +
 516              ((uint128_t) in1[5]) * in2[0];
 517
 518     out[6] = ((uint128_t) in1[0]) * in2[6] +
 519              ((uint128_t) in1[1]) * in2[5] +
 520              ((uint128_t) in1[2]) * in2[4] +
 521              ((uint128_t) in1[3]) * in2[3] +
 522              ((uint128_t) in1[4]) * in2[2] +
 523              ((uint128_t) in1[5]) * in2[1] +
 524              ((uint128_t) in1[6]) * in2[0];
 525
 526     out[7] = ((uint128_t) in1[0]) * in2[7] +
 527              ((uint128_t) in1[1]) * in2[6] +
 528              ((uint128_t) in1[2]) * in2[5] +
 529              ((uint128_t) in1[3]) * in2[4] +
 530              ((uint128_t) in1[4]) * in2[3] +
 531              ((uint128_t) in1[5]) * in2[2] +
 532              ((uint128_t) in1[6]) * in2[1] +
 533              ((uint128_t) in1[7]) * in2[0];
 534
 535     out[8] = ((uint128_t) in1[0]) * in2[8] +
 536              ((uint128_t) in1[1]) * in2[7] +
 537              ((uint128_t) in1[2]) * in2[6] +
 538              ((uint128_t) in1[3]) * in2[5] +
 539              ((uint128_t) in1[4]) * in2[4] +
 540              ((uint128_t) in1[5]) * in2[3] +
 541              ((uint128_t) in1[6]) * in2[2] +
 542              ((uint128_t) in1[7]) * in2[1] +
 543              ((uint128_t) in1[8]) * in2[0];
 544
 545     /* See comment in felem_square about the use of in2x2 here */
 546
 547     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 548               ((uint128_t) in1[2]) * in2x2[7] +
 549               ((uint128_t) in1[3]) * in2x2[6] +
 550               ((uint128_t) in1[4]) * in2x2[5] +
 551               ((uint128_t) in1[5]) * in2x2[4] +
 552               ((uint128_t) in1[6]) * in2x2[3] +
 553               ((uint128_t) in1[7]) * in2x2[2] +
 554               ((uint128_t) in1[8]) * in2x2[1];
 555
 556     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 557               ((uint128_t) in1[3]) * in2x2[7] +
 558               ((uint128_t) in1[4]) * in2x2[6] +
 559               ((uint128_t) in1[5]) * in2x2[5] +
 560               ((uint128_t) in1[6]) * in2x2[4] +
 561               ((uint128_t) in1[7]) * in2x2[3] +
 562               ((uint128_t) in1[8]) * in2x2[2];
 563
 564     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 565               ((uint128_t) in1[4]) * in2x2[7] +
 566               ((uint128_t) in1[5]) * in2x2[6] +
 567               ((uint128_t) in1[6]) * in2x2[5] +
 568               ((uint128_t) in1[7]) * in2x2[4] +
 569               ((uint128_t) in1[8]) * in2x2[3];
 570
 571     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 572               ((uint128_t) in1[5]) * in2x2[7] +
 573               ((uint128_t) in1[6]) * in2x2[6] +
 574               ((uint128_t) in1[7]) * in2x2[5] +
 575               ((uint128_t) in1[8]) * in2x2[4];
 576
 577     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 578               ((uint128_t) in1[6]) * in2x2[7] +
 579               ((uint128_t) in1[7]) * in2x2[6] +
 580               ((uint128_t) in1[8]) * in2x2[5];
 581
 582     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 583               ((uint128_t) in1[7]) * in2x2[7] +
 584               ((uint128_t) in1[8]) * in2x2[6];
 585
 586     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 587               ((uint128_t) in1[8]) * in2x2[7];
 588
 589     out[7] += ((uint128_t) in1[8]) * in2x2[8];
 590 }
 591
 592 static const limb bottom52bits = 0xfffffffffffff;
 593
 594 /*-
 595  * felem_reduce converts a largefelem to an felem.
 596  * On entry:
 597  *   in[i] < 2^128
 598  * On exit:
 599  *   out[i] < 2^59 + 2^14
 600  */
 601 static void felem_reduce(felem out, const largefelem in)
 602 {
 603     u64 overflow1, overflow2;
 604
 605     out[0] = ((limb) in[0]) & bottom58bits;
 606     out[1] = ((limb) in[1]) & bottom58bits;
 607     out[2] = ((limb) in[2]) & bottom58bits;
 608     out[3] = ((limb) in[3]) & bottom58bits;
 609     out[4] = ((limb) in[4]) & bottom58bits;
 610     out[5] = ((limb) in[5]) & bottom58bits;
 611     out[6] = ((limb) in[6]) & bottom58bits;
 612     out[7] = ((limb) in[7]) & bottom58bits;
 613     out[8] = ((limb) in[8]) & bottom58bits;
 614
 615     /* out[i] < 2^58 */
 616
 617     out[1] += ((limb) in[0]) >> 58;
 618     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 619     /*-
 620      * out[1] < 2^58 + 2^6 + 2^58
 621      *        = 2^59 + 2^6
 622      */
 623     out[2] += ((limb) (in[0] >> 64)) >> 52;
 624
 625     out[2] += ((limb) in[1]) >> 58;
 626     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 627     out[3] += ((limb) (in[1] >> 64)) >> 52;
 628
 629     out[3] += ((limb) in[2]) >> 58;
 630     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 631     out[4] += ((limb) (in[2] >> 64)) >> 52;
 632
 633     out[4] += ((limb) in[3]) >> 58;
 634     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 635     out[5] += ((limb) (in[3] >> 64)) >> 52;
 636
 637     out[5] += ((limb) in[4]) >> 58;
 638     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 639     out[6] += ((limb) (in[4] >> 64)) >> 52;
 640
 641     out[6] += ((limb) in[5]) >> 58;
 642     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 643     out[7] += ((limb) (in[5] >> 64)) >> 52;
 644
 645     out[7] += ((limb) in[6]) >> 58;
 646     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 647     out[8] += ((limb) (in[6] >> 64)) >> 52;
 648
 649     out[8] += ((limb) in[7]) >> 58;
 650     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 651     /*-
 652      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 653      *            < 2^59 + 2^13
 654      */
 655     overflow1 = ((limb) (in[7] >> 64)) >> 52;
 656
 657     overflow1 += ((limb) in[8]) >> 58;
 658     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 659     overflow2 = ((limb) (in[8] >> 64)) >> 52;
 660
 661     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
 662     overflow2 <<= 1;            /* overflow2 < 2^13 */
 663
 664     out[0] += overflow1;        /* out[0] < 2^60 */
 665     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
 666
 667     out[1] += out[0] >> 58;
 668     out[0] &= bottom58bits;
 669     /*-
 670      * out[0] < 2^58
 671      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 672      *        < 2^59 + 2^14
 673      */
 674 }
 675
 676 static void felem_square_reduce(felem out, const felem in)
 677 {
 678     largefelem tmp;
 679     felem_square(tmp, in);
 680     felem_reduce(out, tmp);
 681 }
 682
 683 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 684 {
 685     largefelem tmp;
 686     felem_mul(tmp, in1, in2);
 687     felem_reduce(out, tmp);
 688 }
 689
 690 /*-
 691  * felem_inv calculates |out| = |in|^{-1}
 692  *
 693  * Based on Fermat's Little Theorem:
 694  *   a^p = a (mod p)
 695  *   a^{p-1} = 1 (mod p)
 696  *   a^{p-2} = a^{-1} (mod p)
 697  */
 698 static void felem_inv(felem out, const felem in)
 699 {
 700     felem ftmp, ftmp2, ftmp3, ftmp4;
 701     largefelem tmp;
 702     unsigned i;
 703
 704     felem_square(tmp, in);
 705     felem_reduce(ftmp, tmp);    /* 2^1 */
 706     felem_mul(tmp, in, ftmp);
 707     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 708     felem_assign(ftmp2, ftmp);
 709     felem_square(tmp, ftmp);
 710     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
 711     felem_mul(tmp, in, ftmp);
 712     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
 713     felem_square(tmp, ftmp);
 714     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
 715
 716     felem_square(tmp, ftmp2);
 717     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
 718     felem_square(tmp, ftmp3);
 719     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
 720     felem_mul(tmp, ftmp3, ftmp2);
 721     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
 722
 723     felem_assign(ftmp2, ftmp3);
 724     felem_square(tmp, ftmp3);
 725     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
 726     felem_square(tmp, ftmp3);
 727     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
 728     felem_square(tmp, ftmp3);
 729     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
 730     felem_square(tmp, ftmp3);
 731     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
 732     felem_assign(ftmp4, ftmp3);
 733     felem_mul(tmp, ftmp3, ftmp);
 734     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
 735     felem_square(tmp, ftmp4);
 736     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
 737     felem_mul(tmp, ftmp3, ftmp2);
 738     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
 739     felem_assign(ftmp2, ftmp3);
 740
 741     for (i = 0; i < 8; i++) {
 742         felem_square(tmp, ftmp3);
 743         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
 744     }
 745     felem_mul(tmp, ftmp3, ftmp2);
 746     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
 747     felem_assign(ftmp2, ftmp3);
 748
 749     for (i = 0; i < 16; i++) {
 750         felem_square(tmp, ftmp3);
 751         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
 752     }
 753     felem_mul(tmp, ftmp3, ftmp2);
 754     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
 755     felem_assign(ftmp2, ftmp3);
 756
 757     for (i = 0; i < 32; i++) {
 758         felem_square(tmp, ftmp3);
 759         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
 760     }
 761     felem_mul(tmp, ftmp3, ftmp2);
 762     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
 763     felem_assign(ftmp2, ftmp3);
 764
 765     for (i = 0; i < 64; i++) {
 766         felem_square(tmp, ftmp3);
 767         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
 768     }
 769     felem_mul(tmp, ftmp3, ftmp2);
 770     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
 771     felem_assign(ftmp2, ftmp3);
 772
 773     for (i = 0; i < 128; i++) {
 774         felem_square(tmp, ftmp3);
 775         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
 776     }
 777     felem_mul(tmp, ftmp3, ftmp2);
 778     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
 779     felem_assign(ftmp2, ftmp3);
 780
 781     for (i = 0; i < 256; i++) {
 782         felem_square(tmp, ftmp3);
 783         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
 784     }
 785     felem_mul(tmp, ftmp3, ftmp2);
 786     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
 787
 788     for (i = 0; i < 9; i++) {
 789         felem_square(tmp, ftmp3);
 790         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
 791     }
 792     felem_mul(tmp, ftmp3, ftmp4);
 793     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
 794     felem_mul(tmp, ftmp3, in);
 795     felem_reduce(out, tmp);     /* 2^512 - 3 */
 796 }
 797
 798 /* This is 2^521-1, expressed as an felem */
 799 static const felem kPrime = {
 800     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 801     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 802     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 803 };
 804
 805 /*-
 806  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 807  * otherwise.
 808  * On entry:
 809  *   in[i] < 2^59 + 2^14
 810  */
 811 static limb felem_is_zero(const felem in)
 812 {
 813     felem ftmp;
 814     limb is_zero, is_p;
 815     felem_assign(ftmp, in);
 816
 817     ftmp[0] += ftmp[8] >> 57;
 818     ftmp[8] &= bottom57bits;
 819     /* ftmp[8] < 2^57 */
 820     ftmp[1] += ftmp[0] >> 58;
 821     ftmp[0] &= bottom58bits;
 822     ftmp[2] += ftmp[1] >> 58;
 823     ftmp[1] &= bottom58bits;
 824     ftmp[3] += ftmp[2] >> 58;
 825     ftmp[2] &= bottom58bits;
 826     ftmp[4] += ftmp[3] >> 58;
 827     ftmp[3] &= bottom58bits;
 828     ftmp[5] += ftmp[4] >> 58;
 829     ftmp[4] &= bottom58bits;
 830     ftmp[6] += ftmp[5] >> 58;
 831     ftmp[5] &= bottom58bits;
 832     ftmp[7] += ftmp[6] >> 58;
 833     ftmp[6] &= bottom58bits;
 834     ftmp[8] += ftmp[7] >> 58;
 835     ftmp[7] &= bottom58bits;
 836     /* ftmp[8] < 2^57 + 4 */
 837
 838     /*
 839      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
 840      * than our bound for ftmp[8]. Therefore we only have to check if the
 841      * zero is zero or 2^521-1.
 842      */
 843
 844     is_zero = 0;
 845     is_zero |= ftmp[0];
 846     is_zero |= ftmp[1];
 847     is_zero |= ftmp[2];
 848     is_zero |= ftmp[3];
 849     is_zero |= ftmp[4];
 850     is_zero |= ftmp[5];
 851     is_zero |= ftmp[6];
 852     is_zero |= ftmp[7];
 853     is_zero |= ftmp[8];
 854
 855     is_zero--;
 856     /*
 857      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
 858      * can be set is if is_zero was 0 before the decrement.
 859      */
 860     is_zero = 0 - (is_zero >> 63);
 861
 862     is_p = ftmp[0] ^ kPrime[0];
 863     is_p |= ftmp[1] ^ kPrime[1];
 864     is_p |= ftmp[2] ^ kPrime[2];
 865     is_p |= ftmp[3] ^ kPrime[3];
 866     is_p |= ftmp[4] ^ kPrime[4];
 867     is_p |= ftmp[5] ^ kPrime[5];
 868     is_p |= ftmp[6] ^ kPrime[6];
 869     is_p |= ftmp[7] ^ kPrime[7];
 870     is_p |= ftmp[8] ^ kPrime[8];
 871
 872     is_p--;
 873     is_p = 0 - (is_p >> 63);
 874
 875     is_zero |= is_p;
 876     return is_zero;
 877 }
 878
 879 static int felem_is_zero_int(const void *in)
 880 {
 881     return (int)(felem_is_zero(in) & ((limb) 1));
 882 }
 883
 884 /*-
 885  * felem_contract converts |in| to its unique, minimal representation.
 886  * On entry:
 887  *   in[i] < 2^59 + 2^14
 888  */
 889 static void felem_contract(felem out, const felem in)
 890 {
 891     limb is_p, is_greater, sign;
 892     static const limb two58 = ((limb) 1) << 58;
 893
 894     felem_assign(out, in);
 895
 896     out[0] += out[8] >> 57;
 897     out[8] &= bottom57bits;
 898     /* out[8] < 2^57 */
 899     out[1] += out[0] >> 58;
 900     out[0] &= bottom58bits;
 901     out[2] += out[1] >> 58;
 902     out[1] &= bottom58bits;
 903     out[3] += out[2] >> 58;
 904     out[2] &= bottom58bits;
 905     out[4] += out[3] >> 58;
 906     out[3] &= bottom58bits;
 907     out[5] += out[4] >> 58;
 908     out[4] &= bottom58bits;
 909     out[6] += out[5] >> 58;
 910     out[5] &= bottom58bits;
 911     out[7] += out[6] >> 58;
 912     out[6] &= bottom58bits;
 913     out[8] += out[7] >> 58;
 914     out[7] &= bottom58bits;
 915     /* out[8] < 2^57 + 4 */
 916
 917     /*
 918      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
 919      * out. See the comments in felem_is_zero regarding why we don't test for
 920      * other multiples of the prime.
 921      */
 922
 923     /*
 924      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
 925      */
 926
 927     is_p = out[0] ^ kPrime[0];
 928     is_p |= out[1] ^ kPrime[1];
 929     is_p |= out[2] ^ kPrime[2];
 930     is_p |= out[3] ^ kPrime[3];
 931     is_p |= out[4] ^ kPrime[4];
 932     is_p |= out[5] ^ kPrime[5];
 933     is_p |= out[6] ^ kPrime[6];
 934     is_p |= out[7] ^ kPrime[7];
 935     is_p |= out[8] ^ kPrime[8];
 936
 937     is_p--;
 938     is_p &= is_p << 32;
 939     is_p &= is_p << 16;
 940     is_p &= is_p << 8;
 941     is_p &= is_p << 4;
 942     is_p &= is_p << 2;
 943     is_p &= is_p << 1;
 944     is_p = 0 - (is_p >> 63);
 945     is_p = ~is_p;
 946
 947     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 948
 949     out[0] &= is_p;
 950     out[1] &= is_p;
 951     out[2] &= is_p;
 952     out[3] &= is_p;
 953     out[4] &= is_p;
 954     out[5] &= is_p;
 955     out[6] &= is_p;
 956     out[7] &= is_p;
 957     out[8] &= is_p;
 958
 959     /*
 960      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
 961      * 57 is greater than zero as (2^521-1) + x >= 2^522
 962      */
 963     is_greater = out[8] >> 57;
 964     is_greater |= is_greater << 32;
 965     is_greater |= is_greater << 16;
 966     is_greater |= is_greater << 8;
 967     is_greater |= is_greater << 4;
 968     is_greater |= is_greater << 2;
 969     is_greater |= is_greater << 1;
 970     is_greater = 0 - (is_greater >> 63);
 971
 972     out[0] -= kPrime[0] & is_greater;
 973     out[1] -= kPrime[1] & is_greater;
 974     out[2] -= kPrime[2] & is_greater;
 975     out[3] -= kPrime[3] & is_greater;
 976     out[4] -= kPrime[4] & is_greater;
 977     out[5] -= kPrime[5] & is_greater;
 978     out[6] -= kPrime[6] & is_greater;
 979     out[7] -= kPrime[7] & is_greater;
 980     out[8] -= kPrime[8] & is_greater;
 981
 982     /* Eliminate negative coefficients */
 983     sign = -(out[0] >> 63);
 984     out[0] += (two58 & sign);
 985     out[1] -= (1 & sign);
 986     sign = -(out[1] >> 63);
 987     out[1] += (two58 & sign);
 988     out[2] -= (1 & sign);
 989     sign = -(out[2] >> 63);
 990     out[2] += (two58 & sign);
 991     out[3] -= (1 & sign);
 992     sign = -(out[3] >> 63);
 993     out[3] += (two58 & sign);
 994     out[4] -= (1 & sign);
 995     sign = -(out[4] >> 63);
 996     out[4] += (two58 & sign);
 997     out[5] -= (1 & sign);
 998     sign = -(out[0] >> 63);
 999     out[5] += (two58 & sign);
1000     out[6] -= (1 & sign);
1001     sign = -(out[6] >> 63);
1002     out[6] += (two58 & sign);
1003     out[7] -= (1 & sign);
1004     sign = -(out[7] >> 63);
1005     out[7] += (two58 & sign);
1006     out[8] -= (1 & sign);
1007     sign = -(out[5] >> 63);
1008     out[5] += (two58 & sign);
1009     out[6] -= (1 & sign);
1010     sign = -(out[6] >> 63);
1011     out[6] += (two58 & sign);
1012     out[7] -= (1 & sign);
1013     sign = -(out[7] >> 63);
1014     out[7] += (two58 & sign);
1015     out[8] -= (1 & sign);
1016 }
1017
1018 /*-
1019  * Group operations
1020  * ----------------
1021  *
1022  * Building on top of the field operations we have the operations on the
1023  * elliptic curve group itself. Points on the curve are represented in Jacobian
1024  * coordinates */
1025
1026 /*-
1027  * point_double calculates 2*(x_in, y_in, z_in)
1028  *
1029  * The method is taken from:
1030  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1031  *
1032  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1033  * while x_out == y_in is not (maybe this works, but it's not tested). */
1034 static void
1035 point_double(felem x_out, felem y_out, felem z_out,
1036              const felem x_in, const felem y_in, const felem z_in)
1037 {
1038     largefelem tmp, tmp2;
1039     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1040
1041     felem_assign(ftmp, x_in);
1042     felem_assign(ftmp2, x_in);
1043
1044     /* delta = z^2 */
1045     felem_square(tmp, z_in);
1046     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1047
1048     /* gamma = y^2 */
1049     felem_square(tmp, y_in);
1050     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1051
1052     /* beta = x*gamma */
1053     felem_mul(tmp, x_in, gamma);
1054     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1055
1056     /* alpha = 3*(x-delta)*(x+delta) */
1057     felem_diff64(ftmp, delta);
1058     /* ftmp[i] < 2^61 */
1059     felem_sum64(ftmp2, delta);
1060     /* ftmp2[i] < 2^60 + 2^15 */
1061     felem_scalar64(ftmp2, 3);
1062     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1063     felem_mul(tmp, ftmp, ftmp2);
1064     /*-
1065      * tmp[i] < 17(3*2^121 + 3*2^76)
1066      *        = 61*2^121 + 61*2^76
1067      *        < 64*2^121 + 64*2^76
1068      *        = 2^127 + 2^82
1069      *        < 2^128
1070      */
1071     felem_reduce(alpha, tmp);
1072
1073     /* x' = alpha^2 - 8*beta */
1074     felem_square(tmp, alpha);
1075     /*
1076      * tmp[i] < 17*2^120 < 2^125
1077      */
1078     felem_assign(ftmp, beta);
1079     felem_scalar64(ftmp, 8);
1080     /* ftmp[i] < 2^62 + 2^17 */
1081     felem_diff_128_64(tmp, ftmp);
1082     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1083     felem_reduce(x_out, tmp);
1084
1085     /* z' = (y + z)^2 - gamma - delta */
1086     felem_sum64(delta, gamma);
1087     /* delta[i] < 2^60 + 2^15 */
1088     felem_assign(ftmp, y_in);
1089     felem_sum64(ftmp, z_in);
1090     /* ftmp[i] < 2^60 + 2^15 */
1091     felem_square(tmp, ftmp);
1092     /*
1093      * tmp[i] < 17(2^122) < 2^127
1094      */
1095     felem_diff_128_64(tmp, delta);
1096     /* tmp[i] < 2^127 + 2^63 */
1097     felem_reduce(z_out, tmp);
1098
1099     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1100     felem_scalar64(beta, 4);
1101     /* beta[i] < 2^61 + 2^16 */
1102     felem_diff64(beta, x_out);
1103     /* beta[i] < 2^61 + 2^60 + 2^16 */
1104     felem_mul(tmp, alpha, beta);
1105     /*-
1106      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1107      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1108      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1109      *        < 2^128
1110      */
1111     felem_square(tmp2, gamma);
1112     /*-
1113      * tmp2[i] < 17*(2^59 + 2^14)^2
1114      *         = 17*(2^118 + 2^74 + 2^28)
1115      */
1116     felem_scalar128(tmp2, 8);
1117     /*-
1118      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1119      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1120      *         < 2^126
1121      */
1122     felem_diff128(tmp, tmp2);
1123     /*-
1124      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1125      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1126      *          2^74 + 2^69 + 2^34 + 2^30
1127      *        < 2^128
1128      */
1129     felem_reduce(y_out, tmp);
1130 }
1131
1132 /* copy_conditional copies in to out iff mask is all ones. */
1133 static void copy_conditional(felem out, const felem in, limb mask)
1134 {
1135     unsigned i;
1136     for (i = 0; i < NLIMBS; ++i) {
1137         const limb tmp = mask & (in[i] ^ out[i]);
1138         out[i] ^= tmp;
1139     }
1140 }
1141
1142 /*-
1143  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1144  *
1145  * The method is taken from
1146  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1147  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1148  *
1149  * This function includes a branch for checking whether the two input points
1150  * are equal (while not equal to the point at infinity). See comment below
1151  * on constant-time.
1152  */
1153 static void point_add(felem x3, felem y3, felem z3,
1154                       const felem x1, const felem y1, const felem z1,
1155                       const int mixed, const felem x2, const felem y2,
1156                       const felem z2)
1157 {
1158     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1159     largefelem tmp, tmp2;
1160     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1161
1162     z1_is_zero = felem_is_zero(z1);
1163     z2_is_zero = felem_is_zero(z2);
1164
1165     /* ftmp = z1z1 = z1**2 */
1166     felem_square(tmp, z1);
1167     felem_reduce(ftmp, tmp);
1168
1169     if (!mixed) {
1170         /* ftmp2 = z2z2 = z2**2 */
1171         felem_square(tmp, z2);
1172         felem_reduce(ftmp2, tmp);
1173
1174         /* u1 = ftmp3 = x1*z2z2 */
1175         felem_mul(tmp, x1, ftmp2);
1176         felem_reduce(ftmp3, tmp);
1177
1178         /* ftmp5 = z1 + z2 */
1179         felem_assign(ftmp5, z1);
1180         felem_sum64(ftmp5, z2);
1181         /* ftmp5[i] < 2^61 */
1182
1183         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1184         felem_square(tmp, ftmp5);
1185         /* tmp[i] < 17*2^122 */
1186         felem_diff_128_64(tmp, ftmp);
1187         /* tmp[i] < 17*2^122 + 2^63 */
1188         felem_diff_128_64(tmp, ftmp2);
1189         /* tmp[i] < 17*2^122 + 2^64 */
1190         felem_reduce(ftmp5, tmp);
1191
1192         /* ftmp2 = z2 * z2z2 */
1193         felem_mul(tmp, ftmp2, z2);
1194         felem_reduce(ftmp2, tmp);
1195
1196         /* s1 = ftmp6 = y1 * z2**3 */
1197         felem_mul(tmp, y1, ftmp2);
1198         felem_reduce(ftmp6, tmp);
1199     } else {
1200         /*
1201          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1202          */
1203
1204         /* u1 = ftmp3 = x1*z2z2 */
1205         felem_assign(ftmp3, x1);
1206
1207         /* ftmp5 = 2*z1z2 */
1208         felem_scalar(ftmp5, z1, 2);
1209
1210         /* s1 = ftmp6 = y1 * z2**3 */
1211         felem_assign(ftmp6, y1);
1212     }
1213
1214     /* u2 = x2*z1z1 */
1215     felem_mul(tmp, x2, ftmp);
1216     /* tmp[i] < 17*2^120 */
1217
1218     /* h = ftmp4 = u2 - u1 */
1219     felem_diff_128_64(tmp, ftmp3);
1220     /* tmp[i] < 17*2^120 + 2^63 */
1221     felem_reduce(ftmp4, tmp);
1222
1223     x_equal = felem_is_zero(ftmp4);
1224
1225     /* z_out = ftmp5 * h */
1226     felem_mul(tmp, ftmp5, ftmp4);
1227     felem_reduce(z_out, tmp);
1228
1229     /* ftmp = z1 * z1z1 */
1230     felem_mul(tmp, ftmp, z1);
1231     felem_reduce(ftmp, tmp);
1232
1233     /* s2 = tmp = y2 * z1**3 */
1234     felem_mul(tmp, y2, ftmp);
1235     /* tmp[i] < 17*2^120 */
1236
1237     /* r = ftmp5 = (s2 - s1)*2 */
1238     felem_diff_128_64(tmp, ftmp6);
1239     /* tmp[i] < 17*2^120 + 2^63 */
1240     felem_reduce(ftmp5, tmp);
1241     y_equal = felem_is_zero(ftmp5);
1242     felem_scalar64(ftmp5, 2);
1243     /* ftmp5[i] < 2^61 */
1244
1245     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1246         /*
1247          * This is obviously not constant-time but it will almost-never happen
1248          * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1249          * where the intermediate value gets very close to the group order.
1250          * Since |ec_GFp_nistp_recode_scalar_bits| produces signed digits for
1251          * the scalar, it's possible for the intermediate value to be a small
1252          * negative multiple of the base point, and for the final signed digit
1253          * to be the same value. We believe that this only occurs for the scalar
1254          * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1255          * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1256          * 71e913863f7, in that case the penultimate intermediate is -9G and
1257          * the final digit is also -9G. Since this only happens for a single
1258          * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1259          * check whether a secret scalar was that exact value, can already do
1260          * so.)
1261          */
1262         point_double(x3, y3, z3, x1, y1, z1);
1263         return;
1264     }
1265
1266     /* I = ftmp = (2h)**2 */
1267     felem_assign(ftmp, ftmp4);
1268     felem_scalar64(ftmp, 2);
1269     /* ftmp[i] < 2^61 */
1270     felem_square(tmp, ftmp);
1271     /* tmp[i] < 17*2^122 */
1272     felem_reduce(ftmp, tmp);
1273
1274     /* J = ftmp2 = h * I */
1275     felem_mul(tmp, ftmp4, ftmp);
1276     felem_reduce(ftmp2, tmp);
1277
1278     /* V = ftmp4 = U1 * I */
1279     felem_mul(tmp, ftmp3, ftmp);
1280     felem_reduce(ftmp4, tmp);
1281
1282     /* x_out = r**2 - J - 2V */
1283     felem_square(tmp, ftmp5);
1284     /* tmp[i] < 17*2^122 */
1285     felem_diff_128_64(tmp, ftmp2);
1286     /* tmp[i] < 17*2^122 + 2^63 */
1287     felem_assign(ftmp3, ftmp4);
1288     felem_scalar64(ftmp4, 2);
1289     /* ftmp4[i] < 2^61 */
1290     felem_diff_128_64(tmp, ftmp4);
1291     /* tmp[i] < 17*2^122 + 2^64 */
1292     felem_reduce(x_out, tmp);
1293
1294     /* y_out = r(V-x_out) - 2 * s1 * J */
1295     felem_diff64(ftmp3, x_out);
1296     /*
1297      * ftmp3[i] < 2^60 + 2^60 = 2^61
1298      */
1299     felem_mul(tmp, ftmp5, ftmp3);
1300     /* tmp[i] < 17*2^122 */
1301     felem_mul(tmp2, ftmp6, ftmp2);
1302     /* tmp2[i] < 17*2^120 */
1303     felem_scalar128(tmp2, 2);
1304     /* tmp2[i] < 17*2^121 */
1305     felem_diff128(tmp, tmp2);
1306         /*-
1307          * tmp[i] < 2^127 - 2^69 + 17*2^122
1308          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1309          *        < 2^127
1310          */
1311     felem_reduce(y_out, tmp);
1312
1313     copy_conditional(x_out, x2, z1_is_zero);
1314     copy_conditional(x_out, x1, z2_is_zero);
1315     copy_conditional(y_out, y2, z1_is_zero);
1316     copy_conditional(y_out, y1, z2_is_zero);
1317     copy_conditional(z_out, z2, z1_is_zero);
1318     copy_conditional(z_out, z1, z2_is_zero);
1319     felem_assign(x3, x_out);
1320     felem_assign(y3, y_out);
1321     felem_assign(z3, z_out);
1322 }
1323
1324 /*-
1325  * Base point pre computation
1326  * --------------------------
1327  *
1328  * Two different sorts of precomputed tables are used in the following code.
1329  * Each contain various points on the curve, where each point is three field
1330  * elements (x, y, z).
1331  *
1332  * For the base point table, z is usually 1 (0 for the point at infinity).
1333  * This table has 16 elements:
1334  * index | bits    | point
1335  * ------+---------+------------------------------
1336  *     0 | 0 0 0 0 | 0G
1337  *     1 | 0 0 0 1 | 1G
1338  *     2 | 0 0 1 0 | 2^130G
1339  *     3 | 0 0 1 1 | (2^130 + 1)G
1340  *     4 | 0 1 0 0 | 2^260G
1341  *     5 | 0 1 0 1 | (2^260 + 1)G
1342  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1343  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1344  *     8 | 1 0 0 0 | 2^390G
1345  *     9 | 1 0 0 1 | (2^390 + 1)G
1346  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1347  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1348  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1349  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1350  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1351  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1352  *
1353  * The reason for this is so that we can clock bits into four different
1354  * locations when doing simple scalar multiplies against the base point.
1355  *
1356  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1357
1358 /* gmul is the table of precomputed base points */
1359 static const felem gmul[16][3] = {
1360 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1361  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1362  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1363 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1364   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1365   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1366  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1367   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1368   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1369  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1370 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1371   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1372   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1373  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1374   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1375   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1376  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1377 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1378   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1379   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1380  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1381   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1382   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1383  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1384 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1385   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1386   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1387  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1388   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1389   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1390  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1391 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1392   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1393   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1394  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1395   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1396   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1397  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1398 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1399   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1400   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1401  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1402   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1403   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1404  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1405 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1406   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1407   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1408  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1409   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1410   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1411  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1412 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1413   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1414   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1415  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1416   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1417   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1418  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1419 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1420   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1421   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1422  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1423   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1424   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1425  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1426 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1427   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1428   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1429  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1430   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1431   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1432  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1433 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1434   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1435   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1436  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1437   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1438   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1439  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1440 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1441   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1442   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1443  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1444   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1445   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1446  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1447 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1448   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1449   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1450  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1451   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1452   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1453  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1454 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1455   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1456   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1457  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1458   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1459   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1460  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1461 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1462   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1463   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1464  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1465   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1466   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1467  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1468 };
1469
1470 /*
1471  * select_point selects the |idx|th point from a precomputation table and
1472  * copies it to out.
1473  */
1474  /* pre_comp below is of the size provided in |size| */
1475 static void select_point(const limb idx, unsigned int size,
1476                          const felem pre_comp[][3], felem out[3])
1477 {
1478     unsigned i, j;
1479     limb *outlimbs = &out[0][0];
1480
1481     memset(out, 0, sizeof(*out) * 3);
1482
1483     for (i = 0; i < size; i++) {
1484         const limb *inlimbs = &pre_comp[i][0][0];
1485         limb mask = i ^ idx;
1486         mask |= mask >> 4;
1487         mask |= mask >> 2;
1488         mask |= mask >> 1;
1489         mask &= 1;
1490         mask--;
1491         for (j = 0; j < NLIMBS * 3; j++)
1492             outlimbs[j] |= inlimbs[j] & mask;
1493     }
1494 }
1495
1496 /* get_bit returns the |i|th bit in |in| */
1497 static char get_bit(const felem_bytearray in, int i)
1498 {
1499     if (i < 0)
1500         return 0;
1501     return (in[i >> 3] >> (i & 7)) & 1;
1502 }
1503
1504 /*
1505  * Interleaved point multiplication using precomputed point multiples: The
1506  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1507  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1508  * generator, using certain (large) precomputed multiples in g_pre_comp.
1509  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1510  */
1511 static void batch_mul(felem x_out, felem y_out, felem z_out,
1512                       const felem_bytearray scalars[],
1513                       const unsigned num_points, const u8 *g_scalar,
1514                       const int mixed, const felem pre_comp[][17][3],
1515                       const felem g_pre_comp[16][3])
1516 {
1517     int i, skip;
1518     unsigned num, gen_mul = (g_scalar != NULL);
1519     felem nq[3], tmp[4];
1520     limb bits;
1521     u8 sign, digit;
1522
1523     /* set nq to the point at infinity */
1524     memset(nq, 0, sizeof(nq));
1525
1526     /*
1527      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1528      * of the generator (last quarter of rounds) and additions of other
1529      * points multiples (every 5th round).
1530      */
1531     skip = 1;                   /* save two point operations in the first
1532                                  * round */
1533     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1534         /* double */
1535         if (!skip)
1536             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1537
1538         /* add multiples of the generator */
1539         if (gen_mul && (i <= 130)) {
1540             bits = get_bit(g_scalar, i + 390) << 3;
1541             if (i < 130) {
1542                 bits |= get_bit(g_scalar, i + 260) << 2;
1543                 bits |= get_bit(g_scalar, i + 130) << 1;
1544                 bits |= get_bit(g_scalar, i);
1545             }
1546             /* select the point to add, in constant time */
1547             select_point(bits, 16, g_pre_comp, tmp);
1548             if (!skip) {
1549                 /* The 1 argument below is for "mixed" */
1550                 point_add(nq[0], nq[1], nq[2],
1551                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1552             } else {
1553                 memcpy(nq, tmp, 3 * sizeof(felem));
1554                 skip = 0;
1555             }
1556         }
1557
1558         /* do other additions every 5 doublings */
1559         if (num_points && (i % 5 == 0)) {
1560             /* loop over all scalars */
1561             for (num = 0; num < num_points; ++num) {
1562                 bits = get_bit(scalars[num], i + 4) << 5;
1563                 bits |= get_bit(scalars[num], i + 3) << 4;
1564                 bits |= get_bit(scalars[num], i + 2) << 3;
1565                 bits |= get_bit(scalars[num], i + 1) << 2;
1566                 bits |= get_bit(scalars[num], i) << 1;
1567                 bits |= get_bit(scalars[num], i - 1);
1568                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1569
1570                 /*
1571                  * select the point to add or subtract, in constant time
1572                  */
1573                 select_point(digit, 17, pre_comp[num], tmp);
1574                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1575                                             * point */
1576                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1577
1578                 if (!skip) {
1579                     point_add(nq[0], nq[1], nq[2],
1580                               nq[0], nq[1], nq[2],
1581                               mixed, tmp[0], tmp[1], tmp[2]);
1582                 } else {
1583                     memcpy(nq, tmp, 3 * sizeof(felem));
1584                     skip = 0;
1585                 }
1586             }
1587         }
1588     }
1589     felem_assign(x_out, nq[0]);
1590     felem_assign(y_out, nq[1]);
1591     felem_assign(z_out, nq[2]);
1592 }
1593
1594 /* Precomputation for the group generator. */
1595 struct nistp521_pre_comp_st {
1596     felem g_pre_comp[16][3];
1597     CRYPTO_REF_COUNT references;
1598     CRYPTO_RWLOCK *lock;
1599 };
1600
1601 const EC_METHOD *EC_GFp_nistp521_method(void)
1602 {
1603     static const EC_METHOD ret = {
1604         EC_FLAGS_DEFAULT_OCT,
1605         NID_X9_62_prime_field,
1606         ec_GFp_nistp521_group_init,
1607         ec_GFp_simple_group_finish,
1608         ec_GFp_simple_group_clear_finish,
1609         ec_GFp_nist_group_copy,
1610         ec_GFp_nistp521_group_set_curve,
1611         ec_GFp_simple_group_get_curve,
1612         ec_GFp_simple_group_get_degree,
1613         ec_group_simple_order_bits,
1614         ec_GFp_simple_group_check_discriminant,
1615         ec_GFp_simple_point_init,
1616         ec_GFp_simple_point_finish,
1617         ec_GFp_simple_point_clear_finish,
1618         ec_GFp_simple_point_copy,
1619         ec_GFp_simple_point_set_to_infinity,
1620         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1621         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1622         ec_GFp_simple_point_set_affine_coordinates,
1623         ec_GFp_nistp521_point_get_affine_coordinates,
1624         0 /* point_set_compressed_coordinates */ ,
1625         0 /* point2oct */ ,
1626         0 /* oct2point */ ,
1627         ec_GFp_simple_add,
1628         ec_GFp_simple_dbl,
1629         ec_GFp_simple_invert,
1630         ec_GFp_simple_is_at_infinity,
1631         ec_GFp_simple_is_on_curve,
1632         ec_GFp_simple_cmp,
1633         ec_GFp_simple_make_affine,
1634         ec_GFp_simple_points_make_affine,
1635         ec_GFp_nistp521_points_mul,
1636         ec_GFp_nistp521_precompute_mult,
1637         ec_GFp_nistp521_have_precompute_mult,
1638         ec_GFp_nist_field_mul,
1639         ec_GFp_nist_field_sqr,
1640         0 /* field_div */ ,
1641         ec_GFp_simple_field_inv,
1642         0 /* field_encode */ ,
1643         0 /* field_decode */ ,
1644         0,                      /* field_set_to_one */
1645         ec_key_simple_priv2oct,
1646         ec_key_simple_oct2priv,
1647         0, /* set private */
1648         ec_key_simple_generate_key,
1649         ec_key_simple_check_key,
1650         ec_key_simple_generate_public_key,
1651         0, /* keycopy */
1652         0, /* keyfinish */
1653         ecdh_simple_compute_key,
1654         ecdsa_simple_sign_setup,
1655         ecdsa_simple_sign_sig,
1656         ecdsa_simple_verify_sig,
1657         0, /* field_inverse_mod_ord */
1658         0, /* blind_coordinates */
1659         0, /* ladder_pre */
1660         0, /* ladder_step */
1661         0  /* ladder_post */
1662     };
1663
1664     return &ret;
1665 }
1666
1667 /******************************************************************************/
1668 /*
1669  * FUNCTIONS TO MANAGE PRECOMPUTATION
1670  */
1671
1672 static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1673 {
1674     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1675
1676     if (ret == NULL) {
1677         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1678         return ret;
1679     }
1680
1681     ret->references = 1;
1682
1683     ret->lock = CRYPTO_THREAD_lock_new();
1684     if (ret->lock == NULL) {
1685         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1686         OPENSSL_free(ret);
1687         return NULL;
1688     }
1689     return ret;
1690 }
1691
1692 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1693 {
1694     int i;
1695     if (p != NULL)
1696         CRYPTO_UP_REF(&p->references, &i, p->lock);
1697     return p;
1698 }
1699
1700 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1701 {
1702     int i;
1703
1704     if (p == NULL)
1705         return;
1706
1707     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1708     REF_PRINT_COUNT("EC_nistp521", x);
1709     if (i > 0)
1710         return;
1711     REF_ASSERT_ISNT(i < 0);
1712
1713     CRYPTO_THREAD_lock_free(p->lock);
1714     OPENSSL_free(p);
1715 }
1716
1717 /******************************************************************************/
1718 /*
1719  * OPENSSL EC_METHOD FUNCTIONS
1720  */
1721
1722 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1723 {
1724     int ret;
1725     ret = ec_GFp_simple_group_init(group);
1726     group->a_is_minus3 = 1;
1727     return ret;
1728 }
1729
1730 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1731                                     const BIGNUM *a, const BIGNUM *b,
1732                                     BN_CTX *ctx)
1733 {
1734     int ret = 0;
1735     BIGNUM *curve_p, *curve_a, *curve_b;
1736 #ifndef FIPS_MODE
1737     BN_CTX *new_ctx = NULL;
1738
1739     if (ctx == NULL)
1740         ctx = new_ctx = BN_CTX_new();
1741 #endif
1742     if (ctx == NULL)
1743         return 0;
1744
1745     BN_CTX_start(ctx);
1746     curve_p = BN_CTX_get(ctx);
1747     curve_a = BN_CTX_get(ctx);
1748     curve_b = BN_CTX_get(ctx);
1749     if (curve_b == NULL)
1750         goto err;
1751     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1752     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1753     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1754     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1755         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1756               EC_R_WRONG_CURVE_PARAMETERS);
1757         goto err;
1758     }
1759     group->field_mod_func = BN_nist_mod_521;
1760     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1761  err:
1762     BN_CTX_end(ctx);
1763 #ifndef FIPS_MODE
1764     BN_CTX_free(new_ctx);
1765 #endif
1766     return ret;
1767 }
1768
1769 /*
1770  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1771  * (X/Z^2, Y/Z^3)
1772  */
1773 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1774                                                  const EC_POINT *point,
1775                                                  BIGNUM *x, BIGNUM *y,
1776                                                  BN_CTX *ctx)
1777 {
1778     felem z1, z2, x_in, y_in, x_out, y_out;
1779     largefelem tmp;
1780
1781     if (EC_POINT_is_at_infinity(group, point)) {
1782         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1783               EC_R_POINT_AT_INFINITY);
1784         return 0;
1785     }
1786     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1787         (!BN_to_felem(z1, point->Z)))
1788         return 0;
1789     felem_inv(z2, z1);
1790     felem_square(tmp, z2);
1791     felem_reduce(z1, tmp);
1792     felem_mul(tmp, x_in, z1);
1793     felem_reduce(x_in, tmp);
1794     felem_contract(x_out, x_in);
1795     if (x != NULL) {
1796         if (!felem_to_BN(x, x_out)) {
1797             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1798                   ERR_R_BN_LIB);
1799             return 0;
1800         }
1801     }
1802     felem_mul(tmp, z1, z2);
1803     felem_reduce(z1, tmp);
1804     felem_mul(tmp, y_in, z1);
1805     felem_reduce(y_in, tmp);
1806     felem_contract(y_out, y_in);
1807     if (y != NULL) {
1808         if (!felem_to_BN(y, y_out)) {
1809             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1810                   ERR_R_BN_LIB);
1811             return 0;
1812         }
1813     }
1814     return 1;
1815 }
1816
1817 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1818 static void make_points_affine(size_t num, felem points[][3],
1819                                felem tmp_felems[])
1820 {
1821     /*
1822      * Runs in constant time, unless an input is the point at infinity (which
1823      * normally shouldn't happen).
1824      */
1825     ec_GFp_nistp_points_make_affine_internal(num,
1826                                              points,
1827                                              sizeof(felem),
1828                                              tmp_felems,
1829                                              (void (*)(void *))felem_one,
1830                                              felem_is_zero_int,
1831                                              (void (*)(void *, const void *))
1832                                              felem_assign,
1833                                              (void (*)(void *, const void *))
1834                                              felem_square_reduce, (void (*)
1835                                                                    (void *,
1836                                                                     const void
1837                                                                     *,
1838                                                                     const void
1839                                                                     *))
1840                                              felem_mul_reduce,
1841                                              (void (*)(void *, const void *))
1842                                              felem_inv,
1843                                              (void (*)(void *, const void *))
1844                                              felem_contract);
1845 }
1846
1847 /*
1848  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1849  * values Result is stored in r (r can equal one of the inputs).
1850  */
1851 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1852                                const BIGNUM *scalar, size_t num,
1853                                const EC_POINT *points[],
1854                                const BIGNUM *scalars[], BN_CTX *ctx)
1855 {
1856     int ret = 0;
1857     int j;
1858     int mixed = 0;
1859     BIGNUM *x, *y, *z, *tmp_scalar;
1860     felem_bytearray g_secret;
1861     felem_bytearray *secrets = NULL;
1862     felem (*pre_comp)[17][3] = NULL;
1863     felem *tmp_felems = NULL;
1864     unsigned i;
1865     int num_bytes;
1866     int have_pre_comp = 0;
1867     size_t num_points = num;
1868     felem x_in, y_in, z_in, x_out, y_out, z_out;
1869     NISTP521_PRE_COMP *pre = NULL;
1870     felem(*g_pre_comp)[3] = NULL;
1871     EC_POINT *generator = NULL;
1872     const EC_POINT *p = NULL;
1873     const BIGNUM *p_scalar = NULL;
1874
1875     BN_CTX_start(ctx);
1876     x = BN_CTX_get(ctx);
1877     y = BN_CTX_get(ctx);
1878     z = BN_CTX_get(ctx);
1879     tmp_scalar = BN_CTX_get(ctx);
1880     if (tmp_scalar == NULL)
1881         goto err;
1882
1883     if (scalar != NULL) {
1884         pre = group->pre_comp.nistp521;
1885         if (pre)
1886             /* we have precomputation, try to use it */
1887             g_pre_comp = &pre->g_pre_comp[0];
1888         else
1889             /* try to use the standard precomputation */
1890             g_pre_comp = (felem(*)[3]) gmul;
1891         generator = EC_POINT_new(group);
1892         if (generator == NULL)
1893             goto err;
1894         /* get the generator from precomputation */
1895         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1896             !felem_to_BN(y, g_pre_comp[1][1]) ||
1897             !felem_to_BN(z, g_pre_comp[1][2])) {
1898             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1899             goto err;
1900         }
1901         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1902                                                       generator, x, y, z,
1903                                                       ctx))
1904             goto err;
1905         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1906             /* precomputation matches generator */
1907             have_pre_comp = 1;
1908         else
1909             /*
1910              * we don't have valid precomputation: treat the generator as a
1911              * random point
1912              */
1913             num_points++;
1914     }
1915
1916     if (num_points > 0) {
1917         if (num_points >= 2) {
1918             /*
1919              * unless we precompute multiples for just one point, converting
1920              * those into affine form is time well spent
1921              */
1922             mixed = 1;
1923         }
1924         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1925         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1926         if (mixed)
1927             tmp_felems =
1928                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1929         if ((secrets == NULL) || (pre_comp == NULL)
1930             || (mixed && (tmp_felems == NULL))) {
1931             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1932             goto err;
1933         }
1934
1935         /*
1936          * we treat NULL scalars as 0, and NULL points as points at infinity,
1937          * i.e., they contribute nothing to the linear combination
1938          */
1939         for (i = 0; i < num_points; ++i) {
1940             if (i == num) {
1941                 /*
1942                  * we didn't have a valid precomputation, so we pick the
1943                  * generator
1944                  */
1945                 p = EC_GROUP_get0_generator(group);
1946                 p_scalar = scalar;
1947             } else {
1948                 /* the i^th point */
1949                 p = points[i];
1950                 p_scalar = scalars[i];
1951             }
1952             if ((p_scalar != NULL) && (p != NULL)) {
1953                 /* reduce scalar to 0 <= scalar < 2^521 */
1954                 if ((BN_num_bits(p_scalar) > 521)
1955                     || (BN_is_negative(p_scalar))) {
1956                     /*
1957                      * this is an unusual input, and we don't guarantee
1958                      * constant-timeness
1959                      */
1960                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1961                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1962                         goto err;
1963                     }
1964                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1965                                                secrets[i], sizeof(secrets[i]));
1966                 } else {
1967                     num_bytes = BN_bn2lebinpad(p_scalar,
1968                                                secrets[i], sizeof(secrets[i]));
1969                 }
1970                 if (num_bytes < 0) {
1971                     ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1972                     goto err;
1973                 }
1974                 /* precompute multiples */
1975                 if ((!BN_to_felem(x_out, p->X)) ||
1976                     (!BN_to_felem(y_out, p->Y)) ||
1977                     (!BN_to_felem(z_out, p->Z)))
1978                     goto err;
1979                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1980                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1981                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1982                 for (j = 2; j <= 16; ++j) {
1983                     if (j & 1) {
1984                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1985                                   pre_comp[i][j][2], pre_comp[i][1][0],
1986                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1987                                   pre_comp[i][j - 1][0],
1988                                   pre_comp[i][j - 1][1],
1989                                   pre_comp[i][j - 1][2]);
1990                     } else {
1991                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1992                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1993                                      pre_comp[i][j / 2][1],
1994                                      pre_comp[i][j / 2][2]);
1995                     }
1996                 }
1997             }
1998         }
1999         if (mixed)
2000             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
2001     }
2002
2003     /* the scalar for the generator */
2004     if ((scalar != NULL) && (have_pre_comp)) {
2005         memset(g_secret, 0, sizeof(g_secret));
2006         /* reduce scalar to 0 <= scalar < 2^521 */
2007         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
2008             /*
2009              * this is an unusual input, and we don't guarantee
2010              * constant-timeness
2011              */
2012             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2013                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2014                 goto err;
2015             }
2016             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2017         } else {
2018             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2019         }
2020         /* do the multiplication with generator precomputation */
2021         batch_mul(x_out, y_out, z_out,
2022                   (const felem_bytearray(*))secrets, num_points,
2023                   g_secret,
2024                   mixed, (const felem(*)[17][3])pre_comp,
2025                   (const felem(*)[3])g_pre_comp);
2026     } else {
2027         /* do the multiplication without generator precomputation */
2028         batch_mul(x_out, y_out, z_out,
2029                   (const felem_bytearray(*))secrets, num_points,
2030                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2031     }
2032     /* reduce the output to its unique minimal representation */
2033     felem_contract(x_in, x_out);
2034     felem_contract(y_in, y_out);
2035     felem_contract(z_in, z_out);
2036     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2037         (!felem_to_BN(z, z_in))) {
2038         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2039         goto err;
2040     }
2041     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2042
2043  err:
2044     BN_CTX_end(ctx);
2045     EC_POINT_free(generator);
2046     OPENSSL_free(secrets);
2047     OPENSSL_free(pre_comp);
2048     OPENSSL_free(tmp_felems);
2049     return ret;
2050 }
2051
2052 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2053 {
2054     int ret = 0;
2055     NISTP521_PRE_COMP *pre = NULL;
2056     int i, j;
2057     BIGNUM *x, *y;
2058     EC_POINT *generator = NULL;
2059     felem tmp_felems[16];
2060 #ifndef FIPS_MODE
2061     BN_CTX *new_ctx = NULL;
2062 #endif
2063
2064     /* throw away old precomputation */
2065     EC_pre_comp_free(group);
2066
2067 #ifndef FIPS_MODE
2068     if (ctx == NULL)
2069         ctx = new_ctx = BN_CTX_new();
2070 #endif
2071     if (ctx == NULL)
2072         return 0;
2073
2074     BN_CTX_start(ctx);
2075     x = BN_CTX_get(ctx);
2076     y = BN_CTX_get(ctx);
2077     if (y == NULL)
2078         goto err;
2079     /* get the generator */
2080     if (group->generator == NULL)
2081         goto err;
2082     generator = EC_POINT_new(group);
2083     if (generator == NULL)
2084         goto err;
2085     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2086     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2087     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2088         goto err;
2089     if ((pre = nistp521_pre_comp_new()) == NULL)
2090         goto err;
2091     /*
2092      * if the generator is the standard one, use built-in precomputation
2093      */
2094     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2095         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2096         goto done;
2097     }
2098     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2099         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2100         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2101         goto err;
2102     /* compute 2^130*G, 2^260*G, 2^390*G */
2103     for (i = 1; i <= 4; i <<= 1) {
2104         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2105                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2106                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2107         for (j = 0; j < 129; ++j) {
2108             point_double(pre->g_pre_comp[2 * i][0],
2109                          pre->g_pre_comp[2 * i][1],
2110                          pre->g_pre_comp[2 * i][2],
2111                          pre->g_pre_comp[2 * i][0],
2112                          pre->g_pre_comp[2 * i][1],
2113                          pre->g_pre_comp[2 * i][2]);
2114         }
2115     }
2116     /* g_pre_comp[0] is the point at infinity */
2117     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2118     /* the remaining multiples */
2119     /* 2^130*G + 2^260*G */
2120     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2121               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2122               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2123               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2124               pre->g_pre_comp[2][2]);
2125     /* 2^130*G + 2^390*G */
2126     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2127               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2128               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2129               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2130               pre->g_pre_comp[2][2]);
2131     /* 2^260*G + 2^390*G */
2132     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2133               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2134               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2135               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2136               pre->g_pre_comp[4][2]);
2137     /* 2^130*G + 2^260*G + 2^390*G */
2138     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2139               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2140               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2141               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2142               pre->g_pre_comp[2][2]);
2143     for (i = 1; i < 8; ++i) {
2144         /* odd multiples: add G */
2145         point_add(pre->g_pre_comp[2 * i + 1][0],
2146                   pre->g_pre_comp[2 * i + 1][1],
2147                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2148                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2149                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2150                   pre->g_pre_comp[1][2]);
2151     }
2152     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2153
2154  done:
2155     SETPRECOMP(group, nistp521, pre);
2156     ret = 1;
2157     pre = NULL;
2158  err:
2159     BN_CTX_end(ctx);
2160     EC_POINT_free(generator);
2161 #ifndef FIPS_MODE
2162     BN_CTX_free(new_ctx);
2163 #endif
2164     EC_nistp521_pre_comp_free(pre);
2165     return ret;
2166 }
2167
2168 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2169 {
2170     return HAVEPRECOMP(group, nistp521);
2171 }
2172
2173 #endif