crypto/ec/ecp_nistp256.c

   1 /* crypto/ec/ecp_nistp256.c */
   2 /*
   3  * Written by Adam Langley (Google) for the OpenSSL project
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
  23  *
  24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  26  * work which got its smarts from Daniel J. Bernstein's work on the same.
  27  */
  28
  29 #include <openssl/opensslconf.h>
  30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  31
  32 # include <stdint.h>
  33 # include <string.h>
  34 # include <openssl/err.h>
  35 # include "ec_lcl.h"
  36
  37 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  38   /* even with gcc, the typedef won't work for 32-bit platforms */
  39 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  40                                  * platforms */
  41 typedef __int128_t int128_t;
  42 # else
  43 #  error "Need GCC 3.1 or later to define type uint128_t"
  44 # endif
  45
  46 typedef uint8_t u8;
  47 typedef uint32_t u32;
  48 typedef uint64_t u64;
  49 typedef int64_t s64;
  50
  51 /*
  52  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
  53  * can serialise an element of this field into 32 bytes. We call this an
  54  * felem_bytearray.
  55  */
  56
  57 typedef u8 felem_bytearray[32];
  58
  59 /*
  60  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
  61  * values are big-endian.
  62  */
  63 static const felem_bytearray nistp256_curve_params[5] = {
  64     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
  65      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  66      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  67      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
  68     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
  69      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  70      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
  72     {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
  73      0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
  74      0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
  75      0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
  76     {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
  77      0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
  78      0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
  79      0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
  80     {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
  81      0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
  82      0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
  83      0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
  84 };
  85
  86 /*-
  87  * The representation of field elements.
  88  * ------------------------------------
  89  *
  90  * We represent field elements with either four 128-bit values, eight 128-bit
  91  * values, or four 64-bit values. The field element represented is:
  92  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
  93  * or:
  94  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
  95  *
  96  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
  97  * apart, but are 128-bits wide, the most significant bits of each limb overlap
  98  * with the least significant bits of the next.
  99  *
 100  * A field element with four limbs is an 'felem'. One with eight limbs is a
 101  * 'longfelem'
 102  *
 103  * A field element with four, 64-bit values is called a 'smallfelem'. Small
 104  * values are used as intermediate values before multiplication.
 105  */
 106
 107 # define NLIMBS 4
 108
 109 typedef uint128_t limb;
 110 typedef limb felem[NLIMBS];
 111 typedef limb longfelem[NLIMBS * 2];
 112 typedef u64 smallfelem[NLIMBS];
 113
 114 /* This is the value of the prime as four 64-bit words, little-endian. */
 115 static const u64 kPrime[4] =
 116     { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
 117 static const u64 bottom63bits = 0x7ffffffffffffffful;
 118
 119 /*
 120  * bin32_to_felem takes a little-endian byte array and converts it into felem
 121  * form. This assumes that the CPU is little-endian.
 122  */
 123 static void bin32_to_felem(felem out, const u8 in[32])
 124 {
 125     out[0] = *((u64 *)&in[0]);
 126     out[1] = *((u64 *)&in[8]);
 127     out[2] = *((u64 *)&in[16]);
 128     out[3] = *((u64 *)&in[24]);
 129 }
 130
 131 /*
 132  * smallfelem_to_bin32 takes a smallfelem and serialises into a little
 133  * endian, 32 byte array. This assumes that the CPU is little-endian.
 134  */
 135 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
 136 {
 137     *((u64 *)&out[0]) = in[0];
 138     *((u64 *)&out[8]) = in[1];
 139     *((u64 *)&out[16]) = in[2];
 140     *((u64 *)&out[24]) = in[3];
 141 }
 142
 143 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 144 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 145 {
 146     unsigned i;
 147     for (i = 0; i < len; ++i)
 148         out[i] = in[len - 1 - i];
 149 }
 150
 151 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 152 static int BN_to_felem(felem out, const BIGNUM *bn)
 153 {
 154     felem_bytearray b_in;
 155     felem_bytearray b_out;
 156     unsigned num_bytes;
 157
 158     /* BN_bn2bin eats leading zeroes */
 159     memset(b_out, 0, sizeof b_out);
 160     num_bytes = BN_num_bytes(bn);
 161     if (num_bytes > sizeof b_out) {
 162         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 163         return 0;
 164     }
 165     if (BN_is_negative(bn)) {
 166         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 167         return 0;
 168     }
 169     num_bytes = BN_bn2bin(bn, b_in);
 170     flip_endian(b_out, b_in, num_bytes);
 171     bin32_to_felem(out, b_out);
 172     return 1;
 173 }
 174
 175 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 176 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
 177 {
 178     felem_bytearray b_in, b_out;
 179     smallfelem_to_bin32(b_in, in);
 180     flip_endian(b_out, b_in, sizeof b_out);
 181     return BN_bin2bn(b_out, sizeof b_out, out);
 182 }
 183
 184 /*-
 185  * Field operations
 186  * ----------------
 187  */
 188
 189 static void smallfelem_one(smallfelem out)
 190 {
 191     out[0] = 1;
 192     out[1] = 0;
 193     out[2] = 0;
 194     out[3] = 0;
 195 }
 196
 197 static void smallfelem_assign(smallfelem out, const smallfelem in)
 198 {
 199     out[0] = in[0];
 200     out[1] = in[1];
 201     out[2] = in[2];
 202     out[3] = in[3];
 203 }
 204
 205 static void felem_assign(felem out, const felem in)
 206 {
 207     out[0] = in[0];
 208     out[1] = in[1];
 209     out[2] = in[2];
 210     out[3] = in[3];
 211 }
 212
 213 /* felem_sum sets out = out + in. */
 214 static void felem_sum(felem out, const felem in)
 215 {
 216     out[0] += in[0];
 217     out[1] += in[1];
 218     out[2] += in[2];
 219     out[3] += in[3];
 220 }
 221
 222 /* felem_small_sum sets out = out + in. */
 223 static void felem_small_sum(felem out, const smallfelem in)
 224 {
 225     out[0] += in[0];
 226     out[1] += in[1];
 227     out[2] += in[2];
 228     out[3] += in[3];
 229 }
 230
 231 /* felem_scalar sets out = out * scalar */
 232 static void felem_scalar(felem out, const u64 scalar)
 233 {
 234     out[0] *= scalar;
 235     out[1] *= scalar;
 236     out[2] *= scalar;
 237     out[3] *= scalar;
 238 }
 239
 240 /* longfelem_scalar sets out = out * scalar */
 241 static void longfelem_scalar(longfelem out, const u64 scalar)
 242 {
 243     out[0] *= scalar;
 244     out[1] *= scalar;
 245     out[2] *= scalar;
 246     out[3] *= scalar;
 247     out[4] *= scalar;
 248     out[5] *= scalar;
 249     out[6] *= scalar;
 250     out[7] *= scalar;
 251 }
 252
 253 # define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
 254 # define two105 (((limb)1) << 105)
 255 # define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
 256
 257 /* zero105 is 0 mod p */
 258 static const felem zero105 =
 259     { two105m41m9, two105, two105m41p9, two105m41p9 };
 260
 261 /*-
 262  * smallfelem_neg sets |out| to |-small|
 263  * On exit:
 264  *   out[i] < out[i] + 2^105
 265  */
 266 static void smallfelem_neg(felem out, const smallfelem small)
 267 {
 268     /* In order to prevent underflow, we subtract from 0 mod p. */
 269     out[0] = zero105[0] - small[0];
 270     out[1] = zero105[1] - small[1];
 271     out[2] = zero105[2] - small[2];
 272     out[3] = zero105[3] - small[3];
 273 }
 274
 275 /*-
 276  * felem_diff subtracts |in| from |out|
 277  * On entry:
 278  *   in[i] < 2^104
 279  * On exit:
 280  *   out[i] < out[i] + 2^105
 281  */
 282 static void felem_diff(felem out, const felem in)
 283 {
 284     /*
 285      * In order to prevent underflow, we add 0 mod p before subtracting.
 286      */
 287     out[0] += zero105[0];
 288     out[1] += zero105[1];
 289     out[2] += zero105[2];
 290     out[3] += zero105[3];
 291
 292     out[0] -= in[0];
 293     out[1] -= in[1];
 294     out[2] -= in[2];
 295     out[3] -= in[3];
 296 }
 297
 298 # define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
 299 # define two107 (((limb)1) << 107)
 300 # define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
 301
 302 /* zero107 is 0 mod p */
 303 static const felem zero107 =
 304     { two107m43m11, two107, two107m43p11, two107m43p11 };
 305
 306 /*-
 307  * An alternative felem_diff for larger inputs |in|
 308  * felem_diff_zero107 subtracts |in| from |out|
 309  * On entry:
 310  *   in[i] < 2^106
 311  * On exit:
 312  *   out[i] < out[i] + 2^107
 313  */
 314 static void felem_diff_zero107(felem out, const felem in)
 315 {
 316     /*
 317      * In order to prevent underflow, we add 0 mod p before subtracting.
 318      */
 319     out[0] += zero107[0];
 320     out[1] += zero107[1];
 321     out[2] += zero107[2];
 322     out[3] += zero107[3];
 323
 324     out[0] -= in[0];
 325     out[1] -= in[1];
 326     out[2] -= in[2];
 327     out[3] -= in[3];
 328 }
 329
 330 /*-
 331  * longfelem_diff subtracts |in| from |out|
 332  * On entry:
 333  *   in[i] < 7*2^67
 334  * On exit:
 335  *   out[i] < out[i] + 2^70 + 2^40
 336  */
 337 static void longfelem_diff(longfelem out, const longfelem in)
 338 {
 339     static const limb two70m8p6 =
 340         (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
 341     static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
 342     static const limb two70 = (((limb) 1) << 70);
 343     static const limb two70m40m38p6 =
 344         (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
 345         (((limb) 1) << 6);
 346     static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
 347
 348     /* add 0 mod p to avoid underflow */
 349     out[0] += two70m8p6;
 350     out[1] += two70p40;
 351     out[2] += two70;
 352     out[3] += two70m40m38p6;
 353     out[4] += two70m6;
 354     out[5] += two70m6;
 355     out[6] += two70m6;
 356     out[7] += two70m6;
 357
 358     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
 359     out[0] -= in[0];
 360     out[1] -= in[1];
 361     out[2] -= in[2];
 362     out[3] -= in[3];
 363     out[4] -= in[4];
 364     out[5] -= in[5];
 365     out[6] -= in[6];
 366     out[7] -= in[7];
 367 }
 368
 369 # define two64m0 (((limb)1) << 64) - 1
 370 # define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
 371 # define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
 372 # define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
 373
 374 /* zero110 is 0 mod p */
 375 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
 376
 377 /*-
 378  * felem_shrink converts an felem into a smallfelem. The result isn't quite
 379  * minimal as the value may be greater than p.
 380  *
 381  * On entry:
 382  *   in[i] < 2^109
 383  * On exit:
 384  *   out[i] < 2^64
 385  */
 386 static void felem_shrink(smallfelem out, const felem in)
 387 {
 388     felem tmp;
 389     u64 a, b, mask;
 390     s64 high, low;
 391     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
 392
 393     /* Carry 2->3 */
 394     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
 395     /* tmp[3] < 2^110 */
 396
 397     tmp[2] = zero110[2] + (u64)in[2];
 398     tmp[0] = zero110[0] + in[0];
 399     tmp[1] = zero110[1] + in[1];
 400     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
 401
 402     /*
 403      * We perform two partial reductions where we eliminate the high-word of
 404      * tmp[3]. We don't update the other words till the end.
 405      */
 406     a = tmp[3] >> 64;           /* a < 2^46 */
 407     tmp[3] = (u64)tmp[3];
 408     tmp[3] -= a;
 409     tmp[3] += ((limb) a) << 32;
 410     /* tmp[3] < 2^79 */
 411
 412     b = a;
 413     a = tmp[3] >> 64;           /* a < 2^15 */
 414     b += a;                     /* b < 2^46 + 2^15 < 2^47 */
 415     tmp[3] = (u64)tmp[3];
 416     tmp[3] -= a;
 417     tmp[3] += ((limb) a) << 32;
 418     /* tmp[3] < 2^64 + 2^47 */
 419
 420     /*
 421      * This adjusts the other two words to complete the two partial
 422      * reductions.
 423      */
 424     tmp[0] += b;
 425     tmp[1] -= (((limb) b) << 32);
 426
 427     /*
 428      * In order to make space in tmp[3] for the carry from 2 -> 3, we
 429      * conditionally subtract kPrime if tmp[3] is large enough.
 430      */
 431     high = tmp[3] >> 64;
 432     /* As tmp[3] < 2^65, high is either 1 or 0 */
 433     high <<= 63;
 434     high >>= 63;
 435     /*-
 436      * high is:
 437      *   all ones   if the high word of tmp[3] is 1
 438      *   all zeros  if the high word of tmp[3] if 0 */
 439     low = tmp[3];
 440     mask = low >> 63;
 441     /*-
 442      * mask is:
 443      *   all ones   if the MSB of low is 1
 444      *   all zeros  if the MSB of low if 0 */
 445     low &= bottom63bits;
 446     low -= kPrime3Test;
 447     /* if low was greater than kPrime3Test then the MSB is zero */
 448     low = ~low;
 449     low >>= 63;
 450     /*-
 451      * low is:
 452      *   all ones   if low was > kPrime3Test
 453      *   all zeros  if low was <= kPrime3Test */
 454     mask = (mask & low) | high;
 455     tmp[0] -= mask & kPrime[0];
 456     tmp[1] -= mask & kPrime[1];
 457     /* kPrime[2] is zero, so omitted */
 458     tmp[3] -= mask & kPrime[3];
 459     /* tmp[3] < 2**64 - 2**32 + 1 */
 460
 461     tmp[1] += ((u64)(tmp[0] >> 64));
 462     tmp[0] = (u64)tmp[0];
 463     tmp[2] += ((u64)(tmp[1] >> 64));
 464     tmp[1] = (u64)tmp[1];
 465     tmp[3] += ((u64)(tmp[2] >> 64));
 466     tmp[2] = (u64)tmp[2];
 467     /* tmp[i] < 2^64 */
 468
 469     out[0] = tmp[0];
 470     out[1] = tmp[1];
 471     out[2] = tmp[2];
 472     out[3] = tmp[3];
 473 }
 474
 475 /* smallfelem_expand converts a smallfelem to an felem */
 476 static void smallfelem_expand(felem out, const smallfelem in)
 477 {
 478     out[0] = in[0];
 479     out[1] = in[1];
 480     out[2] = in[2];
 481     out[3] = in[3];
 482 }
 483
 484 /*-
 485  * smallfelem_square sets |out| = |small|^2
 486  * On entry:
 487  *   small[i] < 2^64
 488  * On exit:
 489  *   out[i] < 7 * 2^64 < 2^67
 490  */
 491 static void smallfelem_square(longfelem out, const smallfelem small)
 492 {
 493     limb a;
 494     u64 high, low;
 495
 496     a = ((uint128_t) small[0]) * small[0];
 497     low = a;
 498     high = a >> 64;
 499     out[0] = low;
 500     out[1] = high;
 501
 502     a = ((uint128_t) small[0]) * small[1];
 503     low = a;
 504     high = a >> 64;
 505     out[1] += low;
 506     out[1] += low;
 507     out[2] = high;
 508
 509     a = ((uint128_t) small[0]) * small[2];
 510     low = a;
 511     high = a >> 64;
 512     out[2] += low;
 513     out[2] *= 2;
 514     out[3] = high;
 515
 516     a = ((uint128_t) small[0]) * small[3];
 517     low = a;
 518     high = a >> 64;
 519     out[3] += low;
 520     out[4] = high;
 521
 522     a = ((uint128_t) small[1]) * small[2];
 523     low = a;
 524     high = a >> 64;
 525     out[3] += low;
 526     out[3] *= 2;
 527     out[4] += high;
 528
 529     a = ((uint128_t) small[1]) * small[1];
 530     low = a;
 531     high = a >> 64;
 532     out[2] += low;
 533     out[3] += high;
 534
 535     a = ((uint128_t) small[1]) * small[3];
 536     low = a;
 537     high = a >> 64;
 538     out[4] += low;
 539     out[4] *= 2;
 540     out[5] = high;
 541
 542     a = ((uint128_t) small[2]) * small[3];
 543     low = a;
 544     high = a >> 64;
 545     out[5] += low;
 546     out[5] *= 2;
 547     out[6] = high;
 548     out[6] += high;
 549
 550     a = ((uint128_t) small[2]) * small[2];
 551     low = a;
 552     high = a >> 64;
 553     out[4] += low;
 554     out[5] += high;
 555
 556     a = ((uint128_t) small[3]) * small[3];
 557     low = a;
 558     high = a >> 64;
 559     out[6] += low;
 560     out[7] = high;
 561 }
 562
 563 /*-
 564  * felem_square sets |out| = |in|^2
 565  * On entry:
 566  *   in[i] < 2^109
 567  * On exit:
 568  *   out[i] < 7 * 2^64 < 2^67
 569  */
 570 static void felem_square(longfelem out, const felem in)
 571 {
 572     u64 small[4];
 573     felem_shrink(small, in);
 574     smallfelem_square(out, small);
 575 }
 576
 577 /*-
 578  * smallfelem_mul sets |out| = |small1| * |small2|
 579  * On entry:
 580  *   small1[i] < 2^64
 581  *   small2[i] < 2^64
 582  * On exit:
 583  *   out[i] < 7 * 2^64 < 2^67
 584  */
 585 static void smallfelem_mul(longfelem out, const smallfelem small1,
 586                            const smallfelem small2)
 587 {
 588     limb a;
 589     u64 high, low;
 590
 591     a = ((uint128_t) small1[0]) * small2[0];
 592     low = a;
 593     high = a >> 64;
 594     out[0] = low;
 595     out[1] = high;
 596
 597     a = ((uint128_t) small1[0]) * small2[1];
 598     low = a;
 599     high = a >> 64;
 600     out[1] += low;
 601     out[2] = high;
 602
 603     a = ((uint128_t) small1[1]) * small2[0];
 604     low = a;
 605     high = a >> 64;
 606     out[1] += low;
 607     out[2] += high;
 608
 609     a = ((uint128_t) small1[0]) * small2[2];
 610     low = a;
 611     high = a >> 64;
 612     out[2] += low;
 613     out[3] = high;
 614
 615     a = ((uint128_t) small1[1]) * small2[1];
 616     low = a;
 617     high = a >> 64;
 618     out[2] += low;
 619     out[3] += high;
 620
 621     a = ((uint128_t) small1[2]) * small2[0];
 622     low = a;
 623     high = a >> 64;
 624     out[2] += low;
 625     out[3] += high;
 626
 627     a = ((uint128_t) small1[0]) * small2[3];
 628     low = a;
 629     high = a >> 64;
 630     out[3] += low;
 631     out[4] = high;
 632
 633     a = ((uint128_t) small1[1]) * small2[2];
 634     low = a;
 635     high = a >> 64;
 636     out[3] += low;
 637     out[4] += high;
 638
 639     a = ((uint128_t) small1[2]) * small2[1];
 640     low = a;
 641     high = a >> 64;
 642     out[3] += low;
 643     out[4] += high;
 644
 645     a = ((uint128_t) small1[3]) * small2[0];
 646     low = a;
 647     high = a >> 64;
 648     out[3] += low;
 649     out[4] += high;
 650
 651     a = ((uint128_t) small1[1]) * small2[3];
 652     low = a;
 653     high = a >> 64;
 654     out[4] += low;
 655     out[5] = high;
 656
 657     a = ((uint128_t) small1[2]) * small2[2];
 658     low = a;
 659     high = a >> 64;
 660     out[4] += low;
 661     out[5] += high;
 662
 663     a = ((uint128_t) small1[3]) * small2[1];
 664     low = a;
 665     high = a >> 64;
 666     out[4] += low;
 667     out[5] += high;
 668
 669     a = ((uint128_t) small1[2]) * small2[3];
 670     low = a;
 671     high = a >> 64;
 672     out[5] += low;
 673     out[6] = high;
 674
 675     a = ((uint128_t) small1[3]) * small2[2];
 676     low = a;
 677     high = a >> 64;
 678     out[5] += low;
 679     out[6] += high;
 680
 681     a = ((uint128_t) small1[3]) * small2[3];
 682     low = a;
 683     high = a >> 64;
 684     out[6] += low;
 685     out[7] = high;
 686 }
 687
 688 /*-
 689  * felem_mul sets |out| = |in1| * |in2|
 690  * On entry:
 691  *   in1[i] < 2^109
 692  *   in2[i] < 2^109
 693  * On exit:
 694  *   out[i] < 7 * 2^64 < 2^67
 695  */
 696 static void felem_mul(longfelem out, const felem in1, const felem in2)
 697 {
 698     smallfelem small1, small2;
 699     felem_shrink(small1, in1);
 700     felem_shrink(small2, in2);
 701     smallfelem_mul(out, small1, small2);
 702 }
 703
 704 /*-
 705  * felem_small_mul sets |out| = |small1| * |in2|
 706  * On entry:
 707  *   small1[i] < 2^64
 708  *   in2[i] < 2^109
 709  * On exit:
 710  *   out[i] < 7 * 2^64 < 2^67
 711  */
 712 static void felem_small_mul(longfelem out, const smallfelem small1,
 713                             const felem in2)
 714 {
 715     smallfelem small2;
 716     felem_shrink(small2, in2);
 717     smallfelem_mul(out, small1, small2);
 718 }
 719
 720 # define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
 721 # define two100 (((limb)1) << 100)
 722 # define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
 723 /* zero100 is 0 mod p */
 724 static const felem zero100 =
 725     { two100m36m4, two100, two100m36p4, two100m36p4 };
 726
 727 /*-
 728  * Internal function for the different flavours of felem_reduce.
 729  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
 730  * On entry:
 731  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
 732  *   out[1] >= in[7] + 2^32*in[4]
 733  *   out[2] >= in[5] + 2^32*in[5]
 734  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
 735  * On exit:
 736  *   out[0] <= out[0] + in[4] + 2^32*in[5]
 737  *   out[1] <= out[1] + in[5] + 2^33*in[6]
 738  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
 739  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
 740  */
 741 static void felem_reduce_(felem out, const longfelem in)
 742 {
 743     int128_t c;
 744     /* combine common terms from below */
 745     c = in[4] + (in[5] << 32);
 746     out[0] += c;
 747     out[3] -= c;
 748
 749     c = in[5] - in[7];
 750     out[1] += c;
 751     out[2] -= c;
 752
 753     /* the remaining terms */
 754     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
 755     out[1] -= (in[4] << 32);
 756     out[3] += (in[4] << 32);
 757
 758     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
 759     out[2] -= (in[5] << 32);
 760
 761     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
 762     out[0] -= in[6];
 763     out[0] -= (in[6] << 32);
 764     out[1] += (in[6] << 33);
 765     out[2] += (in[6] * 2);
 766     out[3] -= (in[6] << 32);
 767
 768     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
 769     out[0] -= in[7];
 770     out[0] -= (in[7] << 32);
 771     out[2] += (in[7] << 33);
 772     out[3] += (in[7] * 3);
 773 }
 774
 775 /*-
 776  * felem_reduce converts a longfelem into an felem.
 777  * To be called directly after felem_square or felem_mul.
 778  * On entry:
 779  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
 780  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
 781  * On exit:
 782  *   out[i] < 2^101
 783  */
 784 static void felem_reduce(felem out, const longfelem in)
 785 {
 786     out[0] = zero100[0] + in[0];
 787     out[1] = zero100[1] + in[1];
 788     out[2] = zero100[2] + in[2];
 789     out[3] = zero100[3] + in[3];
 790
 791     felem_reduce_(out, in);
 792
 793     /*-
 794      * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
 795      * out[1] > 2^100 - 2^64 - 7*2^96 > 0
 796      * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
 797      * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
 798      *
 799      * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
 800      * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
 801      * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
 802      * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
 803      */
 804 }
 805
 806 /*-
 807  * felem_reduce_zero105 converts a larger longfelem into an felem.
 808  * On entry:
 809  *   in[0] < 2^71
 810  * On exit:
 811  *   out[i] < 2^106
 812  */
 813 static void felem_reduce_zero105(felem out, const longfelem in)
 814 {
 815     out[0] = zero105[0] + in[0];
 816     out[1] = zero105[1] + in[1];
 817     out[2] = zero105[2] + in[2];
 818     out[3] = zero105[3] + in[3];
 819
 820     felem_reduce_(out, in);
 821
 822     /*-
 823      * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
 824      * out[1] > 2^105 - 2^71 - 2^103 > 0
 825      * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
 826      * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
 827      *
 828      * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 829      * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 830      * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
 831      * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
 832      */
 833 }
 834
 835 /*
 836  * subtract_u64 sets *result = *result - v and *carry to one if the
 837  * subtraction underflowed.
 838  */
 839 static void subtract_u64(u64 *result, u64 *carry, u64 v)
 840 {
 841     uint128_t r = *result;
 842     r -= v;
 843     *carry = (r >> 64) & 1;
 844     *result = (u64)r;
 845 }
 846
 847 /*
 848  * felem_contract converts |in| to its unique, minimal representation. On
 849  * entry: in[i] < 2^109
 850  */
 851 static void felem_contract(smallfelem out, const felem in)
 852 {
 853     unsigned i;
 854     u64 all_equal_so_far = 0, result = 0, carry;
 855
 856     felem_shrink(out, in);
 857     /* small is minimal except that the value might be > p */
 858
 859     all_equal_so_far--;
 860     /*
 861      * We are doing a constant time test if out >= kPrime. We need to compare
 862      * each u64, from most-significant to least significant. For each one, if
 863      * all words so far have been equal (m is all ones) then a non-equal
 864      * result is the answer. Otherwise we continue.
 865      */
 866     for (i = 3; i < 4; i--) {
 867         u64 equal;
 868         uint128_t a = ((uint128_t) kPrime[i]) - out[i];
 869         /*
 870          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
 871          * will all be set.
 872          */
 873         result |= all_equal_so_far & ((u64)(a >> 64));
 874
 875         /*
 876          * if kPrime[i] == out[i] then |equal| will be all zeros and the
 877          * decrement will make it all ones.
 878          */
 879         equal = kPrime[i] ^ out[i];
 880         equal--;
 881         equal &= equal << 32;
 882         equal &= equal << 16;
 883         equal &= equal << 8;
 884         equal &= equal << 4;
 885         equal &= equal << 2;
 886         equal &= equal << 1;
 887         equal = ((s64) equal) >> 63;
 888
 889         all_equal_so_far &= equal;
 890     }
 891
 892     /*
 893      * if all_equal_so_far is still all ones then the two values are equal
 894      * and so out >= kPrime is true.
 895      */
 896     result |= all_equal_so_far;
 897
 898     /* if out >= kPrime then we subtract kPrime. */
 899     subtract_u64(&out[0], &carry, result & kPrime[0]);
 900     subtract_u64(&out[1], &carry, carry);
 901     subtract_u64(&out[2], &carry, carry);
 902     subtract_u64(&out[3], &carry, carry);
 903
 904     subtract_u64(&out[1], &carry, result & kPrime[1]);
 905     subtract_u64(&out[2], &carry, carry);
 906     subtract_u64(&out[3], &carry, carry);
 907
 908     subtract_u64(&out[2], &carry, result & kPrime[2]);
 909     subtract_u64(&out[3], &carry, carry);
 910
 911     subtract_u64(&out[3], &carry, result & kPrime[3]);
 912 }
 913
 914 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
 915 {
 916     longfelem longtmp;
 917     felem tmp;
 918
 919     smallfelem_square(longtmp, in);
 920     felem_reduce(tmp, longtmp);
 921     felem_contract(out, tmp);
 922 }
 923
 924 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
 925                                     const smallfelem in2)
 926 {
 927     longfelem longtmp;
 928     felem tmp;
 929
 930     smallfelem_mul(longtmp, in1, in2);
 931     felem_reduce(tmp, longtmp);
 932     felem_contract(out, tmp);
 933 }
 934
 935 /*-
 936  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 937  * otherwise.
 938  * On entry:
 939  *   small[i] < 2^64
 940  */
 941 static limb smallfelem_is_zero(const smallfelem small)
 942 {
 943     limb result;
 944     u64 is_p;
 945
 946     u64 is_zero = small[0] | small[1] | small[2] | small[3];
 947     is_zero--;
 948     is_zero &= is_zero << 32;
 949     is_zero &= is_zero << 16;
 950     is_zero &= is_zero << 8;
 951     is_zero &= is_zero << 4;
 952     is_zero &= is_zero << 2;
 953     is_zero &= is_zero << 1;
 954     is_zero = ((s64) is_zero) >> 63;
 955
 956     is_p = (small[0] ^ kPrime[0]) |
 957         (small[1] ^ kPrime[1]) |
 958         (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
 959     is_p--;
 960     is_p &= is_p << 32;
 961     is_p &= is_p << 16;
 962     is_p &= is_p << 8;
 963     is_p &= is_p << 4;
 964     is_p &= is_p << 2;
 965     is_p &= is_p << 1;
 966     is_p = ((s64) is_p) >> 63;
 967
 968     is_zero |= is_p;
 969
 970     result = is_zero;
 971     result |= ((limb) is_zero) << 64;
 972     return result;
 973 }
 974
 975 static int smallfelem_is_zero_int(const smallfelem small)
 976 {
 977     return (int)(smallfelem_is_zero(small) & ((limb) 1));
 978 }
 979
 980 /*-
 981  * felem_inv calculates |out| = |in|^{-1}
 982  *
 983  * Based on Fermat's Little Theorem:
 984  *   a^p = a (mod p)
 985  *   a^{p-1} = 1 (mod p)
 986  *   a^{p-2} = a^{-1} (mod p)
 987  */
 988 static void felem_inv(felem out, const felem in)
 989 {
 990     felem ftmp, ftmp2;
 991     /* each e_I will hold |in|^{2^I - 1} */
 992     felem e2, e4, e8, e16, e32, e64;
 993     longfelem tmp;
 994     unsigned i;
 995
 996     felem_square(tmp, in);
 997     felem_reduce(ftmp, tmp);    /* 2^1 */
 998     felem_mul(tmp, in, ftmp);
 999     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
1000     felem_assign(e2, ftmp);
1001     felem_square(tmp, ftmp);
1002     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1003     felem_square(tmp, ftmp);
1004     felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1005     felem_mul(tmp, ftmp, e2);
1006     felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1007     felem_assign(e4, ftmp);
1008     felem_square(tmp, ftmp);
1009     felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1010     felem_square(tmp, ftmp);
1011     felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1012     felem_square(tmp, ftmp);
1013     felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1014     felem_square(tmp, ftmp);
1015     felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1016     felem_mul(tmp, ftmp, e4);
1017     felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1018     felem_assign(e8, ftmp);
1019     for (i = 0; i < 8; i++) {
1020         felem_square(tmp, ftmp);
1021         felem_reduce(ftmp, tmp);
1022     }                           /* 2^16 - 2^8 */
1023     felem_mul(tmp, ftmp, e8);
1024     felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1025     felem_assign(e16, ftmp);
1026     for (i = 0; i < 16; i++) {
1027         felem_square(tmp, ftmp);
1028         felem_reduce(ftmp, tmp);
1029     }                           /* 2^32 - 2^16 */
1030     felem_mul(tmp, ftmp, e16);
1031     felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1032     felem_assign(e32, ftmp);
1033     for (i = 0; i < 32; i++) {
1034         felem_square(tmp, ftmp);
1035         felem_reduce(ftmp, tmp);
1036     }                           /* 2^64 - 2^32 */
1037     felem_assign(e64, ftmp);
1038     felem_mul(tmp, ftmp, in);
1039     felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1040     for (i = 0; i < 192; i++) {
1041         felem_square(tmp, ftmp);
1042         felem_reduce(ftmp, tmp);
1043     }                           /* 2^256 - 2^224 + 2^192 */
1044
1045     felem_mul(tmp, e64, e32);
1046     felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1047     for (i = 0; i < 16; i++) {
1048         felem_square(tmp, ftmp2);
1049         felem_reduce(ftmp2, tmp);
1050     }                           /* 2^80 - 2^16 */
1051     felem_mul(tmp, ftmp2, e16);
1052     felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1053     for (i = 0; i < 8; i++) {
1054         felem_square(tmp, ftmp2);
1055         felem_reduce(ftmp2, tmp);
1056     }                           /* 2^88 - 2^8 */
1057     felem_mul(tmp, ftmp2, e8);
1058     felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1059     for (i = 0; i < 4; i++) {
1060         felem_square(tmp, ftmp2);
1061         felem_reduce(ftmp2, tmp);
1062     }                           /* 2^92 - 2^4 */
1063     felem_mul(tmp, ftmp2, e4);
1064     felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1065     felem_square(tmp, ftmp2);
1066     felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1067     felem_square(tmp, ftmp2);
1068     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1069     felem_mul(tmp, ftmp2, e2);
1070     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1071     felem_square(tmp, ftmp2);
1072     felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1073     felem_square(tmp, ftmp2);
1074     felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1075     felem_mul(tmp, ftmp2, in);
1076     felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1077
1078     felem_mul(tmp, ftmp2, ftmp);
1079     felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1080 }
1081
1082 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1083 {
1084     felem tmp;
1085
1086     smallfelem_expand(tmp, in);
1087     felem_inv(tmp, tmp);
1088     felem_contract(out, tmp);
1089 }
1090
1091 /*-
1092  * Group operations
1093  * ----------------
1094  *
1095  * Building on top of the field operations we have the operations on the
1096  * elliptic curve group itself. Points on the curve are represented in Jacobian
1097  * coordinates
1098  */
1099
1100 /*-
1101  * point_double calculates 2*(x_in, y_in, z_in)
1102  *
1103  * The method is taken from:
1104  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1105  *
1106  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1107  * while x_out == y_in is not (maybe this works, but it's not tested).
1108  */
1109 static void
1110 point_double(felem x_out, felem y_out, felem z_out,
1111              const felem x_in, const felem y_in, const felem z_in)
1112 {
1113     longfelem tmp, tmp2;
1114     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1115     smallfelem small1, small2;
1116
1117     felem_assign(ftmp, x_in);
1118     /* ftmp[i] < 2^106 */
1119     felem_assign(ftmp2, x_in);
1120     /* ftmp2[i] < 2^106 */
1121
1122     /* delta = z^2 */
1123     felem_square(tmp, z_in);
1124     felem_reduce(delta, tmp);
1125     /* delta[i] < 2^101 */
1126
1127     /* gamma = y^2 */
1128     felem_square(tmp, y_in);
1129     felem_reduce(gamma, tmp);
1130     /* gamma[i] < 2^101 */
1131     felem_shrink(small1, gamma);
1132
1133     /* beta = x*gamma */
1134     felem_small_mul(tmp, small1, x_in);
1135     felem_reduce(beta, tmp);
1136     /* beta[i] < 2^101 */
1137
1138     /* alpha = 3*(x-delta)*(x+delta) */
1139     felem_diff(ftmp, delta);
1140     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1141     felem_sum(ftmp2, delta);
1142     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1143     felem_scalar(ftmp2, 3);
1144     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1145     felem_mul(tmp, ftmp, ftmp2);
1146     felem_reduce(alpha, tmp);
1147     /* alpha[i] < 2^101 */
1148     felem_shrink(small2, alpha);
1149
1150     /* x' = alpha^2 - 8*beta */
1151     smallfelem_square(tmp, small2);
1152     felem_reduce(x_out, tmp);
1153     felem_assign(ftmp, beta);
1154     felem_scalar(ftmp, 8);
1155     /* ftmp[i] < 8 * 2^101 = 2^104 */
1156     felem_diff(x_out, ftmp);
1157     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1158
1159     /* z' = (y + z)^2 - gamma - delta */
1160     felem_sum(delta, gamma);
1161     /* delta[i] < 2^101 + 2^101 = 2^102 */
1162     felem_assign(ftmp, y_in);
1163     felem_sum(ftmp, z_in);
1164     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1165     felem_square(tmp, ftmp);
1166     felem_reduce(z_out, tmp);
1167     felem_diff(z_out, delta);
1168     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1169
1170     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1171     felem_scalar(beta, 4);
1172     /* beta[i] < 4 * 2^101 = 2^103 */
1173     felem_diff_zero107(beta, x_out);
1174     /* beta[i] < 2^107 + 2^103 < 2^108 */
1175     felem_small_mul(tmp, small2, beta);
1176     /* tmp[i] < 7 * 2^64 < 2^67 */
1177     smallfelem_square(tmp2, small1);
1178     /* tmp2[i] < 7 * 2^64 */
1179     longfelem_scalar(tmp2, 8);
1180     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1181     longfelem_diff(tmp, tmp2);
1182     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1183     felem_reduce_zero105(y_out, tmp);
1184     /* y_out[i] < 2^106 */
1185 }
1186
1187 /*
1188  * point_double_small is the same as point_double, except that it operates on
1189  * smallfelems
1190  */
1191 static void
1192 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1193                    const smallfelem x_in, const smallfelem y_in,
1194                    const smallfelem z_in)
1195 {
1196     felem felem_x_out, felem_y_out, felem_z_out;
1197     felem felem_x_in, felem_y_in, felem_z_in;
1198
1199     smallfelem_expand(felem_x_in, x_in);
1200     smallfelem_expand(felem_y_in, y_in);
1201     smallfelem_expand(felem_z_in, z_in);
1202     point_double(felem_x_out, felem_y_out, felem_z_out,
1203                  felem_x_in, felem_y_in, felem_z_in);
1204     felem_shrink(x_out, felem_x_out);
1205     felem_shrink(y_out, felem_y_out);
1206     felem_shrink(z_out, felem_z_out);
1207 }
1208
1209 /* copy_conditional copies in to out iff mask is all ones. */
1210 static void copy_conditional(felem out, const felem in, limb mask)
1211 {
1212     unsigned i;
1213     for (i = 0; i < NLIMBS; ++i) {
1214         const limb tmp = mask & (in[i] ^ out[i]);
1215         out[i] ^= tmp;
1216     }
1217 }
1218
1219 /* copy_small_conditional copies in to out iff mask is all ones. */
1220 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1221 {
1222     unsigned i;
1223     const u64 mask64 = mask;
1224     for (i = 0; i < NLIMBS; ++i) {
1225         out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1226     }
1227 }
1228
1229 /*-
1230  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1231  *
1232  * The method is taken from:
1233  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1234  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1235  *
1236  * This function includes a branch for checking whether the two input points
1237  * are equal, (while not equal to the point at infinity). This case never
1238  * happens during single point multiplication, so there is no timing leak for
1239  * ECDH or ECDSA signing.
1240  */
1241 static void point_add(felem x3, felem y3, felem z3,
1242                       const felem x1, const felem y1, const felem z1,
1243                       const int mixed, const smallfelem x2,
1244                       const smallfelem y2, const smallfelem z2)
1245 {
1246     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1247     longfelem tmp, tmp2;
1248     smallfelem small1, small2, small3, small4, small5;
1249     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1250
1251     felem_shrink(small3, z1);
1252
1253     z1_is_zero = smallfelem_is_zero(small3);
1254     z2_is_zero = smallfelem_is_zero(z2);
1255
1256     /* ftmp = z1z1 = z1**2 */
1257     smallfelem_square(tmp, small3);
1258     felem_reduce(ftmp, tmp);
1259     /* ftmp[i] < 2^101 */
1260     felem_shrink(small1, ftmp);
1261
1262     if (!mixed) {
1263         /* ftmp2 = z2z2 = z2**2 */
1264         smallfelem_square(tmp, z2);
1265         felem_reduce(ftmp2, tmp);
1266         /* ftmp2[i] < 2^101 */
1267         felem_shrink(small2, ftmp2);
1268
1269         felem_shrink(small5, x1);
1270
1271         /* u1 = ftmp3 = x1*z2z2 */
1272         smallfelem_mul(tmp, small5, small2);
1273         felem_reduce(ftmp3, tmp);
1274         /* ftmp3[i] < 2^101 */
1275
1276         /* ftmp5 = z1 + z2 */
1277         felem_assign(ftmp5, z1);
1278         felem_small_sum(ftmp5, z2);
1279         /* ftmp5[i] < 2^107 */
1280
1281         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1282         felem_square(tmp, ftmp5);
1283         felem_reduce(ftmp5, tmp);
1284         /* ftmp2 = z2z2 + z1z1 */
1285         felem_sum(ftmp2, ftmp);
1286         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1287         felem_diff(ftmp5, ftmp2);
1288         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1289
1290         /* ftmp2 = z2 * z2z2 */
1291         smallfelem_mul(tmp, small2, z2);
1292         felem_reduce(ftmp2, tmp);
1293
1294         /* s1 = ftmp2 = y1 * z2**3 */
1295         felem_mul(tmp, y1, ftmp2);
1296         felem_reduce(ftmp6, tmp);
1297         /* ftmp6[i] < 2^101 */
1298     } else {
1299         /*
1300          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1301          */
1302
1303         /* u1 = ftmp3 = x1*z2z2 */
1304         felem_assign(ftmp3, x1);
1305         /* ftmp3[i] < 2^106 */
1306
1307         /* ftmp5 = 2z1z2 */
1308         felem_assign(ftmp5, z1);
1309         felem_scalar(ftmp5, 2);
1310         /* ftmp5[i] < 2*2^106 = 2^107 */
1311
1312         /* s1 = ftmp2 = y1 * z2**3 */
1313         felem_assign(ftmp6, y1);
1314         /* ftmp6[i] < 2^106 */
1315     }
1316
1317     /* u2 = x2*z1z1 */
1318     smallfelem_mul(tmp, x2, small1);
1319     felem_reduce(ftmp4, tmp);
1320
1321     /* h = ftmp4 = u2 - u1 */
1322     felem_diff_zero107(ftmp4, ftmp3);
1323     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1324     felem_shrink(small4, ftmp4);
1325
1326     x_equal = smallfelem_is_zero(small4);
1327
1328     /* z_out = ftmp5 * h */
1329     felem_small_mul(tmp, small4, ftmp5);
1330     felem_reduce(z_out, tmp);
1331     /* z_out[i] < 2^101 */
1332
1333     /* ftmp = z1 * z1z1 */
1334     smallfelem_mul(tmp, small1, small3);
1335     felem_reduce(ftmp, tmp);
1336
1337     /* s2 = tmp = y2 * z1**3 */
1338     felem_small_mul(tmp, y2, ftmp);
1339     felem_reduce(ftmp5, tmp);
1340
1341     /* r = ftmp5 = (s2 - s1)*2 */
1342     felem_diff_zero107(ftmp5, ftmp6);
1343     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1344     felem_scalar(ftmp5, 2);
1345     /* ftmp5[i] < 2^109 */
1346     felem_shrink(small1, ftmp5);
1347     y_equal = smallfelem_is_zero(small1);
1348
1349     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1350         point_double(x3, y3, z3, x1, y1, z1);
1351         return;
1352     }
1353
1354     /* I = ftmp = (2h)**2 */
1355     felem_assign(ftmp, ftmp4);
1356     felem_scalar(ftmp, 2);
1357     /* ftmp[i] < 2*2^108 = 2^109 */
1358     felem_square(tmp, ftmp);
1359     felem_reduce(ftmp, tmp);
1360
1361     /* J = ftmp2 = h * I */
1362     felem_mul(tmp, ftmp4, ftmp);
1363     felem_reduce(ftmp2, tmp);
1364
1365     /* V = ftmp4 = U1 * I */
1366     felem_mul(tmp, ftmp3, ftmp);
1367     felem_reduce(ftmp4, tmp);
1368
1369     /* x_out = r**2 - J - 2V */
1370     smallfelem_square(tmp, small1);
1371     felem_reduce(x_out, tmp);
1372     felem_assign(ftmp3, ftmp4);
1373     felem_scalar(ftmp4, 2);
1374     felem_sum(ftmp4, ftmp2);
1375     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1376     felem_diff(x_out, ftmp4);
1377     /* x_out[i] < 2^105 + 2^101 */
1378
1379     /* y_out = r(V-x_out) - 2 * s1 * J */
1380     felem_diff_zero107(ftmp3, x_out);
1381     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1382     felem_small_mul(tmp, small1, ftmp3);
1383     felem_mul(tmp2, ftmp6, ftmp2);
1384     longfelem_scalar(tmp2, 2);
1385     /* tmp2[i] < 2*2^67 = 2^68 */
1386     longfelem_diff(tmp, tmp2);
1387     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1388     felem_reduce_zero105(y_out, tmp);
1389     /* y_out[i] < 2^106 */
1390
1391     copy_small_conditional(x_out, x2, z1_is_zero);
1392     copy_conditional(x_out, x1, z2_is_zero);
1393     copy_small_conditional(y_out, y2, z1_is_zero);
1394     copy_conditional(y_out, y1, z2_is_zero);
1395     copy_small_conditional(z_out, z2, z1_is_zero);
1396     copy_conditional(z_out, z1, z2_is_zero);
1397     felem_assign(x3, x_out);
1398     felem_assign(y3, y_out);
1399     felem_assign(z3, z_out);
1400 }
1401
1402 /*
1403  * point_add_small is the same as point_add, except that it operates on
1404  * smallfelems
1405  */
1406 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1407                             smallfelem x1, smallfelem y1, smallfelem z1,
1408                             smallfelem x2, smallfelem y2, smallfelem z2)
1409 {
1410     felem felem_x3, felem_y3, felem_z3;
1411     felem felem_x1, felem_y1, felem_z1;
1412     smallfelem_expand(felem_x1, x1);
1413     smallfelem_expand(felem_y1, y1);
1414     smallfelem_expand(felem_z1, z1);
1415     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1416               x2, y2, z2);
1417     felem_shrink(x3, felem_x3);
1418     felem_shrink(y3, felem_y3);
1419     felem_shrink(z3, felem_z3);
1420 }
1421
1422 /*-
1423  * Base point pre computation
1424  * --------------------------
1425  *
1426  * Two different sorts of precomputed tables are used in the following code.
1427  * Each contain various points on the curve, where each point is three field
1428  * elements (x, y, z).
1429  *
1430  * For the base point table, z is usually 1 (0 for the point at infinity).
1431  * This table has 2 * 16 elements, starting with the following:
1432  * index | bits    | point
1433  * ------+---------+------------------------------
1434  *     0 | 0 0 0 0 | 0G
1435  *     1 | 0 0 0 1 | 1G
1436  *     2 | 0 0 1 0 | 2^64G
1437  *     3 | 0 0 1 1 | (2^64 + 1)G
1438  *     4 | 0 1 0 0 | 2^128G
1439  *     5 | 0 1 0 1 | (2^128 + 1)G
1440  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1441  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1442  *     8 | 1 0 0 0 | 2^192G
1443  *     9 | 1 0 0 1 | (2^192 + 1)G
1444  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1445  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1446  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1447  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1448  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1449  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1450  * followed by a copy of this with each element multiplied by 2^32.
1451  *
1452  * The reason for this is so that we can clock bits into four different
1453  * locations when doing simple scalar multiplies against the base point,
1454  * and then another four locations using the second 16 elements.
1455  *
1456  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1457
1458 /* gmul is the table of precomputed base points */
1459 static const smallfelem gmul[2][16][3] = {
1460     {{{0, 0, 0, 0},
1461       {0, 0, 0, 0},
1462       {0, 0, 0, 0}},
1463      {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1464        0x6b17d1f2e12c4247},
1465       {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1466        0x4fe342e2fe1a7f9b},
1467       {1, 0, 0, 0}},
1468      {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1469        0x0fa822bc2811aaa5},
1470       {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1471        0xbff44ae8f5dba80d},
1472       {1, 0, 0, 0}},
1473      {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1474        0x300a4bbc89d6726f},
1475       {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1476        0x72aac7e0d09b4644},
1477       {1, 0, 0, 0}},
1478      {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1479        0x447d739beedb5e67},
1480       {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1481        0x2d4825ab834131ee},
1482       {1, 0, 0, 0}},
1483      {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1484        0xef9519328a9c72ff},
1485       {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1486        0x611e9fc37dbb2c9b},
1487       {1, 0, 0, 0}},
1488      {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1489        0x550663797b51f5d8},
1490       {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1491        0x157164848aecb851},
1492       {1, 0, 0, 0}},
1493      {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1494        0xeb5d7745b21141ea},
1495       {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1496        0xeafd72ebdbecc17b},
1497       {1, 0, 0, 0}},
1498      {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1499        0xa6d39677a7849276},
1500       {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1501        0x674f84749b0b8816},
1502       {1, 0, 0, 0}},
1503      {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1504        0x4e769e7672c9ddad},
1505       {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1506        0x42b99082de830663},
1507       {1, 0, 0, 0}},
1508      {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1509        0x78878ef61c6ce04d},
1510       {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1511        0xb6cb3f5d7b72c321},
1512       {1, 0, 0, 0}},
1513      {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1514        0x0c88bc4d716b1287},
1515       {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1516        0xdd5ddea3f3901dc6},
1517       {1, 0, 0, 0}},
1518      {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1519        0x68f344af6b317466},
1520       {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1521        0x31b9c405f8540a20},
1522       {1, 0, 0, 0}},
1523      {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1524        0x4052bf4b6f461db9},
1525       {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1526        0xfecf4d5190b0fc61},
1527       {1, 0, 0, 0}},
1528      {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1529        0x1eddbae2c802e41a},
1530       {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1531        0x43104d86560ebcfc},
1532       {1, 0, 0, 0}},
1533      {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1534        0xb48e26b484f7a21c},
1535       {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1536        0xfac015404d4d3dab},
1537       {1, 0, 0, 0}}},
1538     {{{0, 0, 0, 0},
1539       {0, 0, 0, 0},
1540       {0, 0, 0, 0}},
1541      {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1542        0x7fe36b40af22af89},
1543       {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1544        0xe697d45825b63624},
1545       {1, 0, 0, 0}},
1546      {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1547        0x4a5b506612a677a6},
1548       {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1549        0xeb13461ceac089f1},
1550       {1, 0, 0, 0}},
1551      {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1552        0x0781b8291c6a220a},
1553       {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1554        0x690cde8df0151593},
1555       {1, 0, 0, 0}},
1556      {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1557        0x8a535f566ec73617},
1558       {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1559        0x0455c08468b08bd7},
1560       {1, 0, 0, 0}},
1561      {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1562        0x06bada7ab77f8276},
1563       {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1564        0x5b476dfd0e6cb18a},
1565       {1, 0, 0, 0}},
1566      {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1567        0x3e29864e8a2ec908},
1568       {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1569        0x239b90ea3dc31e7e},
1570       {1, 0, 0, 0}},
1571      {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1572        0x820f4dd949f72ff7},
1573       {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1574        0x140406ec783a05ec},
1575       {1, 0, 0, 0}},
1576      {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1577        0x68f6b8542783dfee},
1578       {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1579        0xcbe1feba92e40ce6},
1580       {1, 0, 0, 0}},
1581      {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1582        0xd0b2f94d2f420109},
1583       {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1584        0x971459828b0719e5},
1585       {1, 0, 0, 0}},
1586      {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1587        0x961610004a866aba},
1588       {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1589        0x7acb9fadcee75e44},
1590       {1, 0, 0, 0}},
1591      {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1592        0x24eb9acca333bf5b},
1593       {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1594        0x69f891c5acd079cc},
1595       {1, 0, 0, 0}},
1596      {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1597        0xe51f547c5972a107},
1598       {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1599        0x1c309a2b25bb1387},
1600       {1, 0, 0, 0}},
1601      {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1602        0x20b87b8aa2c4e503},
1603       {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1604        0xf5c6fa49919776be},
1605       {1, 0, 0, 0}},
1606      {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1607        0x1ed7d1b9332010b9},
1608       {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1609        0x3a2b03f03217257a},
1610       {1, 0, 0, 0}},
1611      {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1612        0x15fee545c78dd9f6},
1613       {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1614        0x4ab5b6b2b8753f81},
1615       {1, 0, 0, 0}}}
1616 };
1617
1618 /*
1619  * select_point selects the |idx|th point from a precomputation table and
1620  * copies it to out.
1621  */
1622 static void select_point(const u64 idx, unsigned int size,
1623                          const smallfelem pre_comp[16][3], smallfelem out[3])
1624 {
1625     unsigned i, j;
1626     u64 *outlimbs = &out[0][0];
1627     memset(outlimbs, 0, 3 * sizeof(smallfelem));
1628
1629     for (i = 0; i < size; i++) {
1630         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1631         u64 mask = i ^ idx;
1632         mask |= mask >> 4;
1633         mask |= mask >> 2;
1634         mask |= mask >> 1;
1635         mask &= 1;
1636         mask--;
1637         for (j = 0; j < NLIMBS * 3; j++)
1638             outlimbs[j] |= inlimbs[j] & mask;
1639     }
1640 }
1641
1642 /* get_bit returns the |i|th bit in |in| */
1643 static char get_bit(const felem_bytearray in, int i)
1644 {
1645     if ((i < 0) || (i >= 256))
1646         return 0;
1647     return (in[i >> 3] >> (i & 7)) & 1;
1648 }
1649
1650 /*
1651  * Interleaved point multiplication using precomputed point multiples: The
1652  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1653  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1654  * generator, using certain (large) precomputed multiples in g_pre_comp.
1655  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1656  */
1657 static void batch_mul(felem x_out, felem y_out, felem z_out,
1658                       const felem_bytearray scalars[],
1659                       const unsigned num_points, const u8 *g_scalar,
1660                       const int mixed, const smallfelem pre_comp[][17][3],
1661                       const smallfelem g_pre_comp[2][16][3])
1662 {
1663     int i, skip;
1664     unsigned num, gen_mul = (g_scalar != NULL);
1665     felem nq[3], ftmp;
1666     smallfelem tmp[3];
1667     u64 bits;
1668     u8 sign, digit;
1669
1670     /* set nq to the point at infinity */
1671     memset(nq, 0, 3 * sizeof(felem));
1672
1673     /*
1674      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1675      * of the generator (two in each of the last 32 rounds) and additions of
1676      * other points multiples (every 5th round).
1677      */
1678     skip = 1;                   /* save two point operations in the first
1679                                  * round */
1680     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1681         /* double */
1682         if (!skip)
1683             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1684
1685         /* add multiples of the generator */
1686         if (gen_mul && (i <= 31)) {
1687             /* first, look 32 bits upwards */
1688             bits = get_bit(g_scalar, i + 224) << 3;
1689             bits |= get_bit(g_scalar, i + 160) << 2;
1690             bits |= get_bit(g_scalar, i + 96) << 1;
1691             bits |= get_bit(g_scalar, i + 32);
1692             /* select the point to add, in constant time */
1693             select_point(bits, 16, g_pre_comp[1], tmp);
1694
1695             if (!skip) {
1696                 /* Arg 1 below is for "mixed" */
1697                 point_add(nq[0], nq[1], nq[2],
1698                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1699             } else {
1700                 smallfelem_expand(nq[0], tmp[0]);
1701                 smallfelem_expand(nq[1], tmp[1]);
1702                 smallfelem_expand(nq[2], tmp[2]);
1703                 skip = 0;
1704             }
1705
1706             /* second, look at the current position */
1707             bits = get_bit(g_scalar, i + 192) << 3;
1708             bits |= get_bit(g_scalar, i + 128) << 2;
1709             bits |= get_bit(g_scalar, i + 64) << 1;
1710             bits |= get_bit(g_scalar, i);
1711             /* select the point to add, in constant time */
1712             select_point(bits, 16, g_pre_comp[0], tmp);
1713             /* Arg 1 below is for "mixed" */
1714             point_add(nq[0], nq[1], nq[2],
1715                       nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1716         }
1717
1718         /* do other additions every 5 doublings */
1719         if (num_points && (i % 5 == 0)) {
1720             /* loop over all scalars */
1721             for (num = 0; num < num_points; ++num) {
1722                 bits = get_bit(scalars[num], i + 4) << 5;
1723                 bits |= get_bit(scalars[num], i + 3) << 4;
1724                 bits |= get_bit(scalars[num], i + 2) << 3;
1725                 bits |= get_bit(scalars[num], i + 1) << 2;
1726                 bits |= get_bit(scalars[num], i) << 1;
1727                 bits |= get_bit(scalars[num], i - 1);
1728                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1729
1730                 /*
1731                  * select the point to add or subtract, in constant time
1732                  */
1733                 select_point(digit, 17, pre_comp[num], tmp);
1734                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1735                                                * point */
1736                 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1737                 felem_contract(tmp[1], ftmp);
1738
1739                 if (!skip) {
1740                     point_add(nq[0], nq[1], nq[2],
1741                               nq[0], nq[1], nq[2],
1742                               mixed, tmp[0], tmp[1], tmp[2]);
1743                 } else {
1744                     smallfelem_expand(nq[0], tmp[0]);
1745                     smallfelem_expand(nq[1], tmp[1]);
1746                     smallfelem_expand(nq[2], tmp[2]);
1747                     skip = 0;
1748                 }
1749             }
1750         }
1751     }
1752     felem_assign(x_out, nq[0]);
1753     felem_assign(y_out, nq[1]);
1754     felem_assign(z_out, nq[2]);
1755 }
1756
1757 /* Precomputation for the group generator. */
1758 typedef struct {
1759     smallfelem g_pre_comp[2][16][3];
1760     int references;
1761 } NISTP256_PRE_COMP;
1762
1763 const EC_METHOD *EC_GFp_nistp256_method(void)
1764 {
1765     static const EC_METHOD ret = {
1766         EC_FLAGS_DEFAULT_OCT,
1767         NID_X9_62_prime_field,
1768         ec_GFp_nistp256_group_init,
1769         ec_GFp_simple_group_finish,
1770         ec_GFp_simple_group_clear_finish,
1771         ec_GFp_nist_group_copy,
1772         ec_GFp_nistp256_group_set_curve,
1773         ec_GFp_simple_group_get_curve,
1774         ec_GFp_simple_group_get_degree,
1775         ec_GFp_simple_group_check_discriminant,
1776         ec_GFp_simple_point_init,
1777         ec_GFp_simple_point_finish,
1778         ec_GFp_simple_point_clear_finish,
1779         ec_GFp_simple_point_copy,
1780         ec_GFp_simple_point_set_to_infinity,
1781         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1782         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1783         ec_GFp_simple_point_set_affine_coordinates,
1784         ec_GFp_nistp256_point_get_affine_coordinates,
1785         0 /* point_set_compressed_coordinates */ ,
1786         0 /* point2oct */ ,
1787         0 /* oct2point */ ,
1788         ec_GFp_simple_add,
1789         ec_GFp_simple_dbl,
1790         ec_GFp_simple_invert,
1791         ec_GFp_simple_is_at_infinity,
1792         ec_GFp_simple_is_on_curve,
1793         ec_GFp_simple_cmp,
1794         ec_GFp_simple_make_affine,
1795         ec_GFp_simple_points_make_affine,
1796         ec_GFp_nistp256_points_mul,
1797         ec_GFp_nistp256_precompute_mult,
1798         ec_GFp_nistp256_have_precompute_mult,
1799         ec_GFp_nist_field_mul,
1800         ec_GFp_nist_field_sqr,
1801         0 /* field_div */ ,
1802         0 /* field_encode */ ,
1803         0 /* field_decode */ ,
1804         0                       /* field_set_to_one */
1805     };
1806
1807     return &ret;
1808 }
1809
1810 /******************************************************************************/
1811 /*
1812  * FUNCTIONS TO MANAGE PRECOMPUTATION
1813  */
1814
1815 static NISTP256_PRE_COMP *nistp256_pre_comp_new()
1816 {
1817     NISTP256_PRE_COMP *ret = NULL;
1818     ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret);
1819     if (!ret) {
1820         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1821         return ret;
1822     }
1823     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1824     ret->references = 1;
1825     return ret;
1826 }
1827
1828 static void *nistp256_pre_comp_dup(void *src_)
1829 {
1830     NISTP256_PRE_COMP *src = src_;
1831
1832     /* no need to actually copy, these objects never change! */
1833     CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1834
1835     return src_;
1836 }
1837
1838 static void nistp256_pre_comp_free(void *pre_)
1839 {
1840     int i;
1841     NISTP256_PRE_COMP *pre = pre_;
1842
1843     if (!pre)
1844         return;
1845
1846     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1847     if (i > 0)
1848         return;
1849
1850     OPENSSL_free(pre);
1851 }
1852
1853 static void nistp256_pre_comp_clear_free(void *pre_)
1854 {
1855     int i;
1856     NISTP256_PRE_COMP *pre = pre_;
1857
1858     if (!pre)
1859         return;
1860
1861     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1862     if (i > 0)
1863         return;
1864
1865     OPENSSL_cleanse(pre, sizeof *pre);
1866     OPENSSL_free(pre);
1867 }
1868
1869 /******************************************************************************/
1870 /*
1871  * OPENSSL EC_METHOD FUNCTIONS
1872  */
1873
1874 int ec_GFp_nistp256_group_init(EC_GROUP *group)
1875 {
1876     int ret;
1877     ret = ec_GFp_simple_group_init(group);
1878     group->a_is_minus3 = 1;
1879     return ret;
1880 }
1881
1882 int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1883                                     const BIGNUM *a, const BIGNUM *b,
1884                                     BN_CTX *ctx)
1885 {
1886     int ret = 0;
1887     BN_CTX *new_ctx = NULL;
1888     BIGNUM *curve_p, *curve_a, *curve_b;
1889
1890     if (ctx == NULL)
1891         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1892             return 0;
1893     BN_CTX_start(ctx);
1894     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1895         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1896         ((curve_b = BN_CTX_get(ctx)) == NULL))
1897         goto err;
1898     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1899     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1900     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1901     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1902         ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1903               EC_R_WRONG_CURVE_PARAMETERS);
1904         goto err;
1905     }
1906     group->field_mod_func = BN_nist_mod_256;
1907     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1908  err:
1909     BN_CTX_end(ctx);
1910     if (new_ctx != NULL)
1911         BN_CTX_free(new_ctx);
1912     return ret;
1913 }
1914
1915 /*
1916  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1917  * (X/Z^2, Y/Z^3)
1918  */
1919 int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1920                                                  const EC_POINT *point,
1921                                                  BIGNUM *x, BIGNUM *y,
1922                                                  BN_CTX *ctx)
1923 {
1924     felem z1, z2, x_in, y_in;
1925     smallfelem x_out, y_out;
1926     longfelem tmp;
1927
1928     if (EC_POINT_is_at_infinity(group, point)) {
1929         ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1930               EC_R_POINT_AT_INFINITY);
1931         return 0;
1932     }
1933     if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1934         (!BN_to_felem(z1, &point->Z)))
1935         return 0;
1936     felem_inv(z2, z1);
1937     felem_square(tmp, z2);
1938     felem_reduce(z1, tmp);
1939     felem_mul(tmp, x_in, z1);
1940     felem_reduce(x_in, tmp);
1941     felem_contract(x_out, x_in);
1942     if (x != NULL) {
1943         if (!smallfelem_to_BN(x, x_out)) {
1944             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1945                   ERR_R_BN_LIB);
1946             return 0;
1947         }
1948     }
1949     felem_mul(tmp, z1, z2);
1950     felem_reduce(z1, tmp);
1951     felem_mul(tmp, y_in, z1);
1952     felem_reduce(y_in, tmp);
1953     felem_contract(y_out, y_in);
1954     if (y != NULL) {
1955         if (!smallfelem_to_BN(y, y_out)) {
1956             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1957                   ERR_R_BN_LIB);
1958             return 0;
1959         }
1960     }
1961     return 1;
1962 }
1963
1964 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1965 static void make_points_affine(size_t num, smallfelem points[][3],
1966                                smallfelem tmp_smallfelems[])
1967 {
1968     /*
1969      * Runs in constant time, unless an input is the point at infinity (which
1970      * normally shouldn't happen).
1971      */
1972     ec_GFp_nistp_points_make_affine_internal(num,
1973                                              points,
1974                                              sizeof(smallfelem),
1975                                              tmp_smallfelems,
1976                                              (void (*)(void *))smallfelem_one,
1977                                              (int (*)(const void *))
1978                                              smallfelem_is_zero_int,
1979                                              (void (*)(void *, const void *))
1980                                              smallfelem_assign,
1981                                              (void (*)(void *, const void *))
1982                                              smallfelem_square_contract,
1983                                              (void (*)
1984                                               (void *, const void *,
1985                                                const void *))
1986                                              smallfelem_mul_contract,
1987                                              (void (*)(void *, const void *))
1988                                              smallfelem_inv_contract,
1989                                              /* nothing to contract */
1990                                              (void (*)(void *, const void *))
1991                                              smallfelem_assign);
1992 }
1993
1994 /*
1995  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1996  * values Result is stored in r (r can equal one of the inputs).
1997  */
1998 int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
1999                                const BIGNUM *scalar, size_t num,
2000                                const EC_POINT *points[],
2001                                const BIGNUM *scalars[], BN_CTX *ctx)
2002 {
2003     int ret = 0;
2004     int j;
2005     int mixed = 0;
2006     BN_CTX *new_ctx = NULL;
2007     BIGNUM *x, *y, *z, *tmp_scalar;
2008     felem_bytearray g_secret;
2009     felem_bytearray *secrets = NULL;
2010     smallfelem(*pre_comp)[17][3] = NULL;
2011     smallfelem *tmp_smallfelems = NULL;
2012     felem_bytearray tmp;
2013     unsigned i, num_bytes;
2014     int have_pre_comp = 0;
2015     size_t num_points = num;
2016     smallfelem x_in, y_in, z_in;
2017     felem x_out, y_out, z_out;
2018     NISTP256_PRE_COMP *pre = NULL;
2019     const smallfelem(*g_pre_comp)[16][3] = NULL;
2020     EC_POINT *generator = NULL;
2021     const EC_POINT *p = NULL;
2022     const BIGNUM *p_scalar = NULL;
2023
2024     if (ctx == NULL)
2025         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2026             return 0;
2027     BN_CTX_start(ctx);
2028     if (((x = BN_CTX_get(ctx)) == NULL) ||
2029         ((y = BN_CTX_get(ctx)) == NULL) ||
2030         ((z = BN_CTX_get(ctx)) == NULL) ||
2031         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2032         goto err;
2033
2034     if (scalar != NULL) {
2035         pre = EC_EX_DATA_get_data(group->extra_data,
2036                                   nistp256_pre_comp_dup,
2037                                   nistp256_pre_comp_free,
2038                                   nistp256_pre_comp_clear_free);
2039         if (pre)
2040             /* we have precomputation, try to use it */
2041             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2042         else
2043             /* try to use the standard precomputation */
2044             g_pre_comp = &gmul[0];
2045         generator = EC_POINT_new(group);
2046         if (generator == NULL)
2047             goto err;
2048         /* get the generator from precomputation */
2049         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2050             !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2051             !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2052             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2053             goto err;
2054         }
2055         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2056                                                       generator, x, y, z,
2057                                                       ctx))
2058             goto err;
2059         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2060             /* precomputation matches generator */
2061             have_pre_comp = 1;
2062         else
2063             /*
2064              * we don't have valid precomputation: treat the generator as a
2065              * random point
2066              */
2067             num_points++;
2068     }
2069     if (num_points > 0) {
2070         if (num_points >= 3) {
2071             /*
2072              * unless we precompute multiples for just one or two points,
2073              * converting those into affine form is time well spent
2074              */
2075             mixed = 1;
2076         }
2077         secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
2078         pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem));
2079         if (mixed)
2080             tmp_smallfelems =
2081                 OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem));
2082         if ((secrets == NULL) || (pre_comp == NULL)
2083             || (mixed && (tmp_smallfelems == NULL))) {
2084             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2085             goto err;
2086         }
2087
2088         /*
2089          * we treat NULL scalars as 0, and NULL points as points at infinity,
2090          * i.e., they contribute nothing to the linear combination
2091          */
2092         memset(secrets, 0, num_points * sizeof(felem_bytearray));
2093         memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem));
2094         for (i = 0; i < num_points; ++i) {
2095             if (i == num)
2096                 /*
2097                  * we didn't have a valid precomputation, so we pick the
2098                  * generator
2099                  */
2100             {
2101                 p = EC_GROUP_get0_generator(group);
2102                 p_scalar = scalar;
2103             } else
2104                 /* the i^th point */
2105             {
2106                 p = points[i];
2107                 p_scalar = scalars[i];
2108             }
2109             if ((p_scalar != NULL) && (p != NULL)) {
2110                 /* reduce scalar to 0 <= scalar < 2^256 */
2111                 if ((BN_num_bits(p_scalar) > 256)
2112                     || (BN_is_negative(p_scalar))) {
2113                     /*
2114                      * this is an unusual input, and we don't guarantee
2115                      * constant-timeness
2116                      */
2117                     if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
2118                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2119                         goto err;
2120                     }
2121                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
2122                 } else
2123                     num_bytes = BN_bn2bin(p_scalar, tmp);
2124                 flip_endian(secrets[i], tmp, num_bytes);
2125                 /* precompute multiples */
2126                 if ((!BN_to_felem(x_out, &p->X)) ||
2127                     (!BN_to_felem(y_out, &p->Y)) ||
2128                     (!BN_to_felem(z_out, &p->Z)))
2129                     goto err;
2130                 felem_shrink(pre_comp[i][1][0], x_out);
2131                 felem_shrink(pre_comp[i][1][1], y_out);
2132                 felem_shrink(pre_comp[i][1][2], z_out);
2133                 for (j = 2; j <= 16; ++j) {
2134                     if (j & 1) {
2135                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2136                                         pre_comp[i][j][2], pre_comp[i][1][0],
2137                                         pre_comp[i][1][1], pre_comp[i][1][2],
2138                                         pre_comp[i][j - 1][0],
2139                                         pre_comp[i][j - 1][1],
2140                                         pre_comp[i][j - 1][2]);
2141                     } else {
2142                         point_double_small(pre_comp[i][j][0],
2143                                            pre_comp[i][j][1],
2144                                            pre_comp[i][j][2],
2145                                            pre_comp[i][j / 2][0],
2146                                            pre_comp[i][j / 2][1],
2147                                            pre_comp[i][j / 2][2]);
2148                     }
2149                 }
2150             }
2151         }
2152         if (mixed)
2153             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2154     }
2155
2156     /* the scalar for the generator */
2157     if ((scalar != NULL) && (have_pre_comp)) {
2158         memset(g_secret, 0, sizeof(g_secret));
2159         /* reduce scalar to 0 <= scalar < 2^256 */
2160         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2161             /*
2162              * this is an unusual input, and we don't guarantee
2163              * constant-timeness
2164              */
2165             if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
2166                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2167                 goto err;
2168             }
2169             num_bytes = BN_bn2bin(tmp_scalar, tmp);
2170         } else
2171             num_bytes = BN_bn2bin(scalar, tmp);
2172         flip_endian(g_secret, tmp, num_bytes);
2173         /* do the multiplication with generator precomputation */
2174         batch_mul(x_out, y_out, z_out,
2175                   (const felem_bytearray(*))secrets, num_points,
2176                   g_secret,
2177                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2178     } else
2179         /* do the multiplication without generator precomputation */
2180         batch_mul(x_out, y_out, z_out,
2181                   (const felem_bytearray(*))secrets, num_points,
2182                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2183     /* reduce the output to its unique minimal representation */
2184     felem_contract(x_in, x_out);
2185     felem_contract(y_in, y_out);
2186     felem_contract(z_in, z_out);
2187     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2188         (!smallfelem_to_BN(z, z_in))) {
2189         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2190         goto err;
2191     }
2192     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2193
2194  err:
2195     BN_CTX_end(ctx);
2196     if (generator != NULL)
2197         EC_POINT_free(generator);
2198     if (new_ctx != NULL)
2199         BN_CTX_free(new_ctx);
2200     if (secrets != NULL)
2201         OPENSSL_free(secrets);
2202     if (pre_comp != NULL)
2203         OPENSSL_free(pre_comp);
2204     if (tmp_smallfelems != NULL)
2205         OPENSSL_free(tmp_smallfelems);
2206     return ret;
2207 }
2208
2209 int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2210 {
2211     int ret = 0;
2212     NISTP256_PRE_COMP *pre = NULL;
2213     int i, j;
2214     BN_CTX *new_ctx = NULL;
2215     BIGNUM *x, *y;
2216     EC_POINT *generator = NULL;
2217     smallfelem tmp_smallfelems[32];
2218     felem x_tmp, y_tmp, z_tmp;
2219
2220     /* throw away old precomputation */
2221     EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup,
2222                          nistp256_pre_comp_free,
2223                          nistp256_pre_comp_clear_free);
2224     if (ctx == NULL)
2225         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2226             return 0;
2227     BN_CTX_start(ctx);
2228     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2229         goto err;
2230     /* get the generator */
2231     if (group->generator == NULL)
2232         goto err;
2233     generator = EC_POINT_new(group);
2234     if (generator == NULL)
2235         goto err;
2236     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2237     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2238     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2239         goto err;
2240     if ((pre = nistp256_pre_comp_new()) == NULL)
2241         goto err;
2242     /*
2243      * if the generator is the standard one, use built-in precomputation
2244      */
2245     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2246         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2247         ret = 1;
2248         goto err;
2249     }
2250     if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
2251         (!BN_to_felem(y_tmp, &group->generator->Y)) ||
2252         (!BN_to_felem(z_tmp, &group->generator->Z)))
2253         goto err;
2254     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2255     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2256     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2257     /*
2258      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2259      * 2^160*G, 2^224*G for the second one
2260      */
2261     for (i = 1; i <= 8; i <<= 1) {
2262         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2263                            pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2264                            pre->g_pre_comp[0][i][1],
2265                            pre->g_pre_comp[0][i][2]);
2266         for (j = 0; j < 31; ++j) {
2267             point_double_small(pre->g_pre_comp[1][i][0],
2268                                pre->g_pre_comp[1][i][1],
2269                                pre->g_pre_comp[1][i][2],
2270                                pre->g_pre_comp[1][i][0],
2271                                pre->g_pre_comp[1][i][1],
2272                                pre->g_pre_comp[1][i][2]);
2273         }
2274         if (i == 8)
2275             break;
2276         point_double_small(pre->g_pre_comp[0][2 * i][0],
2277                            pre->g_pre_comp[0][2 * i][1],
2278                            pre->g_pre_comp[0][2 * i][2],
2279                            pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2280                            pre->g_pre_comp[1][i][2]);
2281         for (j = 0; j < 31; ++j) {
2282             point_double_small(pre->g_pre_comp[0][2 * i][0],
2283                                pre->g_pre_comp[0][2 * i][1],
2284                                pre->g_pre_comp[0][2 * i][2],
2285                                pre->g_pre_comp[0][2 * i][0],
2286                                pre->g_pre_comp[0][2 * i][1],
2287                                pre->g_pre_comp[0][2 * i][2]);
2288         }
2289     }
2290     for (i = 0; i < 2; i++) {
2291         /* g_pre_comp[i][0] is the point at infinity */
2292         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2293         /* the remaining multiples */
2294         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2295         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2296                         pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2297                         pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2298                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2299                         pre->g_pre_comp[i][2][2]);
2300         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2301         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2302                         pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2303                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2304                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2305                         pre->g_pre_comp[i][2][2]);
2306         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2307         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2308                         pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2309                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2310                         pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2311                         pre->g_pre_comp[i][4][2]);
2312         /*
2313          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2314          */
2315         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2316                         pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2317                         pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2318                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2319                         pre->g_pre_comp[i][2][2]);
2320         for (j = 1; j < 8; ++j) {
2321             /* odd multiples: add G resp. 2^32*G */
2322             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2323                             pre->g_pre_comp[i][2 * j + 1][1],
2324                             pre->g_pre_comp[i][2 * j + 1][2],
2325                             pre->g_pre_comp[i][2 * j][0],
2326                             pre->g_pre_comp[i][2 * j][1],
2327                             pre->g_pre_comp[i][2 * j][2],
2328                             pre->g_pre_comp[i][1][0],
2329                             pre->g_pre_comp[i][1][1],
2330                             pre->g_pre_comp[i][1][2]);
2331         }
2332     }
2333     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2334
2335     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
2336                              nistp256_pre_comp_free,
2337                              nistp256_pre_comp_clear_free))
2338         goto err;
2339     ret = 1;
2340     pre = NULL;
2341  err:
2342     BN_CTX_end(ctx);
2343     if (generator != NULL)
2344         EC_POINT_free(generator);
2345     if (new_ctx != NULL)
2346         BN_CTX_free(new_ctx);
2347     if (pre)
2348         nistp256_pre_comp_free(pre);
2349     return ret;
2350 }
2351
2352 int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2353 {
2354     if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup,
2355                             nistp256_pre_comp_free,
2356                             nistp256_pre_comp_clear_free)
2357         != NULL)
2358         return 1;
2359     else
2360         return 0;
2361 }
2362 #else
2363 static void *dummy = &dummy;
2364 #endif