crypto/ec/ecp_nistp256.c

   1 /*
   2  * Written by Adam Langley (Google) for the OpenSSL project
   3  */
   4 /* Copyright 2011 Google Inc.
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  *
   8  * you may not use this file except in compliance with the License.
   9  * You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  *  Unless required by applicable law or agreed to in writing, software
  14  *  distributed under the License is distributed on an "AS IS" BASIS,
  15  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  *  See the License for the specific language governing permissions and
  17  *  limitations under the License.
  18  */
  19
  20 /*
  21  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
  22  *
  23  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  24  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  25  * work which got its smarts from Daniel J. Bernstein's work on the same.
  26  */
  27
  28 #include <openssl/opensslconf.h>
  29 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  30
  31 # include <stdint.h>
  32 # include <string.h>
  33 # include <openssl/err.h>
  34 # include "ec_lcl.h"
  35
  36 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  37   /* even with gcc, the typedef won't work for 32-bit platforms */
  38 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
  39                                  * platforms */
  40 typedef __int128_t int128_t;
  41 # else
  42 #  error "Need GCC 3.1 or later to define type uint128_t"
  43 # endif
  44
  45 typedef uint8_t u8;
  46 typedef uint32_t u32;
  47 typedef uint64_t u64;
  48 typedef int64_t s64;
  49
  50 /*
  51  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
  52  * can serialise an element of this field into 32 bytes. We call this an
  53  * felem_bytearray.
  54  */
  55
  56 typedef u8 felem_bytearray[32];
  57
  58 /*
  59  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
  60  * values are big-endian.
  61  */
  62 static const felem_bytearray nistp256_curve_params[5] = {
  63     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
  64      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  65      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  66      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
  67     {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
  68      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  69      0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
  70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */
  71     {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
  72      0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
  73      0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
  74      0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
  75     {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
  76      0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
  77      0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
  78      0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
  79     {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
  80      0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
  81      0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
  82      0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
  83 };
  84
  85 /*-
  86  * The representation of field elements.
  87  * ------------------------------------
  88  *
  89  * We represent field elements with either four 128-bit values, eight 128-bit
  90  * values, or four 64-bit values. The field element represented is:
  91  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
  92  * or:
  93  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
  94  *
  95  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
  96  * apart, but are 128-bits wide, the most significant bits of each limb overlap
  97  * with the least significant bits of the next.
  98  *
  99  * A field element with four limbs is an 'felem'. One with eight limbs is a
 100  * 'longfelem'
 101  *
 102  * A field element with four, 64-bit values is called a 'smallfelem'. Small
 103  * values are used as intermediate values before multiplication.
 104  */
 105
 106 # define NLIMBS 4
 107
 108 typedef uint128_t limb;
 109 typedef limb felem[NLIMBS];
 110 typedef limb longfelem[NLIMBS * 2];
 111 typedef u64 smallfelem[NLIMBS];
 112
 113 /* This is the value of the prime as four 64-bit words, little-endian. */
 114 static const u64 kPrime[4] =
 115     { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
 116 static const u64 bottom63bits = 0x7ffffffffffffffful;
 117
 118 /*
 119  * bin32_to_felem takes a little-endian byte array and converts it into felem
 120  * form. This assumes that the CPU is little-endian.
 121  */
 122 static void bin32_to_felem(felem out, const u8 in[32])
 123 {
 124     out[0] = *((u64 *)&in[0]);
 125     out[1] = *((u64 *)&in[8]);
 126     out[2] = *((u64 *)&in[16]);
 127     out[3] = *((u64 *)&in[24]);
 128 }
 129
 130 /*
 131  * smallfelem_to_bin32 takes a smallfelem and serialises into a little
 132  * endian, 32 byte array. This assumes that the CPU is little-endian.
 133  */
 134 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
 135 {
 136     *((u64 *)&out[0]) = in[0];
 137     *((u64 *)&out[8]) = in[1];
 138     *((u64 *)&out[16]) = in[2];
 139     *((u64 *)&out[24]) = in[3];
 140 }
 141
 142 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 143 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 144 {
 145     unsigned i;
 146     for (i = 0; i < len; ++i)
 147         out[i] = in[len - 1 - i];
 148 }
 149
 150 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 151 static int BN_to_felem(felem out, const BIGNUM *bn)
 152 {
 153     felem_bytearray b_in;
 154     felem_bytearray b_out;
 155     unsigned num_bytes;
 156
 157     /* BN_bn2bin eats leading zeroes */
 158     memset(b_out, 0, sizeof(b_out));
 159     num_bytes = BN_num_bytes(bn);
 160     if (num_bytes > sizeof b_out) {
 161         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 162         return 0;
 163     }
 164     if (BN_is_negative(bn)) {
 165         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 166         return 0;
 167     }
 168     num_bytes = BN_bn2bin(bn, b_in);
 169     flip_endian(b_out, b_in, num_bytes);
 170     bin32_to_felem(out, b_out);
 171     return 1;
 172 }
 173
 174 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 175 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
 176 {
 177     felem_bytearray b_in, b_out;
 178     smallfelem_to_bin32(b_in, in);
 179     flip_endian(b_out, b_in, sizeof b_out);
 180     return BN_bin2bn(b_out, sizeof b_out, out);
 181 }
 182
 183 /*-
 184  * Field operations
 185  * ----------------
 186  */
 187
 188 static void smallfelem_one(smallfelem out)
 189 {
 190     out[0] = 1;
 191     out[1] = 0;
 192     out[2] = 0;
 193     out[3] = 0;
 194 }
 195
 196 static void smallfelem_assign(smallfelem out, const smallfelem in)
 197 {
 198     out[0] = in[0];
 199     out[1] = in[1];
 200     out[2] = in[2];
 201     out[3] = in[3];
 202 }
 203
 204 static void felem_assign(felem out, const felem in)
 205 {
 206     out[0] = in[0];
 207     out[1] = in[1];
 208     out[2] = in[2];
 209     out[3] = in[3];
 210 }
 211
 212 /* felem_sum sets out = out + in. */
 213 static void felem_sum(felem out, const felem in)
 214 {
 215     out[0] += in[0];
 216     out[1] += in[1];
 217     out[2] += in[2];
 218     out[3] += in[3];
 219 }
 220
 221 /* felem_small_sum sets out = out + in. */
 222 static void felem_small_sum(felem out, const smallfelem in)
 223 {
 224     out[0] += in[0];
 225     out[1] += in[1];
 226     out[2] += in[2];
 227     out[3] += in[3];
 228 }
 229
 230 /* felem_scalar sets out = out * scalar */
 231 static void felem_scalar(felem out, const u64 scalar)
 232 {
 233     out[0] *= scalar;
 234     out[1] *= scalar;
 235     out[2] *= scalar;
 236     out[3] *= scalar;
 237 }
 238
 239 /* longfelem_scalar sets out = out * scalar */
 240 static void longfelem_scalar(longfelem out, const u64 scalar)
 241 {
 242     out[0] *= scalar;
 243     out[1] *= scalar;
 244     out[2] *= scalar;
 245     out[3] *= scalar;
 246     out[4] *= scalar;
 247     out[5] *= scalar;
 248     out[6] *= scalar;
 249     out[7] *= scalar;
 250 }
 251
 252 # define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
 253 # define two105 (((limb)1) << 105)
 254 # define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
 255
 256 /* zero105 is 0 mod p */
 257 static const felem zero105 =
 258     { two105m41m9, two105, two105m41p9, two105m41p9 };
 259
 260 /*-
 261  * smallfelem_neg sets |out| to |-small|
 262  * On exit:
 263  *   out[i] < out[i] + 2^105
 264  */
 265 static void smallfelem_neg(felem out, const smallfelem small)
 266 {
 267     /* In order to prevent underflow, we subtract from 0 mod p. */
 268     out[0] = zero105[0] - small[0];
 269     out[1] = zero105[1] - small[1];
 270     out[2] = zero105[2] - small[2];
 271     out[3] = zero105[3] - small[3];
 272 }
 273
 274 /*-
 275  * felem_diff subtracts |in| from |out|
 276  * On entry:
 277  *   in[i] < 2^104
 278  * On exit:
 279  *   out[i] < out[i] + 2^105
 280  */
 281 static void felem_diff(felem out, const felem in)
 282 {
 283     /*
 284      * In order to prevent underflow, we add 0 mod p before subtracting.
 285      */
 286     out[0] += zero105[0];
 287     out[1] += zero105[1];
 288     out[2] += zero105[2];
 289     out[3] += zero105[3];
 290
 291     out[0] -= in[0];
 292     out[1] -= in[1];
 293     out[2] -= in[2];
 294     out[3] -= in[3];
 295 }
 296
 297 # define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
 298 # define two107 (((limb)1) << 107)
 299 # define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
 300
 301 /* zero107 is 0 mod p */
 302 static const felem zero107 =
 303     { two107m43m11, two107, two107m43p11, two107m43p11 };
 304
 305 /*-
 306  * An alternative felem_diff for larger inputs |in|
 307  * felem_diff_zero107 subtracts |in| from |out|
 308  * On entry:
 309  *   in[i] < 2^106
 310  * On exit:
 311  *   out[i] < out[i] + 2^107
 312  */
 313 static void felem_diff_zero107(felem out, const felem in)
 314 {
 315     /*
 316      * In order to prevent underflow, we add 0 mod p before subtracting.
 317      */
 318     out[0] += zero107[0];
 319     out[1] += zero107[1];
 320     out[2] += zero107[2];
 321     out[3] += zero107[3];
 322
 323     out[0] -= in[0];
 324     out[1] -= in[1];
 325     out[2] -= in[2];
 326     out[3] -= in[3];
 327 }
 328
 329 /*-
 330  * longfelem_diff subtracts |in| from |out|
 331  * On entry:
 332  *   in[i] < 7*2^67
 333  * On exit:
 334  *   out[i] < out[i] + 2^70 + 2^40
 335  */
 336 static void longfelem_diff(longfelem out, const longfelem in)
 337 {
 338     static const limb two70m8p6 =
 339         (((limb) 1) << 70) - (((limb) 1) << 8) + (((limb) 1) << 6);
 340     static const limb two70p40 = (((limb) 1) << 70) + (((limb) 1) << 40);
 341     static const limb two70 = (((limb) 1) << 70);
 342     static const limb two70m40m38p6 =
 343         (((limb) 1) << 70) - (((limb) 1) << 40) - (((limb) 1) << 38) +
 344         (((limb) 1) << 6);
 345     static const limb two70m6 = (((limb) 1) << 70) - (((limb) 1) << 6);
 346
 347     /* add 0 mod p to avoid underflow */
 348     out[0] += two70m8p6;
 349     out[1] += two70p40;
 350     out[2] += two70;
 351     out[3] += two70m40m38p6;
 352     out[4] += two70m6;
 353     out[5] += two70m6;
 354     out[6] += two70m6;
 355     out[7] += two70m6;
 356
 357     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
 358     out[0] -= in[0];
 359     out[1] -= in[1];
 360     out[2] -= in[2];
 361     out[3] -= in[3];
 362     out[4] -= in[4];
 363     out[5] -= in[5];
 364     out[6] -= in[6];
 365     out[7] -= in[7];
 366 }
 367
 368 # define two64m0 (((limb)1) << 64) - 1
 369 # define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
 370 # define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
 371 # define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
 372
 373 /* zero110 is 0 mod p */
 374 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
 375
 376 /*-
 377  * felem_shrink converts an felem into a smallfelem. The result isn't quite
 378  * minimal as the value may be greater than p.
 379  *
 380  * On entry:
 381  *   in[i] < 2^109
 382  * On exit:
 383  *   out[i] < 2^64
 384  */
 385 static void felem_shrink(smallfelem out, const felem in)
 386 {
 387     felem tmp;
 388     u64 a, b, mask;
 389     s64 high, low;
 390     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
 391
 392     /* Carry 2->3 */
 393     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
 394     /* tmp[3] < 2^110 */
 395
 396     tmp[2] = zero110[2] + (u64)in[2];
 397     tmp[0] = zero110[0] + in[0];
 398     tmp[1] = zero110[1] + in[1];
 399     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
 400
 401     /*
 402      * We perform two partial reductions where we eliminate the high-word of
 403      * tmp[3]. We don't update the other words till the end.
 404      */
 405     a = tmp[3] >> 64;           /* a < 2^46 */
 406     tmp[3] = (u64)tmp[3];
 407     tmp[3] -= a;
 408     tmp[3] += ((limb) a) << 32;
 409     /* tmp[3] < 2^79 */
 410
 411     b = a;
 412     a = tmp[3] >> 64;           /* a < 2^15 */
 413     b += a;                     /* b < 2^46 + 2^15 < 2^47 */
 414     tmp[3] = (u64)tmp[3];
 415     tmp[3] -= a;
 416     tmp[3] += ((limb) a) << 32;
 417     /* tmp[3] < 2^64 + 2^47 */
 418
 419     /*
 420      * This adjusts the other two words to complete the two partial
 421      * reductions.
 422      */
 423     tmp[0] += b;
 424     tmp[1] -= (((limb) b) << 32);
 425
 426     /*
 427      * In order to make space in tmp[3] for the carry from 2 -> 3, we
 428      * conditionally subtract kPrime if tmp[3] is large enough.
 429      */
 430     high = tmp[3] >> 64;
 431     /* As tmp[3] < 2^65, high is either 1 or 0 */
 432     high <<= 63;
 433     high >>= 63;
 434     /*-
 435      * high is:
 436      *   all ones   if the high word of tmp[3] is 1
 437      *   all zeros  if the high word of tmp[3] if 0 */
 438     low = tmp[3];
 439     mask = low >> 63;
 440     /*-
 441      * mask is:
 442      *   all ones   if the MSB of low is 1
 443      *   all zeros  if the MSB of low if 0 */
 444     low &= bottom63bits;
 445     low -= kPrime3Test;
 446     /* if low was greater than kPrime3Test then the MSB is zero */
 447     low = ~low;
 448     low >>= 63;
 449     /*-
 450      * low is:
 451      *   all ones   if low was > kPrime3Test
 452      *   all zeros  if low was <= kPrime3Test */
 453     mask = (mask & low) | high;
 454     tmp[0] -= mask & kPrime[0];
 455     tmp[1] -= mask & kPrime[1];
 456     /* kPrime[2] is zero, so omitted */
 457     tmp[3] -= mask & kPrime[3];
 458     /* tmp[3] < 2**64 - 2**32 + 1 */
 459
 460     tmp[1] += ((u64)(tmp[0] >> 64));
 461     tmp[0] = (u64)tmp[0];
 462     tmp[2] += ((u64)(tmp[1] >> 64));
 463     tmp[1] = (u64)tmp[1];
 464     tmp[3] += ((u64)(tmp[2] >> 64));
 465     tmp[2] = (u64)tmp[2];
 466     /* tmp[i] < 2^64 */
 467
 468     out[0] = tmp[0];
 469     out[1] = tmp[1];
 470     out[2] = tmp[2];
 471     out[3] = tmp[3];
 472 }
 473
 474 /* smallfelem_expand converts a smallfelem to an felem */
 475 static void smallfelem_expand(felem out, const smallfelem in)
 476 {
 477     out[0] = in[0];
 478     out[1] = in[1];
 479     out[2] = in[2];
 480     out[3] = in[3];
 481 }
 482
 483 /*-
 484  * smallfelem_square sets |out| = |small|^2
 485  * On entry:
 486  *   small[i] < 2^64
 487  * On exit:
 488  *   out[i] < 7 * 2^64 < 2^67
 489  */
 490 static void smallfelem_square(longfelem out, const smallfelem small)
 491 {
 492     limb a;
 493     u64 high, low;
 494
 495     a = ((uint128_t) small[0]) * small[0];
 496     low = a;
 497     high = a >> 64;
 498     out[0] = low;
 499     out[1] = high;
 500
 501     a = ((uint128_t) small[0]) * small[1];
 502     low = a;
 503     high = a >> 64;
 504     out[1] += low;
 505     out[1] += low;
 506     out[2] = high;
 507
 508     a = ((uint128_t) small[0]) * small[2];
 509     low = a;
 510     high = a >> 64;
 511     out[2] += low;
 512     out[2] *= 2;
 513     out[3] = high;
 514
 515     a = ((uint128_t) small[0]) * small[3];
 516     low = a;
 517     high = a >> 64;
 518     out[3] += low;
 519     out[4] = high;
 520
 521     a = ((uint128_t) small[1]) * small[2];
 522     low = a;
 523     high = a >> 64;
 524     out[3] += low;
 525     out[3] *= 2;
 526     out[4] += high;
 527
 528     a = ((uint128_t) small[1]) * small[1];
 529     low = a;
 530     high = a >> 64;
 531     out[2] += low;
 532     out[3] += high;
 533
 534     a = ((uint128_t) small[1]) * small[3];
 535     low = a;
 536     high = a >> 64;
 537     out[4] += low;
 538     out[4] *= 2;
 539     out[5] = high;
 540
 541     a = ((uint128_t) small[2]) * small[3];
 542     low = a;
 543     high = a >> 64;
 544     out[5] += low;
 545     out[5] *= 2;
 546     out[6] = high;
 547     out[6] += high;
 548
 549     a = ((uint128_t) small[2]) * small[2];
 550     low = a;
 551     high = a >> 64;
 552     out[4] += low;
 553     out[5] += high;
 554
 555     a = ((uint128_t) small[3]) * small[3];
 556     low = a;
 557     high = a >> 64;
 558     out[6] += low;
 559     out[7] = high;
 560 }
 561
 562 /*-
 563  * felem_square sets |out| = |in|^2
 564  * On entry:
 565  *   in[i] < 2^109
 566  * On exit:
 567  *   out[i] < 7 * 2^64 < 2^67
 568  */
 569 static void felem_square(longfelem out, const felem in)
 570 {
 571     u64 small[4];
 572     felem_shrink(small, in);
 573     smallfelem_square(out, small);
 574 }
 575
 576 /*-
 577  * smallfelem_mul sets |out| = |small1| * |small2|
 578  * On entry:
 579  *   small1[i] < 2^64
 580  *   small2[i] < 2^64
 581  * On exit:
 582  *   out[i] < 7 * 2^64 < 2^67
 583  */
 584 static void smallfelem_mul(longfelem out, const smallfelem small1,
 585                            const smallfelem small2)
 586 {
 587     limb a;
 588     u64 high, low;
 589
 590     a = ((uint128_t) small1[0]) * small2[0];
 591     low = a;
 592     high = a >> 64;
 593     out[0] = low;
 594     out[1] = high;
 595
 596     a = ((uint128_t) small1[0]) * small2[1];
 597     low = a;
 598     high = a >> 64;
 599     out[1] += low;
 600     out[2] = high;
 601
 602     a = ((uint128_t) small1[1]) * small2[0];
 603     low = a;
 604     high = a >> 64;
 605     out[1] += low;
 606     out[2] += high;
 607
 608     a = ((uint128_t) small1[0]) * small2[2];
 609     low = a;
 610     high = a >> 64;
 611     out[2] += low;
 612     out[3] = high;
 613
 614     a = ((uint128_t) small1[1]) * small2[1];
 615     low = a;
 616     high = a >> 64;
 617     out[2] += low;
 618     out[3] += high;
 619
 620     a = ((uint128_t) small1[2]) * small2[0];
 621     low = a;
 622     high = a >> 64;
 623     out[2] += low;
 624     out[3] += high;
 625
 626     a = ((uint128_t) small1[0]) * small2[3];
 627     low = a;
 628     high = a >> 64;
 629     out[3] += low;
 630     out[4] = high;
 631
 632     a = ((uint128_t) small1[1]) * small2[2];
 633     low = a;
 634     high = a >> 64;
 635     out[3] += low;
 636     out[4] += high;
 637
 638     a = ((uint128_t) small1[2]) * small2[1];
 639     low = a;
 640     high = a >> 64;
 641     out[3] += low;
 642     out[4] += high;
 643
 644     a = ((uint128_t) small1[3]) * small2[0];
 645     low = a;
 646     high = a >> 64;
 647     out[3] += low;
 648     out[4] += high;
 649
 650     a = ((uint128_t) small1[1]) * small2[3];
 651     low = a;
 652     high = a >> 64;
 653     out[4] += low;
 654     out[5] = high;
 655
 656     a = ((uint128_t) small1[2]) * small2[2];
 657     low = a;
 658     high = a >> 64;
 659     out[4] += low;
 660     out[5] += high;
 661
 662     a = ((uint128_t) small1[3]) * small2[1];
 663     low = a;
 664     high = a >> 64;
 665     out[4] += low;
 666     out[5] += high;
 667
 668     a = ((uint128_t) small1[2]) * small2[3];
 669     low = a;
 670     high = a >> 64;
 671     out[5] += low;
 672     out[6] = high;
 673
 674     a = ((uint128_t) small1[3]) * small2[2];
 675     low = a;
 676     high = a >> 64;
 677     out[5] += low;
 678     out[6] += high;
 679
 680     a = ((uint128_t) small1[3]) * small2[3];
 681     low = a;
 682     high = a >> 64;
 683     out[6] += low;
 684     out[7] = high;
 685 }
 686
 687 /*-
 688  * felem_mul sets |out| = |in1| * |in2|
 689  * On entry:
 690  *   in1[i] < 2^109
 691  *   in2[i] < 2^109
 692  * On exit:
 693  *   out[i] < 7 * 2^64 < 2^67
 694  */
 695 static void felem_mul(longfelem out, const felem in1, const felem in2)
 696 {
 697     smallfelem small1, small2;
 698     felem_shrink(small1, in1);
 699     felem_shrink(small2, in2);
 700     smallfelem_mul(out, small1, small2);
 701 }
 702
 703 /*-
 704  * felem_small_mul sets |out| = |small1| * |in2|
 705  * On entry:
 706  *   small1[i] < 2^64
 707  *   in2[i] < 2^109
 708  * On exit:
 709  *   out[i] < 7 * 2^64 < 2^67
 710  */
 711 static void felem_small_mul(longfelem out, const smallfelem small1,
 712                             const felem in2)
 713 {
 714     smallfelem small2;
 715     felem_shrink(small2, in2);
 716     smallfelem_mul(out, small1, small2);
 717 }
 718
 719 # define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
 720 # define two100 (((limb)1) << 100)
 721 # define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
 722 /* zero100 is 0 mod p */
 723 static const felem zero100 =
 724     { two100m36m4, two100, two100m36p4, two100m36p4 };
 725
 726 /*-
 727  * Internal function for the different flavours of felem_reduce.
 728  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
 729  * On entry:
 730  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
 731  *   out[1] >= in[7] + 2^32*in[4]
 732  *   out[2] >= in[5] + 2^32*in[5]
 733  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
 734  * On exit:
 735  *   out[0] <= out[0] + in[4] + 2^32*in[5]
 736  *   out[1] <= out[1] + in[5] + 2^33*in[6]
 737  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
 738  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
 739  */
 740 static void felem_reduce_(felem out, const longfelem in)
 741 {
 742     int128_t c;
 743     /* combine common terms from below */
 744     c = in[4] + (in[5] << 32);
 745     out[0] += c;
 746     out[3] -= c;
 747
 748     c = in[5] - in[7];
 749     out[1] += c;
 750     out[2] -= c;
 751
 752     /* the remaining terms */
 753     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
 754     out[1] -= (in[4] << 32);
 755     out[3] += (in[4] << 32);
 756
 757     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
 758     out[2] -= (in[5] << 32);
 759
 760     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
 761     out[0] -= in[6];
 762     out[0] -= (in[6] << 32);
 763     out[1] += (in[6] << 33);
 764     out[2] += (in[6] * 2);
 765     out[3] -= (in[6] << 32);
 766
 767     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
 768     out[0] -= in[7];
 769     out[0] -= (in[7] << 32);
 770     out[2] += (in[7] << 33);
 771     out[3] += (in[7] * 3);
 772 }
 773
 774 /*-
 775  * felem_reduce converts a longfelem into an felem.
 776  * To be called directly after felem_square or felem_mul.
 777  * On entry:
 778  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
 779  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
 780  * On exit:
 781  *   out[i] < 2^101
 782  */
 783 static void felem_reduce(felem out, const longfelem in)
 784 {
 785     out[0] = zero100[0] + in[0];
 786     out[1] = zero100[1] + in[1];
 787     out[2] = zero100[2] + in[2];
 788     out[3] = zero100[3] + in[3];
 789
 790     felem_reduce_(out, in);
 791
 792     /*-
 793      * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
 794      * out[1] > 2^100 - 2^64 - 7*2^96 > 0
 795      * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
 796      * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
 797      *
 798      * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
 799      * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
 800      * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
 801      * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
 802      */
 803 }
 804
 805 /*-
 806  * felem_reduce_zero105 converts a larger longfelem into an felem.
 807  * On entry:
 808  *   in[0] < 2^71
 809  * On exit:
 810  *   out[i] < 2^106
 811  */
 812 static void felem_reduce_zero105(felem out, const longfelem in)
 813 {
 814     out[0] = zero105[0] + in[0];
 815     out[1] = zero105[1] + in[1];
 816     out[2] = zero105[2] + in[2];
 817     out[3] = zero105[3] + in[3];
 818
 819     felem_reduce_(out, in);
 820
 821     /*-
 822      * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
 823      * out[1] > 2^105 - 2^71 - 2^103 > 0
 824      * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
 825      * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
 826      *
 827      * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 828      * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
 829      * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
 830      * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
 831      */
 832 }
 833
 834 /*
 835  * subtract_u64 sets *result = *result - v and *carry to one if the
 836  * subtraction underflowed.
 837  */
 838 static void subtract_u64(u64 *result, u64 *carry, u64 v)
 839 {
 840     uint128_t r = *result;
 841     r -= v;
 842     *carry = (r >> 64) & 1;
 843     *result = (u64)r;
 844 }
 845
 846 /*
 847  * felem_contract converts |in| to its unique, minimal representation. On
 848  * entry: in[i] < 2^109
 849  */
 850 static void felem_contract(smallfelem out, const felem in)
 851 {
 852     unsigned i;
 853     u64 all_equal_so_far = 0, result = 0, carry;
 854
 855     felem_shrink(out, in);
 856     /* small is minimal except that the value might be > p */
 857
 858     all_equal_so_far--;
 859     /*
 860      * We are doing a constant time test if out >= kPrime. We need to compare
 861      * each u64, from most-significant to least significant. For each one, if
 862      * all words so far have been equal (m is all ones) then a non-equal
 863      * result is the answer. Otherwise we continue.
 864      */
 865     for (i = 3; i < 4; i--) {
 866         u64 equal;
 867         uint128_t a = ((uint128_t) kPrime[i]) - out[i];
 868         /*
 869          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
 870          * will all be set.
 871          */
 872         result |= all_equal_so_far & ((u64)(a >> 64));
 873
 874         /*
 875          * if kPrime[i] == out[i] then |equal| will be all zeros and the
 876          * decrement will make it all ones.
 877          */
 878         equal = kPrime[i] ^ out[i];
 879         equal--;
 880         equal &= equal << 32;
 881         equal &= equal << 16;
 882         equal &= equal << 8;
 883         equal &= equal << 4;
 884         equal &= equal << 2;
 885         equal &= equal << 1;
 886         equal = ((s64) equal) >> 63;
 887
 888         all_equal_so_far &= equal;
 889     }
 890
 891     /*
 892      * if all_equal_so_far is still all ones then the two values are equal
 893      * and so out >= kPrime is true.
 894      */
 895     result |= all_equal_so_far;
 896
 897     /* if out >= kPrime then we subtract kPrime. */
 898     subtract_u64(&out[0], &carry, result & kPrime[0]);
 899     subtract_u64(&out[1], &carry, carry);
 900     subtract_u64(&out[2], &carry, carry);
 901     subtract_u64(&out[3], &carry, carry);
 902
 903     subtract_u64(&out[1], &carry, result & kPrime[1]);
 904     subtract_u64(&out[2], &carry, carry);
 905     subtract_u64(&out[3], &carry, carry);
 906
 907     subtract_u64(&out[2], &carry, result & kPrime[2]);
 908     subtract_u64(&out[3], &carry, carry);
 909
 910     subtract_u64(&out[3], &carry, result & kPrime[3]);
 911 }
 912
 913 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
 914 {
 915     longfelem longtmp;
 916     felem tmp;
 917
 918     smallfelem_square(longtmp, in);
 919     felem_reduce(tmp, longtmp);
 920     felem_contract(out, tmp);
 921 }
 922
 923 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
 924                                     const smallfelem in2)
 925 {
 926     longfelem longtmp;
 927     felem tmp;
 928
 929     smallfelem_mul(longtmp, in1, in2);
 930     felem_reduce(tmp, longtmp);
 931     felem_contract(out, tmp);
 932 }
 933
 934 /*-
 935  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 936  * otherwise.
 937  * On entry:
 938  *   small[i] < 2^64
 939  */
 940 static limb smallfelem_is_zero(const smallfelem small)
 941 {
 942     limb result;
 943     u64 is_p;
 944
 945     u64 is_zero = small[0] | small[1] | small[2] | small[3];
 946     is_zero--;
 947     is_zero &= is_zero << 32;
 948     is_zero &= is_zero << 16;
 949     is_zero &= is_zero << 8;
 950     is_zero &= is_zero << 4;
 951     is_zero &= is_zero << 2;
 952     is_zero &= is_zero << 1;
 953     is_zero = ((s64) is_zero) >> 63;
 954
 955     is_p = (small[0] ^ kPrime[0]) |
 956         (small[1] ^ kPrime[1]) |
 957         (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
 958     is_p--;
 959     is_p &= is_p << 32;
 960     is_p &= is_p << 16;
 961     is_p &= is_p << 8;
 962     is_p &= is_p << 4;
 963     is_p &= is_p << 2;
 964     is_p &= is_p << 1;
 965     is_p = ((s64) is_p) >> 63;
 966
 967     is_zero |= is_p;
 968
 969     result = is_zero;
 970     result |= ((limb) is_zero) << 64;
 971     return result;
 972 }
 973
 974 static int smallfelem_is_zero_int(const smallfelem small)
 975 {
 976     return (int)(smallfelem_is_zero(small) & ((limb) 1));
 977 }
 978
 979 /*-
 980  * felem_inv calculates |out| = |in|^{-1}
 981  *
 982  * Based on Fermat's Little Theorem:
 983  *   a^p = a (mod p)
 984  *   a^{p-1} = 1 (mod p)
 985  *   a^{p-2} = a^{-1} (mod p)
 986  */
 987 static void felem_inv(felem out, const felem in)
 988 {
 989     felem ftmp, ftmp2;
 990     /* each e_I will hold |in|^{2^I - 1} */
 991     felem e2, e4, e8, e16, e32, e64;
 992     longfelem tmp;
 993     unsigned i;
 994
 995     felem_square(tmp, in);
 996     felem_reduce(ftmp, tmp);    /* 2^1 */
 997     felem_mul(tmp, in, ftmp);
 998     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
 999     felem_assign(e2, ftmp);
1000     felem_square(tmp, ftmp);
1001     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
1002     felem_square(tmp, ftmp);
1003     felem_reduce(ftmp, tmp);    /* 2^4 - 2^2 */
1004     felem_mul(tmp, ftmp, e2);
1005     felem_reduce(ftmp, tmp);    /* 2^4 - 2^0 */
1006     felem_assign(e4, ftmp);
1007     felem_square(tmp, ftmp);
1008     felem_reduce(ftmp, tmp);    /* 2^5 - 2^1 */
1009     felem_square(tmp, ftmp);
1010     felem_reduce(ftmp, tmp);    /* 2^6 - 2^2 */
1011     felem_square(tmp, ftmp);
1012     felem_reduce(ftmp, tmp);    /* 2^7 - 2^3 */
1013     felem_square(tmp, ftmp);
1014     felem_reduce(ftmp, tmp);    /* 2^8 - 2^4 */
1015     felem_mul(tmp, ftmp, e4);
1016     felem_reduce(ftmp, tmp);    /* 2^8 - 2^0 */
1017     felem_assign(e8, ftmp);
1018     for (i = 0; i < 8; i++) {
1019         felem_square(tmp, ftmp);
1020         felem_reduce(ftmp, tmp);
1021     }                           /* 2^16 - 2^8 */
1022     felem_mul(tmp, ftmp, e8);
1023     felem_reduce(ftmp, tmp);    /* 2^16 - 2^0 */
1024     felem_assign(e16, ftmp);
1025     for (i = 0; i < 16; i++) {
1026         felem_square(tmp, ftmp);
1027         felem_reduce(ftmp, tmp);
1028     }                           /* 2^32 - 2^16 */
1029     felem_mul(tmp, ftmp, e16);
1030     felem_reduce(ftmp, tmp);    /* 2^32 - 2^0 */
1031     felem_assign(e32, ftmp);
1032     for (i = 0; i < 32; i++) {
1033         felem_square(tmp, ftmp);
1034         felem_reduce(ftmp, tmp);
1035     }                           /* 2^64 - 2^32 */
1036     felem_assign(e64, ftmp);
1037     felem_mul(tmp, ftmp, in);
1038     felem_reduce(ftmp, tmp);    /* 2^64 - 2^32 + 2^0 */
1039     for (i = 0; i < 192; i++) {
1040         felem_square(tmp, ftmp);
1041         felem_reduce(ftmp, tmp);
1042     }                           /* 2^256 - 2^224 + 2^192 */
1043
1044     felem_mul(tmp, e64, e32);
1045     felem_reduce(ftmp2, tmp);   /* 2^64 - 2^0 */
1046     for (i = 0; i < 16; i++) {
1047         felem_square(tmp, ftmp2);
1048         felem_reduce(ftmp2, tmp);
1049     }                           /* 2^80 - 2^16 */
1050     felem_mul(tmp, ftmp2, e16);
1051     felem_reduce(ftmp2, tmp);   /* 2^80 - 2^0 */
1052     for (i = 0; i < 8; i++) {
1053         felem_square(tmp, ftmp2);
1054         felem_reduce(ftmp2, tmp);
1055     }                           /* 2^88 - 2^8 */
1056     felem_mul(tmp, ftmp2, e8);
1057     felem_reduce(ftmp2, tmp);   /* 2^88 - 2^0 */
1058     for (i = 0; i < 4; i++) {
1059         felem_square(tmp, ftmp2);
1060         felem_reduce(ftmp2, tmp);
1061     }                           /* 2^92 - 2^4 */
1062     felem_mul(tmp, ftmp2, e4);
1063     felem_reduce(ftmp2, tmp);   /* 2^92 - 2^0 */
1064     felem_square(tmp, ftmp2);
1065     felem_reduce(ftmp2, tmp);   /* 2^93 - 2^1 */
1066     felem_square(tmp, ftmp2);
1067     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^2 */
1068     felem_mul(tmp, ftmp2, e2);
1069     felem_reduce(ftmp2, tmp);   /* 2^94 - 2^0 */
1070     felem_square(tmp, ftmp2);
1071     felem_reduce(ftmp2, tmp);   /* 2^95 - 2^1 */
1072     felem_square(tmp, ftmp2);
1073     felem_reduce(ftmp2, tmp);   /* 2^96 - 2^2 */
1074     felem_mul(tmp, ftmp2, in);
1075     felem_reduce(ftmp2, tmp);   /* 2^96 - 3 */
1076
1077     felem_mul(tmp, ftmp2, ftmp);
1078     felem_reduce(out, tmp);     /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1079 }
1080
1081 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1082 {
1083     felem tmp;
1084
1085     smallfelem_expand(tmp, in);
1086     felem_inv(tmp, tmp);
1087     felem_contract(out, tmp);
1088 }
1089
1090 /*-
1091  * Group operations
1092  * ----------------
1093  *
1094  * Building on top of the field operations we have the operations on the
1095  * elliptic curve group itself. Points on the curve are represented in Jacobian
1096  * coordinates
1097  */
1098
1099 /*-
1100  * point_double calculates 2*(x_in, y_in, z_in)
1101  *
1102  * The method is taken from:
1103  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1104  *
1105  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1106  * while x_out == y_in is not (maybe this works, but it's not tested).
1107  */
1108 static void
1109 point_double(felem x_out, felem y_out, felem z_out,
1110              const felem x_in, const felem y_in, const felem z_in)
1111 {
1112     longfelem tmp, tmp2;
1113     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1114     smallfelem small1, small2;
1115
1116     felem_assign(ftmp, x_in);
1117     /* ftmp[i] < 2^106 */
1118     felem_assign(ftmp2, x_in);
1119     /* ftmp2[i] < 2^106 */
1120
1121     /* delta = z^2 */
1122     felem_square(tmp, z_in);
1123     felem_reduce(delta, tmp);
1124     /* delta[i] < 2^101 */
1125
1126     /* gamma = y^2 */
1127     felem_square(tmp, y_in);
1128     felem_reduce(gamma, tmp);
1129     /* gamma[i] < 2^101 */
1130     felem_shrink(small1, gamma);
1131
1132     /* beta = x*gamma */
1133     felem_small_mul(tmp, small1, x_in);
1134     felem_reduce(beta, tmp);
1135     /* beta[i] < 2^101 */
1136
1137     /* alpha = 3*(x-delta)*(x+delta) */
1138     felem_diff(ftmp, delta);
1139     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1140     felem_sum(ftmp2, delta);
1141     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1142     felem_scalar(ftmp2, 3);
1143     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1144     felem_mul(tmp, ftmp, ftmp2);
1145     felem_reduce(alpha, tmp);
1146     /* alpha[i] < 2^101 */
1147     felem_shrink(small2, alpha);
1148
1149     /* x' = alpha^2 - 8*beta */
1150     smallfelem_square(tmp, small2);
1151     felem_reduce(x_out, tmp);
1152     felem_assign(ftmp, beta);
1153     felem_scalar(ftmp, 8);
1154     /* ftmp[i] < 8 * 2^101 = 2^104 */
1155     felem_diff(x_out, ftmp);
1156     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1157
1158     /* z' = (y + z)^2 - gamma - delta */
1159     felem_sum(delta, gamma);
1160     /* delta[i] < 2^101 + 2^101 = 2^102 */
1161     felem_assign(ftmp, y_in);
1162     felem_sum(ftmp, z_in);
1163     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1164     felem_square(tmp, ftmp);
1165     felem_reduce(z_out, tmp);
1166     felem_diff(z_out, delta);
1167     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1168
1169     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1170     felem_scalar(beta, 4);
1171     /* beta[i] < 4 * 2^101 = 2^103 */
1172     felem_diff_zero107(beta, x_out);
1173     /* beta[i] < 2^107 + 2^103 < 2^108 */
1174     felem_small_mul(tmp, small2, beta);
1175     /* tmp[i] < 7 * 2^64 < 2^67 */
1176     smallfelem_square(tmp2, small1);
1177     /* tmp2[i] < 7 * 2^64 */
1178     longfelem_scalar(tmp2, 8);
1179     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1180     longfelem_diff(tmp, tmp2);
1181     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1182     felem_reduce_zero105(y_out, tmp);
1183     /* y_out[i] < 2^106 */
1184 }
1185
1186 /*
1187  * point_double_small is the same as point_double, except that it operates on
1188  * smallfelems
1189  */
1190 static void
1191 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1192                    const smallfelem x_in, const smallfelem y_in,
1193                    const smallfelem z_in)
1194 {
1195     felem felem_x_out, felem_y_out, felem_z_out;
1196     felem felem_x_in, felem_y_in, felem_z_in;
1197
1198     smallfelem_expand(felem_x_in, x_in);
1199     smallfelem_expand(felem_y_in, y_in);
1200     smallfelem_expand(felem_z_in, z_in);
1201     point_double(felem_x_out, felem_y_out, felem_z_out,
1202                  felem_x_in, felem_y_in, felem_z_in);
1203     felem_shrink(x_out, felem_x_out);
1204     felem_shrink(y_out, felem_y_out);
1205     felem_shrink(z_out, felem_z_out);
1206 }
1207
1208 /* copy_conditional copies in to out iff mask is all ones. */
1209 static void copy_conditional(felem out, const felem in, limb mask)
1210 {
1211     unsigned i;
1212     for (i = 0; i < NLIMBS; ++i) {
1213         const limb tmp = mask & (in[i] ^ out[i]);
1214         out[i] ^= tmp;
1215     }
1216 }
1217
1218 /* copy_small_conditional copies in to out iff mask is all ones. */
1219 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1220 {
1221     unsigned i;
1222     const u64 mask64 = mask;
1223     for (i = 0; i < NLIMBS; ++i) {
1224         out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
1225     }
1226 }
1227
1228 /*-
1229  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1230  *
1231  * The method is taken from:
1232  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1233  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1234  *
1235  * This function includes a branch for checking whether the two input points
1236  * are equal, (while not equal to the point at infinity). This case never
1237  * happens during single point multiplication, so there is no timing leak for
1238  * ECDH or ECDSA signing.
1239  */
1240 static void point_add(felem x3, felem y3, felem z3,
1241                       const felem x1, const felem y1, const felem z1,
1242                       const int mixed, const smallfelem x2,
1243                       const smallfelem y2, const smallfelem z2)
1244 {
1245     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1246     longfelem tmp, tmp2;
1247     smallfelem small1, small2, small3, small4, small5;
1248     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1249
1250     felem_shrink(small3, z1);
1251
1252     z1_is_zero = smallfelem_is_zero(small3);
1253     z2_is_zero = smallfelem_is_zero(z2);
1254
1255     /* ftmp = z1z1 = z1**2 */
1256     smallfelem_square(tmp, small3);
1257     felem_reduce(ftmp, tmp);
1258     /* ftmp[i] < 2^101 */
1259     felem_shrink(small1, ftmp);
1260
1261     if (!mixed) {
1262         /* ftmp2 = z2z2 = z2**2 */
1263         smallfelem_square(tmp, z2);
1264         felem_reduce(ftmp2, tmp);
1265         /* ftmp2[i] < 2^101 */
1266         felem_shrink(small2, ftmp2);
1267
1268         felem_shrink(small5, x1);
1269
1270         /* u1 = ftmp3 = x1*z2z2 */
1271         smallfelem_mul(tmp, small5, small2);
1272         felem_reduce(ftmp3, tmp);
1273         /* ftmp3[i] < 2^101 */
1274
1275         /* ftmp5 = z1 + z2 */
1276         felem_assign(ftmp5, z1);
1277         felem_small_sum(ftmp5, z2);
1278         /* ftmp5[i] < 2^107 */
1279
1280         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1281         felem_square(tmp, ftmp5);
1282         felem_reduce(ftmp5, tmp);
1283         /* ftmp2 = z2z2 + z1z1 */
1284         felem_sum(ftmp2, ftmp);
1285         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1286         felem_diff(ftmp5, ftmp2);
1287         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1288
1289         /* ftmp2 = z2 * z2z2 */
1290         smallfelem_mul(tmp, small2, z2);
1291         felem_reduce(ftmp2, tmp);
1292
1293         /* s1 = ftmp2 = y1 * z2**3 */
1294         felem_mul(tmp, y1, ftmp2);
1295         felem_reduce(ftmp6, tmp);
1296         /* ftmp6[i] < 2^101 */
1297     } else {
1298         /*
1299          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1300          */
1301
1302         /* u1 = ftmp3 = x1*z2z2 */
1303         felem_assign(ftmp3, x1);
1304         /* ftmp3[i] < 2^106 */
1305
1306         /* ftmp5 = 2z1z2 */
1307         felem_assign(ftmp5, z1);
1308         felem_scalar(ftmp5, 2);
1309         /* ftmp5[i] < 2*2^106 = 2^107 */
1310
1311         /* s1 = ftmp2 = y1 * z2**3 */
1312         felem_assign(ftmp6, y1);
1313         /* ftmp6[i] < 2^106 */
1314     }
1315
1316     /* u2 = x2*z1z1 */
1317     smallfelem_mul(tmp, x2, small1);
1318     felem_reduce(ftmp4, tmp);
1319
1320     /* h = ftmp4 = u2 - u1 */
1321     felem_diff_zero107(ftmp4, ftmp3);
1322     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1323     felem_shrink(small4, ftmp4);
1324
1325     x_equal = smallfelem_is_zero(small4);
1326
1327     /* z_out = ftmp5 * h */
1328     felem_small_mul(tmp, small4, ftmp5);
1329     felem_reduce(z_out, tmp);
1330     /* z_out[i] < 2^101 */
1331
1332     /* ftmp = z1 * z1z1 */
1333     smallfelem_mul(tmp, small1, small3);
1334     felem_reduce(ftmp, tmp);
1335
1336     /* s2 = tmp = y2 * z1**3 */
1337     felem_small_mul(tmp, y2, ftmp);
1338     felem_reduce(ftmp5, tmp);
1339
1340     /* r = ftmp5 = (s2 - s1)*2 */
1341     felem_diff_zero107(ftmp5, ftmp6);
1342     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1343     felem_scalar(ftmp5, 2);
1344     /* ftmp5[i] < 2^109 */
1345     felem_shrink(small1, ftmp5);
1346     y_equal = smallfelem_is_zero(small1);
1347
1348     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1349         point_double(x3, y3, z3, x1, y1, z1);
1350         return;
1351     }
1352
1353     /* I = ftmp = (2h)**2 */
1354     felem_assign(ftmp, ftmp4);
1355     felem_scalar(ftmp, 2);
1356     /* ftmp[i] < 2*2^108 = 2^109 */
1357     felem_square(tmp, ftmp);
1358     felem_reduce(ftmp, tmp);
1359
1360     /* J = ftmp2 = h * I */
1361     felem_mul(tmp, ftmp4, ftmp);
1362     felem_reduce(ftmp2, tmp);
1363
1364     /* V = ftmp4 = U1 * I */
1365     felem_mul(tmp, ftmp3, ftmp);
1366     felem_reduce(ftmp4, tmp);
1367
1368     /* x_out = r**2 - J - 2V */
1369     smallfelem_square(tmp, small1);
1370     felem_reduce(x_out, tmp);
1371     felem_assign(ftmp3, ftmp4);
1372     felem_scalar(ftmp4, 2);
1373     felem_sum(ftmp4, ftmp2);
1374     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1375     felem_diff(x_out, ftmp4);
1376     /* x_out[i] < 2^105 + 2^101 */
1377
1378     /* y_out = r(V-x_out) - 2 * s1 * J */
1379     felem_diff_zero107(ftmp3, x_out);
1380     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1381     felem_small_mul(tmp, small1, ftmp3);
1382     felem_mul(tmp2, ftmp6, ftmp2);
1383     longfelem_scalar(tmp2, 2);
1384     /* tmp2[i] < 2*2^67 = 2^68 */
1385     longfelem_diff(tmp, tmp2);
1386     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1387     felem_reduce_zero105(y_out, tmp);
1388     /* y_out[i] < 2^106 */
1389
1390     copy_small_conditional(x_out, x2, z1_is_zero);
1391     copy_conditional(x_out, x1, z2_is_zero);
1392     copy_small_conditional(y_out, y2, z1_is_zero);
1393     copy_conditional(y_out, y1, z2_is_zero);
1394     copy_small_conditional(z_out, z2, z1_is_zero);
1395     copy_conditional(z_out, z1, z2_is_zero);
1396     felem_assign(x3, x_out);
1397     felem_assign(y3, y_out);
1398     felem_assign(z3, z_out);
1399 }
1400
1401 /*
1402  * point_add_small is the same as point_add, except that it operates on
1403  * smallfelems
1404  */
1405 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1406                             smallfelem x1, smallfelem y1, smallfelem z1,
1407                             smallfelem x2, smallfelem y2, smallfelem z2)
1408 {
1409     felem felem_x3, felem_y3, felem_z3;
1410     felem felem_x1, felem_y1, felem_z1;
1411     smallfelem_expand(felem_x1, x1);
1412     smallfelem_expand(felem_y1, y1);
1413     smallfelem_expand(felem_z1, z1);
1414     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1415               x2, y2, z2);
1416     felem_shrink(x3, felem_x3);
1417     felem_shrink(y3, felem_y3);
1418     felem_shrink(z3, felem_z3);
1419 }
1420
1421 /*-
1422  * Base point pre computation
1423  * --------------------------
1424  *
1425  * Two different sorts of precomputed tables are used in the following code.
1426  * Each contain various points on the curve, where each point is three field
1427  * elements (x, y, z).
1428  *
1429  * For the base point table, z is usually 1 (0 for the point at infinity).
1430  * This table has 2 * 16 elements, starting with the following:
1431  * index | bits    | point
1432  * ------+---------+------------------------------
1433  *     0 | 0 0 0 0 | 0G
1434  *     1 | 0 0 0 1 | 1G
1435  *     2 | 0 0 1 0 | 2^64G
1436  *     3 | 0 0 1 1 | (2^64 + 1)G
1437  *     4 | 0 1 0 0 | 2^128G
1438  *     5 | 0 1 0 1 | (2^128 + 1)G
1439  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1440  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1441  *     8 | 1 0 0 0 | 2^192G
1442  *     9 | 1 0 0 1 | (2^192 + 1)G
1443  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1444  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1445  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1446  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1447  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1448  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1449  * followed by a copy of this with each element multiplied by 2^32.
1450  *
1451  * The reason for this is so that we can clock bits into four different
1452  * locations when doing simple scalar multiplies against the base point,
1453  * and then another four locations using the second 16 elements.
1454  *
1455  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1456
1457 /* gmul is the table of precomputed base points */
1458 static const smallfelem gmul[2][16][3] = {
1459     {{{0, 0, 0, 0},
1460       {0, 0, 0, 0},
1461       {0, 0, 0, 0}},
1462      {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1463        0x6b17d1f2e12c4247},
1464       {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1465        0x4fe342e2fe1a7f9b},
1466       {1, 0, 0, 0}},
1467      {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1468        0x0fa822bc2811aaa5},
1469       {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1470        0xbff44ae8f5dba80d},
1471       {1, 0, 0, 0}},
1472      {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1473        0x300a4bbc89d6726f},
1474       {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1475        0x72aac7e0d09b4644},
1476       {1, 0, 0, 0}},
1477      {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1478        0x447d739beedb5e67},
1479       {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1480        0x2d4825ab834131ee},
1481       {1, 0, 0, 0}},
1482      {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1483        0xef9519328a9c72ff},
1484       {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1485        0x611e9fc37dbb2c9b},
1486       {1, 0, 0, 0}},
1487      {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1488        0x550663797b51f5d8},
1489       {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1490        0x157164848aecb851},
1491       {1, 0, 0, 0}},
1492      {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1493        0xeb5d7745b21141ea},
1494       {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1495        0xeafd72ebdbecc17b},
1496       {1, 0, 0, 0}},
1497      {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1498        0xa6d39677a7849276},
1499       {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1500        0x674f84749b0b8816},
1501       {1, 0, 0, 0}},
1502      {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1503        0x4e769e7672c9ddad},
1504       {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1505        0x42b99082de830663},
1506       {1, 0, 0, 0}},
1507      {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1508        0x78878ef61c6ce04d},
1509       {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1510        0xb6cb3f5d7b72c321},
1511       {1, 0, 0, 0}},
1512      {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1513        0x0c88bc4d716b1287},
1514       {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1515        0xdd5ddea3f3901dc6},
1516       {1, 0, 0, 0}},
1517      {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1518        0x68f344af6b317466},
1519       {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1520        0x31b9c405f8540a20},
1521       {1, 0, 0, 0}},
1522      {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1523        0x4052bf4b6f461db9},
1524       {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1525        0xfecf4d5190b0fc61},
1526       {1, 0, 0, 0}},
1527      {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1528        0x1eddbae2c802e41a},
1529       {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1530        0x43104d86560ebcfc},
1531       {1, 0, 0, 0}},
1532      {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1533        0xb48e26b484f7a21c},
1534       {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1535        0xfac015404d4d3dab},
1536       {1, 0, 0, 0}}},
1537     {{{0, 0, 0, 0},
1538       {0, 0, 0, 0},
1539       {0, 0, 0, 0}},
1540      {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1541        0x7fe36b40af22af89},
1542       {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1543        0xe697d45825b63624},
1544       {1, 0, 0, 0}},
1545      {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1546        0x4a5b506612a677a6},
1547       {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1548        0xeb13461ceac089f1},
1549       {1, 0, 0, 0}},
1550      {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1551        0x0781b8291c6a220a},
1552       {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1553        0x690cde8df0151593},
1554       {1, 0, 0, 0}},
1555      {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1556        0x8a535f566ec73617},
1557       {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1558        0x0455c08468b08bd7},
1559       {1, 0, 0, 0}},
1560      {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1561        0x06bada7ab77f8276},
1562       {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1563        0x5b476dfd0e6cb18a},
1564       {1, 0, 0, 0}},
1565      {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1566        0x3e29864e8a2ec908},
1567       {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1568        0x239b90ea3dc31e7e},
1569       {1, 0, 0, 0}},
1570      {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1571        0x820f4dd949f72ff7},
1572       {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1573        0x140406ec783a05ec},
1574       {1, 0, 0, 0}},
1575      {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1576        0x68f6b8542783dfee},
1577       {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1578        0xcbe1feba92e40ce6},
1579       {1, 0, 0, 0}},
1580      {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1581        0xd0b2f94d2f420109},
1582       {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1583        0x971459828b0719e5},
1584       {1, 0, 0, 0}},
1585      {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1586        0x961610004a866aba},
1587       {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1588        0x7acb9fadcee75e44},
1589       {1, 0, 0, 0}},
1590      {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1591        0x24eb9acca333bf5b},
1592       {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1593        0x69f891c5acd079cc},
1594       {1, 0, 0, 0}},
1595      {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1596        0xe51f547c5972a107},
1597       {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1598        0x1c309a2b25bb1387},
1599       {1, 0, 0, 0}},
1600      {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1601        0x20b87b8aa2c4e503},
1602       {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1603        0xf5c6fa49919776be},
1604       {1, 0, 0, 0}},
1605      {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1606        0x1ed7d1b9332010b9},
1607       {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1608        0x3a2b03f03217257a},
1609       {1, 0, 0, 0}},
1610      {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1611        0x15fee545c78dd9f6},
1612       {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1613        0x4ab5b6b2b8753f81},
1614       {1, 0, 0, 0}}}
1615 };
1616
1617 /*
1618  * select_point selects the |idx|th point from a precomputation table and
1619  * copies it to out.
1620  */
1621 static void select_point(const u64 idx, unsigned int size,
1622                          const smallfelem pre_comp[16][3], smallfelem out[3])
1623 {
1624     unsigned i, j;
1625     u64 *outlimbs = &out[0][0];
1626
1627     memset(out, 0, sizeof(*out) * 3);
1628
1629     for (i = 0; i < size; i++) {
1630         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1631         u64 mask = i ^ idx;
1632         mask |= mask >> 4;
1633         mask |= mask >> 2;
1634         mask |= mask >> 1;
1635         mask &= 1;
1636         mask--;
1637         for (j = 0; j < NLIMBS * 3; j++)
1638             outlimbs[j] |= inlimbs[j] & mask;
1639     }
1640 }
1641
1642 /* get_bit returns the |i|th bit in |in| */
1643 static char get_bit(const felem_bytearray in, int i)
1644 {
1645     if ((i < 0) || (i >= 256))
1646         return 0;
1647     return (in[i >> 3] >> (i & 7)) & 1;
1648 }
1649
1650 /*
1651  * Interleaved point multiplication using precomputed point multiples: The
1652  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1653  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1654  * generator, using certain (large) precomputed multiples in g_pre_comp.
1655  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1656  */
1657 static void batch_mul(felem x_out, felem y_out, felem z_out,
1658                       const felem_bytearray scalars[],
1659                       const unsigned num_points, const u8 *g_scalar,
1660                       const int mixed, const smallfelem pre_comp[][17][3],
1661                       const smallfelem g_pre_comp[2][16][3])
1662 {
1663     int i, skip;
1664     unsigned num, gen_mul = (g_scalar != NULL);
1665     felem nq[3], ftmp;
1666     smallfelem tmp[3];
1667     u64 bits;
1668     u8 sign, digit;
1669
1670     /* set nq to the point at infinity */
1671     memset(nq, 0, sizeof(nq));
1672
1673     /*
1674      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1675      * of the generator (two in each of the last 32 rounds) and additions of
1676      * other points multiples (every 5th round).
1677      */
1678     skip = 1;                   /* save two point operations in the first
1679                                  * round */
1680     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1681         /* double */
1682         if (!skip)
1683             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1684
1685         /* add multiples of the generator */
1686         if (gen_mul && (i <= 31)) {
1687             /* first, look 32 bits upwards */
1688             bits = get_bit(g_scalar, i + 224) << 3;
1689             bits |= get_bit(g_scalar, i + 160) << 2;
1690             bits |= get_bit(g_scalar, i + 96) << 1;
1691             bits |= get_bit(g_scalar, i + 32);
1692             /* select the point to add, in constant time */
1693             select_point(bits, 16, g_pre_comp[1], tmp);
1694
1695             if (!skip) {
1696                 /* Arg 1 below is for "mixed" */
1697                 point_add(nq[0], nq[1], nq[2],
1698                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1699             } else {
1700                 smallfelem_expand(nq[0], tmp[0]);
1701                 smallfelem_expand(nq[1], tmp[1]);
1702                 smallfelem_expand(nq[2], tmp[2]);
1703                 skip = 0;
1704             }
1705
1706             /* second, look at the current position */
1707             bits = get_bit(g_scalar, i + 192) << 3;
1708             bits |= get_bit(g_scalar, i + 128) << 2;
1709             bits |= get_bit(g_scalar, i + 64) << 1;
1710             bits |= get_bit(g_scalar, i);
1711             /* select the point to add, in constant time */
1712             select_point(bits, 16, g_pre_comp[0], tmp);
1713             /* Arg 1 below is for "mixed" */
1714             point_add(nq[0], nq[1], nq[2],
1715                       nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1716         }
1717
1718         /* do other additions every 5 doublings */
1719         if (num_points && (i % 5 == 0)) {
1720             /* loop over all scalars */
1721             for (num = 0; num < num_points; ++num) {
1722                 bits = get_bit(scalars[num], i + 4) << 5;
1723                 bits |= get_bit(scalars[num], i + 3) << 4;
1724                 bits |= get_bit(scalars[num], i + 2) << 3;
1725                 bits |= get_bit(scalars[num], i + 1) << 2;
1726                 bits |= get_bit(scalars[num], i) << 1;
1727                 bits |= get_bit(scalars[num], i - 1);
1728                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1729
1730                 /*
1731                  * select the point to add or subtract, in constant time
1732                  */
1733                 select_point(digit, 17, pre_comp[num], tmp);
1734                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1735                                                * point */
1736                 copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
1737                 felem_contract(tmp[1], ftmp);
1738
1739                 if (!skip) {
1740                     point_add(nq[0], nq[1], nq[2],
1741                               nq[0], nq[1], nq[2],
1742                               mixed, tmp[0], tmp[1], tmp[2]);
1743                 } else {
1744                     smallfelem_expand(nq[0], tmp[0]);
1745                     smallfelem_expand(nq[1], tmp[1]);
1746                     smallfelem_expand(nq[2], tmp[2]);
1747                     skip = 0;
1748                 }
1749             }
1750         }
1751     }
1752     felem_assign(x_out, nq[0]);
1753     felem_assign(y_out, nq[1]);
1754     felem_assign(z_out, nq[2]);
1755 }
1756
1757 /* Precomputation for the group generator. */
1758 struct nistp256_pre_comp_st {
1759     smallfelem g_pre_comp[2][16][3];
1760     int references;
1761 };
1762
1763 const EC_METHOD *EC_GFp_nistp256_method(void)
1764 {
1765     static const EC_METHOD ret = {
1766         EC_FLAGS_DEFAULT_OCT,
1767         NID_X9_62_prime_field,
1768         ec_GFp_nistp256_group_init,
1769         ec_GFp_simple_group_finish,
1770         ec_GFp_simple_group_clear_finish,
1771         ec_GFp_nist_group_copy,
1772         ec_GFp_nistp256_group_set_curve,
1773         ec_GFp_simple_group_get_curve,
1774         ec_GFp_simple_group_get_degree,
1775         ec_GFp_simple_group_check_discriminant,
1776         ec_GFp_simple_point_init,
1777         ec_GFp_simple_point_finish,
1778         ec_GFp_simple_point_clear_finish,
1779         ec_GFp_simple_point_copy,
1780         ec_GFp_simple_point_set_to_infinity,
1781         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1782         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1783         ec_GFp_simple_point_set_affine_coordinates,
1784         ec_GFp_nistp256_point_get_affine_coordinates,
1785         0 /* point_set_compressed_coordinates */ ,
1786         0 /* point2oct */ ,
1787         0 /* oct2point */ ,
1788         ec_GFp_simple_add,
1789         ec_GFp_simple_dbl,
1790         ec_GFp_simple_invert,
1791         ec_GFp_simple_is_at_infinity,
1792         ec_GFp_simple_is_on_curve,
1793         ec_GFp_simple_cmp,
1794         ec_GFp_simple_make_affine,
1795         ec_GFp_simple_points_make_affine,
1796         ec_GFp_nistp256_points_mul,
1797         ec_GFp_nistp256_precompute_mult,
1798         ec_GFp_nistp256_have_precompute_mult,
1799         ec_GFp_nist_field_mul,
1800         ec_GFp_nist_field_sqr,
1801         0 /* field_div */ ,
1802         0 /* field_encode */ ,
1803         0 /* field_decode */ ,
1804         0                       /* field_set_to_one */
1805     };
1806
1807     return &ret;
1808 }
1809
1810 /******************************************************************************/
1811 /*
1812  * FUNCTIONS TO MANAGE PRECOMPUTATION
1813  */
1814
1815 static NISTP256_PRE_COMP *nistp256_pre_comp_new()
1816 {
1817     NISTP256_PRE_COMP *ret = NULL;
1818     ret = OPENSSL_malloc(sizeof(*ret));
1819     if (ret == NULL) {
1820         ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1821         return ret;
1822     }
1823     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1824     ret->references = 1;
1825     return ret;
1826 }
1827
1828 NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *p)
1829 {
1830     if (p != NULL)
1831         CRYPTO_add(&p->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1832     return p;
1833 }
1834
1835 void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
1836 {
1837     if (pre == NULL
1838             || CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP) > 0)
1839         return;
1840     OPENSSL_free(pre);
1841 }
1842
1843 /******************************************************************************/
1844 /*
1845  * OPENSSL EC_METHOD FUNCTIONS
1846  */
1847
1848 int ec_GFp_nistp256_group_init(EC_GROUP *group)
1849 {
1850     int ret;
1851     ret = ec_GFp_simple_group_init(group);
1852     group->a_is_minus3 = 1;
1853     return ret;
1854 }
1855
1856 int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1857                                     const BIGNUM *a, const BIGNUM *b,
1858                                     BN_CTX *ctx)
1859 {
1860     int ret = 0;
1861     BN_CTX *new_ctx = NULL;
1862     BIGNUM *curve_p, *curve_a, *curve_b;
1863
1864     if (ctx == NULL)
1865         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1866             return 0;
1867     BN_CTX_start(ctx);
1868     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1869         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1870         ((curve_b = BN_CTX_get(ctx)) == NULL))
1871         goto err;
1872     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1873     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1874     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1875     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1876         ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
1877               EC_R_WRONG_CURVE_PARAMETERS);
1878         goto err;
1879     }
1880     group->field_mod_func = BN_nist_mod_256;
1881     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1882  err:
1883     BN_CTX_end(ctx);
1884     BN_CTX_free(new_ctx);
1885     return ret;
1886 }
1887
1888 /*
1889  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1890  * (X/Z^2, Y/Z^3)
1891  */
1892 int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1893                                                  const EC_POINT *point,
1894                                                  BIGNUM *x, BIGNUM *y,
1895                                                  BN_CTX *ctx)
1896 {
1897     felem z1, z2, x_in, y_in;
1898     smallfelem x_out, y_out;
1899     longfelem tmp;
1900
1901     if (EC_POINT_is_at_infinity(group, point)) {
1902         ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1903               EC_R_POINT_AT_INFINITY);
1904         return 0;
1905     }
1906     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1907         (!BN_to_felem(z1, point->Z)))
1908         return 0;
1909     felem_inv(z2, z1);
1910     felem_square(tmp, z2);
1911     felem_reduce(z1, tmp);
1912     felem_mul(tmp, x_in, z1);
1913     felem_reduce(x_in, tmp);
1914     felem_contract(x_out, x_in);
1915     if (x != NULL) {
1916         if (!smallfelem_to_BN(x, x_out)) {
1917             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1918                   ERR_R_BN_LIB);
1919             return 0;
1920         }
1921     }
1922     felem_mul(tmp, z1, z2);
1923     felem_reduce(z1, tmp);
1924     felem_mul(tmp, y_in, z1);
1925     felem_reduce(y_in, tmp);
1926     felem_contract(y_out, y_in);
1927     if (y != NULL) {
1928         if (!smallfelem_to_BN(y, y_out)) {
1929             ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
1930                   ERR_R_BN_LIB);
1931             return 0;
1932         }
1933     }
1934     return 1;
1935 }
1936
1937 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
1938 static void make_points_affine(size_t num, smallfelem points[][3],
1939                                smallfelem tmp_smallfelems[])
1940 {
1941     /*
1942      * Runs in constant time, unless an input is the point at infinity (which
1943      * normally shouldn't happen).
1944      */
1945     ec_GFp_nistp_points_make_affine_internal(num,
1946                                              points,
1947                                              sizeof(smallfelem),
1948                                              tmp_smallfelems,
1949                                              (void (*)(void *))smallfelem_one,
1950                                              (int (*)(const void *))
1951                                              smallfelem_is_zero_int,
1952                                              (void (*)(void *, const void *))
1953                                              smallfelem_assign,
1954                                              (void (*)(void *, const void *))
1955                                              smallfelem_square_contract,
1956                                              (void (*)
1957                                               (void *, const void *,
1958                                                const void *))
1959                                              smallfelem_mul_contract,
1960                                              (void (*)(void *, const void *))
1961                                              smallfelem_inv_contract,
1962                                              /* nothing to contract */
1963                                              (void (*)(void *, const void *))
1964                                              smallfelem_assign);
1965 }
1966
1967 /*
1968  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1969  * values Result is stored in r (r can equal one of the inputs).
1970  */
1971 int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
1972                                const BIGNUM *scalar, size_t num,
1973                                const EC_POINT *points[],
1974                                const BIGNUM *scalars[], BN_CTX *ctx)
1975 {
1976     int ret = 0;
1977     int j;
1978     int mixed = 0;
1979     BN_CTX *new_ctx = NULL;
1980     BIGNUM *x, *y, *z, *tmp_scalar;
1981     felem_bytearray g_secret;
1982     felem_bytearray *secrets = NULL;
1983     smallfelem (*pre_comp)[17][3] = NULL;
1984     smallfelem *tmp_smallfelems = NULL;
1985     felem_bytearray tmp;
1986     unsigned i, num_bytes;
1987     int have_pre_comp = 0;
1988     size_t num_points = num;
1989     smallfelem x_in, y_in, z_in;
1990     felem x_out, y_out, z_out;
1991     NISTP256_PRE_COMP *pre = NULL;
1992     const smallfelem(*g_pre_comp)[16][3] = NULL;
1993     EC_POINT *generator = NULL;
1994     const EC_POINT *p = NULL;
1995     const BIGNUM *p_scalar = NULL;
1996
1997     if (ctx == NULL)
1998         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1999             return 0;
2000     BN_CTX_start(ctx);
2001     if (((x = BN_CTX_get(ctx)) == NULL) ||
2002         ((y = BN_CTX_get(ctx)) == NULL) ||
2003         ((z = BN_CTX_get(ctx)) == NULL) ||
2004         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
2005         goto err;
2006
2007     if (scalar != NULL) {
2008         pre = group->pre_comp.nistp256;
2009         if (pre)
2010             /* we have precomputation, try to use it */
2011             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2012         else
2013             /* try to use the standard precomputation */
2014             g_pre_comp = &gmul[0];
2015         generator = EC_POINT_new(group);
2016         if (generator == NULL)
2017             goto err;
2018         /* get the generator from precomputation */
2019         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
2020             !smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
2021             !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2022             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2023             goto err;
2024         }
2025         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
2026                                                       generator, x, y, z,
2027                                                       ctx))
2028             goto err;
2029         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2030             /* precomputation matches generator */
2031             have_pre_comp = 1;
2032         else
2033             /*
2034              * we don't have valid precomputation: treat the generator as a
2035              * random point
2036              */
2037             num_points++;
2038     }
2039     if (num_points > 0) {
2040         if (num_points >= 3) {
2041             /*
2042              * unless we precompute multiples for just one or two points,
2043              * converting those into affine form is time well spent
2044              */
2045             mixed = 1;
2046         }
2047         secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2048         pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2049         if (mixed)
2050             tmp_smallfelems =
2051               OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2052         if ((secrets == NULL) || (pre_comp == NULL)
2053             || (mixed && (tmp_smallfelems == NULL))) {
2054             ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
2055             goto err;
2056         }
2057
2058         /*
2059          * we treat NULL scalars as 0, and NULL points as points at infinity,
2060          * i.e., they contribute nothing to the linear combination
2061          */
2062         memset(secrets, 0, sizeof(*secrets) * num_points);
2063         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2064         for (i = 0; i < num_points; ++i) {
2065             if (i == num)
2066                 /*
2067                  * we didn't have a valid precomputation, so we pick the
2068                  * generator
2069                  */
2070             {
2071                 p = EC_GROUP_get0_generator(group);
2072                 p_scalar = scalar;
2073             } else
2074                 /* the i^th point */
2075             {
2076                 p = points[i];
2077                 p_scalar = scalars[i];
2078             }
2079             if ((p_scalar != NULL) && (p != NULL)) {
2080                 /* reduce scalar to 0 <= scalar < 2^256 */
2081                 if ((BN_num_bits(p_scalar) > 256)
2082                     || (BN_is_negative(p_scalar))) {
2083                     /*
2084                      * this is an unusual input, and we don't guarantee
2085                      * constant-timeness
2086                      */
2087                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2088                         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2089                         goto err;
2090                     }
2091                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
2092                 } else
2093                     num_bytes = BN_bn2bin(p_scalar, tmp);
2094                 flip_endian(secrets[i], tmp, num_bytes);
2095                 /* precompute multiples */
2096                 if ((!BN_to_felem(x_out, p->X)) ||
2097                     (!BN_to_felem(y_out, p->Y)) ||
2098                     (!BN_to_felem(z_out, p->Z)))
2099                     goto err;
2100                 felem_shrink(pre_comp[i][1][0], x_out);
2101                 felem_shrink(pre_comp[i][1][1], y_out);
2102                 felem_shrink(pre_comp[i][1][2], z_out);
2103                 for (j = 2; j <= 16; ++j) {
2104                     if (j & 1) {
2105                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2106                                         pre_comp[i][j][2], pre_comp[i][1][0],
2107                                         pre_comp[i][1][1], pre_comp[i][1][2],
2108                                         pre_comp[i][j - 1][0],
2109                                         pre_comp[i][j - 1][1],
2110                                         pre_comp[i][j - 1][2]);
2111                     } else {
2112                         point_double_small(pre_comp[i][j][0],
2113                                            pre_comp[i][j][1],
2114                                            pre_comp[i][j][2],
2115                                            pre_comp[i][j / 2][0],
2116                                            pre_comp[i][j / 2][1],
2117                                            pre_comp[i][j / 2][2]);
2118                     }
2119                 }
2120             }
2121         }
2122         if (mixed)
2123             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2124     }
2125
2126     /* the scalar for the generator */
2127     if ((scalar != NULL) && (have_pre_comp)) {
2128         memset(g_secret, 0, sizeof(g_secret));
2129         /* reduce scalar to 0 <= scalar < 2^256 */
2130         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2131             /*
2132              * this is an unusual input, and we don't guarantee
2133              * constant-timeness
2134              */
2135             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2136                 ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2137                 goto err;
2138             }
2139             num_bytes = BN_bn2bin(tmp_scalar, tmp);
2140         } else
2141             num_bytes = BN_bn2bin(scalar, tmp);
2142         flip_endian(g_secret, tmp, num_bytes);
2143         /* do the multiplication with generator precomputation */
2144         batch_mul(x_out, y_out, z_out,
2145                   (const felem_bytearray(*))secrets, num_points,
2146                   g_secret,
2147                   mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2148     } else
2149         /* do the multiplication without generator precomputation */
2150         batch_mul(x_out, y_out, z_out,
2151                   (const felem_bytearray(*))secrets, num_points,
2152                   NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2153     /* reduce the output to its unique minimal representation */
2154     felem_contract(x_in, x_out);
2155     felem_contract(y_in, y_out);
2156     felem_contract(z_in, z_out);
2157     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
2158         (!smallfelem_to_BN(z, z_in))) {
2159         ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
2160         goto err;
2161     }
2162     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2163
2164  err:
2165     BN_CTX_end(ctx);
2166     EC_POINT_free(generator);
2167     BN_CTX_free(new_ctx);
2168     OPENSSL_free(secrets);
2169     OPENSSL_free(pre_comp);
2170     OPENSSL_free(tmp_smallfelems);
2171     return ret;
2172 }
2173
2174 int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2175 {
2176     int ret = 0;
2177     NISTP256_PRE_COMP *pre = NULL;
2178     int i, j;
2179     BN_CTX *new_ctx = NULL;
2180     BIGNUM *x, *y;
2181     EC_POINT *generator = NULL;
2182     smallfelem tmp_smallfelems[32];
2183     felem x_tmp, y_tmp, z_tmp;
2184
2185     /* throw away old precomputation */
2186     EC_pre_comp_free(group);
2187     if (ctx == NULL)
2188         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2189             return 0;
2190     BN_CTX_start(ctx);
2191     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2192         goto err;
2193     /* get the generator */
2194     if (group->generator == NULL)
2195         goto err;
2196     generator = EC_POINT_new(group);
2197     if (generator == NULL)
2198         goto err;
2199     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2200     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2201     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2202         goto err;
2203     if ((pre = nistp256_pre_comp_new()) == NULL)
2204         goto err;
2205     /*
2206      * if the generator is the standard one, use built-in precomputation
2207      */
2208     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2209         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2210         goto done;
2211     }
2212     if ((!BN_to_felem(x_tmp, group->generator->X)) ||
2213         (!BN_to_felem(y_tmp, group->generator->Y)) ||
2214         (!BN_to_felem(z_tmp, group->generator->Z)))
2215         goto err;
2216     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2217     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2218     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2219     /*
2220      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2221      * 2^160*G, 2^224*G for the second one
2222      */
2223     for (i = 1; i <= 8; i <<= 1) {
2224         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2225                            pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2226                            pre->g_pre_comp[0][i][1],
2227                            pre->g_pre_comp[0][i][2]);
2228         for (j = 0; j < 31; ++j) {
2229             point_double_small(pre->g_pre_comp[1][i][0],
2230                                pre->g_pre_comp[1][i][1],
2231                                pre->g_pre_comp[1][i][2],
2232                                pre->g_pre_comp[1][i][0],
2233                                pre->g_pre_comp[1][i][1],
2234                                pre->g_pre_comp[1][i][2]);
2235         }
2236         if (i == 8)
2237             break;
2238         point_double_small(pre->g_pre_comp[0][2 * i][0],
2239                            pre->g_pre_comp[0][2 * i][1],
2240                            pre->g_pre_comp[0][2 * i][2],
2241                            pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2242                            pre->g_pre_comp[1][i][2]);
2243         for (j = 0; j < 31; ++j) {
2244             point_double_small(pre->g_pre_comp[0][2 * i][0],
2245                                pre->g_pre_comp[0][2 * i][1],
2246                                pre->g_pre_comp[0][2 * i][2],
2247                                pre->g_pre_comp[0][2 * i][0],
2248                                pre->g_pre_comp[0][2 * i][1],
2249                                pre->g_pre_comp[0][2 * i][2]);
2250         }
2251     }
2252     for (i = 0; i < 2; i++) {
2253         /* g_pre_comp[i][0] is the point at infinity */
2254         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2255         /* the remaining multiples */
2256         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2257         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2258                         pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2259                         pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2260                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2261                         pre->g_pre_comp[i][2][2]);
2262         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2263         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2264                         pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2265                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2266                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2267                         pre->g_pre_comp[i][2][2]);
2268         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2269         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2270                         pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2271                         pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2272                         pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2273                         pre->g_pre_comp[i][4][2]);
2274         /*
2275          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2276          */
2277         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2278                         pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2279                         pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2280                         pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2281                         pre->g_pre_comp[i][2][2]);
2282         for (j = 1; j < 8; ++j) {
2283             /* odd multiples: add G resp. 2^32*G */
2284             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2285                             pre->g_pre_comp[i][2 * j + 1][1],
2286                             pre->g_pre_comp[i][2 * j + 1][2],
2287                             pre->g_pre_comp[i][2 * j][0],
2288                             pre->g_pre_comp[i][2 * j][1],
2289                             pre->g_pre_comp[i][2 * j][2],
2290                             pre->g_pre_comp[i][1][0],
2291                             pre->g_pre_comp[i][1][1],
2292                             pre->g_pre_comp[i][1][2]);
2293         }
2294     }
2295     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2296
2297  done:
2298     SETPRECOMP(group, nistp256, pre);
2299     pre = NULL;
2300     ret = 1;
2301
2302  err:
2303     BN_CTX_end(ctx);
2304     EC_POINT_free(generator);
2305     BN_CTX_free(new_ctx);
2306     EC_nistp256_pre_comp_free(pre);
2307     return ret;
2308 }
2309
2310 int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2311 {
2312     return HAVEPRECOMP(group, nistp256);
2313 }
2314 #else
2315 static void *dummy = &dummy;
2316 #endif