crypto/ec/ecp_nistp521.c

   1 /* crypto/ec/ecp_nistp521.c */
   2 /*
   3  * Written by Adam Langley (Google) for the OpenSSL project
   4  */
   5 /* Copyright 2011 Google Inc.
   6  *
   7  * Licensed under the Apache License, Version 2.0 (the "License");
   8  *
   9  * you may not use this file except in compliance with the License.
  10  * You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  *  Unless required by applicable law or agreed to in writing, software
  15  *  distributed under the License is distributed on an "AS IS" BASIS,
  16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  *  See the License for the specific language governing permissions and
  18  *  limitations under the License.
  19  */
  20
  21 /*
  22  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
  23  *
  24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
  25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
  26  * work which got its smarts from Daniel J. Bernstein's work on the same.
  27  */
  28
  29 #include <openssl/opensslconf.h>
  30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
  31
  32 #ifndef OPENSSL_SYS_VMS
  33 #include <stdint.h>
  34 #else
  35 #include <inttypes.h>
  36 #endif
  37
  38 #include <string.h>
  39 #include <openssl/err.h>
  40 #include "ec_lcl.h"
  41
  42 #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
  43   /* even with gcc, the typedef won't work for 32-bit platforms */
  44   typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
  45 #else
  46   #error "Need GCC 3.1 or later to define type uint128_t"
  47 #endif
  48
  49 typedef uint8_t u8;
  50 typedef uint64_t u64;
  51 typedef int64_t s64;
  52
  53 /* The underlying field.
  54  *
  55  * P521 operates over GF(2^521-1). We can serialise an element of this field
  56  * into 66 bytes where the most significant byte contains only a single bit. We
  57  * call this an felem_bytearray. */
  58
  59 typedef u8 felem_bytearray[66];
  60
  61 /* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
  62  * These values are big-endian. */
  63 static const felem_bytearray nistp521_curve_params[5] =
  64         {
  65         {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* p */
  66          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  67          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  68          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  69          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  70          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  71          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  72          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  73          0xff, 0xff},
  74         {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* a = -3 */
  75          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  76          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  77          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  78          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  79          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  80          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  81          0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  82          0xff, 0xfc},
  83         {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c,  /* b */
  84          0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
  85          0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
  86          0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
  87          0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
  88          0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
  89          0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
  90          0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
  91          0x3f, 0x00},
  92         {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04,  /* x */
  93          0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
  94          0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
  95          0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
  96          0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
  97          0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
  98          0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
  99          0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
 100          0xbd, 0x66},
 101         {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b,  /* y */
 102          0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
 103          0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
 104          0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
 105          0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
 106          0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
 107          0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
 108          0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
 109          0x66, 0x50}
 110         };
 111
 112 /*-
 113  * The representation of field elements.
 114  * ------------------------------------
 115  *
 116  * We represent field elements with nine values. These values are either 64 or
 117  * 128 bits and the field element represented is:
 118  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
 119  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
 120  * 58 bits apart, but are greater than 58 bits in length, the most significant
 121  * bits of each limb overlap with the least significant bits of the next.
 122  *
 123  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
 124  * 'largefelem' */
 125
 126 #define NLIMBS 9
 127
 128 typedef uint64_t limb;
 129 typedef limb felem[NLIMBS];
 130 typedef uint128_t largefelem[NLIMBS];
 131
 132 static const limb bottom57bits = 0x1ffffffffffffff;
 133 static const limb bottom58bits = 0x3ffffffffffffff;
 134
 135 /* bin66_to_felem takes a little-endian byte array and converts it into felem
 136  * form. This assumes that the CPU is little-endian. */
 137 static void bin66_to_felem(felem out, const u8 in[66])
 138         {
 139         out[0] = (*((limb*) &in[0])) & bottom58bits;
 140         out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits;
 141         out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits;
 142         out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits;
 143         out[4] = (*((limb*) &in[29])) & bottom58bits;
 144         out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits;
 145         out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits;
 146         out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits;
 147         out[8] = (*((limb*) &in[58])) & bottom57bits;
 148         }
 149
 150 /* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
 151  * array. This assumes that the CPU is little-endian. */
 152 static void felem_to_bin66(u8 out[66], const felem in)
 153         {
 154         memset(out, 0, 66);
 155         (*((limb*) &out[0])) = in[0];
 156         (*((limb*) &out[7])) |= in[1] << 2;
 157         (*((limb*) &out[14])) |= in[2] << 4;
 158         (*((limb*) &out[21])) |= in[3] << 6;
 159         (*((limb*) &out[29])) = in[4];
 160         (*((limb*) &out[36])) |= in[5] << 2;
 161         (*((limb*) &out[43])) |= in[6] << 4;
 162         (*((limb*) &out[50])) |= in[7] << 6;
 163         (*((limb*) &out[58])) = in[8];
 164         }
 165
 166 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
 167 static void flip_endian(u8 *out, const u8 *in, unsigned len)
 168         {
 169         unsigned i;
 170         for (i = 0; i < len; ++i)
 171                 out[i] = in[len-1-i];
 172         }
 173
 174 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
 175 static int BN_to_felem(felem out, const BIGNUM *bn)
 176         {
 177         felem_bytearray b_in;
 178         felem_bytearray b_out;
 179         unsigned num_bytes;
 180
 181         /* BN_bn2bin eats leading zeroes */
 182         memset(b_out, 0, sizeof b_out);
 183         num_bytes = BN_num_bytes(bn);
 184         if (num_bytes > sizeof b_out)
 185                 {
 186                 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 187                 return 0;
 188                 }
 189         if (BN_is_negative(bn))
 190                 {
 191                 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
 192                 return 0;
 193                 }
 194         num_bytes = BN_bn2bin(bn, b_in);
 195         flip_endian(b_out, b_in, num_bytes);
 196         bin66_to_felem(out, b_out);
 197         return 1;
 198         }
 199
 200 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
 201 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
 202         {
 203         felem_bytearray b_in, b_out;
 204         felem_to_bin66(b_in, in);
 205         flip_endian(b_out, b_in, sizeof b_out);
 206         return BN_bin2bn(b_out, sizeof b_out, out);
 207         }
 208
 209
 210 /*-
 211  * Field operations
 212  * ----------------
 213  */
 214
 215 static void felem_one(felem out)
 216         {
 217         out[0] = 1;
 218         out[1] = 0;
 219         out[2] = 0;
 220         out[3] = 0;
 221         out[4] = 0;
 222         out[5] = 0;
 223         out[6] = 0;
 224         out[7] = 0;
 225         out[8] = 0;
 226         }
 227
 228 static void felem_assign(felem out, const felem in)
 229         {
 230         out[0] = in[0];
 231         out[1] = in[1];
 232         out[2] = in[2];
 233         out[3] = in[3];
 234         out[4] = in[4];
 235         out[5] = in[5];
 236         out[6] = in[6];
 237         out[7] = in[7];
 238         out[8] = in[8];
 239         }
 240
 241 /* felem_sum64 sets out = out + in. */
 242 static void felem_sum64(felem out, const felem in)
 243         {
 244         out[0] += in[0];
 245         out[1] += in[1];
 246         out[2] += in[2];
 247         out[3] += in[3];
 248         out[4] += in[4];
 249         out[5] += in[5];
 250         out[6] += in[6];
 251         out[7] += in[7];
 252         out[8] += in[8];
 253         }
 254
 255 /* felem_scalar sets out = in * scalar */
 256 static void felem_scalar(felem out, const felem in, limb scalar)
 257         {
 258         out[0] = in[0] * scalar;
 259         out[1] = in[1] * scalar;
 260         out[2] = in[2] * scalar;
 261         out[3] = in[3] * scalar;
 262         out[4] = in[4] * scalar;
 263         out[5] = in[5] * scalar;
 264         out[6] = in[6] * scalar;
 265         out[7] = in[7] * scalar;
 266         out[8] = in[8] * scalar;
 267         }
 268
 269 /* felem_scalar64 sets out = out * scalar */
 270 static void felem_scalar64(felem out, limb scalar)
 271         {
 272         out[0] *= scalar;
 273         out[1] *= scalar;
 274         out[2] *= scalar;
 275         out[3] *= scalar;
 276         out[4] *= scalar;
 277         out[5] *= scalar;
 278         out[6] *= scalar;
 279         out[7] *= scalar;
 280         out[8] *= scalar;
 281         }
 282
 283 /* felem_scalar128 sets out = out * scalar */
 284 static void felem_scalar128(largefelem out, limb scalar)
 285         {
 286         out[0] *= scalar;
 287         out[1] *= scalar;
 288         out[2] *= scalar;
 289         out[3] *= scalar;
 290         out[4] *= scalar;
 291         out[5] *= scalar;
 292         out[6] *= scalar;
 293         out[7] *= scalar;
 294         out[8] *= scalar;
 295         }
 296
 297 /*-
 298  * felem_neg sets |out| to |-in|
 299  * On entry:
 300  *   in[i] < 2^59 + 2^14
 301  * On exit:
 302  *   out[i] < 2^62
 303  */
 304 static void felem_neg(felem out, const felem in)
 305         {
 306         /* In order to prevent underflow, we subtract from 0 mod p. */
 307         static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
 308         static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
 309
 310         out[0] = two62m3 - in[0];
 311         out[1] = two62m2 - in[1];
 312         out[2] = two62m2 - in[2];
 313         out[3] = two62m2 - in[3];
 314         out[4] = two62m2 - in[4];
 315         out[5] = two62m2 - in[5];
 316         out[6] = two62m2 - in[6];
 317         out[7] = two62m2 - in[7];
 318         out[8] = two62m2 - in[8];
 319         }
 320
 321 /*-
 322  * felem_diff64 subtracts |in| from |out|
 323  * On entry:
 324  *   in[i] < 2^59 + 2^14
 325  * On exit:
 326  *   out[i] < out[i] + 2^62
 327  */
 328 static void felem_diff64(felem out, const felem in)
 329         {
 330         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 331         static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
 332         static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
 333
 334         out[0] += two62m3 - in[0];
 335         out[1] += two62m2 - in[1];
 336         out[2] += two62m2 - in[2];
 337         out[3] += two62m2 - in[3];
 338         out[4] += two62m2 - in[4];
 339         out[5] += two62m2 - in[5];
 340         out[6] += two62m2 - in[6];
 341         out[7] += two62m2 - in[7];
 342         out[8] += two62m2 - in[8];
 343         }
 344
 345 /*-
 346  * felem_diff_128_64 subtracts |in| from |out|
 347  * On entry:
 348  *   in[i] < 2^62 + 2^17
 349  * On exit:
 350  *   out[i] < out[i] + 2^63
 351  */
 352 static void felem_diff_128_64(largefelem out, const felem in)
 353         {
 354         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 355         static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5);
 356         static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4);
 357
 358         out[0] += two63m6 - in[0];
 359         out[1] += two63m5 - in[1];
 360         out[2] += two63m5 - in[2];
 361         out[3] += two63m5 - in[3];
 362         out[4] += two63m5 - in[4];
 363         out[5] += two63m5 - in[5];
 364         out[6] += two63m5 - in[6];
 365         out[7] += two63m5 - in[7];
 366         out[8] += two63m5 - in[8];
 367         }
 368
 369 /*-
 370  * felem_diff_128_64 subtracts |in| from |out|
 371  * On entry:
 372  *   in[i] < 2^126
 373  * On exit:
 374  *   out[i] < out[i] + 2^127 - 2^69
 375  */
 376 static void felem_diff128(largefelem out, const largefelem in)
 377         {
 378         /* In order to prevent underflow, we add 0 mod p before subtracting. */
 379         static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70);
 380         static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69);
 381
 382         out[0] += (two127m70 - in[0]);
 383         out[1] += (two127m69 - in[1]);
 384         out[2] += (two127m69 - in[2]);
 385         out[3] += (two127m69 - in[3]);
 386         out[4] += (two127m69 - in[4]);
 387         out[5] += (two127m69 - in[5]);
 388         out[6] += (two127m69 - in[6]);
 389         out[7] += (two127m69 - in[7]);
 390         out[8] += (two127m69 - in[8]);
 391         }
 392
 393 /*-
 394  * felem_square sets |out| = |in|^2
 395  * On entry:
 396  *   in[i] < 2^62
 397  * On exit:
 398  *   out[i] < 17 * max(in[i]) * max(in[i])
 399  */
 400 static void felem_square(largefelem out, const felem in)
 401         {
 402         felem inx2, inx4;
 403         felem_scalar(inx2, in, 2);
 404         felem_scalar(inx4, in, 4);
 405
 406         /*-
 407          * We have many cases were we want to do
 408          *   in[x] * in[y] +
 409          *   in[y] * in[x]
 410          * This is obviously just
 411          *   2 * in[x] * in[y]
 412          * However, rather than do the doubling on the 128 bit result, we
 413          * double one of the inputs to the multiplication by reading from
 414          * |inx2| */
 415
 416         out[0] = ((uint128_t) in[0]) * in[0];
 417         out[1] = ((uint128_t) in[0]) * inx2[1];
 418         out[2] = ((uint128_t) in[0]) * inx2[2] +
 419                  ((uint128_t) in[1]) * in[1];
 420         out[3] = ((uint128_t) in[0]) * inx2[3] +
 421                  ((uint128_t) in[1]) * inx2[2];
 422         out[4] = ((uint128_t) in[0]) * inx2[4] +
 423                  ((uint128_t) in[1]) * inx2[3] +
 424                  ((uint128_t) in[2]) * in[2];
 425         out[5] = ((uint128_t) in[0]) * inx2[5] +
 426                  ((uint128_t) in[1]) * inx2[4] +
 427                  ((uint128_t) in[2]) * inx2[3];
 428         out[6] = ((uint128_t) in[0]) * inx2[6] +
 429                  ((uint128_t) in[1]) * inx2[5] +
 430                  ((uint128_t) in[2]) * inx2[4] +
 431                  ((uint128_t) in[3]) * in[3];
 432         out[7] = ((uint128_t) in[0]) * inx2[7] +
 433                  ((uint128_t) in[1]) * inx2[6] +
 434                  ((uint128_t) in[2]) * inx2[5] +
 435                  ((uint128_t) in[3]) * inx2[4];
 436         out[8] = ((uint128_t) in[0]) * inx2[8] +
 437                  ((uint128_t) in[1]) * inx2[7] +
 438                  ((uint128_t) in[2]) * inx2[6] +
 439                  ((uint128_t) in[3]) * inx2[5] +
 440                  ((uint128_t) in[4]) * in[4];
 441
 442         /* The remaining limbs fall above 2^521, with the first falling at
 443          * 2^522. They correspond to locations one bit up from the limbs
 444          * produced above so we would have to multiply by two to align them.
 445          * Again, rather than operate on the 128-bit result, we double one of
 446          * the inputs to the multiplication. If we want to double for both this
 447          * reason, and the reason above, then we end up multiplying by four. */
 448
 449         /* 9 */
 450         out[0] += ((uint128_t) in[1]) * inx4[8] +
 451                   ((uint128_t) in[2]) * inx4[7] +
 452                   ((uint128_t) in[3]) * inx4[6] +
 453                   ((uint128_t) in[4]) * inx4[5];
 454
 455         /* 10 */
 456         out[1] += ((uint128_t) in[2]) * inx4[8] +
 457                   ((uint128_t) in[3]) * inx4[7] +
 458                   ((uint128_t) in[4]) * inx4[6] +
 459                   ((uint128_t) in[5]) * inx2[5];
 460
 461         /* 11 */
 462         out[2] += ((uint128_t) in[3]) * inx4[8] +
 463                   ((uint128_t) in[4]) * inx4[7] +
 464                   ((uint128_t) in[5]) * inx4[6];
 465
 466         /* 12 */
 467         out[3] += ((uint128_t) in[4]) * inx4[8] +
 468                   ((uint128_t) in[5]) * inx4[7] +
 469                   ((uint128_t) in[6]) * inx2[6];
 470
 471         /* 13 */
 472         out[4] += ((uint128_t) in[5]) * inx4[8] +
 473                   ((uint128_t) in[6]) * inx4[7];
 474
 475         /* 14 */
 476         out[5] += ((uint128_t) in[6]) * inx4[8] +
 477                   ((uint128_t) in[7]) * inx2[7];
 478
 479         /* 15 */
 480         out[6] += ((uint128_t) in[7]) * inx4[8];
 481
 482         /* 16 */
 483         out[7] += ((uint128_t) in[8]) * inx2[8];
 484         }
 485
 486 /*-
 487  * felem_mul sets |out| = |in1| * |in2|
 488  * On entry:
 489  *   in1[i] < 2^64
 490  *   in2[i] < 2^63
 491  * On exit:
 492  *   out[i] < 17 * max(in1[i]) * max(in2[i])
 493  */
 494 static void felem_mul(largefelem out, const felem in1, const felem in2)
 495         {
 496         felem in2x2;
 497         felem_scalar(in2x2, in2, 2);
 498
 499         out[0] = ((uint128_t) in1[0]) * in2[0];
 500
 501         out[1] = ((uint128_t) in1[0]) * in2[1] +
 502                  ((uint128_t) in1[1]) * in2[0];
 503
 504         out[2] = ((uint128_t) in1[0]) * in2[2] +
 505                  ((uint128_t) in1[1]) * in2[1] +
 506                  ((uint128_t) in1[2]) * in2[0];
 507
 508         out[3] = ((uint128_t) in1[0]) * in2[3] +
 509                  ((uint128_t) in1[1]) * in2[2] +
 510                  ((uint128_t) in1[2]) * in2[1] +
 511                  ((uint128_t) in1[3]) * in2[0];
 512
 513         out[4] = ((uint128_t) in1[0]) * in2[4] +
 514                  ((uint128_t) in1[1]) * in2[3] +
 515                  ((uint128_t) in1[2]) * in2[2] +
 516                  ((uint128_t) in1[3]) * in2[1] +
 517                  ((uint128_t) in1[4]) * in2[0];
 518
 519         out[5] = ((uint128_t) in1[0]) * in2[5] +
 520                  ((uint128_t) in1[1]) * in2[4] +
 521                  ((uint128_t) in1[2]) * in2[3] +
 522                  ((uint128_t) in1[3]) * in2[2] +
 523                  ((uint128_t) in1[4]) * in2[1] +
 524                  ((uint128_t) in1[5]) * in2[0];
 525
 526         out[6] = ((uint128_t) in1[0]) * in2[6] +
 527                  ((uint128_t) in1[1]) * in2[5] +
 528                  ((uint128_t) in1[2]) * in2[4] +
 529                  ((uint128_t) in1[3]) * in2[3] +
 530                  ((uint128_t) in1[4]) * in2[2] +
 531                  ((uint128_t) in1[5]) * in2[1] +
 532                  ((uint128_t) in1[6]) * in2[0];
 533
 534         out[7] = ((uint128_t) in1[0]) * in2[7] +
 535                  ((uint128_t) in1[1]) * in2[6] +
 536                  ((uint128_t) in1[2]) * in2[5] +
 537                  ((uint128_t) in1[3]) * in2[4] +
 538                  ((uint128_t) in1[4]) * in2[3] +
 539                  ((uint128_t) in1[5]) * in2[2] +
 540                  ((uint128_t) in1[6]) * in2[1] +
 541                  ((uint128_t) in1[7]) * in2[0];
 542
 543         out[8] = ((uint128_t) in1[0]) * in2[8] +
 544                  ((uint128_t) in1[1]) * in2[7] +
 545                  ((uint128_t) in1[2]) * in2[6] +
 546                  ((uint128_t) in1[3]) * in2[5] +
 547                  ((uint128_t) in1[4]) * in2[4] +
 548                  ((uint128_t) in1[5]) * in2[3] +
 549                  ((uint128_t) in1[6]) * in2[2] +
 550                  ((uint128_t) in1[7]) * in2[1] +
 551                  ((uint128_t) in1[8]) * in2[0];
 552
 553         /* See comment in felem_square about the use of in2x2 here */
 554
 555         out[0] += ((uint128_t) in1[1]) * in2x2[8] +
 556                   ((uint128_t) in1[2]) * in2x2[7] +
 557                   ((uint128_t) in1[3]) * in2x2[6] +
 558                   ((uint128_t) in1[4]) * in2x2[5] +
 559                   ((uint128_t) in1[5]) * in2x2[4] +
 560                   ((uint128_t) in1[6]) * in2x2[3] +
 561                   ((uint128_t) in1[7]) * in2x2[2] +
 562                   ((uint128_t) in1[8]) * in2x2[1];
 563
 564         out[1] += ((uint128_t) in1[2]) * in2x2[8] +
 565                   ((uint128_t) in1[3]) * in2x2[7] +
 566                   ((uint128_t) in1[4]) * in2x2[6] +
 567                   ((uint128_t) in1[5]) * in2x2[5] +
 568                   ((uint128_t) in1[6]) * in2x2[4] +
 569                   ((uint128_t) in1[7]) * in2x2[3] +
 570                   ((uint128_t) in1[8]) * in2x2[2];
 571
 572         out[2] += ((uint128_t) in1[3]) * in2x2[8] +
 573                   ((uint128_t) in1[4]) * in2x2[7] +
 574                   ((uint128_t) in1[5]) * in2x2[6] +
 575                   ((uint128_t) in1[6]) * in2x2[5] +
 576                   ((uint128_t) in1[7]) * in2x2[4] +
 577                   ((uint128_t) in1[8]) * in2x2[3];
 578
 579         out[3] += ((uint128_t) in1[4]) * in2x2[8] +
 580                   ((uint128_t) in1[5]) * in2x2[7] +
 581                   ((uint128_t) in1[6]) * in2x2[6] +
 582                   ((uint128_t) in1[7]) * in2x2[5] +
 583                   ((uint128_t) in1[8]) * in2x2[4];
 584
 585         out[4] += ((uint128_t) in1[5]) * in2x2[8] +
 586                   ((uint128_t) in1[6]) * in2x2[7] +
 587                   ((uint128_t) in1[7]) * in2x2[6] +
 588                   ((uint128_t) in1[8]) * in2x2[5];
 589
 590         out[5] += ((uint128_t) in1[6]) * in2x2[8] +
 591                   ((uint128_t) in1[7]) * in2x2[7] +
 592                   ((uint128_t) in1[8]) * in2x2[6];
 593
 594         out[6] += ((uint128_t) in1[7]) * in2x2[8] +
 595                   ((uint128_t) in1[8]) * in2x2[7];
 596
 597         out[7] += ((uint128_t) in1[8]) * in2x2[8];
 598         }
 599
 600 static const limb bottom52bits = 0xfffffffffffff;
 601
 602 /*-
 603  * felem_reduce converts a largefelem to an felem.
 604  * On entry:
 605  *   in[i] < 2^128
 606  * On exit:
 607  *   out[i] < 2^59 + 2^14
 608  */
 609 static void felem_reduce(felem out, const largefelem in)
 610         {
 611         u64 overflow1, overflow2;
 612
 613         out[0] = ((limb) in[0]) & bottom58bits;
 614         out[1] = ((limb) in[1]) & bottom58bits;
 615         out[2] = ((limb) in[2]) & bottom58bits;
 616         out[3] = ((limb) in[3]) & bottom58bits;
 617         out[4] = ((limb) in[4]) & bottom58bits;
 618         out[5] = ((limb) in[5]) & bottom58bits;
 619         out[6] = ((limb) in[6]) & bottom58bits;
 620         out[7] = ((limb) in[7]) & bottom58bits;
 621         out[8] = ((limb) in[8]) & bottom58bits;
 622
 623         /* out[i] < 2^58 */
 624
 625         out[1] += ((limb) in[0]) >> 58;
 626         out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
 627         /*-
 628          * out[1] < 2^58 + 2^6 + 2^58
 629          *        = 2^59 + 2^6
 630          */
 631         out[2] += ((limb) (in[0] >> 64)) >> 52;
 632
 633         out[2] += ((limb) in[1]) >> 58;
 634         out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
 635         out[3] += ((limb) (in[1] >> 64)) >> 52;
 636
 637         out[3] += ((limb) in[2]) >> 58;
 638         out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
 639         out[4] += ((limb) (in[2] >> 64)) >> 52;
 640
 641         out[4] += ((limb) in[3]) >> 58;
 642         out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
 643         out[5] += ((limb) (in[3] >> 64)) >> 52;
 644
 645         out[5] += ((limb) in[4]) >> 58;
 646         out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
 647         out[6] += ((limb) (in[4] >> 64)) >> 52;
 648
 649         out[6] += ((limb) in[5]) >> 58;
 650         out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
 651         out[7] += ((limb) (in[5] >> 64)) >> 52;
 652
 653         out[7] += ((limb) in[6]) >> 58;
 654         out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
 655         out[8] += ((limb) (in[6] >> 64)) >> 52;
 656
 657         out[8] += ((limb) in[7]) >> 58;
 658         out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
 659         /*-
 660          * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
 661          *            < 2^59 + 2^13
 662          */
 663         overflow1 = ((limb) (in[7] >> 64)) >> 52;
 664
 665         overflow1 += ((limb) in[8]) >> 58;
 666         overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
 667         overflow2 = ((limb) (in[8] >> 64)) >> 52;
 668
 669         overflow1 <<= 1;  /* overflow1 < 2^13 + 2^7 + 2^59 */
 670         overflow2 <<= 1;  /* overflow2 < 2^13 */
 671
 672         out[0] += overflow1;  /* out[0] < 2^60 */
 673         out[1] += overflow2;  /* out[1] < 2^59 + 2^6 + 2^13 */
 674
 675         out[1] += out[0] >> 58; out[0] &= bottom58bits;
 676         /*-
 677          * out[0] < 2^58
 678          * out[1] < 2^59 + 2^6 + 2^13 + 2^2
 679          *        < 2^59 + 2^14
 680          */
 681         }
 682
 683 static void felem_square_reduce(felem out, const felem in)
 684         {
 685         largefelem tmp;
 686         felem_square(tmp, in);
 687         felem_reduce(out, tmp);
 688         }
 689
 690 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
 691         {
 692         largefelem tmp;
 693         felem_mul(tmp, in1, in2);
 694         felem_reduce(out, tmp);
 695         }
 696
 697 /*-
 698  * felem_inv calculates |out| = |in|^{-1}
 699  *
 700  * Based on Fermat's Little Theorem:
 701  *   a^p = a (mod p)
 702  *   a^{p-1} = 1 (mod p)
 703  *   a^{p-2} = a^{-1} (mod p)
 704  */
 705 static void felem_inv(felem out, const felem in)
 706         {
 707         felem ftmp, ftmp2, ftmp3, ftmp4;
 708         largefelem tmp;
 709         unsigned i;
 710
 711         felem_square(tmp, in); felem_reduce(ftmp, tmp);         /* 2^1 */
 712         felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);      /* 2^2 - 2^0 */
 713         felem_assign(ftmp2, ftmp);
 714         felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);       /* 2^3 - 2^1 */
 715         felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);      /* 2^3 - 2^0 */
 716         felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);       /* 2^4 - 2^1 */
 717
 718         felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp);     /* 2^3 - 2^1 */
 719         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^4 - 2^2 */
 720         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
 721
 722         felem_assign(ftmp2, ftmp3);
 723         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^5 - 2^1 */
 724         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^6 - 2^2 */
 725         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^7 - 2^3 */
 726         felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^8 - 2^4 */
 727         felem_assign(ftmp4, ftmp3);
 728         felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp);  /* 2^8 - 2^1 */
 729         felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp);     /* 2^9 - 2^2 */
 730         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
 731         felem_assign(ftmp2, ftmp3);
 732
 733         for (i = 0; i < 8; i++)
 734                 {
 735                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^16 - 2^8 */
 736                 }
 737         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
 738         felem_assign(ftmp2, ftmp3);
 739
 740         for (i = 0; i < 16; i++)
 741                 {
 742                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^32 - 2^16 */
 743                 }
 744         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
 745         felem_assign(ftmp2, ftmp3);
 746
 747         for (i = 0; i < 32; i++)
 748                 {
 749                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^64 - 2^32 */
 750                 }
 751         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
 752         felem_assign(ftmp2, ftmp3);
 753
 754         for (i = 0; i < 64; i++)
 755                 {
 756                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^128 - 2^64 */
 757                 }
 758         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
 759         felem_assign(ftmp2, ftmp3);
 760
 761         for (i = 0; i < 128; i++)
 762                 {
 763                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^256 - 2^128 */
 764                 }
 765         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
 766         felem_assign(ftmp2, ftmp3);
 767
 768         for (i = 0; i < 256; i++)
 769                 {
 770                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^512 - 2^256 */
 771                 }
 772         felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
 773
 774         for (i = 0; i < 9; i++)
 775                 {
 776                 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);     /* 2^521 - 2^9 */
 777                 }
 778         felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
 779         felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp);      /* 2^512 - 3 */
 780 }
 781
 782 /* This is 2^521-1, expressed as an felem */
 783 static const felem kPrime =
 784         {
 785         0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 786         0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
 787         0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
 788         };
 789
 790 /*-
 791  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
 792  * otherwise.
 793  * On entry:
 794  *   in[i] < 2^59 + 2^14
 795  */
 796 static limb felem_is_zero(const felem in)
 797         {
 798         felem ftmp;
 799         limb is_zero, is_p;
 800         felem_assign(ftmp, in);
 801
 802         ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits;
 803         /* ftmp[8] < 2^57 */
 804         ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits;
 805         ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits;
 806         ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits;
 807         ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits;
 808         ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits;
 809         ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits;
 810         ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits;
 811         ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits;
 812         /* ftmp[8] < 2^57 + 4 */
 813
 814         /* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is
 815          * greater than our bound for ftmp[8]. Therefore we only have to check
 816          * if the zero is zero or 2^521-1. */
 817
 818         is_zero = 0;
 819         is_zero |= ftmp[0];
 820         is_zero |= ftmp[1];
 821         is_zero |= ftmp[2];
 822         is_zero |= ftmp[3];
 823         is_zero |= ftmp[4];
 824         is_zero |= ftmp[5];
 825         is_zero |= ftmp[6];
 826         is_zero |= ftmp[7];
 827         is_zero |= ftmp[8];
 828
 829         is_zero--;
 830         /* We know that ftmp[i] < 2^63, therefore the only way that the top bit
 831          * can be set is if is_zero was 0 before the decrement. */
 832         is_zero = ((s64) is_zero) >> 63;
 833
 834         is_p = ftmp[0] ^ kPrime[0];
 835         is_p |= ftmp[1] ^ kPrime[1];
 836         is_p |= ftmp[2] ^ kPrime[2];
 837         is_p |= ftmp[3] ^ kPrime[3];
 838         is_p |= ftmp[4] ^ kPrime[4];
 839         is_p |= ftmp[5] ^ kPrime[5];
 840         is_p |= ftmp[6] ^ kPrime[6];
 841         is_p |= ftmp[7] ^ kPrime[7];
 842         is_p |= ftmp[8] ^ kPrime[8];
 843
 844         is_p--;
 845         is_p = ((s64) is_p) >> 63;
 846
 847         is_zero |= is_p;
 848         return is_zero;
 849         }
 850
 851 static int felem_is_zero_int(const felem in)
 852         {
 853         return (int) (felem_is_zero(in) & ((limb)1));
 854         }
 855
 856 /*-
 857  * felem_contract converts |in| to its unique, minimal representation.
 858  * On entry:
 859  *   in[i] < 2^59 + 2^14
 860  */
 861 static void felem_contract(felem out, const felem in)
 862         {
 863         limb is_p, is_greater, sign;
 864         static const limb two58 = ((limb)1) << 58;
 865
 866         felem_assign(out, in);
 867
 868         out[0] += out[8] >> 57; out[8] &= bottom57bits;
 869         /* out[8] < 2^57 */
 870         out[1] += out[0] >> 58; out[0] &= bottom58bits;
 871         out[2] += out[1] >> 58; out[1] &= bottom58bits;
 872         out[3] += out[2] >> 58; out[2] &= bottom58bits;
 873         out[4] += out[3] >> 58; out[3] &= bottom58bits;
 874         out[5] += out[4] >> 58; out[4] &= bottom58bits;
 875         out[6] += out[5] >> 58; out[5] &= bottom58bits;
 876         out[7] += out[6] >> 58; out[6] &= bottom58bits;
 877         out[8] += out[7] >> 58; out[7] &= bottom58bits;
 878         /* out[8] < 2^57 + 4 */
 879
 880         /* If the value is greater than 2^521-1 then we have to subtract
 881          * 2^521-1 out. See the comments in felem_is_zero regarding why we
 882          * don't test for other multiples of the prime. */
 883
 884         /* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */
 885
 886         is_p = out[0] ^ kPrime[0];
 887         is_p |= out[1] ^ kPrime[1];
 888         is_p |= out[2] ^ kPrime[2];
 889         is_p |= out[3] ^ kPrime[3];
 890         is_p |= out[4] ^ kPrime[4];
 891         is_p |= out[5] ^ kPrime[5];
 892         is_p |= out[6] ^ kPrime[6];
 893         is_p |= out[7] ^ kPrime[7];
 894         is_p |= out[8] ^ kPrime[8];
 895
 896         is_p--;
 897         is_p &= is_p << 32;
 898         is_p &= is_p << 16;
 899         is_p &= is_p << 8;
 900         is_p &= is_p << 4;
 901         is_p &= is_p << 2;
 902         is_p &= is_p << 1;
 903         is_p = ((s64) is_p) >> 63;
 904         is_p = ~is_p;
 905
 906         /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
 907
 908         out[0] &= is_p;
 909         out[1] &= is_p;
 910         out[2] &= is_p;
 911         out[3] &= is_p;
 912         out[4] &= is_p;
 913         out[5] &= is_p;
 914         out[6] &= is_p;
 915         out[7] &= is_p;
 916         out[8] &= is_p;
 917
 918         /* In order to test that |out| >= 2^521-1 we need only test if out[8]
 919          * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */
 920         is_greater = out[8] >> 57;
 921         is_greater |= is_greater << 32;
 922         is_greater |= is_greater << 16;
 923         is_greater |= is_greater << 8;
 924         is_greater |= is_greater << 4;
 925         is_greater |= is_greater << 2;
 926         is_greater |= is_greater << 1;
 927         is_greater = ((s64) is_greater) >> 63;
 928
 929         out[0] -= kPrime[0] & is_greater;
 930         out[1] -= kPrime[1] & is_greater;
 931         out[2] -= kPrime[2] & is_greater;
 932         out[3] -= kPrime[3] & is_greater;
 933         out[4] -= kPrime[4] & is_greater;
 934         out[5] -= kPrime[5] & is_greater;
 935         out[6] -= kPrime[6] & is_greater;
 936         out[7] -= kPrime[7] & is_greater;
 937         out[8] -= kPrime[8] & is_greater;
 938
 939         /* Eliminate negative coefficients */
 940         sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign);
 941         sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign);
 942         sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign);
 943         sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign);
 944         sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign);
 945         sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
 946         sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
 947         sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
 948         sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
 949         sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
 950         sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
 951         }
 952
 953 /*-
 954  * Group operations
 955  * ----------------
 956  *
 957  * Building on top of the field operations we have the operations on the
 958  * elliptic curve group itself. Points on the curve are represented in Jacobian
 959  * coordinates */
 960
 961 /*-
 962  * point_double calcuates 2*(x_in, y_in, z_in)
 963  *
 964  * The method is taken from:
 965  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
 966  *
 967  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
 968  * while x_out == y_in is not (maybe this works, but it's not tested). */
 969 static void
 970 point_double(felem x_out, felem y_out, felem z_out,
 971              const felem x_in, const felem y_in, const felem z_in)
 972         {
 973         largefelem tmp, tmp2;
 974         felem delta, gamma, beta, alpha, ftmp, ftmp2;
 975
 976         felem_assign(ftmp, x_in);
 977         felem_assign(ftmp2, x_in);
 978
 979         /* delta = z^2 */
 980         felem_square(tmp, z_in);
 981         felem_reduce(delta, tmp);  /* delta[i] < 2^59 + 2^14 */
 982
 983         /* gamma = y^2 */
 984         felem_square(tmp, y_in);
 985         felem_reduce(gamma, tmp);  /* gamma[i] < 2^59 + 2^14 */
 986
 987         /* beta = x*gamma */
 988         felem_mul(tmp, x_in, gamma);
 989         felem_reduce(beta, tmp);  /* beta[i] < 2^59 + 2^14 */
 990
 991         /* alpha = 3*(x-delta)*(x+delta) */
 992         felem_diff64(ftmp, delta);
 993         /* ftmp[i] < 2^61 */
 994         felem_sum64(ftmp2, delta);
 995         /* ftmp2[i] < 2^60 + 2^15 */
 996         felem_scalar64(ftmp2, 3);
 997         /* ftmp2[i] < 3*2^60 + 3*2^15 */
 998         felem_mul(tmp, ftmp, ftmp2);
 999         /*-
1000          * tmp[i] < 17(3*2^121 + 3*2^76)
1001          *        = 61*2^121 + 61*2^76
1002          *        < 64*2^121 + 64*2^76
1003          *        = 2^127 + 2^82
1004          *        < 2^128
1005          */
1006         felem_reduce(alpha, tmp);
1007
1008         /* x' = alpha^2 - 8*beta */
1009         felem_square(tmp, alpha);
1010         /* tmp[i] < 17*2^120
1011          *        < 2^125 */
1012         felem_assign(ftmp, beta);
1013         felem_scalar64(ftmp, 8);
1014         /* ftmp[i] < 2^62 + 2^17 */
1015         felem_diff_128_64(tmp, ftmp);
1016         /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1017         felem_reduce(x_out, tmp);
1018
1019         /* z' = (y + z)^2 - gamma - delta */
1020         felem_sum64(delta, gamma);
1021         /* delta[i] < 2^60 + 2^15 */
1022         felem_assign(ftmp, y_in);
1023         felem_sum64(ftmp, z_in);
1024         /* ftmp[i] < 2^60 + 2^15 */
1025         felem_square(tmp, ftmp);
1026         /* tmp[i] < 17(2^122)
1027          *        < 2^127 */
1028         felem_diff_128_64(tmp, delta);
1029         /* tmp[i] < 2^127 + 2^63 */
1030         felem_reduce(z_out, tmp);
1031
1032         /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1033         felem_scalar64(beta, 4);
1034         /* beta[i] < 2^61 + 2^16 */
1035         felem_diff64(beta, x_out);
1036         /* beta[i] < 2^61 + 2^60 + 2^16 */
1037         felem_mul(tmp, alpha, beta);
1038         /*-
1039          * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1040          *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1041          *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1042          *        < 2^128
1043          */
1044         felem_square(tmp2, gamma);
1045         /*-
1046          * tmp2[i] < 17*(2^59 + 2^14)^2
1047          *         = 17*(2^118 + 2^74 + 2^28)
1048          */
1049         felem_scalar128(tmp2, 8);
1050         /*-
1051          * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1052          *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1053          *         < 2^126
1054          */
1055         felem_diff128(tmp, tmp2);
1056         /*-
1057          * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1058          *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1059          *          2^74 + 2^69 + 2^34 + 2^30
1060          *        < 2^128
1061          */
1062         felem_reduce(y_out, tmp);
1063         }
1064
1065 /* copy_conditional copies in to out iff mask is all ones. */
1066 static void
1067 copy_conditional(felem out, const felem in, limb mask)
1068         {
1069         unsigned i;
1070         for (i = 0; i < NLIMBS; ++i)
1071                 {
1072                 const limb tmp = mask & (in[i] ^ out[i]);
1073                 out[i] ^= tmp;
1074                 }
1075         }
1076
1077 /*-
1078  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1079  *
1080  * The method is taken from
1081  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1082  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1083  *
1084  * This function includes a branch for checking whether the two input points
1085  * are equal (while not equal to the point at infinity). This case never
1086  * happens during single point multiplication, so there is no timing leak for
1087  * ECDH or ECDSA signing. */
1088 static void point_add(felem x3, felem y3, felem z3,
1089         const felem x1, const felem y1, const felem z1,
1090         const int mixed, const felem x2, const felem y2, const felem z2)
1091         {
1092         felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1093         largefelem tmp, tmp2;
1094         limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1095
1096         z1_is_zero = felem_is_zero(z1);
1097         z2_is_zero = felem_is_zero(z2);
1098
1099         /* ftmp = z1z1 = z1**2 */
1100         felem_square(tmp, z1);
1101         felem_reduce(ftmp, tmp);
1102
1103         if (!mixed)
1104                 {
1105                 /* ftmp2 = z2z2 = z2**2 */
1106                 felem_square(tmp, z2);
1107                 felem_reduce(ftmp2, tmp);
1108
1109                 /* u1 = ftmp3 = x1*z2z2 */
1110                 felem_mul(tmp, x1, ftmp2);
1111                 felem_reduce(ftmp3, tmp);
1112
1113                 /* ftmp5 = z1 + z2 */
1114                 felem_assign(ftmp5, z1);
1115                 felem_sum64(ftmp5, z2);
1116                 /* ftmp5[i] < 2^61 */
1117
1118                 /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1119                 felem_square(tmp, ftmp5);
1120                 /* tmp[i] < 17*2^122 */
1121                 felem_diff_128_64(tmp, ftmp);
1122                 /* tmp[i] < 17*2^122 + 2^63 */
1123                 felem_diff_128_64(tmp, ftmp2);
1124                 /* tmp[i] < 17*2^122 + 2^64 */
1125                 felem_reduce(ftmp5, tmp);
1126
1127                 /* ftmp2 = z2 * z2z2 */
1128                 felem_mul(tmp, ftmp2, z2);
1129                 felem_reduce(ftmp2, tmp);
1130
1131                 /* s1 = ftmp6 = y1 * z2**3 */
1132                 felem_mul(tmp, y1, ftmp2);
1133                 felem_reduce(ftmp6, tmp);
1134                 }
1135         else
1136                 {
1137                 /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
1138
1139                 /* u1 = ftmp3 = x1*z2z2 */
1140                 felem_assign(ftmp3, x1);
1141
1142                 /* ftmp5 = 2*z1z2 */
1143                 felem_scalar(ftmp5, z1, 2);
1144
1145                 /* s1 = ftmp6 = y1 * z2**3 */
1146                 felem_assign(ftmp6, y1);
1147                 }
1148
1149         /* u2 = x2*z1z1 */
1150         felem_mul(tmp, x2, ftmp);
1151         /* tmp[i] < 17*2^120 */
1152
1153         /* h = ftmp4 = u2 - u1 */
1154         felem_diff_128_64(tmp, ftmp3);
1155         /* tmp[i] < 17*2^120 + 2^63 */
1156         felem_reduce(ftmp4, tmp);
1157
1158         x_equal = felem_is_zero(ftmp4);
1159
1160         /* z_out = ftmp5 * h */
1161         felem_mul(tmp, ftmp5, ftmp4);
1162         felem_reduce(z_out, tmp);
1163
1164         /* ftmp = z1 * z1z1 */
1165         felem_mul(tmp, ftmp, z1);
1166         felem_reduce(ftmp, tmp);
1167
1168         /* s2 = tmp = y2 * z1**3 */
1169         felem_mul(tmp, y2, ftmp);
1170         /* tmp[i] < 17*2^120 */
1171
1172         /* r = ftmp5 = (s2 - s1)*2 */
1173         felem_diff_128_64(tmp, ftmp6);
1174         /* tmp[i] < 17*2^120 + 2^63 */
1175         felem_reduce(ftmp5, tmp);
1176         y_equal = felem_is_zero(ftmp5);
1177         felem_scalar64(ftmp5, 2);
1178         /* ftmp5[i] < 2^61 */
1179
1180         if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
1181                 {
1182                 point_double(x3, y3, z3, x1, y1, z1);
1183                 return;
1184                 }
1185
1186         /* I = ftmp = (2h)**2 */
1187         felem_assign(ftmp, ftmp4);
1188         felem_scalar64(ftmp, 2);
1189         /* ftmp[i] < 2^61 */
1190         felem_square(tmp, ftmp);
1191         /* tmp[i] < 17*2^122 */
1192         felem_reduce(ftmp, tmp);
1193
1194         /* J = ftmp2 = h * I */
1195         felem_mul(tmp, ftmp4, ftmp);
1196         felem_reduce(ftmp2, tmp);
1197
1198         /* V = ftmp4 = U1 * I */
1199         felem_mul(tmp, ftmp3, ftmp);
1200         felem_reduce(ftmp4, tmp);
1201
1202         /* x_out = r**2 - J - 2V */
1203         felem_square(tmp, ftmp5);
1204         /* tmp[i] < 17*2^122 */
1205         felem_diff_128_64(tmp, ftmp2);
1206         /* tmp[i] < 17*2^122 + 2^63 */
1207         felem_assign(ftmp3, ftmp4);
1208         felem_scalar64(ftmp4, 2);
1209         /* ftmp4[i] < 2^61 */
1210         felem_diff_128_64(tmp, ftmp4);
1211         /* tmp[i] < 17*2^122 + 2^64 */
1212         felem_reduce(x_out, tmp);
1213
1214         /* y_out = r(V-x_out) - 2 * s1 * J */
1215         felem_diff64(ftmp3, x_out);
1216         /* ftmp3[i] < 2^60 + 2^60
1217          *          = 2^61 */
1218         felem_mul(tmp, ftmp5, ftmp3);
1219         /* tmp[i] < 17*2^122 */
1220         felem_mul(tmp2, ftmp6, ftmp2);
1221         /* tmp2[i] < 17*2^120 */
1222         felem_scalar128(tmp2, 2);
1223         /* tmp2[i] < 17*2^121 */
1224         felem_diff128(tmp, tmp2);
1225         /*-
1226          * tmp[i] < 2^127 - 2^69 + 17*2^122
1227          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1228          *        < 2^127
1229          */
1230         felem_reduce(y_out, tmp);
1231
1232         copy_conditional(x_out, x2, z1_is_zero);
1233         copy_conditional(x_out, x1, z2_is_zero);
1234         copy_conditional(y_out, y2, z1_is_zero);
1235         copy_conditional(y_out, y1, z2_is_zero);
1236         copy_conditional(z_out, z2, z1_is_zero);
1237         copy_conditional(z_out, z1, z2_is_zero);
1238         felem_assign(x3, x_out);
1239         felem_assign(y3, y_out);
1240         felem_assign(z3, z_out);
1241         }
1242
1243 /*-
1244  * Base point pre computation
1245  * --------------------------
1246  *
1247  * Two different sorts of precomputed tables are used in the following code.
1248  * Each contain various points on the curve, where each point is three field
1249  * elements (x, y, z).
1250  *
1251  * For the base point table, z is usually 1 (0 for the point at infinity).
1252  * This table has 16 elements:
1253  * index | bits    | point
1254  * ------+---------+------------------------------
1255  *     0 | 0 0 0 0 | 0G
1256  *     1 | 0 0 0 1 | 1G
1257  *     2 | 0 0 1 0 | 2^130G
1258  *     3 | 0 0 1 1 | (2^130 + 1)G
1259  *     4 | 0 1 0 0 | 2^260G
1260  *     5 | 0 1 0 1 | (2^260 + 1)G
1261  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1262  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1263  *     8 | 1 0 0 0 | 2^390G
1264  *     9 | 1 0 0 1 | (2^390 + 1)G
1265  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1266  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1267  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1268  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1269  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1270  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1271  *
1272  * The reason for this is so that we can clock bits into four different
1273  * locations when doing simple scalar multiplies against the base point.
1274  *
1275  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1276
1277 /* gmul is the table of precomputed base points */
1278 static const felem gmul[16][3] =
1279         {{{0, 0, 0, 0, 0, 0, 0, 0, 0},
1280           {0, 0, 0, 0, 0, 0, 0, 0, 0},
1281           {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1282          {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1283            0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1284            0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1285           {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1286            0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1287            0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1288           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1289          {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1290            0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1291            0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1292           {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1293            0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1294            0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1295           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1296          {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1297            0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1298            0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1299           {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1300            0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1301            0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1302           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1303          {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1304            0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1305            0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1306           {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1307            0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1308            0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1309           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1310          {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1311            0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1312            0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1313           {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1314            0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1315            0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1316           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1317          {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1318            0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1319            0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1320           {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1321            0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1322            0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1323           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1324          {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1325            0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1326            0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1327           {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1328            0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1329            0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1330           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1331          {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1332            0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1333            0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1334           {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1335            0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1336            0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1337           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1338          {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1339            0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1340            0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1341           {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1342            0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1343            0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1344           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1345          {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1346            0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1347            0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1348           {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1349            0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1350            0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1351           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1352          {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1353            0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1354            0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1355           {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1356            0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1357            0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1358           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1359          {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1360            0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1361            0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1362           {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1363            0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1364            0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1365           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1366          {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1367            0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1368            0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1369           {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1370            0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1371            0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1372           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1373          {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1374            0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1375            0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1376           {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1377            0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1378            0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1379           {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1380          {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1381            0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1382            0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1383           {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1384            0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1385            0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1386          {1, 0, 0, 0, 0, 0, 0, 0, 0}}};
1387
1388 /* select_point selects the |idx|th point from a precomputation table and
1389  * copies it to out. */
1390  /* pre_comp below is of the size provided in |size| */
1391 static void select_point(const limb idx, unsigned int size, const felem pre_comp[][3],
1392                          felem out[3])
1393         {
1394         unsigned i, j;
1395         limb *outlimbs = &out[0][0];
1396         memset(outlimbs, 0, 3 * sizeof(felem));
1397
1398         for (i = 0; i < size; i++)
1399                 {
1400                 const limb *inlimbs = &pre_comp[i][0][0];
1401                 limb mask = i ^ idx;
1402                 mask |= mask >> 4;
1403                 mask |= mask >> 2;
1404                 mask |= mask >> 1;
1405                 mask &= 1;
1406                 mask--;
1407                 for (j = 0; j < NLIMBS * 3; j++)
1408                         outlimbs[j] |= inlimbs[j] & mask;
1409                 }
1410         }
1411
1412 /* get_bit returns the |i|th bit in |in| */
1413 static char get_bit(const felem_bytearray in, int i)
1414         {
1415         if (i < 0)
1416                 return 0;
1417         return (in[i >> 3] >> (i & 7)) & 1;
1418         }
1419
1420 /* Interleaved point multiplication using precomputed point multiples:
1421  * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
1422  * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
1423  * of the generator, using certain (large) precomputed multiples in g_pre_comp.
1424  * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
1425 static void batch_mul(felem x_out, felem y_out, felem z_out,
1426         const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
1427         const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3])
1428         {
1429         int i, skip;
1430         unsigned num, gen_mul = (g_scalar != NULL);
1431         felem nq[3], tmp[4];
1432         limb bits;
1433         u8 sign, digit;
1434
1435         /* set nq to the point at infinity */
1436         memset(nq, 0, 3 * sizeof(felem));
1437
1438         /* Loop over all scalars msb-to-lsb, interleaving additions
1439          * of multiples of the generator (last quarter of rounds)
1440          * and additions of other points multiples (every 5th round).
1441          */
1442         skip = 1; /* save two point operations in the first round */
1443         for (i = (num_points ? 520 : 130); i >= 0; --i)
1444                 {
1445                 /* double */
1446                 if (!skip)
1447                         point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1448
1449                 /* add multiples of the generator */
1450                 if (gen_mul && (i <= 130))
1451                         {
1452                         bits = get_bit(g_scalar, i + 390) << 3;
1453                         if (i < 130)
1454                                 {
1455                                 bits |= get_bit(g_scalar, i + 260) << 2;
1456                                 bits |= get_bit(g_scalar, i + 130) << 1;
1457                                 bits |= get_bit(g_scalar, i);
1458                                 }
1459                         /* select the point to add, in constant time */
1460                         select_point(bits, 16, g_pre_comp, tmp);
1461                         if (!skip)
1462                                 {
1463                                 /* The 1 argument below is for "mixed" */
1464                                 point_add(nq[0], nq[1], nq[2],
1465                                         nq[0], nq[1], nq[2],
1466                                         1, tmp[0], tmp[1], tmp[2]);
1467                                 }
1468                         else
1469                                 {
1470                                 memcpy(nq, tmp, 3 * sizeof(felem));
1471                                 skip = 0;
1472                                 }
1473                         }
1474
1475                 /* do other additions every 5 doublings */
1476                 if (num_points && (i % 5 == 0))
1477                         {
1478                         /* loop over all scalars */
1479                         for (num = 0; num < num_points; ++num)
1480                                 {
1481                                 bits = get_bit(scalars[num], i + 4) << 5;
1482                                 bits |= get_bit(scalars[num], i + 3) << 4;
1483                                 bits |= get_bit(scalars[num], i + 2) << 3;
1484                                 bits |= get_bit(scalars[num], i + 1) << 2;
1485                                 bits |= get_bit(scalars[num], i) << 1;
1486                                 bits |= get_bit(scalars[num], i - 1);
1487                                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1488
1489                                 /* select the point to add or subtract, in constant time */
1490                                 select_point(digit, 17, pre_comp[num], tmp);
1491                                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */
1492                                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1493
1494                                 if (!skip)
1495                                         {
1496                                         point_add(nq[0], nq[1], nq[2],
1497                                                 nq[0], nq[1], nq[2],
1498                                                 mixed, tmp[0], tmp[1], tmp[2]);
1499                                         }
1500                                 else
1501                                         {
1502                                         memcpy(nq, tmp, 3 * sizeof(felem));
1503                                         skip = 0;
1504                                         }
1505                                 }
1506                         }
1507                 }
1508         felem_assign(x_out, nq[0]);
1509         felem_assign(y_out, nq[1]);
1510         felem_assign(z_out, nq[2]);
1511         }
1512
1513
1514 /* Precomputation for the group generator. */
1515 typedef struct {
1516         felem g_pre_comp[16][3];
1517         int references;
1518 } NISTP521_PRE_COMP;
1519
1520 const EC_METHOD *EC_GFp_nistp521_method(void)
1521         {
1522         static const EC_METHOD ret = {
1523                 EC_FLAGS_DEFAULT_OCT,
1524                 NID_X9_62_prime_field,
1525                 ec_GFp_nistp521_group_init,
1526                 ec_GFp_simple_group_finish,
1527                 ec_GFp_simple_group_clear_finish,
1528                 ec_GFp_nist_group_copy,
1529                 ec_GFp_nistp521_group_set_curve,
1530                 ec_GFp_simple_group_get_curve,
1531                 ec_GFp_simple_group_get_degree,
1532                 ec_GFp_simple_group_check_discriminant,
1533                 ec_GFp_simple_point_init,
1534                 ec_GFp_simple_point_finish,
1535                 ec_GFp_simple_point_clear_finish,
1536                 ec_GFp_simple_point_copy,
1537                 ec_GFp_simple_point_set_to_infinity,
1538                 ec_GFp_simple_set_Jprojective_coordinates_GFp,
1539                 ec_GFp_simple_get_Jprojective_coordinates_GFp,
1540                 ec_GFp_simple_point_set_affine_coordinates,
1541                 ec_GFp_nistp521_point_get_affine_coordinates,
1542                 0 /* point_set_compressed_coordinates */,
1543                 0 /* point2oct */,
1544                 0 /* oct2point */,
1545                 ec_GFp_simple_add,
1546                 ec_GFp_simple_dbl,
1547                 ec_GFp_simple_invert,
1548                 ec_GFp_simple_is_at_infinity,
1549                 ec_GFp_simple_is_on_curve,
1550                 ec_GFp_simple_cmp,
1551                 ec_GFp_simple_make_affine,
1552                 ec_GFp_simple_points_make_affine,
1553                 ec_GFp_nistp521_points_mul,
1554                 ec_GFp_nistp521_precompute_mult,
1555                 ec_GFp_nistp521_have_precompute_mult,
1556                 ec_GFp_nist_field_mul,
1557                 ec_GFp_nist_field_sqr,
1558                 0 /* field_div */,
1559                 0 /* field_encode */,
1560                 0 /* field_decode */,
1561                 0 /* field_set_to_one */ };
1562
1563         return &ret;
1564         }
1565
1566
1567 /******************************************************************************/
1568 /*                     FUNCTIONS TO MANAGE PRECOMPUTATION
1569  */
1570
1571 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1572         {
1573         NISTP521_PRE_COMP *ret = NULL;
1574         ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1575         if (!ret)
1576                 {
1577                 ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1578                 return ret;
1579                 }
1580         memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1581         ret->references = 1;
1582         return ret;
1583         }
1584
1585 static void *nistp521_pre_comp_dup(void *src_)
1586         {
1587         NISTP521_PRE_COMP *src = src_;
1588
1589         /* no need to actually copy, these objects never change! */
1590         CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1591
1592         return src_;
1593         }
1594
1595 static void nistp521_pre_comp_free(void *pre_)
1596         {
1597         int i;
1598         NISTP521_PRE_COMP *pre = pre_;
1599
1600         if (!pre)
1601                 return;
1602
1603         i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1604         if (i > 0)
1605                 return;
1606
1607         OPENSSL_free(pre);
1608         }
1609
1610 static void nistp521_pre_comp_clear_free(void *pre_)
1611         {
1612         int i;
1613         NISTP521_PRE_COMP *pre = pre_;
1614
1615         if (!pre)
1616                 return;
1617
1618         i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1619         if (i > 0)
1620                 return;
1621
1622         OPENSSL_cleanse(pre, sizeof(*pre));
1623         OPENSSL_free(pre);
1624         }
1625
1626 /******************************************************************************/
1627 /*                         OPENSSL EC_METHOD FUNCTIONS
1628  */
1629
1630 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1631         {
1632         int ret;
1633         ret = ec_GFp_simple_group_init(group);
1634         group->a_is_minus3 = 1;
1635         return ret;
1636         }
1637
1638 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1639         const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
1640         {
1641         int ret = 0;
1642         BN_CTX *new_ctx = NULL;
1643         BIGNUM *curve_p, *curve_a, *curve_b;
1644
1645         if (ctx == NULL)
1646                 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1647         BN_CTX_start(ctx);
1648         if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1649                 ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1650                 ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
1651         BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1652         BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1653         BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1654         if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
1655                 (BN_cmp(curve_b, b)))
1656                 {
1657                 ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1658                         EC_R_WRONG_CURVE_PARAMETERS);
1659                 goto err;
1660                 }
1661         group->field_mod_func = BN_nist_mod_521;
1662         ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1663 err:
1664         BN_CTX_end(ctx);
1665         if (new_ctx != NULL)
1666                 BN_CTX_free(new_ctx);
1667         return ret;
1668         }
1669
1670 /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
1671  * (X', Y') = (X/Z^2, Y/Z^3) */
1672 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1673         const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
1674         {
1675         felem z1, z2, x_in, y_in, x_out, y_out;
1676         largefelem tmp;
1677
1678         if (EC_POINT_is_at_infinity(group, point))
1679                 {
1680                 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1681                         EC_R_POINT_AT_INFINITY);
1682                 return 0;
1683                 }
1684         if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1685                 (!BN_to_felem(z1, &point->Z))) return 0;
1686         felem_inv(z2, z1);
1687         felem_square(tmp, z2); felem_reduce(z1, tmp);
1688         felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
1689         felem_contract(x_out, x_in);
1690         if (x != NULL)
1691                 {
1692                 if (!felem_to_BN(x, x_out))
1693                         {
1694                         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1695                         return 0;
1696                         }
1697                 }
1698         felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
1699         felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
1700         felem_contract(y_out, y_in);
1701         if (y != NULL)
1702                 {
1703                 if (!felem_to_BN(y, y_out))
1704                         {
1705                         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1706                         return 0;
1707                         }
1708                 }
1709         return 1;
1710         }
1711
1712 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1713 static void make_points_affine(size_t num, felem points[][3], felem tmp_felems[])
1714         {
1715         /* Runs in constant time, unless an input is the point at infinity
1716          * (which normally shouldn't happen). */
1717         ec_GFp_nistp_points_make_affine_internal(
1718                 num,
1719                 points,
1720                 sizeof(felem),
1721                 tmp_felems,
1722                 (void (*)(void *)) felem_one,
1723                 (int (*)(const void *)) felem_is_zero_int,
1724                 (void (*)(void *, const void *)) felem_assign,
1725                 (void (*)(void *, const void *)) felem_square_reduce,
1726                 (void (*)(void *, const void *, const void *)) felem_mul_reduce,
1727                 (void (*)(void *, const void *)) felem_inv,
1728                 (void (*)(void *, const void *)) felem_contract);
1729         }
1730
1731 /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
1732  * Result is stored in r (r can equal one of the inputs). */
1733 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1734         const BIGNUM *scalar, size_t num, const EC_POINT *points[],
1735         const BIGNUM *scalars[], BN_CTX *ctx)
1736         {
1737         int ret = 0;
1738         int j;
1739         int mixed = 0;
1740         BN_CTX *new_ctx = NULL;
1741         BIGNUM *x, *y, *z, *tmp_scalar;
1742         felem_bytearray g_secret;
1743         felem_bytearray *secrets = NULL;
1744         felem (*pre_comp)[17][3] = NULL;
1745         felem *tmp_felems = NULL;
1746         felem_bytearray tmp;
1747         unsigned i, num_bytes;
1748         int have_pre_comp = 0;
1749         size_t num_points = num;
1750         felem x_in, y_in, z_in, x_out, y_out, z_out;
1751         NISTP521_PRE_COMP *pre = NULL;
1752         felem (*g_pre_comp)[3] = NULL;
1753         EC_POINT *generator = NULL;
1754         const EC_POINT *p = NULL;
1755         const BIGNUM *p_scalar = NULL;
1756
1757         if (ctx == NULL)
1758                 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1759         BN_CTX_start(ctx);
1760         if (((x = BN_CTX_get(ctx)) == NULL) ||
1761                 ((y = BN_CTX_get(ctx)) == NULL) ||
1762                 ((z = BN_CTX_get(ctx)) == NULL) ||
1763                 ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1764                 goto err;
1765
1766         if (scalar != NULL)
1767                 {
1768                 pre = EC_EX_DATA_get_data(group->extra_data,
1769                         nistp521_pre_comp_dup, nistp521_pre_comp_free,
1770                         nistp521_pre_comp_clear_free);
1771                 if (pre)
1772                         /* we have precomputation, try to use it */
1773                         g_pre_comp = &pre->g_pre_comp[0];
1774                 else
1775                         /* try to use the standard precomputation */
1776                         g_pre_comp = (felem (*)[3]) gmul;
1777                 generator = EC_POINT_new(group);
1778                 if (generator == NULL)
1779                         goto err;
1780                 /* get the generator from precomputation */
1781                 if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1782                         !felem_to_BN(y, g_pre_comp[1][1]) ||
1783                         !felem_to_BN(z, g_pre_comp[1][2]))
1784                         {
1785                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1786                         goto err;
1787                         }
1788                 if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1789                                 generator, x, y, z, ctx))
1790                         goto err;
1791                 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1792                         /* precomputation matches generator */
1793                         have_pre_comp = 1;
1794                 else
1795                         /* we don't have valid precomputation:
1796                          * treat the generator as a random point */
1797                         num_points++;
1798                 }
1799
1800         if (num_points > 0)
1801                 {
1802                 if (num_points >= 2)
1803                         {
1804                         /* unless we precompute multiples for just one point,
1805                          * converting those into affine form is time well spent  */
1806                         mixed = 1;
1807                         }
1808                 secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1809                 pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1810                 if (mixed)
1811                         tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1812                 if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL)))
1813                         {
1814                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1815                         goto err;
1816                         }
1817
1818                 /* we treat NULL scalars as 0, and NULL points as points at infinity,
1819                  * i.e., they contribute nothing to the linear combination */
1820                 memset(secrets, 0, num_points * sizeof(felem_bytearray));
1821                 memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1822                 for (i = 0; i < num_points; ++i)
1823                         {
1824                         if (i == num)
1825                                 /* we didn't have a valid precomputation, so we pick
1826                                  * the generator */
1827                                 {
1828                                 p = EC_GROUP_get0_generator(group);
1829                                 p_scalar = scalar;
1830                                 }
1831                         else
1832                                 /* the i^th point */
1833                                 {
1834                                 p = points[i];
1835                                 p_scalar = scalars[i];
1836                                 }
1837                         if ((p_scalar != NULL) && (p != NULL))
1838                                 {
1839                                 /* reduce scalar to 0 <= scalar < 2^521 */
1840                                 if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar)))
1841                                         {
1842                                         /* this is an unusual input, and we don't guarantee
1843                                          * constant-timeness */
1844                                         if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
1845                                                 {
1846                                                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1847                                                 goto err;
1848                                                 }
1849                                         num_bytes = BN_bn2bin(tmp_scalar, tmp);
1850                                         }
1851                                 else
1852                                         num_bytes = BN_bn2bin(p_scalar, tmp);
1853                                 flip_endian(secrets[i], tmp, num_bytes);
1854                                 /* precompute multiples */
1855                                 if ((!BN_to_felem(x_out, &p->X)) ||
1856                                         (!BN_to_felem(y_out, &p->Y)) ||
1857                                         (!BN_to_felem(z_out, &p->Z))) goto err;
1858                                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1859                                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1860                                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1861                                 for (j = 2; j <= 16; ++j)
1862                                         {
1863                                         if (j & 1)
1864                                                 {
1865                                                 point_add(
1866                                                         pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1867                                                         pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
1868                                                         0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
1869                                                 }
1870                                         else
1871                                                 {
1872                                                 point_double(
1873                                                         pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1874                                                         pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
1875                                                 }
1876                                         }
1877                                 }
1878                         }
1879                 if (mixed)
1880                         make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1881                 }
1882
1883         /* the scalar for the generator */
1884         if ((scalar != NULL) && (have_pre_comp))
1885                 {
1886                 memset(g_secret, 0, sizeof(g_secret));
1887                 /* reduce scalar to 0 <= scalar < 2^521 */
1888                 if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar)))
1889                         {
1890                         /* this is an unusual input, and we don't guarantee
1891                          * constant-timeness */
1892                         if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
1893                                 {
1894                                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1895                                 goto err;
1896                                 }
1897                         num_bytes = BN_bn2bin(tmp_scalar, tmp);
1898                         }
1899                 else
1900                         num_bytes = BN_bn2bin(scalar, tmp);
1901                 flip_endian(g_secret, tmp, num_bytes);
1902                 /* do the multiplication with generator precomputation*/
1903                 batch_mul(x_out, y_out, z_out,
1904                         (const felem_bytearray (*)) secrets, num_points,
1905                         g_secret,
1906                         mixed, (const felem (*)[17][3]) pre_comp,
1907                         (const felem (*)[3]) g_pre_comp);
1908                 }
1909         else
1910                 /* do the multiplication without generator precomputation */
1911                 batch_mul(x_out, y_out, z_out,
1912                         (const felem_bytearray (*)) secrets, num_points,
1913                         NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL);
1914         /* reduce the output to its unique minimal representation */
1915         felem_contract(x_in, x_out);
1916         felem_contract(y_in, y_out);
1917         felem_contract(z_in, z_out);
1918         if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1919                 (!felem_to_BN(z, z_in)))
1920                 {
1921                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1922                 goto err;
1923                 }
1924         ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1925
1926 err:
1927         BN_CTX_end(ctx);
1928         if (generator != NULL)
1929                 EC_POINT_free(generator);
1930         if (new_ctx != NULL)
1931                 BN_CTX_free(new_ctx);
1932         if (secrets != NULL)
1933                 OPENSSL_free(secrets);
1934         if (pre_comp != NULL)
1935                 OPENSSL_free(pre_comp);
1936         if (tmp_felems != NULL)
1937                 OPENSSL_free(tmp_felems);
1938         return ret;
1939         }
1940
1941 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1942         {
1943         int ret = 0;
1944         NISTP521_PRE_COMP *pre = NULL;
1945         int i, j;
1946         BN_CTX *new_ctx = NULL;
1947         BIGNUM *x, *y;
1948         EC_POINT *generator = NULL;
1949         felem tmp_felems[16];
1950
1951         /* throw away old precomputation */
1952         EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
1953                 nistp521_pre_comp_free, nistp521_pre_comp_clear_free);
1954         if (ctx == NULL)
1955                 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1956         BN_CTX_start(ctx);
1957         if (((x = BN_CTX_get(ctx)) == NULL) ||
1958                 ((y = BN_CTX_get(ctx)) == NULL))
1959                 goto err;
1960         /* get the generator */
1961         if (group->generator == NULL) goto err;
1962         generator = EC_POINT_new(group);
1963         if (generator == NULL)
1964                 goto err;
1965         BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x);
1966         BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y);
1967         if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1968                 goto err;
1969         if ((pre = nistp521_pre_comp_new()) == NULL)
1970                 goto err;
1971         /* if the generator is the standard one, use built-in precomputation */
1972         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1973                 {
1974                 memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1975                 ret = 1;
1976                 goto err;
1977                 }
1978         if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
1979                 (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
1980                 (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
1981                 goto err;
1982         /* compute 2^130*G, 2^260*G, 2^390*G */
1983         for (i = 1; i <= 4; i <<= 1)
1984                 {
1985                 point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1],
1986                         pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0],
1987                         pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1988                 for (j = 0; j < 129; ++j)
1989                         {
1990                         point_double(pre->g_pre_comp[2*i][0],
1991                                 pre->g_pre_comp[2*i][1],
1992                                 pre->g_pre_comp[2*i][2],
1993                                 pre->g_pre_comp[2*i][0],
1994                                 pre->g_pre_comp[2*i][1],
1995                                 pre->g_pre_comp[2*i][2]);
1996                         }
1997                 }
1998         /* g_pre_comp[0] is the point at infinity */
1999         memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2000         /* the remaining multiples */
2001         /* 2^130*G + 2^260*G */
2002         point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2003                 pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2004                 pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2005                 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2006                 pre->g_pre_comp[2][2]);
2007         /* 2^130*G + 2^390*G */
2008         point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2009                 pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2010                 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2011                 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2012                 pre->g_pre_comp[2][2]);
2013         /* 2^260*G + 2^390*G */
2014         point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2015                 pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2016                 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2017                 0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2018                 pre->g_pre_comp[4][2]);
2019         /* 2^130*G + 2^260*G + 2^390*G */
2020         point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2021                 pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2022                 pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2023                 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2024                 pre->g_pre_comp[2][2]);
2025         for (i = 1; i < 8; ++i)
2026                 {
2027                 /* odd multiples: add G */
2028                 point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1],
2029                         pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0],
2030                         pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2],
2031                         0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2032                         pre->g_pre_comp[1][2]);
2033                 }
2034         make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2035
2036         if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2037                         nistp521_pre_comp_free, nistp521_pre_comp_clear_free))
2038                 goto err;
2039         ret = 1;
2040         pre = NULL;
2041  err:
2042         BN_CTX_end(ctx);
2043         if (generator != NULL)
2044                 EC_POINT_free(generator);
2045         if (new_ctx != NULL)
2046                 BN_CTX_free(new_ctx);
2047         if (pre)
2048                 nistp521_pre_comp_free(pre);
2049         return ret;
2050         }
2051
2052 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2053         {
2054         if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2055                         nistp521_pre_comp_free, nistp521_pre_comp_clear_free)
2056                 != NULL)
2057                 return 1;
2058         else
2059                 return 0;
2060         }
2061
2062 #else
2063 static void *dummy=&dummy;
2064 #endif