1 /* crypto/ec/ecp_nistp521.c */
3 * Written by Adam Langley (Google) for the OpenSSL project
5 /* Copyright 2011 Google Inc.
7 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
22 * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
24 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26 * work which got its smarts from Daniel J. Bernstein's work on the same.
29 #include <openssl/opensslconf.h>
30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
34 #include <openssl/err.h>
37 #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
38 /* even with gcc, the typedef won't work for 32-bit platforms */
39 typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
41 #error "Need GCC 3.1 or later to define type uint128_t"
48 /* The underlying field.
50 * P521 operates over GF(2^521-1). We can serialise an element of this field
51 * into 66 bytes where the most significant byte contains only a single bit. We
52 * call this an felem_bytearray. */
54 typedef u8 felem_bytearray[66];
56 /* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
57 * These values are big-endian. */
58 static const felem_bytearray nistp521_curve_params[5] =
60 {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
61 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
62 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
63 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
64 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
65 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
66 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
67 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69 {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
70 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
76 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78 {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
79 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
80 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
81 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
82 0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
83 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
84 0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
85 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
87 {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
88 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
89 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
90 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
91 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
92 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
93 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
94 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
96 {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
97 0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
98 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
99 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
100 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
101 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
102 0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
103 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
107 /* The representation of field elements.
108 * ------------------------------------
110 * We represent field elements with nine values. These values are either 64 or
111 * 128 bits and the field element represented is:
112 * v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464 (mod p)
113 * Each of the nine values is called a 'limb'. Since the limbs are spaced only
114 * 58 bits apart, but are greater than 58 bits in length, the most significant
115 * bits of each limb overlap with the least significant bits of the next.
117 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
122 typedef uint64_t limb;
123 typedef limb felem[NLIMBS];
124 typedef uint128_t largefelem[NLIMBS];
126 static const limb bottom57bits = 0x1ffffffffffffff;
127 static const limb bottom58bits = 0x3ffffffffffffff;
129 /* bin66_to_felem takes a little-endian byte array and converts it into felem
130 * form. This assumes that the CPU is little-endian. */
131 static void bin66_to_felem(felem out, const u8 in[66])
133 out[0] = (*((limb*) &in[0])) & bottom58bits;
134 out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits;
135 out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits;
136 out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits;
137 out[4] = (*((limb*) &in[29])) & bottom58bits;
138 out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits;
139 out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits;
140 out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits;
141 out[8] = (*((limb*) &in[58])) & bottom57bits;
144 /* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
145 * array. This assumes that the CPU is little-endian. */
146 static void felem_to_bin66(u8 out[66], const felem in)
149 (*((limb*) &out[0])) = in[0];
150 (*((limb*) &out[7])) |= in[1] << 2;
151 (*((limb*) &out[14])) |= in[2] << 4;
152 (*((limb*) &out[21])) |= in[3] << 6;
153 (*((limb*) &out[29])) = in[4];
154 (*((limb*) &out[36])) |= in[5] << 2;
155 (*((limb*) &out[43])) |= in[6] << 4;
156 (*((limb*) &out[50])) |= in[7] << 6;
157 (*((limb*) &out[58])) = in[8];
160 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
161 static void flip_endian(u8 *out, const u8 *in, unsigned len)
164 for (i = 0; i < len; ++i)
165 out[i] = in[len-1-i];
168 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
169 static int BN_to_felem(felem out, const BIGNUM *bn)
171 felem_bytearray b_in;
172 felem_bytearray b_out;
175 /* BN_bn2bin eats leading zeroes */
176 memset(b_out, 0, sizeof b_out);
177 num_bytes = BN_num_bytes(bn);
178 if (num_bytes > sizeof b_out)
180 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
183 if (BN_is_negative(bn))
185 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
188 num_bytes = BN_bn2bin(bn, b_in);
189 flip_endian(b_out, b_in, num_bytes);
190 bin66_to_felem(out, b_out);
194 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
195 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
197 felem_bytearray b_in, b_out;
198 felem_to_bin66(b_in, in);
199 flip_endian(b_out, b_in, sizeof b_out);
200 return BN_bin2bn(b_out, sizeof b_out, out);
205 * ---------------- */
207 static void felem_one(felem out)
220 static void felem_assign(felem out, const felem in)
233 /* felem_sum64 sets out = out + in. */
234 static void felem_sum64(felem out, const felem in)
247 /* felem_scalar sets out = in * scalar */
248 static void felem_scalar(felem out, const felem in, limb scalar)
250 out[0] = in[0] * scalar;
251 out[1] = in[1] * scalar;
252 out[2] = in[2] * scalar;
253 out[3] = in[3] * scalar;
254 out[4] = in[4] * scalar;
255 out[5] = in[5] * scalar;
256 out[6] = in[6] * scalar;
257 out[7] = in[7] * scalar;
258 out[8] = in[8] * scalar;
261 /* felem_scalar64 sets out = out * scalar */
262 static void felem_scalar64(felem out, limb scalar)
275 /* felem_scalar128 sets out = out * scalar */
276 static void felem_scalar128(largefelem out, limb scalar)
289 /* felem_neg sets |out| to |-in|
291 * in[i] < 2^59 + 2^14
295 static void felem_neg(felem out, const felem in)
297 /* In order to prevent underflow, we subtract from 0 mod p. */
298 static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
299 static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
301 out[0] = two62m3 - in[0];
302 out[1] = two62m2 - in[1];
303 out[2] = two62m2 - in[2];
304 out[3] = two62m2 - in[3];
305 out[4] = two62m2 - in[4];
306 out[5] = two62m2 - in[5];
307 out[6] = two62m2 - in[6];
308 out[7] = two62m2 - in[7];
309 out[8] = two62m2 - in[8];
312 /* felem_diff64 subtracts |in| from |out|
314 * in[i] < 2^59 + 2^14
316 * out[i] < out[i] + 2^62
318 static void felem_diff64(felem out, const felem in)
320 /* In order to prevent underflow, we add 0 mod p before subtracting. */
321 static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
322 static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
324 out[0] += two62m3 - in[0];
325 out[1] += two62m2 - in[1];
326 out[2] += two62m2 - in[2];
327 out[3] += two62m2 - in[3];
328 out[4] += two62m2 - in[4];
329 out[5] += two62m2 - in[5];
330 out[6] += two62m2 - in[6];
331 out[7] += two62m2 - in[7];
332 out[8] += two62m2 - in[8];
335 /* felem_diff_128_64 subtracts |in| from |out|
337 * in[i] < 2^62 + 2^17
339 * out[i] < out[i] + 2^63
341 static void felem_diff_128_64(largefelem out, const felem in)
343 /* In order to prevent underflow, we add 0 mod p before subtracting. */
344 static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5);
345 static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4);
347 out[0] += two63m6 - in[0];
348 out[1] += two63m5 - in[1];
349 out[2] += two63m5 - in[2];
350 out[3] += two63m5 - in[3];
351 out[4] += two63m5 - in[4];
352 out[5] += two63m5 - in[5];
353 out[6] += two63m5 - in[6];
354 out[7] += two63m5 - in[7];
355 out[8] += two63m5 - in[8];
358 /* felem_diff_128_64 subtracts |in| from |out|
362 * out[i] < out[i] + 2^127 - 2^69
364 static void felem_diff128(largefelem out, const largefelem in)
366 /* In order to prevent underflow, we add 0 mod p before subtracting. */
367 static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70);
368 static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69);
370 out[0] += (two127m70 - in[0]);
371 out[1] += (two127m69 - in[1]);
372 out[2] += (two127m69 - in[2]);
373 out[3] += (two127m69 - in[3]);
374 out[4] += (two127m69 - in[4]);
375 out[5] += (two127m69 - in[5]);
376 out[6] += (two127m69 - in[6]);
377 out[7] += (two127m69 - in[7]);
378 out[8] += (two127m69 - in[8]);
381 /* felem_square sets |out| = |in|^2
385 * out[i] < 17 * max(in[i]) * max(in[i])
387 static void felem_square(largefelem out, const felem in)
390 felem_scalar(inx2, in, 2);
391 felem_scalar(inx4, in, 4);
393 /* We have many cases were we want to do
396 * This is obviously just
398 * However, rather than do the doubling on the 128 bit result, we
399 * double one of the inputs to the multiplication by reading from
402 out[0] = ((uint128_t) in[0]) * in[0];
403 out[1] = ((uint128_t) in[0]) * inx2[1];
404 out[2] = ((uint128_t) in[0]) * inx2[2] +
405 ((uint128_t) in[1]) * in[1];
406 out[3] = ((uint128_t) in[0]) * inx2[3] +
407 ((uint128_t) in[1]) * inx2[2];
408 out[4] = ((uint128_t) in[0]) * inx2[4] +
409 ((uint128_t) in[1]) * inx2[3] +
410 ((uint128_t) in[2]) * in[2];
411 out[5] = ((uint128_t) in[0]) * inx2[5] +
412 ((uint128_t) in[1]) * inx2[4] +
413 ((uint128_t) in[2]) * inx2[3];
414 out[6] = ((uint128_t) in[0]) * inx2[6] +
415 ((uint128_t) in[1]) * inx2[5] +
416 ((uint128_t) in[2]) * inx2[4] +
417 ((uint128_t) in[3]) * in[3];
418 out[7] = ((uint128_t) in[0]) * inx2[7] +
419 ((uint128_t) in[1]) * inx2[6] +
420 ((uint128_t) in[2]) * inx2[5] +
421 ((uint128_t) in[3]) * inx2[4];
422 out[8] = ((uint128_t) in[0]) * inx2[8] +
423 ((uint128_t) in[1]) * inx2[7] +
424 ((uint128_t) in[2]) * inx2[6] +
425 ((uint128_t) in[3]) * inx2[5] +
426 ((uint128_t) in[4]) * in[4];
428 /* The remaining limbs fall above 2^521, with the first falling at
429 * 2^522. They correspond to locations one bit up from the limbs
430 * produced above so we would have to multiply by two to align them.
431 * Again, rather than operate on the 128-bit result, we double one of
432 * the inputs to the multiplication. If we want to double for both this
433 * reason, and the reason above, then we end up multiplying by four. */
436 out[0] += ((uint128_t) in[1]) * inx4[8] +
437 ((uint128_t) in[2]) * inx4[7] +
438 ((uint128_t) in[3]) * inx4[6] +
439 ((uint128_t) in[4]) * inx4[5];
442 out[1] += ((uint128_t) in[2]) * inx4[8] +
443 ((uint128_t) in[3]) * inx4[7] +
444 ((uint128_t) in[4]) * inx4[6] +
445 ((uint128_t) in[5]) * inx2[5];
448 out[2] += ((uint128_t) in[3]) * inx4[8] +
449 ((uint128_t) in[4]) * inx4[7] +
450 ((uint128_t) in[5]) * inx4[6];
453 out[3] += ((uint128_t) in[4]) * inx4[8] +
454 ((uint128_t) in[5]) * inx4[7] +
455 ((uint128_t) in[6]) * inx2[6];
458 out[4] += ((uint128_t) in[5]) * inx4[8] +
459 ((uint128_t) in[6]) * inx4[7];
462 out[5] += ((uint128_t) in[6]) * inx4[8] +
463 ((uint128_t) in[7]) * inx2[7];
466 out[6] += ((uint128_t) in[7]) * inx4[8];
469 out[7] += ((uint128_t) in[8]) * inx2[8];
472 /* felem_mul sets |out| = |in1| * |in2|
477 * out[i] < 17 * max(in1[i]) * max(in2[i])
479 static void felem_mul(largefelem out, const felem in1, const felem in2)
482 felem_scalar(in2x2, in2, 2);
484 out[0] = ((uint128_t) in1[0]) * in2[0];
486 out[1] = ((uint128_t) in1[0]) * in2[1] +
487 ((uint128_t) in1[1]) * in2[0];
489 out[2] = ((uint128_t) in1[0]) * in2[2] +
490 ((uint128_t) in1[1]) * in2[1] +
491 ((uint128_t) in1[2]) * in2[0];
493 out[3] = ((uint128_t) in1[0]) * in2[3] +
494 ((uint128_t) in1[1]) * in2[2] +
495 ((uint128_t) in1[2]) * in2[1] +
496 ((uint128_t) in1[3]) * in2[0];
498 out[4] = ((uint128_t) in1[0]) * in2[4] +
499 ((uint128_t) in1[1]) * in2[3] +
500 ((uint128_t) in1[2]) * in2[2] +
501 ((uint128_t) in1[3]) * in2[1] +
502 ((uint128_t) in1[4]) * in2[0];
504 out[5] = ((uint128_t) in1[0]) * in2[5] +
505 ((uint128_t) in1[1]) * in2[4] +
506 ((uint128_t) in1[2]) * in2[3] +
507 ((uint128_t) in1[3]) * in2[2] +
508 ((uint128_t) in1[4]) * in2[1] +
509 ((uint128_t) in1[5]) * in2[0];
511 out[6] = ((uint128_t) in1[0]) * in2[6] +
512 ((uint128_t) in1[1]) * in2[5] +
513 ((uint128_t) in1[2]) * in2[4] +
514 ((uint128_t) in1[3]) * in2[3] +
515 ((uint128_t) in1[4]) * in2[2] +
516 ((uint128_t) in1[5]) * in2[1] +
517 ((uint128_t) in1[6]) * in2[0];
519 out[7] = ((uint128_t) in1[0]) * in2[7] +
520 ((uint128_t) in1[1]) * in2[6] +
521 ((uint128_t) in1[2]) * in2[5] +
522 ((uint128_t) in1[3]) * in2[4] +
523 ((uint128_t) in1[4]) * in2[3] +
524 ((uint128_t) in1[5]) * in2[2] +
525 ((uint128_t) in1[6]) * in2[1] +
526 ((uint128_t) in1[7]) * in2[0];
528 out[8] = ((uint128_t) in1[0]) * in2[8] +
529 ((uint128_t) in1[1]) * in2[7] +
530 ((uint128_t) in1[2]) * in2[6] +
531 ((uint128_t) in1[3]) * in2[5] +
532 ((uint128_t) in1[4]) * in2[4] +
533 ((uint128_t) in1[5]) * in2[3] +
534 ((uint128_t) in1[6]) * in2[2] +
535 ((uint128_t) in1[7]) * in2[1] +
536 ((uint128_t) in1[8]) * in2[0];
538 /* See comment in felem_square about the use of in2x2 here */
540 out[0] += ((uint128_t) in1[1]) * in2x2[8] +
541 ((uint128_t) in1[2]) * in2x2[7] +
542 ((uint128_t) in1[3]) * in2x2[6] +
543 ((uint128_t) in1[4]) * in2x2[5] +
544 ((uint128_t) in1[5]) * in2x2[4] +
545 ((uint128_t) in1[6]) * in2x2[3] +
546 ((uint128_t) in1[7]) * in2x2[2] +
547 ((uint128_t) in1[8]) * in2x2[1];
549 out[1] += ((uint128_t) in1[2]) * in2x2[8] +
550 ((uint128_t) in1[3]) * in2x2[7] +
551 ((uint128_t) in1[4]) * in2x2[6] +
552 ((uint128_t) in1[5]) * in2x2[5] +
553 ((uint128_t) in1[6]) * in2x2[4] +
554 ((uint128_t) in1[7]) * in2x2[3] +
555 ((uint128_t) in1[8]) * in2x2[2];
557 out[2] += ((uint128_t) in1[3]) * in2x2[8] +
558 ((uint128_t) in1[4]) * in2x2[7] +
559 ((uint128_t) in1[5]) * in2x2[6] +
560 ((uint128_t) in1[6]) * in2x2[5] +
561 ((uint128_t) in1[7]) * in2x2[4] +
562 ((uint128_t) in1[8]) * in2x2[3];
564 out[3] += ((uint128_t) in1[4]) * in2x2[8] +
565 ((uint128_t) in1[5]) * in2x2[7] +
566 ((uint128_t) in1[6]) * in2x2[6] +
567 ((uint128_t) in1[7]) * in2x2[5] +
568 ((uint128_t) in1[8]) * in2x2[4];
570 out[4] += ((uint128_t) in1[5]) * in2x2[8] +
571 ((uint128_t) in1[6]) * in2x2[7] +
572 ((uint128_t) in1[7]) * in2x2[6] +
573 ((uint128_t) in1[8]) * in2x2[5];
575 out[5] += ((uint128_t) in1[6]) * in2x2[8] +
576 ((uint128_t) in1[7]) * in2x2[7] +
577 ((uint128_t) in1[8]) * in2x2[6];
579 out[6] += ((uint128_t) in1[7]) * in2x2[8] +
580 ((uint128_t) in1[8]) * in2x2[7];
582 out[7] += ((uint128_t) in1[8]) * in2x2[8];
585 static const limb bottom52bits = 0xfffffffffffff;
587 /* felem_reduce converts a largefelem to an felem.
591 * out[i] < 2^59 + 2^14
593 static void felem_reduce(felem out, const largefelem in)
595 u64 overflow1, overflow2;
597 out[0] = ((limb) in[0]) & bottom58bits;
598 out[1] = ((limb) in[1]) & bottom58bits;
599 out[2] = ((limb) in[2]) & bottom58bits;
600 out[3] = ((limb) in[3]) & bottom58bits;
601 out[4] = ((limb) in[4]) & bottom58bits;
602 out[5] = ((limb) in[5]) & bottom58bits;
603 out[6] = ((limb) in[6]) & bottom58bits;
604 out[7] = ((limb) in[7]) & bottom58bits;
605 out[8] = ((limb) in[8]) & bottom58bits;
609 out[1] += ((limb) in[0]) >> 58;
610 out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
611 /* out[1] < 2^58 + 2^6 + 2^58
613 out[2] += ((limb) (in[0] >> 64)) >> 52;
615 out[2] += ((limb) in[1]) >> 58;
616 out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
617 out[3] += ((limb) (in[1] >> 64)) >> 52;
619 out[3] += ((limb) in[2]) >> 58;
620 out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
621 out[4] += ((limb) (in[2] >> 64)) >> 52;
623 out[4] += ((limb) in[3]) >> 58;
624 out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
625 out[5] += ((limb) (in[3] >> 64)) >> 52;
627 out[5] += ((limb) in[4]) >> 58;
628 out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
629 out[6] += ((limb) (in[4] >> 64)) >> 52;
631 out[6] += ((limb) in[5]) >> 58;
632 out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
633 out[7] += ((limb) (in[5] >> 64)) >> 52;
635 out[7] += ((limb) in[6]) >> 58;
636 out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
637 out[8] += ((limb) (in[6] >> 64)) >> 52;
639 out[8] += ((limb) in[7]) >> 58;
640 out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
641 /* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
643 overflow1 = ((limb) (in[7] >> 64)) >> 52;
645 overflow1 += ((limb) in[8]) >> 58;
646 overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
647 overflow2 = ((limb) (in[8] >> 64)) >> 52;
649 overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */
650 overflow2 <<= 1; /* overflow2 < 2^13 */
652 out[0] += overflow1; /* out[0] < 2^60 */
653 out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */
655 out[1] += out[0] >> 58; out[0] &= bottom58bits;
657 * out[1] < 2^59 + 2^6 + 2^13 + 2^2
661 static void felem_square_reduce(felem out, const felem in)
664 felem_square(tmp, in);
665 felem_reduce(out, tmp);
668 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
671 felem_mul(tmp, in1, in2);
672 felem_reduce(out, tmp);
675 /* felem_inv calculates |out| = |in|^{-1}
677 * Based on Fermat's Little Theorem:
679 * a^{p-1} = 1 (mod p)
680 * a^{p-2} = a^{-1} (mod p)
682 static void felem_inv(felem out, const felem in)
684 felem ftmp, ftmp2, ftmp3, ftmp4;
688 felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2^1 */
689 felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
690 felem_assign(ftmp2, ftmp);
691 felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
692 felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */
693 felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */
695 felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */
696 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */
697 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
699 felem_assign(ftmp2, ftmp3);
700 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */
701 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */
702 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */
703 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */
704 felem_assign(ftmp4, ftmp3);
705 felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */
706 felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */
707 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
708 felem_assign(ftmp2, ftmp3);
710 for (i = 0; i < 8; i++)
712 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
714 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
715 felem_assign(ftmp2, ftmp3);
717 for (i = 0; i < 16; i++)
719 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
721 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
722 felem_assign(ftmp2, ftmp3);
724 for (i = 0; i < 32; i++)
726 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
728 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
729 felem_assign(ftmp2, ftmp3);
731 for (i = 0; i < 64; i++)
733 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
735 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
736 felem_assign(ftmp2, ftmp3);
738 for (i = 0; i < 128; i++)
740 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
742 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
743 felem_assign(ftmp2, ftmp3);
745 for (i = 0; i < 256; i++)
747 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
749 felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
751 for (i = 0; i < 9; i++)
753 felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
755 felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
756 felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp); /* 2^512 - 3 */
759 /* This is 2^521-1, expressed as an felem */
760 static const felem kPrime =
762 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
763 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
764 0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
767 /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
770 * in[i] < 2^59 + 2^14
772 static limb felem_is_zero(const felem in)
776 felem_assign(ftmp, in);
778 ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits;
780 ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits;
781 ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits;
782 ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits;
783 ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits;
784 ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits;
785 ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits;
786 ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits;
787 ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits;
788 /* ftmp[8] < 2^57 + 4 */
790 /* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is
791 * greater than our bound for ftmp[8]. Therefore we only have to check
792 * if the zero is zero or 2^521-1. */
806 /* We know that ftmp[i] < 2^63, therefore the only way that the top bit
807 * can be set is if is_zero was 0 before the decrement. */
808 is_zero = ((s64) is_zero) >> 63;
810 is_p = ftmp[0] ^ kPrime[0];
811 is_p |= ftmp[1] ^ kPrime[1];
812 is_p |= ftmp[2] ^ kPrime[2];
813 is_p |= ftmp[3] ^ kPrime[3];
814 is_p |= ftmp[4] ^ kPrime[4];
815 is_p |= ftmp[5] ^ kPrime[5];
816 is_p |= ftmp[6] ^ kPrime[6];
817 is_p |= ftmp[7] ^ kPrime[7];
818 is_p |= ftmp[8] ^ kPrime[8];
821 is_p = ((s64) is_p) >> 63;
827 static int felem_is_zero_int(const felem in)
829 return (int) (felem_is_zero(in) & ((limb)1));
832 /* felem_contract converts |in| to its unique, minimal representation.
834 * in[i] < 2^59 + 2^14
836 static void felem_contract(felem out, const felem in)
838 limb is_p, is_greater, sign;
839 static const limb two58 = ((limb)1) << 58;
841 felem_assign(out, in);
843 out[0] += out[8] >> 57; out[8] &= bottom57bits;
845 out[1] += out[0] >> 58; out[0] &= bottom58bits;
846 out[2] += out[1] >> 58; out[1] &= bottom58bits;
847 out[3] += out[2] >> 58; out[2] &= bottom58bits;
848 out[4] += out[3] >> 58; out[3] &= bottom58bits;
849 out[5] += out[4] >> 58; out[4] &= bottom58bits;
850 out[6] += out[5] >> 58; out[5] &= bottom58bits;
851 out[7] += out[6] >> 58; out[6] &= bottom58bits;
852 out[8] += out[7] >> 58; out[7] &= bottom58bits;
853 /* out[8] < 2^57 + 4 */
855 /* If the value is greater than 2^521-1 then we have to subtract
856 * 2^521-1 out. See the comments in felem_is_zero regarding why we
857 * don't test for other multiples of the prime. */
859 /* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */
861 is_p = out[0] ^ kPrime[0];
862 is_p |= out[1] ^ kPrime[1];
863 is_p |= out[2] ^ kPrime[2];
864 is_p |= out[3] ^ kPrime[3];
865 is_p |= out[4] ^ kPrime[4];
866 is_p |= out[5] ^ kPrime[5];
867 is_p |= out[6] ^ kPrime[6];
868 is_p |= out[7] ^ kPrime[7];
869 is_p |= out[8] ^ kPrime[8];
878 is_p = ((s64) is_p) >> 63;
881 /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
893 /* In order to test that |out| >= 2^521-1 we need only test if out[8]
894 * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */
895 is_greater = out[8] >> 57;
896 is_greater |= is_greater << 32;
897 is_greater |= is_greater << 16;
898 is_greater |= is_greater << 8;
899 is_greater |= is_greater << 4;
900 is_greater |= is_greater << 2;
901 is_greater |= is_greater << 1;
902 is_greater = ((s64) is_greater) >> 63;
904 out[0] -= kPrime[0] & is_greater;
905 out[1] -= kPrime[1] & is_greater;
906 out[2] -= kPrime[2] & is_greater;
907 out[3] -= kPrime[3] & is_greater;
908 out[4] -= kPrime[4] & is_greater;
909 out[5] -= kPrime[5] & is_greater;
910 out[6] -= kPrime[6] & is_greater;
911 out[7] -= kPrime[7] & is_greater;
912 out[8] -= kPrime[8] & is_greater;
914 /* Eliminate negative coefficients */
915 sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign);
916 sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign);
917 sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign);
918 sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign);
919 sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign);
920 sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
921 sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
922 sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
923 sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
924 sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
925 sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
931 * Building on top of the field operations we have the operations on the
932 * elliptic curve group itself. Points on the curve are represented in Jacobian
935 /* point_double calcuates 2*(x_in, y_in, z_in)
937 * The method is taken from:
938 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
940 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
941 * while x_out == y_in is not (maybe this works, but it's not tested). */
943 point_double(felem x_out, felem y_out, felem z_out,
944 const felem x_in, const felem y_in, const felem z_in)
946 largefelem tmp, tmp2;
947 felem delta, gamma, beta, alpha, ftmp, ftmp2;
949 felem_assign(ftmp, x_in);
950 felem_assign(ftmp2, x_in);
953 felem_square(tmp, z_in);
954 felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */
957 felem_square(tmp, y_in);
958 felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */
961 felem_mul(tmp, x_in, gamma);
962 felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */
964 /* alpha = 3*(x-delta)*(x+delta) */
965 felem_diff64(ftmp, delta);
967 felem_sum64(ftmp2, delta);
968 /* ftmp2[i] < 2^60 + 2^15 */
969 felem_scalar64(ftmp2, 3);
970 /* ftmp2[i] < 3*2^60 + 3*2^15 */
971 felem_mul(tmp, ftmp, ftmp2);
972 /* tmp[i] < 17(3*2^121 + 3*2^76)
973 * = 61*2^121 + 61*2^76
974 * < 64*2^121 + 64*2^76
977 felem_reduce(alpha, tmp);
979 /* x' = alpha^2 - 8*beta */
980 felem_square(tmp, alpha);
983 felem_assign(ftmp, beta);
984 felem_scalar64(ftmp, 8);
985 /* ftmp[i] < 2^62 + 2^17 */
986 felem_diff_128_64(tmp, ftmp);
987 /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
988 felem_reduce(x_out, tmp);
990 /* z' = (y + z)^2 - gamma - delta */
991 felem_sum64(delta, gamma);
992 /* delta[i] < 2^60 + 2^15 */
993 felem_assign(ftmp, y_in);
994 felem_sum64(ftmp, z_in);
995 /* ftmp[i] < 2^60 + 2^15 */
996 felem_square(tmp, ftmp);
997 /* tmp[i] < 17(2^122)
999 felem_diff_128_64(tmp, delta);
1000 /* tmp[i] < 2^127 + 2^63 */
1001 felem_reduce(z_out, tmp);
1003 /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1004 felem_scalar64(beta, 4);
1005 /* beta[i] < 2^61 + 2^16 */
1006 felem_diff64(beta, x_out);
1007 /* beta[i] < 2^61 + 2^60 + 2^16 */
1008 felem_mul(tmp, alpha, beta);
1009 /* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1010 * = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1011 * = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1013 felem_square(tmp2, gamma);
1014 /* tmp2[i] < 17*(2^59 + 2^14)^2
1015 * = 17*(2^118 + 2^74 + 2^28) */
1016 felem_scalar128(tmp2, 8);
1017 /* tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1018 * = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1020 felem_diff128(tmp, tmp2);
1021 /* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1022 * = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1023 * 2^74 + 2^69 + 2^34 + 2^30
1025 felem_reduce(y_out, tmp);
1028 /* copy_conditional copies in to out iff mask is all ones. */
1030 copy_conditional(felem out, const felem in, limb mask)
1033 for (i = 0; i < NLIMBS; ++i)
1035 const limb tmp = mask & (in[i] ^ out[i]);
1040 /* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1042 * The method is taken from
1043 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1044 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1046 * This function includes a branch for checking whether the two input points
1047 * are equal (while not equal to the point at infinity). This case never
1048 * happens during single point multiplication, so there is no timing leak for
1049 * ECDH or ECDSA signing. */
1050 static void point_add(felem x3, felem y3, felem z3,
1051 const felem x1, const felem y1, const felem z1,
1052 const int mixed, const felem x2, const felem y2, const felem z2)
1054 felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1055 largefelem tmp, tmp2;
1056 limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1058 z1_is_zero = felem_is_zero(z1);
1059 z2_is_zero = felem_is_zero(z2);
1061 /* ftmp = z1z1 = z1**2 */
1062 felem_square(tmp, z1);
1063 felem_reduce(ftmp, tmp);
1067 /* ftmp2 = z2z2 = z2**2 */
1068 felem_square(tmp, z2);
1069 felem_reduce(ftmp2, tmp);
1071 /* u1 = ftmp3 = x1*z2z2 */
1072 felem_mul(tmp, x1, ftmp2);
1073 felem_reduce(ftmp3, tmp);
1075 /* ftmp5 = z1 + z2 */
1076 felem_assign(ftmp5, z1);
1077 felem_sum64(ftmp5, z2);
1078 /* ftmp5[i] < 2^61 */
1080 /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1081 felem_square(tmp, ftmp5);
1082 /* tmp[i] < 17*2^122 */
1083 felem_diff_128_64(tmp, ftmp);
1084 /* tmp[i] < 17*2^122 + 2^63 */
1085 felem_diff_128_64(tmp, ftmp2);
1086 /* tmp[i] < 17*2^122 + 2^64 */
1087 felem_reduce(ftmp5, tmp);
1089 /* ftmp2 = z2 * z2z2 */
1090 felem_mul(tmp, ftmp2, z2);
1091 felem_reduce(ftmp2, tmp);
1093 /* s1 = ftmp6 = y1 * z2**3 */
1094 felem_mul(tmp, y1, ftmp2);
1095 felem_reduce(ftmp6, tmp);
1099 /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
1101 /* u1 = ftmp3 = x1*z2z2 */
1102 felem_assign(ftmp3, x1);
1104 /* ftmp5 = 2*z1z2 */
1105 felem_scalar(ftmp5, z1, 2);
1107 /* s1 = ftmp6 = y1 * z2**3 */
1108 felem_assign(ftmp6, y1);
1112 felem_mul(tmp, x2, ftmp);
1113 /* tmp[i] < 17*2^120 */
1115 /* h = ftmp4 = u2 - u1 */
1116 felem_diff_128_64(tmp, ftmp3);
1117 /* tmp[i] < 17*2^120 + 2^63 */
1118 felem_reduce(ftmp4, tmp);
1120 x_equal = felem_is_zero(ftmp4);
1122 /* z_out = ftmp5 * h */
1123 felem_mul(tmp, ftmp5, ftmp4);
1124 felem_reduce(z_out, tmp);
1126 /* ftmp = z1 * z1z1 */
1127 felem_mul(tmp, ftmp, z1);
1128 felem_reduce(ftmp, tmp);
1130 /* s2 = tmp = y2 * z1**3 */
1131 felem_mul(tmp, y2, ftmp);
1132 /* tmp[i] < 17*2^120 */
1134 /* r = ftmp5 = (s2 - s1)*2 */
1135 felem_diff_128_64(tmp, ftmp6);
1136 /* tmp[i] < 17*2^120 + 2^63 */
1137 felem_reduce(ftmp5, tmp);
1138 y_equal = felem_is_zero(ftmp5);
1139 felem_scalar64(ftmp5, 2);
1140 /* ftmp5[i] < 2^61 */
1142 if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
1144 point_double(x3, y3, z3, x1, y1, z1);
1148 /* I = ftmp = (2h)**2 */
1149 felem_assign(ftmp, ftmp4);
1150 felem_scalar64(ftmp, 2);
1151 /* ftmp[i] < 2^61 */
1152 felem_square(tmp, ftmp);
1153 /* tmp[i] < 17*2^122 */
1154 felem_reduce(ftmp, tmp);
1156 /* J = ftmp2 = h * I */
1157 felem_mul(tmp, ftmp4, ftmp);
1158 felem_reduce(ftmp2, tmp);
1160 /* V = ftmp4 = U1 * I */
1161 felem_mul(tmp, ftmp3, ftmp);
1162 felem_reduce(ftmp4, tmp);
1164 /* x_out = r**2 - J - 2V */
1165 felem_square(tmp, ftmp5);
1166 /* tmp[i] < 17*2^122 */
1167 felem_diff_128_64(tmp, ftmp2);
1168 /* tmp[i] < 17*2^122 + 2^63 */
1169 felem_assign(ftmp3, ftmp4);
1170 felem_scalar64(ftmp4, 2);
1171 /* ftmp4[i] < 2^61 */
1172 felem_diff_128_64(tmp, ftmp4);
1173 /* tmp[i] < 17*2^122 + 2^64 */
1174 felem_reduce(x_out, tmp);
1176 /* y_out = r(V-x_out) - 2 * s1 * J */
1177 felem_diff64(ftmp3, x_out);
1178 /* ftmp3[i] < 2^60 + 2^60
1180 felem_mul(tmp, ftmp5, ftmp3);
1181 /* tmp[i] < 17*2^122 */
1182 felem_mul(tmp2, ftmp6, ftmp2);
1183 /* tmp2[i] < 17*2^120 */
1184 felem_scalar128(tmp2, 2);
1185 /* tmp2[i] < 17*2^121 */
1186 felem_diff128(tmp, tmp2);
1187 /* tmp[i] < 2^127 - 2^69 + 17*2^122
1188 * = 2^126 - 2^122 - 2^6 - 2^2 - 1
1190 felem_reduce(y_out, tmp);
1192 copy_conditional(x_out, x2, z1_is_zero);
1193 copy_conditional(x_out, x1, z2_is_zero);
1194 copy_conditional(y_out, y2, z1_is_zero);
1195 copy_conditional(y_out, y1, z2_is_zero);
1196 copy_conditional(z_out, z2, z1_is_zero);
1197 copy_conditional(z_out, z1, z2_is_zero);
1198 felem_assign(x3, x_out);
1199 felem_assign(y3, y_out);
1200 felem_assign(z3, z_out);
1203 /* Base point pre computation
1204 * --------------------------
1206 * Two different sorts of precomputed tables are used in the following code.
1207 * Each contain various points on the curve, where each point is three field
1208 * elements (x, y, z).
1210 * For the base point table, z is usually 1 (0 for the point at infinity).
1211 * This table has 16 elements:
1212 * index | bits | point
1213 * ------+---------+------------------------------
1216 * 2 | 0 0 1 0 | 2^130G
1217 * 3 | 0 0 1 1 | (2^130 + 1)G
1218 * 4 | 0 1 0 0 | 2^260G
1219 * 5 | 0 1 0 1 | (2^260 + 1)G
1220 * 6 | 0 1 1 0 | (2^260 + 2^130)G
1221 * 7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1222 * 8 | 1 0 0 0 | 2^390G
1223 * 9 | 1 0 0 1 | (2^390 + 1)G
1224 * 10 | 1 0 1 0 | (2^390 + 2^130)G
1225 * 11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1226 * 12 | 1 1 0 0 | (2^390 + 2^260)G
1227 * 13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1228 * 14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1229 * 15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1231 * The reason for this is so that we can clock bits into four different
1232 * locations when doing simple scalar multiplies against the base point.
1234 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1236 /* gmul is the table of precomputed base points */
1237 static const felem gmul[16][3] =
1238 {{{0, 0, 0, 0, 0, 0, 0, 0, 0},
1239 {0, 0, 0, 0, 0, 0, 0, 0, 0},
1240 {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1241 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1242 0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1243 0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1244 {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1245 0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1246 0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1247 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1248 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1249 0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1250 0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1251 {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1252 0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1253 0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1254 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1255 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1256 0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1257 0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1258 {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1259 0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1260 0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1261 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1262 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1263 0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1264 0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1265 {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1266 0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1267 0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1268 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1269 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1270 0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1271 0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1272 {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1273 0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1274 0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1275 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1276 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1277 0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1278 0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1279 {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1280 0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1281 0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1282 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1283 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1284 0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1285 0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1286 {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1287 0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1288 0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1289 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1290 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1291 0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1292 0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1293 {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1294 0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1295 0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1296 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1297 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1298 0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1299 0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1300 {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1301 0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1302 0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1303 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1304 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1305 0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1306 0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1307 {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1308 0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1309 0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1310 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1311 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1312 0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1313 0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1314 {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1315 0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1316 0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1317 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1318 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1319 0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1320 0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1321 {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1322 0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1323 0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1324 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1325 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1326 0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1327 0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1328 {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1329 0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1330 0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1331 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1332 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1333 0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1334 0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1335 {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1336 0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1337 0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1338 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1339 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1340 0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1341 0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1342 {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1343 0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1344 0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1345 {1, 0, 0, 0, 0, 0, 0, 0, 0}}};
1347 /* select_point selects the |idx|th point from a precomputation table and
1348 * copies it to out. */
1349 static void select_point(const limb idx, unsigned int size, const felem pre_comp[/* size */][3],
1353 limb *outlimbs = &out[0][0];
1354 memset(outlimbs, 0, 3 * sizeof(felem));
1356 for (i = 0; i < size; i++)
1358 const limb *inlimbs = &pre_comp[i][0][0];
1359 limb mask = i ^ idx;
1365 for (j = 0; j < NLIMBS * 3; j++)
1366 outlimbs[j] |= inlimbs[j] & mask;
1370 /* get_bit returns the |i|th bit in |in| */
1371 static char get_bit(const felem_bytearray in, int i)
1375 return (in[i >> 3] >> (i & 7)) & 1;
1378 /* Interleaved point multiplication using precomputed point multiples:
1379 * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
1380 * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
1381 * of the generator, using certain (large) precomputed multiples in g_pre_comp.
1382 * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
1383 static void batch_mul(felem x_out, felem y_out, felem z_out,
1384 const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
1385 const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3])
1388 unsigned num, gen_mul = (g_scalar != NULL);
1389 felem nq[3], tmp[4];
1393 /* set nq to the point at infinity */
1394 memset(nq, 0, 3 * sizeof(felem));
1396 /* Loop over all scalars msb-to-lsb, interleaving additions
1397 * of multiples of the generator (last quarter of rounds)
1398 * and additions of other points multiples (every 5th round).
1400 skip = 1; /* save two point operations in the first round */
1401 for (i = (num_points ? 520 : 130); i >= 0; --i)
1405 point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1407 /* add multiples of the generator */
1408 if (gen_mul && (i <= 130))
1410 bits = get_bit(g_scalar, i + 390) << 3;
1413 bits |= get_bit(g_scalar, i + 260) << 2;
1414 bits |= get_bit(g_scalar, i + 130) << 1;
1415 bits |= get_bit(g_scalar, i);
1417 /* select the point to add, in constant time */
1418 select_point(bits, 16, g_pre_comp, tmp);
1421 point_add(nq[0], nq[1], nq[2],
1422 nq[0], nq[1], nq[2],
1423 1 /* mixed */, tmp[0], tmp[1], tmp[2]);
1427 memcpy(nq, tmp, 3 * sizeof(felem));
1432 /* do other additions every 5 doublings */
1433 if (num_points && (i % 5 == 0))
1435 /* loop over all scalars */
1436 for (num = 0; num < num_points; ++num)
1438 bits = get_bit(scalars[num], i + 4) << 5;
1439 bits |= get_bit(scalars[num], i + 3) << 4;
1440 bits |= get_bit(scalars[num], i + 2) << 3;
1441 bits |= get_bit(scalars[num], i + 1) << 2;
1442 bits |= get_bit(scalars[num], i) << 1;
1443 bits |= get_bit(scalars[num], i - 1);
1444 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1446 /* select the point to add or subtract, in constant time */
1447 select_point(digit, 17, pre_comp[num], tmp);
1448 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */
1449 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1453 point_add(nq[0], nq[1], nq[2],
1454 nq[0], nq[1], nq[2],
1455 mixed, tmp[0], tmp[1], tmp[2]);
1459 memcpy(nq, tmp, 3 * sizeof(felem));
1465 felem_assign(x_out, nq[0]);
1466 felem_assign(y_out, nq[1]);
1467 felem_assign(z_out, nq[2]);
1471 /* Precomputation for the group generator. */
1473 felem g_pre_comp[16][3];
1475 } NISTP521_PRE_COMP;
1477 const EC_METHOD *EC_GFp_nistp521_method(void)
1479 static const EC_METHOD ret = {
1480 EC_FLAGS_DEFAULT_OCT,
1481 NID_X9_62_prime_field,
1482 ec_GFp_nistp521_group_init,
1483 ec_GFp_simple_group_finish,
1484 ec_GFp_simple_group_clear_finish,
1485 ec_GFp_nist_group_copy,
1486 ec_GFp_nistp521_group_set_curve,
1487 ec_GFp_simple_group_get_curve,
1488 ec_GFp_simple_group_get_degree,
1489 ec_GFp_simple_group_check_discriminant,
1490 ec_GFp_simple_point_init,
1491 ec_GFp_simple_point_finish,
1492 ec_GFp_simple_point_clear_finish,
1493 ec_GFp_simple_point_copy,
1494 ec_GFp_simple_point_set_to_infinity,
1495 ec_GFp_simple_set_Jprojective_coordinates_GFp,
1496 ec_GFp_simple_get_Jprojective_coordinates_GFp,
1497 ec_GFp_simple_point_set_affine_coordinates,
1498 ec_GFp_nistp521_point_get_affine_coordinates,
1499 0 /* point_set_compressed_coordinates */,
1504 ec_GFp_simple_invert,
1505 ec_GFp_simple_is_at_infinity,
1506 ec_GFp_simple_is_on_curve,
1508 ec_GFp_simple_make_affine,
1509 ec_GFp_simple_points_make_affine,
1510 ec_GFp_nistp521_points_mul,
1511 ec_GFp_nistp521_precompute_mult,
1512 ec_GFp_nistp521_have_precompute_mult,
1513 ec_GFp_nist_field_mul,
1514 ec_GFp_nist_field_sqr,
1516 0 /* field_encode */,
1517 0 /* field_decode */,
1518 0 /* field_set_to_one */ };
1524 /******************************************************************************/
1525 /* FUNCTIONS TO MANAGE PRECOMPUTATION
1528 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1530 NISTP521_PRE_COMP *ret = NULL;
1531 ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1534 ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1537 memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1538 ret->references = 1;
1542 static void *nistp521_pre_comp_dup(void *src_)
1544 NISTP521_PRE_COMP *src = src_;
1546 /* no need to actually copy, these objects never change! */
1547 CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1552 static void nistp521_pre_comp_free(void *pre_)
1555 NISTP521_PRE_COMP *pre = pre_;
1560 i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1567 static void nistp521_pre_comp_clear_free(void *pre_)
1570 NISTP521_PRE_COMP *pre = pre_;
1575 i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1579 OPENSSL_cleanse(pre, sizeof(*pre));
1583 /******************************************************************************/
1584 /* OPENSSL EC_METHOD FUNCTIONS
1587 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1590 ret = ec_GFp_simple_group_init(group);
1591 group->a_is_minus3 = 1;
1595 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1596 const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
1599 BN_CTX *new_ctx = NULL;
1600 BIGNUM *curve_p, *curve_a, *curve_b;
1603 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1605 if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1606 ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1607 ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
1608 BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1609 BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1610 BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1611 if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
1612 (BN_cmp(curve_b, b)))
1614 ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1615 EC_R_WRONG_CURVE_PARAMETERS);
1618 group->field_mod_func = BN_nist_mod_521;
1619 ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1622 if (new_ctx != NULL)
1623 BN_CTX_free(new_ctx);
1627 /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
1628 * (X', Y') = (X/Z^2, Y/Z^3) */
1629 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1630 const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
1632 felem z1, z2, x_in, y_in, x_out, y_out;
1635 if (EC_POINT_is_at_infinity(group, point))
1637 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1638 EC_R_POINT_AT_INFINITY);
1641 if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1642 (!BN_to_felem(z1, &point->Z))) return 0;
1644 felem_square(tmp, z2); felem_reduce(z1, tmp);
1645 felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
1646 felem_contract(x_out, x_in);
1649 if (!felem_to_BN(x, x_out))
1651 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1655 felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
1656 felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
1657 felem_contract(y_out, y_in);
1660 if (!felem_to_BN(y, y_out))
1662 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1669 static void make_points_affine(size_t num, felem points[/* num */][3], felem tmp_felems[/* num+1 */])
1671 /* Runs in constant time, unless an input is the point at infinity
1672 * (which normally shouldn't happen). */
1673 ec_GFp_nistp_points_make_affine_internal(
1678 (void (*)(void *)) felem_one,
1679 (int (*)(const void *)) felem_is_zero_int,
1680 (void (*)(void *, const void *)) felem_assign,
1681 (void (*)(void *, const void *)) felem_square_reduce,
1682 (void (*)(void *, const void *, const void *)) felem_mul_reduce,
1683 (void (*)(void *, const void *)) felem_inv,
1684 (void (*)(void *, const void *)) felem_contract);
1687 /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
1688 * Result is stored in r (r can equal one of the inputs). */
1689 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1690 const BIGNUM *scalar, size_t num, const EC_POINT *points[],
1691 const BIGNUM *scalars[], BN_CTX *ctx)
1696 BN_CTX *new_ctx = NULL;
1697 BIGNUM *x, *y, *z, *tmp_scalar;
1698 felem_bytearray g_secret;
1699 felem_bytearray *secrets = NULL;
1700 felem (*pre_comp)[17][3] = NULL;
1701 felem *tmp_felems = NULL;
1702 felem_bytearray tmp;
1703 unsigned i, num_bytes;
1704 int have_pre_comp = 0;
1705 size_t num_points = num;
1706 felem x_in, y_in, z_in, x_out, y_out, z_out;
1707 NISTP521_PRE_COMP *pre = NULL;
1708 felem (*g_pre_comp)[3] = NULL;
1709 EC_POINT *generator = NULL;
1710 const EC_POINT *p = NULL;
1711 const BIGNUM *p_scalar = NULL;
1714 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1716 if (((x = BN_CTX_get(ctx)) == NULL) ||
1717 ((y = BN_CTX_get(ctx)) == NULL) ||
1718 ((z = BN_CTX_get(ctx)) == NULL) ||
1719 ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1724 pre = EC_EX_DATA_get_data(group->extra_data,
1725 nistp521_pre_comp_dup, nistp521_pre_comp_free,
1726 nistp521_pre_comp_clear_free);
1728 /* we have precomputation, try to use it */
1729 g_pre_comp = &pre->g_pre_comp[0];
1731 /* try to use the standard precomputation */
1732 g_pre_comp = (felem (*)[3]) gmul;
1733 generator = EC_POINT_new(group);
1734 if (generator == NULL)
1736 /* get the generator from precomputation */
1737 if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1738 !felem_to_BN(y, g_pre_comp[1][1]) ||
1739 !felem_to_BN(z, g_pre_comp[1][2]))
1741 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1744 if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1745 generator, x, y, z, ctx))
1747 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1748 /* precomputation matches generator */
1751 /* we don't have valid precomputation:
1752 * treat the generator as a random point */
1758 if (num_points >= 2)
1760 /* unless we precompute multiples for just one point,
1761 * converting those into affine form is time well spent */
1764 secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1765 pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1767 tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1768 if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL)))
1770 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1774 /* we treat NULL scalars as 0, and NULL points as points at infinity,
1775 * i.e., they contribute nothing to the linear combination */
1776 memset(secrets, 0, num_points * sizeof(felem_bytearray));
1777 memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1778 for (i = 0; i < num_points; ++i)
1781 /* we didn't have a valid precomputation, so we pick
1784 p = EC_GROUP_get0_generator(group);
1788 /* the i^th point */
1791 p_scalar = scalars[i];
1793 if ((p_scalar != NULL) && (p != NULL))
1795 /* reduce scalar to 0 <= scalar < 2^521 */
1796 if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar)))
1798 /* this is an unusual input, and we don't guarantee
1799 * constant-timeness */
1800 if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
1802 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1805 num_bytes = BN_bn2bin(tmp_scalar, tmp);
1808 num_bytes = BN_bn2bin(p_scalar, tmp);
1809 flip_endian(secrets[i], tmp, num_bytes);
1810 /* precompute multiples */
1811 if ((!BN_to_felem(x_out, &p->X)) ||
1812 (!BN_to_felem(y_out, &p->Y)) ||
1813 (!BN_to_felem(z_out, &p->Z))) goto err;
1814 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1815 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1816 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1817 for (j = 2; j <= 16; ++j)
1822 pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1823 pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
1824 0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
1829 pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1830 pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
1836 make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1839 /* the scalar for the generator */
1840 if ((scalar != NULL) && (have_pre_comp))
1842 memset(g_secret, 0, sizeof(g_secret));
1843 /* reduce scalar to 0 <= scalar < 2^521 */
1844 if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar)))
1846 /* this is an unusual input, and we don't guarantee
1847 * constant-timeness */
1848 if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
1850 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1853 num_bytes = BN_bn2bin(tmp_scalar, tmp);
1856 num_bytes = BN_bn2bin(scalar, tmp);
1857 flip_endian(g_secret, tmp, num_bytes);
1858 /* do the multiplication with generator precomputation*/
1859 batch_mul(x_out, y_out, z_out,
1860 (const felem_bytearray (*)) secrets, num_points,
1862 mixed, (const felem (*)[17][3]) pre_comp,
1863 (const felem (*)[3]) g_pre_comp);
1866 /* do the multiplication without generator precomputation */
1867 batch_mul(x_out, y_out, z_out,
1868 (const felem_bytearray (*)) secrets, num_points,
1869 NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL);
1870 /* reduce the output to its unique minimal representation */
1871 felem_contract(x_in, x_out);
1872 felem_contract(y_in, y_out);
1873 felem_contract(z_in, z_out);
1874 if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1875 (!felem_to_BN(z, z_in)))
1877 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1880 ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1884 if (generator != NULL)
1885 EC_POINT_free(generator);
1886 if (new_ctx != NULL)
1887 BN_CTX_free(new_ctx);
1888 if (secrets != NULL)
1889 OPENSSL_free(secrets);
1890 if (pre_comp != NULL)
1891 OPENSSL_free(pre_comp);
1892 if (tmp_felems != NULL)
1893 OPENSSL_free(tmp_felems);
1897 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1900 NISTP521_PRE_COMP *pre = NULL;
1902 BN_CTX *new_ctx = NULL;
1904 EC_POINT *generator = NULL;
1905 felem tmp_felems[16];
1907 /* throw away old precomputation */
1908 EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
1909 nistp521_pre_comp_free, nistp521_pre_comp_clear_free);
1911 if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1913 if (((x = BN_CTX_get(ctx)) == NULL) ||
1914 ((y = BN_CTX_get(ctx)) == NULL))
1916 /* get the generator */
1917 if (group->generator == NULL) goto err;
1918 generator = EC_POINT_new(group);
1919 if (generator == NULL)
1921 BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x);
1922 BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y);
1923 if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1925 if ((pre = nistp521_pre_comp_new()) == NULL)
1927 /* if the generator is the standard one, use built-in precomputation */
1928 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1930 memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1934 if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
1935 (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
1936 (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
1938 /* compute 2^130*G, 2^260*G, 2^390*G */
1939 for (i = 1; i <= 4; i <<= 1)
1941 point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1],
1942 pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0],
1943 pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1944 for (j = 0; j < 129; ++j)
1946 point_double(pre->g_pre_comp[2*i][0],
1947 pre->g_pre_comp[2*i][1],
1948 pre->g_pre_comp[2*i][2],
1949 pre->g_pre_comp[2*i][0],
1950 pre->g_pre_comp[2*i][1],
1951 pre->g_pre_comp[2*i][2]);
1954 /* g_pre_comp[0] is the point at infinity */
1955 memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1956 /* the remaining multiples */
1957 /* 2^130*G + 2^260*G */
1958 point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
1959 pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
1960 pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
1961 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1962 pre->g_pre_comp[2][2]);
1963 /* 2^130*G + 2^390*G */
1964 point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
1965 pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
1966 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1967 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1968 pre->g_pre_comp[2][2]);
1969 /* 2^260*G + 2^390*G */
1970 point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
1971 pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
1972 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1973 0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
1974 pre->g_pre_comp[4][2]);
1975 /* 2^130*G + 2^260*G + 2^390*G */
1976 point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
1977 pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
1978 pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1979 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1980 pre->g_pre_comp[2][2]);
1981 for (i = 1; i < 8; ++i)
1983 /* odd multiples: add G */
1984 point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1],
1985 pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0],
1986 pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2],
1987 0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
1988 pre->g_pre_comp[1][2]);
1990 make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1992 if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
1993 nistp521_pre_comp_free, nistp521_pre_comp_clear_free))
1999 if (generator != NULL)
2000 EC_POINT_free(generator);
2001 if (new_ctx != NULL)
2002 BN_CTX_free(new_ctx);
2004 nistp521_pre_comp_free(pre);
2008 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2010 if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2011 nistp521_pre_comp_free, nistp521_pre_comp_clear_free)
2019 static void *dummy=&dummy;