2 * Written by Adam Langley (Google) for the OpenSSL project
4 /* Copyright 2011 Google Inc.
6 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
23 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
24 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
25 * work which got its smarts from Daniel J. Bernstein's work on the same.
28 #include <openssl/opensslconf.h>
29 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31 # ifndef OPENSSL_SYS_VMS
34 # include <inttypes.h>
38 # include <openssl/err.h>
41 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
42 /* even with gcc, the typedef won't work for 32-bit platforms */
43 typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit
46 # error "Need GCC 3.1 or later to define type uint128_t"
54 * The underlying field. P521 operates over GF(2^521-1). We can serialise an
55 * element of this field into 66 bytes where the most significant byte
56 * contains only a single bit. We call this an felem_bytearray.
59 typedef u8 felem_bytearray[66];
62 * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
63 * These values are big-endian.
65 static const felem_bytearray nistp521_curve_params[5] = {
66 {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
67 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75 {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
76 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
77 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
84 {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
85 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
86 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
87 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
88 0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
89 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
90 0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
91 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
93 {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
94 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
95 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
96 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
97 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
98 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
99 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
100 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
102 {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
103 0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
104 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
105 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
106 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
107 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
108 0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
109 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
114 * The representation of field elements.
115 * ------------------------------------
117 * We represent field elements with nine values. These values are either 64 or
118 * 128 bits and the field element represented is:
119 * v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464 (mod p)
120 * Each of the nine values is called a 'limb'. Since the limbs are spaced only
121 * 58 bits apart, but are greater than 58 bits in length, the most significant
122 * bits of each limb overlap with the least significant bits of the next.
124 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
129 typedef uint64_t limb;
130 typedef limb felem[NLIMBS];
131 typedef uint128_t largefelem[NLIMBS];
133 static const limb bottom57bits = 0x1ffffffffffffff;
134 static const limb bottom58bits = 0x3ffffffffffffff;
137 * bin66_to_felem takes a little-endian byte array and converts it into felem
138 * form. This assumes that the CPU is little-endian.
140 static void bin66_to_felem(felem out, const u8 in[66])
142 out[0] = (*((limb *) & in[0])) & bottom58bits;
143 out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
144 out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
145 out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
146 out[4] = (*((limb *) & in[29])) & bottom58bits;
147 out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
148 out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
149 out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
150 out[8] = (*((limb *) & in[58])) & bottom57bits;
154 * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
155 * array. This assumes that the CPU is little-endian.
157 static void felem_to_bin66(u8 out[66], const felem in)
160 (*((limb *) & out[0])) = in[0];
161 (*((limb *) & out[7])) |= in[1] << 2;
162 (*((limb *) & out[14])) |= in[2] << 4;
163 (*((limb *) & out[21])) |= in[3] << 6;
164 (*((limb *) & out[29])) = in[4];
165 (*((limb *) & out[36])) |= in[5] << 2;
166 (*((limb *) & out[43])) |= in[6] << 4;
167 (*((limb *) & out[50])) |= in[7] << 6;
168 (*((limb *) & out[58])) = in[8];
171 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
172 static void flip_endian(u8 *out, const u8 *in, unsigned len)
175 for (i = 0; i < len; ++i)
176 out[i] = in[len - 1 - i];
179 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
180 static int BN_to_felem(felem out, const BIGNUM *bn)
182 felem_bytearray b_in;
183 felem_bytearray b_out;
186 /* BN_bn2bin eats leading zeroes */
187 memset(b_out, 0, sizeof(b_out));
188 num_bytes = BN_num_bytes(bn);
189 if (num_bytes > sizeof b_out) {
190 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
193 if (BN_is_negative(bn)) {
194 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
197 num_bytes = BN_bn2bin(bn, b_in);
198 flip_endian(b_out, b_in, num_bytes);
199 bin66_to_felem(out, b_out);
203 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
204 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
206 felem_bytearray b_in, b_out;
207 felem_to_bin66(b_in, in);
208 flip_endian(b_out, b_in, sizeof b_out);
209 return BN_bin2bn(b_out, sizeof b_out, out);
217 static void felem_one(felem out)
230 static void felem_assign(felem out, const felem in)
243 /* felem_sum64 sets out = out + in. */
244 static void felem_sum64(felem out, const felem in)
257 /* felem_scalar sets out = in * scalar */
258 static void felem_scalar(felem out, const felem in, limb scalar)
260 out[0] = in[0] * scalar;
261 out[1] = in[1] * scalar;
262 out[2] = in[2] * scalar;
263 out[3] = in[3] * scalar;
264 out[4] = in[4] * scalar;
265 out[5] = in[5] * scalar;
266 out[6] = in[6] * scalar;
267 out[7] = in[7] * scalar;
268 out[8] = in[8] * scalar;
271 /* felem_scalar64 sets out = out * scalar */
272 static void felem_scalar64(felem out, limb scalar)
285 /* felem_scalar128 sets out = out * scalar */
286 static void felem_scalar128(largefelem out, limb scalar)
300 * felem_neg sets |out| to |-in|
302 * in[i] < 2^59 + 2^14
306 static void felem_neg(felem out, const felem in)
308 /* In order to prevent underflow, we subtract from 0 mod p. */
309 static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
310 static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
312 out[0] = two62m3 - in[0];
313 out[1] = two62m2 - in[1];
314 out[2] = two62m2 - in[2];
315 out[3] = two62m2 - in[3];
316 out[4] = two62m2 - in[4];
317 out[5] = two62m2 - in[5];
318 out[6] = two62m2 - in[6];
319 out[7] = two62m2 - in[7];
320 out[8] = two62m2 - in[8];
324 * felem_diff64 subtracts |in| from |out|
326 * in[i] < 2^59 + 2^14
328 * out[i] < out[i] + 2^62
330 static void felem_diff64(felem out, const felem in)
333 * In order to prevent underflow, we add 0 mod p before subtracting.
335 static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
336 static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
338 out[0] += two62m3 - in[0];
339 out[1] += two62m2 - in[1];
340 out[2] += two62m2 - in[2];
341 out[3] += two62m2 - in[3];
342 out[4] += two62m2 - in[4];
343 out[5] += two62m2 - in[5];
344 out[6] += two62m2 - in[6];
345 out[7] += two62m2 - in[7];
346 out[8] += two62m2 - in[8];
350 * felem_diff_128_64 subtracts |in| from |out|
352 * in[i] < 2^62 + 2^17
354 * out[i] < out[i] + 2^63
356 static void felem_diff_128_64(largefelem out, const felem in)
359 * In order to prevent underflow, we add 0 mod p before subtracting.
361 static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
362 static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
364 out[0] += two63m6 - in[0];
365 out[1] += two63m5 - in[1];
366 out[2] += two63m5 - in[2];
367 out[3] += two63m5 - in[3];
368 out[4] += two63m5 - in[4];
369 out[5] += two63m5 - in[5];
370 out[6] += two63m5 - in[6];
371 out[7] += two63m5 - in[7];
372 out[8] += two63m5 - in[8];
376 * felem_diff_128_64 subtracts |in| from |out|
380 * out[i] < out[i] + 2^127 - 2^69
382 static void felem_diff128(largefelem out, const largefelem in)
385 * In order to prevent underflow, we add 0 mod p before subtracting.
387 static const uint128_t two127m70 =
388 (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
389 static const uint128_t two127m69 =
390 (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
392 out[0] += (two127m70 - in[0]);
393 out[1] += (two127m69 - in[1]);
394 out[2] += (two127m69 - in[2]);
395 out[3] += (two127m69 - in[3]);
396 out[4] += (two127m69 - in[4]);
397 out[5] += (two127m69 - in[5]);
398 out[6] += (two127m69 - in[6]);
399 out[7] += (two127m69 - in[7]);
400 out[8] += (two127m69 - in[8]);
404 * felem_square sets |out| = |in|^2
408 * out[i] < 17 * max(in[i]) * max(in[i])
410 static void felem_square(largefelem out, const felem in)
413 felem_scalar(inx2, in, 2);
414 felem_scalar(inx4, in, 4);
417 * We have many cases were we want to do
420 * This is obviously just
422 * However, rather than do the doubling on the 128 bit result, we
423 * double one of the inputs to the multiplication by reading from
427 out[0] = ((uint128_t) in[0]) * in[0];
428 out[1] = ((uint128_t) in[0]) * inx2[1];
429 out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
430 out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
431 out[4] = ((uint128_t) in[0]) * inx2[4] +
432 ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
433 out[5] = ((uint128_t) in[0]) * inx2[5] +
434 ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
435 out[6] = ((uint128_t) in[0]) * inx2[6] +
436 ((uint128_t) in[1]) * inx2[5] +
437 ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
438 out[7] = ((uint128_t) in[0]) * inx2[7] +
439 ((uint128_t) in[1]) * inx2[6] +
440 ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
441 out[8] = ((uint128_t) in[0]) * inx2[8] +
442 ((uint128_t) in[1]) * inx2[7] +
443 ((uint128_t) in[2]) * inx2[6] +
444 ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
447 * The remaining limbs fall above 2^521, with the first falling at 2^522.
448 * They correspond to locations one bit up from the limbs produced above
449 * so we would have to multiply by two to align them. Again, rather than
450 * operate on the 128-bit result, we double one of the inputs to the
451 * multiplication. If we want to double for both this reason, and the
452 * reason above, then we end up multiplying by four.
456 out[0] += ((uint128_t) in[1]) * inx4[8] +
457 ((uint128_t) in[2]) * inx4[7] +
458 ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
461 out[1] += ((uint128_t) in[2]) * inx4[8] +
462 ((uint128_t) in[3]) * inx4[7] +
463 ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
466 out[2] += ((uint128_t) in[3]) * inx4[8] +
467 ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
470 out[3] += ((uint128_t) in[4]) * inx4[8] +
471 ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
474 out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
477 out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
480 out[6] += ((uint128_t) in[7]) * inx4[8];
483 out[7] += ((uint128_t) in[8]) * inx2[8];
487 * felem_mul sets |out| = |in1| * |in2|
492 * out[i] < 17 * max(in1[i]) * max(in2[i])
494 static void felem_mul(largefelem out, const felem in1, const felem in2)
497 felem_scalar(in2x2, in2, 2);
499 out[0] = ((uint128_t) in1[0]) * in2[0];
501 out[1] = ((uint128_t) in1[0]) * in2[1] +
502 ((uint128_t) in1[1]) * in2[0];
504 out[2] = ((uint128_t) in1[0]) * in2[2] +
505 ((uint128_t) in1[1]) * in2[1] +
506 ((uint128_t) in1[2]) * in2[0];
508 out[3] = ((uint128_t) in1[0]) * in2[3] +
509 ((uint128_t) in1[1]) * in2[2] +
510 ((uint128_t) in1[2]) * in2[1] +
511 ((uint128_t) in1[3]) * in2[0];
513 out[4] = ((uint128_t) in1[0]) * in2[4] +
514 ((uint128_t) in1[1]) * in2[3] +
515 ((uint128_t) in1[2]) * in2[2] +
516 ((uint128_t) in1[3]) * in2[1] +
517 ((uint128_t) in1[4]) * in2[0];
519 out[5] = ((uint128_t) in1[0]) * in2[5] +
520 ((uint128_t) in1[1]) * in2[4] +
521 ((uint128_t) in1[2]) * in2[3] +
522 ((uint128_t) in1[3]) * in2[2] +
523 ((uint128_t) in1[4]) * in2[1] +
524 ((uint128_t) in1[5]) * in2[0];
526 out[6] = ((uint128_t) in1[0]) * in2[6] +
527 ((uint128_t) in1[1]) * in2[5] +
528 ((uint128_t) in1[2]) * in2[4] +
529 ((uint128_t) in1[3]) * in2[3] +
530 ((uint128_t) in1[4]) * in2[2] +
531 ((uint128_t) in1[5]) * in2[1] +
532 ((uint128_t) in1[6]) * in2[0];
534 out[7] = ((uint128_t) in1[0]) * in2[7] +
535 ((uint128_t) in1[1]) * in2[6] +
536 ((uint128_t) in1[2]) * in2[5] +
537 ((uint128_t) in1[3]) * in2[4] +
538 ((uint128_t) in1[4]) * in2[3] +
539 ((uint128_t) in1[5]) * in2[2] +
540 ((uint128_t) in1[6]) * in2[1] +
541 ((uint128_t) in1[7]) * in2[0];
543 out[8] = ((uint128_t) in1[0]) * in2[8] +
544 ((uint128_t) in1[1]) * in2[7] +
545 ((uint128_t) in1[2]) * in2[6] +
546 ((uint128_t) in1[3]) * in2[5] +
547 ((uint128_t) in1[4]) * in2[4] +
548 ((uint128_t) in1[5]) * in2[3] +
549 ((uint128_t) in1[6]) * in2[2] +
550 ((uint128_t) in1[7]) * in2[1] +
551 ((uint128_t) in1[8]) * in2[0];
553 /* See comment in felem_square about the use of in2x2 here */
555 out[0] += ((uint128_t) in1[1]) * in2x2[8] +
556 ((uint128_t) in1[2]) * in2x2[7] +
557 ((uint128_t) in1[3]) * in2x2[6] +
558 ((uint128_t) in1[4]) * in2x2[5] +
559 ((uint128_t) in1[5]) * in2x2[4] +
560 ((uint128_t) in1[6]) * in2x2[3] +
561 ((uint128_t) in1[7]) * in2x2[2] +
562 ((uint128_t) in1[8]) * in2x2[1];
564 out[1] += ((uint128_t) in1[2]) * in2x2[8] +
565 ((uint128_t) in1[3]) * in2x2[7] +
566 ((uint128_t) in1[4]) * in2x2[6] +
567 ((uint128_t) in1[5]) * in2x2[5] +
568 ((uint128_t) in1[6]) * in2x2[4] +
569 ((uint128_t) in1[7]) * in2x2[3] +
570 ((uint128_t) in1[8]) * in2x2[2];
572 out[2] += ((uint128_t) in1[3]) * in2x2[8] +
573 ((uint128_t) in1[4]) * in2x2[7] +
574 ((uint128_t) in1[5]) * in2x2[6] +
575 ((uint128_t) in1[6]) * in2x2[5] +
576 ((uint128_t) in1[7]) * in2x2[4] +
577 ((uint128_t) in1[8]) * in2x2[3];
579 out[3] += ((uint128_t) in1[4]) * in2x2[8] +
580 ((uint128_t) in1[5]) * in2x2[7] +
581 ((uint128_t) in1[6]) * in2x2[6] +
582 ((uint128_t) in1[7]) * in2x2[5] +
583 ((uint128_t) in1[8]) * in2x2[4];
585 out[4] += ((uint128_t) in1[5]) * in2x2[8] +
586 ((uint128_t) in1[6]) * in2x2[7] +
587 ((uint128_t) in1[7]) * in2x2[6] +
588 ((uint128_t) in1[8]) * in2x2[5];
590 out[5] += ((uint128_t) in1[6]) * in2x2[8] +
591 ((uint128_t) in1[7]) * in2x2[7] +
592 ((uint128_t) in1[8]) * in2x2[6];
594 out[6] += ((uint128_t) in1[7]) * in2x2[8] +
595 ((uint128_t) in1[8]) * in2x2[7];
597 out[7] += ((uint128_t) in1[8]) * in2x2[8];
600 static const limb bottom52bits = 0xfffffffffffff;
603 * felem_reduce converts a largefelem to an felem.
607 * out[i] < 2^59 + 2^14
609 static void felem_reduce(felem out, const largefelem in)
611 u64 overflow1, overflow2;
613 out[0] = ((limb) in[0]) & bottom58bits;
614 out[1] = ((limb) in[1]) & bottom58bits;
615 out[2] = ((limb) in[2]) & bottom58bits;
616 out[3] = ((limb) in[3]) & bottom58bits;
617 out[4] = ((limb) in[4]) & bottom58bits;
618 out[5] = ((limb) in[5]) & bottom58bits;
619 out[6] = ((limb) in[6]) & bottom58bits;
620 out[7] = ((limb) in[7]) & bottom58bits;
621 out[8] = ((limb) in[8]) & bottom58bits;
625 out[1] += ((limb) in[0]) >> 58;
626 out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
628 * out[1] < 2^58 + 2^6 + 2^58
631 out[2] += ((limb) (in[0] >> 64)) >> 52;
633 out[2] += ((limb) in[1]) >> 58;
634 out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
635 out[3] += ((limb) (in[1] >> 64)) >> 52;
637 out[3] += ((limb) in[2]) >> 58;
638 out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
639 out[4] += ((limb) (in[2] >> 64)) >> 52;
641 out[4] += ((limb) in[3]) >> 58;
642 out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
643 out[5] += ((limb) (in[3] >> 64)) >> 52;
645 out[5] += ((limb) in[4]) >> 58;
646 out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
647 out[6] += ((limb) (in[4] >> 64)) >> 52;
649 out[6] += ((limb) in[5]) >> 58;
650 out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
651 out[7] += ((limb) (in[5] >> 64)) >> 52;
653 out[7] += ((limb) in[6]) >> 58;
654 out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
655 out[8] += ((limb) (in[6] >> 64)) >> 52;
657 out[8] += ((limb) in[7]) >> 58;
658 out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
660 * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
663 overflow1 = ((limb) (in[7] >> 64)) >> 52;
665 overflow1 += ((limb) in[8]) >> 58;
666 overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
667 overflow2 = ((limb) (in[8] >> 64)) >> 52;
669 overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */
670 overflow2 <<= 1; /* overflow2 < 2^13 */
672 out[0] += overflow1; /* out[0] < 2^60 */
673 out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */
675 out[1] += out[0] >> 58;
676 out[0] &= bottom58bits;
679 * out[1] < 2^59 + 2^6 + 2^13 + 2^2
684 static void felem_square_reduce(felem out, const felem in)
687 felem_square(tmp, in);
688 felem_reduce(out, tmp);
691 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
694 felem_mul(tmp, in1, in2);
695 felem_reduce(out, tmp);
699 * felem_inv calculates |out| = |in|^{-1}
701 * Based on Fermat's Little Theorem:
703 * a^{p-1} = 1 (mod p)
704 * a^{p-2} = a^{-1} (mod p)
706 static void felem_inv(felem out, const felem in)
708 felem ftmp, ftmp2, ftmp3, ftmp4;
712 felem_square(tmp, in);
713 felem_reduce(ftmp, tmp); /* 2^1 */
714 felem_mul(tmp, in, ftmp);
715 felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
716 felem_assign(ftmp2, ftmp);
717 felem_square(tmp, ftmp);
718 felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
719 felem_mul(tmp, in, ftmp);
720 felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */
721 felem_square(tmp, ftmp);
722 felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */
724 felem_square(tmp, ftmp2);
725 felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */
726 felem_square(tmp, ftmp3);
727 felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */
728 felem_mul(tmp, ftmp3, ftmp2);
729 felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
731 felem_assign(ftmp2, ftmp3);
732 felem_square(tmp, ftmp3);
733 felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */
734 felem_square(tmp, ftmp3);
735 felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */
736 felem_square(tmp, ftmp3);
737 felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */
738 felem_square(tmp, ftmp3);
739 felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */
740 felem_assign(ftmp4, ftmp3);
741 felem_mul(tmp, ftmp3, ftmp);
742 felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */
743 felem_square(tmp, ftmp4);
744 felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */
745 felem_mul(tmp, ftmp3, ftmp2);
746 felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
747 felem_assign(ftmp2, ftmp3);
749 for (i = 0; i < 8; i++) {
750 felem_square(tmp, ftmp3);
751 felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
753 felem_mul(tmp, ftmp3, ftmp2);
754 felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
755 felem_assign(ftmp2, ftmp3);
757 for (i = 0; i < 16; i++) {
758 felem_square(tmp, ftmp3);
759 felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
761 felem_mul(tmp, ftmp3, ftmp2);
762 felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
763 felem_assign(ftmp2, ftmp3);
765 for (i = 0; i < 32; i++) {
766 felem_square(tmp, ftmp3);
767 felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
769 felem_mul(tmp, ftmp3, ftmp2);
770 felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
771 felem_assign(ftmp2, ftmp3);
773 for (i = 0; i < 64; i++) {
774 felem_square(tmp, ftmp3);
775 felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
777 felem_mul(tmp, ftmp3, ftmp2);
778 felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
779 felem_assign(ftmp2, ftmp3);
781 for (i = 0; i < 128; i++) {
782 felem_square(tmp, ftmp3);
783 felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
785 felem_mul(tmp, ftmp3, ftmp2);
786 felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
787 felem_assign(ftmp2, ftmp3);
789 for (i = 0; i < 256; i++) {
790 felem_square(tmp, ftmp3);
791 felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
793 felem_mul(tmp, ftmp3, ftmp2);
794 felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
796 for (i = 0; i < 9; i++) {
797 felem_square(tmp, ftmp3);
798 felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
800 felem_mul(tmp, ftmp3, ftmp4);
801 felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
802 felem_mul(tmp, ftmp3, in);
803 felem_reduce(out, tmp); /* 2^512 - 3 */
806 /* This is 2^521-1, expressed as an felem */
807 static const felem kPrime = {
808 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
809 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
810 0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
814 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
817 * in[i] < 2^59 + 2^14
819 static limb felem_is_zero(const felem in)
823 felem_assign(ftmp, in);
825 ftmp[0] += ftmp[8] >> 57;
826 ftmp[8] &= bottom57bits;
828 ftmp[1] += ftmp[0] >> 58;
829 ftmp[0] &= bottom58bits;
830 ftmp[2] += ftmp[1] >> 58;
831 ftmp[1] &= bottom58bits;
832 ftmp[3] += ftmp[2] >> 58;
833 ftmp[2] &= bottom58bits;
834 ftmp[4] += ftmp[3] >> 58;
835 ftmp[3] &= bottom58bits;
836 ftmp[5] += ftmp[4] >> 58;
837 ftmp[4] &= bottom58bits;
838 ftmp[6] += ftmp[5] >> 58;
839 ftmp[5] &= bottom58bits;
840 ftmp[7] += ftmp[6] >> 58;
841 ftmp[6] &= bottom58bits;
842 ftmp[8] += ftmp[7] >> 58;
843 ftmp[7] &= bottom58bits;
844 /* ftmp[8] < 2^57 + 4 */
847 * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
848 * than our bound for ftmp[8]. Therefore we only have to check if the
849 * zero is zero or 2^521-1.
865 * We know that ftmp[i] < 2^63, therefore the only way that the top bit
866 * can be set is if is_zero was 0 before the decrement.
868 is_zero = ((s64) is_zero) >> 63;
870 is_p = ftmp[0] ^ kPrime[0];
871 is_p |= ftmp[1] ^ kPrime[1];
872 is_p |= ftmp[2] ^ kPrime[2];
873 is_p |= ftmp[3] ^ kPrime[3];
874 is_p |= ftmp[4] ^ kPrime[4];
875 is_p |= ftmp[5] ^ kPrime[5];
876 is_p |= ftmp[6] ^ kPrime[6];
877 is_p |= ftmp[7] ^ kPrime[7];
878 is_p |= ftmp[8] ^ kPrime[8];
881 is_p = ((s64) is_p) >> 63;
887 static int felem_is_zero_int(const felem in)
889 return (int)(felem_is_zero(in) & ((limb) 1));
893 * felem_contract converts |in| to its unique, minimal representation.
895 * in[i] < 2^59 + 2^14
897 static void felem_contract(felem out, const felem in)
899 limb is_p, is_greater, sign;
900 static const limb two58 = ((limb) 1) << 58;
902 felem_assign(out, in);
904 out[0] += out[8] >> 57;
905 out[8] &= bottom57bits;
907 out[1] += out[0] >> 58;
908 out[0] &= bottom58bits;
909 out[2] += out[1] >> 58;
910 out[1] &= bottom58bits;
911 out[3] += out[2] >> 58;
912 out[2] &= bottom58bits;
913 out[4] += out[3] >> 58;
914 out[3] &= bottom58bits;
915 out[5] += out[4] >> 58;
916 out[4] &= bottom58bits;
917 out[6] += out[5] >> 58;
918 out[5] &= bottom58bits;
919 out[7] += out[6] >> 58;
920 out[6] &= bottom58bits;
921 out[8] += out[7] >> 58;
922 out[7] &= bottom58bits;
923 /* out[8] < 2^57 + 4 */
926 * If the value is greater than 2^521-1 then we have to subtract 2^521-1
927 * out. See the comments in felem_is_zero regarding why we don't test for
928 * other multiples of the prime.
932 * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
935 is_p = out[0] ^ kPrime[0];
936 is_p |= out[1] ^ kPrime[1];
937 is_p |= out[2] ^ kPrime[2];
938 is_p |= out[3] ^ kPrime[3];
939 is_p |= out[4] ^ kPrime[4];
940 is_p |= out[5] ^ kPrime[5];
941 is_p |= out[6] ^ kPrime[6];
942 is_p |= out[7] ^ kPrime[7];
943 is_p |= out[8] ^ kPrime[8];
952 is_p = ((s64) is_p) >> 63;
955 /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
968 * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
969 * 57 is greater than zero as (2^521-1) + x >= 2^522
971 is_greater = out[8] >> 57;
972 is_greater |= is_greater << 32;
973 is_greater |= is_greater << 16;
974 is_greater |= is_greater << 8;
975 is_greater |= is_greater << 4;
976 is_greater |= is_greater << 2;
977 is_greater |= is_greater << 1;
978 is_greater = ((s64) is_greater) >> 63;
980 out[0] -= kPrime[0] & is_greater;
981 out[1] -= kPrime[1] & is_greater;
982 out[2] -= kPrime[2] & is_greater;
983 out[3] -= kPrime[3] & is_greater;
984 out[4] -= kPrime[4] & is_greater;
985 out[5] -= kPrime[5] & is_greater;
986 out[6] -= kPrime[6] & is_greater;
987 out[7] -= kPrime[7] & is_greater;
988 out[8] -= kPrime[8] & is_greater;
990 /* Eliminate negative coefficients */
991 sign = -(out[0] >> 63);
992 out[0] += (two58 & sign);
993 out[1] -= (1 & sign);
994 sign = -(out[1] >> 63);
995 out[1] += (two58 & sign);
996 out[2] -= (1 & sign);
997 sign = -(out[2] >> 63);
998 out[2] += (two58 & sign);
999 out[3] -= (1 & sign);
1000 sign = -(out[3] >> 63);
1001 out[3] += (two58 & sign);
1002 out[4] -= (1 & sign);
1003 sign = -(out[4] >> 63);
1004 out[4] += (two58 & sign);
1005 out[5] -= (1 & sign);
1006 sign = -(out[0] >> 63);
1007 out[5] += (two58 & sign);
1008 out[6] -= (1 & sign);
1009 sign = -(out[6] >> 63);
1010 out[6] += (two58 & sign);
1011 out[7] -= (1 & sign);
1012 sign = -(out[7] >> 63);
1013 out[7] += (two58 & sign);
1014 out[8] -= (1 & sign);
1015 sign = -(out[5] >> 63);
1016 out[5] += (two58 & sign);
1017 out[6] -= (1 & sign);
1018 sign = -(out[6] >> 63);
1019 out[6] += (two58 & sign);
1020 out[7] -= (1 & sign);
1021 sign = -(out[7] >> 63);
1022 out[7] += (two58 & sign);
1023 out[8] -= (1 & sign);
1030 * Building on top of the field operations we have the operations on the
1031 * elliptic curve group itself. Points on the curve are represented in Jacobian
1035 * point_double calculates 2*(x_in, y_in, z_in)
1037 * The method is taken from:
1038 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1040 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1041 * while x_out == y_in is not (maybe this works, but it's not tested). */
1043 point_double(felem x_out, felem y_out, felem z_out,
1044 const felem x_in, const felem y_in, const felem z_in)
1046 largefelem tmp, tmp2;
1047 felem delta, gamma, beta, alpha, ftmp, ftmp2;
1049 felem_assign(ftmp, x_in);
1050 felem_assign(ftmp2, x_in);
1053 felem_square(tmp, z_in);
1054 felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */
1057 felem_square(tmp, y_in);
1058 felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */
1060 /* beta = x*gamma */
1061 felem_mul(tmp, x_in, gamma);
1062 felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */
1064 /* alpha = 3*(x-delta)*(x+delta) */
1065 felem_diff64(ftmp, delta);
1066 /* ftmp[i] < 2^61 */
1067 felem_sum64(ftmp2, delta);
1068 /* ftmp2[i] < 2^60 + 2^15 */
1069 felem_scalar64(ftmp2, 3);
1070 /* ftmp2[i] < 3*2^60 + 3*2^15 */
1071 felem_mul(tmp, ftmp, ftmp2);
1073 * tmp[i] < 17(3*2^121 + 3*2^76)
1074 * = 61*2^121 + 61*2^76
1075 * < 64*2^121 + 64*2^76
1079 felem_reduce(alpha, tmp);
1081 /* x' = alpha^2 - 8*beta */
1082 felem_square(tmp, alpha);
1084 * tmp[i] < 17*2^120 < 2^125
1086 felem_assign(ftmp, beta);
1087 felem_scalar64(ftmp, 8);
1088 /* ftmp[i] < 2^62 + 2^17 */
1089 felem_diff_128_64(tmp, ftmp);
1090 /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1091 felem_reduce(x_out, tmp);
1093 /* z' = (y + z)^2 - gamma - delta */
1094 felem_sum64(delta, gamma);
1095 /* delta[i] < 2^60 + 2^15 */
1096 felem_assign(ftmp, y_in);
1097 felem_sum64(ftmp, z_in);
1098 /* ftmp[i] < 2^60 + 2^15 */
1099 felem_square(tmp, ftmp);
1101 * tmp[i] < 17(2^122) < 2^127
1103 felem_diff_128_64(tmp, delta);
1104 /* tmp[i] < 2^127 + 2^63 */
1105 felem_reduce(z_out, tmp);
1107 /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1108 felem_scalar64(beta, 4);
1109 /* beta[i] < 2^61 + 2^16 */
1110 felem_diff64(beta, x_out);
1111 /* beta[i] < 2^61 + 2^60 + 2^16 */
1112 felem_mul(tmp, alpha, beta);
1114 * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1115 * = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1116 * = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1119 felem_square(tmp2, gamma);
1121 * tmp2[i] < 17*(2^59 + 2^14)^2
1122 * = 17*(2^118 + 2^74 + 2^28)
1124 felem_scalar128(tmp2, 8);
1126 * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1127 * = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1130 felem_diff128(tmp, tmp2);
1132 * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1133 * = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1134 * 2^74 + 2^69 + 2^34 + 2^30
1137 felem_reduce(y_out, tmp);
1140 /* copy_conditional copies in to out iff mask is all ones. */
1141 static void copy_conditional(felem out, const felem in, limb mask)
1144 for (i = 0; i < NLIMBS; ++i) {
1145 const limb tmp = mask & (in[i] ^ out[i]);
1151 * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1153 * The method is taken from
1154 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1155 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1157 * This function includes a branch for checking whether the two input points
1158 * are equal (while not equal to the point at infinity). This case never
1159 * happens during single point multiplication, so there is no timing leak for
1160 * ECDH or ECDSA signing. */
1161 static void point_add(felem x3, felem y3, felem z3,
1162 const felem x1, const felem y1, const felem z1,
1163 const int mixed, const felem x2, const felem y2,
1166 felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1167 largefelem tmp, tmp2;
1168 limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1170 z1_is_zero = felem_is_zero(z1);
1171 z2_is_zero = felem_is_zero(z2);
1173 /* ftmp = z1z1 = z1**2 */
1174 felem_square(tmp, z1);
1175 felem_reduce(ftmp, tmp);
1178 /* ftmp2 = z2z2 = z2**2 */
1179 felem_square(tmp, z2);
1180 felem_reduce(ftmp2, tmp);
1182 /* u1 = ftmp3 = x1*z2z2 */
1183 felem_mul(tmp, x1, ftmp2);
1184 felem_reduce(ftmp3, tmp);
1186 /* ftmp5 = z1 + z2 */
1187 felem_assign(ftmp5, z1);
1188 felem_sum64(ftmp5, z2);
1189 /* ftmp5[i] < 2^61 */
1191 /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1192 felem_square(tmp, ftmp5);
1193 /* tmp[i] < 17*2^122 */
1194 felem_diff_128_64(tmp, ftmp);
1195 /* tmp[i] < 17*2^122 + 2^63 */
1196 felem_diff_128_64(tmp, ftmp2);
1197 /* tmp[i] < 17*2^122 + 2^64 */
1198 felem_reduce(ftmp5, tmp);
1200 /* ftmp2 = z2 * z2z2 */
1201 felem_mul(tmp, ftmp2, z2);
1202 felem_reduce(ftmp2, tmp);
1204 /* s1 = ftmp6 = y1 * z2**3 */
1205 felem_mul(tmp, y1, ftmp2);
1206 felem_reduce(ftmp6, tmp);
1209 * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1212 /* u1 = ftmp3 = x1*z2z2 */
1213 felem_assign(ftmp3, x1);
1215 /* ftmp5 = 2*z1z2 */
1216 felem_scalar(ftmp5, z1, 2);
1218 /* s1 = ftmp6 = y1 * z2**3 */
1219 felem_assign(ftmp6, y1);
1223 felem_mul(tmp, x2, ftmp);
1224 /* tmp[i] < 17*2^120 */
1226 /* h = ftmp4 = u2 - u1 */
1227 felem_diff_128_64(tmp, ftmp3);
1228 /* tmp[i] < 17*2^120 + 2^63 */
1229 felem_reduce(ftmp4, tmp);
1231 x_equal = felem_is_zero(ftmp4);
1233 /* z_out = ftmp5 * h */
1234 felem_mul(tmp, ftmp5, ftmp4);
1235 felem_reduce(z_out, tmp);
1237 /* ftmp = z1 * z1z1 */
1238 felem_mul(tmp, ftmp, z1);
1239 felem_reduce(ftmp, tmp);
1241 /* s2 = tmp = y2 * z1**3 */
1242 felem_mul(tmp, y2, ftmp);
1243 /* tmp[i] < 17*2^120 */
1245 /* r = ftmp5 = (s2 - s1)*2 */
1246 felem_diff_128_64(tmp, ftmp6);
1247 /* tmp[i] < 17*2^120 + 2^63 */
1248 felem_reduce(ftmp5, tmp);
1249 y_equal = felem_is_zero(ftmp5);
1250 felem_scalar64(ftmp5, 2);
1251 /* ftmp5[i] < 2^61 */
1253 if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1254 point_double(x3, y3, z3, x1, y1, z1);
1258 /* I = ftmp = (2h)**2 */
1259 felem_assign(ftmp, ftmp4);
1260 felem_scalar64(ftmp, 2);
1261 /* ftmp[i] < 2^61 */
1262 felem_square(tmp, ftmp);
1263 /* tmp[i] < 17*2^122 */
1264 felem_reduce(ftmp, tmp);
1266 /* J = ftmp2 = h * I */
1267 felem_mul(tmp, ftmp4, ftmp);
1268 felem_reduce(ftmp2, tmp);
1270 /* V = ftmp4 = U1 * I */
1271 felem_mul(tmp, ftmp3, ftmp);
1272 felem_reduce(ftmp4, tmp);
1274 /* x_out = r**2 - J - 2V */
1275 felem_square(tmp, ftmp5);
1276 /* tmp[i] < 17*2^122 */
1277 felem_diff_128_64(tmp, ftmp2);
1278 /* tmp[i] < 17*2^122 + 2^63 */
1279 felem_assign(ftmp3, ftmp4);
1280 felem_scalar64(ftmp4, 2);
1281 /* ftmp4[i] < 2^61 */
1282 felem_diff_128_64(tmp, ftmp4);
1283 /* tmp[i] < 17*2^122 + 2^64 */
1284 felem_reduce(x_out, tmp);
1286 /* y_out = r(V-x_out) - 2 * s1 * J */
1287 felem_diff64(ftmp3, x_out);
1289 * ftmp3[i] < 2^60 + 2^60 = 2^61
1291 felem_mul(tmp, ftmp5, ftmp3);
1292 /* tmp[i] < 17*2^122 */
1293 felem_mul(tmp2, ftmp6, ftmp2);
1294 /* tmp2[i] < 17*2^120 */
1295 felem_scalar128(tmp2, 2);
1296 /* tmp2[i] < 17*2^121 */
1297 felem_diff128(tmp, tmp2);
1299 * tmp[i] < 2^127 - 2^69 + 17*2^122
1300 * = 2^126 - 2^122 - 2^6 - 2^2 - 1
1303 felem_reduce(y_out, tmp);
1305 copy_conditional(x_out, x2, z1_is_zero);
1306 copy_conditional(x_out, x1, z2_is_zero);
1307 copy_conditional(y_out, y2, z1_is_zero);
1308 copy_conditional(y_out, y1, z2_is_zero);
1309 copy_conditional(z_out, z2, z1_is_zero);
1310 copy_conditional(z_out, z1, z2_is_zero);
1311 felem_assign(x3, x_out);
1312 felem_assign(y3, y_out);
1313 felem_assign(z3, z_out);
1317 * Base point pre computation
1318 * --------------------------
1320 * Two different sorts of precomputed tables are used in the following code.
1321 * Each contain various points on the curve, where each point is three field
1322 * elements (x, y, z).
1324 * For the base point table, z is usually 1 (0 for the point at infinity).
1325 * This table has 16 elements:
1326 * index | bits | point
1327 * ------+---------+------------------------------
1330 * 2 | 0 0 1 0 | 2^130G
1331 * 3 | 0 0 1 1 | (2^130 + 1)G
1332 * 4 | 0 1 0 0 | 2^260G
1333 * 5 | 0 1 0 1 | (2^260 + 1)G
1334 * 6 | 0 1 1 0 | (2^260 + 2^130)G
1335 * 7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1336 * 8 | 1 0 0 0 | 2^390G
1337 * 9 | 1 0 0 1 | (2^390 + 1)G
1338 * 10 | 1 0 1 0 | (2^390 + 2^130)G
1339 * 11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1340 * 12 | 1 1 0 0 | (2^390 + 2^260)G
1341 * 13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1342 * 14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1343 * 15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1345 * The reason for this is so that we can clock bits into four different
1346 * locations when doing simple scalar multiplies against the base point.
1348 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1350 /* gmul is the table of precomputed base points */
1351 static const felem gmul[16][3] = {
1352 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1353 {0, 0, 0, 0, 0, 0, 0, 0, 0},
1354 {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1355 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1356 0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1357 0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1358 {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1359 0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1360 0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1361 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1362 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1363 0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1364 0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1365 {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1366 0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1367 0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1368 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1369 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1370 0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1371 0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1372 {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1373 0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1374 0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1375 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1376 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1377 0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1378 0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1379 {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1380 0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1381 0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1382 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1383 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1384 0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1385 0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1386 {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1387 0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1388 0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1389 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1390 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1391 0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1392 0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1393 {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1394 0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1395 0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1396 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1397 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1398 0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1399 0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1400 {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1401 0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1402 0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1403 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1404 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1405 0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1406 0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1407 {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1408 0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1409 0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1410 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1411 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1412 0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1413 0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1414 {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1415 0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1416 0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1417 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1418 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1419 0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1420 0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1421 {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1422 0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1423 0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1424 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1425 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1426 0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1427 0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1428 {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1429 0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1430 0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1431 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1432 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1433 0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1434 0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1435 {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1436 0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1437 0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1438 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1439 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1440 0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1441 0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1442 {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1443 0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1444 0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1445 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1446 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1447 0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1448 0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1449 {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1450 0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1451 0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1452 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1453 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1454 0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1455 0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1456 {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1457 0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1458 0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1459 {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1463 * select_point selects the |idx|th point from a precomputation table and
1466 /* pre_comp below is of the size provided in |size| */
1467 static void select_point(const limb idx, unsigned int size,
1468 const felem pre_comp[][3], felem out[3])
1471 limb *outlimbs = &out[0][0];
1473 memset(out, 0, sizeof(*out) * 3);
1475 for (i = 0; i < size; i++) {
1476 const limb *inlimbs = &pre_comp[i][0][0];
1477 limb mask = i ^ idx;
1483 for (j = 0; j < NLIMBS * 3; j++)
1484 outlimbs[j] |= inlimbs[j] & mask;
1488 /* get_bit returns the |i|th bit in |in| */
1489 static char get_bit(const felem_bytearray in, int i)
1493 return (in[i >> 3] >> (i & 7)) & 1;
1497 * Interleaved point multiplication using precomputed point multiples: The
1498 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1499 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1500 * generator, using certain (large) precomputed multiples in g_pre_comp.
1501 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1503 static void batch_mul(felem x_out, felem y_out, felem z_out,
1504 const felem_bytearray scalars[],
1505 const unsigned num_points, const u8 *g_scalar,
1506 const int mixed, const felem pre_comp[][17][3],
1507 const felem g_pre_comp[16][3])
1510 unsigned num, gen_mul = (g_scalar != NULL);
1511 felem nq[3], tmp[4];
1515 /* set nq to the point at infinity */
1516 memset(nq, 0, sizeof(nq));
1519 * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1520 * of the generator (last quarter of rounds) and additions of other
1521 * points multiples (every 5th round).
1523 skip = 1; /* save two point operations in the first
1525 for (i = (num_points ? 520 : 130); i >= 0; --i) {
1528 point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1530 /* add multiples of the generator */
1531 if (gen_mul && (i <= 130)) {
1532 bits = get_bit(g_scalar, i + 390) << 3;
1534 bits |= get_bit(g_scalar, i + 260) << 2;
1535 bits |= get_bit(g_scalar, i + 130) << 1;
1536 bits |= get_bit(g_scalar, i);
1538 /* select the point to add, in constant time */
1539 select_point(bits, 16, g_pre_comp, tmp);
1541 /* The 1 argument below is for "mixed" */
1542 point_add(nq[0], nq[1], nq[2],
1543 nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1545 memcpy(nq, tmp, 3 * sizeof(felem));
1550 /* do other additions every 5 doublings */
1551 if (num_points && (i % 5 == 0)) {
1552 /* loop over all scalars */
1553 for (num = 0; num < num_points; ++num) {
1554 bits = get_bit(scalars[num], i + 4) << 5;
1555 bits |= get_bit(scalars[num], i + 3) << 4;
1556 bits |= get_bit(scalars[num], i + 2) << 3;
1557 bits |= get_bit(scalars[num], i + 1) << 2;
1558 bits |= get_bit(scalars[num], i) << 1;
1559 bits |= get_bit(scalars[num], i - 1);
1560 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1563 * select the point to add or subtract, in constant time
1565 select_point(digit, 17, pre_comp[num], tmp);
1566 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1568 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1571 point_add(nq[0], nq[1], nq[2],
1572 nq[0], nq[1], nq[2],
1573 mixed, tmp[0], tmp[1], tmp[2]);
1575 memcpy(nq, tmp, 3 * sizeof(felem));
1581 felem_assign(x_out, nq[0]);
1582 felem_assign(y_out, nq[1]);
1583 felem_assign(z_out, nq[2]);
1586 /* Precomputation for the group generator. */
1587 struct nistp521_pre_comp_st {
1588 felem g_pre_comp[16][3];
1592 const EC_METHOD *EC_GFp_nistp521_method(void)
1594 static const EC_METHOD ret = {
1595 EC_FLAGS_DEFAULT_OCT,
1596 NID_X9_62_prime_field,
1597 ec_GFp_nistp521_group_init,
1598 ec_GFp_simple_group_finish,
1599 ec_GFp_simple_group_clear_finish,
1600 ec_GFp_nist_group_copy,
1601 ec_GFp_nistp521_group_set_curve,
1602 ec_GFp_simple_group_get_curve,
1603 ec_GFp_simple_group_get_degree,
1604 ec_GFp_simple_group_check_discriminant,
1605 ec_GFp_simple_point_init,
1606 ec_GFp_simple_point_finish,
1607 ec_GFp_simple_point_clear_finish,
1608 ec_GFp_simple_point_copy,
1609 ec_GFp_simple_point_set_to_infinity,
1610 ec_GFp_simple_set_Jprojective_coordinates_GFp,
1611 ec_GFp_simple_get_Jprojective_coordinates_GFp,
1612 ec_GFp_simple_point_set_affine_coordinates,
1613 ec_GFp_nistp521_point_get_affine_coordinates,
1614 0 /* point_set_compressed_coordinates */ ,
1619 ec_GFp_simple_invert,
1620 ec_GFp_simple_is_at_infinity,
1621 ec_GFp_simple_is_on_curve,
1623 ec_GFp_simple_make_affine,
1624 ec_GFp_simple_points_make_affine,
1625 ec_GFp_nistp521_points_mul,
1626 ec_GFp_nistp521_precompute_mult,
1627 ec_GFp_nistp521_have_precompute_mult,
1628 ec_GFp_nist_field_mul,
1629 ec_GFp_nist_field_sqr,
1631 0 /* field_encode */ ,
1632 0 /* field_decode */ ,
1633 0 /* field_set_to_one */
1639 /******************************************************************************/
1641 * FUNCTIONS TO MANAGE PRECOMPUTATION
1644 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1646 NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1649 ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1652 ret->references = 1;
1656 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1659 CRYPTO_add(&p->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1663 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1666 || CRYPTO_add(&p->references, -1, CRYPTO_LOCK_EC_PRE_COMP) > 0)
1671 /******************************************************************************/
1673 * OPENSSL EC_METHOD FUNCTIONS
1676 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1679 ret = ec_GFp_simple_group_init(group);
1680 group->a_is_minus3 = 1;
1684 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1685 const BIGNUM *a, const BIGNUM *b,
1689 BN_CTX *new_ctx = NULL;
1690 BIGNUM *curve_p, *curve_a, *curve_b;
1693 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1696 if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1697 ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1698 ((curve_b = BN_CTX_get(ctx)) == NULL))
1700 BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1701 BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1702 BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1703 if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1704 ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1705 EC_R_WRONG_CURVE_PARAMETERS);
1708 group->field_mod_func = BN_nist_mod_521;
1709 ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1712 BN_CTX_free(new_ctx);
1717 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1720 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1721 const EC_POINT *point,
1722 BIGNUM *x, BIGNUM *y,
1725 felem z1, z2, x_in, y_in, x_out, y_out;
1728 if (EC_POINT_is_at_infinity(group, point)) {
1729 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1730 EC_R_POINT_AT_INFINITY);
1733 if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1734 (!BN_to_felem(z1, point->Z)))
1737 felem_square(tmp, z2);
1738 felem_reduce(z1, tmp);
1739 felem_mul(tmp, x_in, z1);
1740 felem_reduce(x_in, tmp);
1741 felem_contract(x_out, x_in);
1743 if (!felem_to_BN(x, x_out)) {
1744 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1749 felem_mul(tmp, z1, z2);
1750 felem_reduce(z1, tmp);
1751 felem_mul(tmp, y_in, z1);
1752 felem_reduce(y_in, tmp);
1753 felem_contract(y_out, y_in);
1755 if (!felem_to_BN(y, y_out)) {
1756 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1764 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1765 static void make_points_affine(size_t num, felem points[][3],
1769 * Runs in constant time, unless an input is the point at infinity (which
1770 * normally shouldn't happen).
1772 ec_GFp_nistp_points_make_affine_internal(num,
1776 (void (*)(void *))felem_one,
1777 (int (*)(const void *))
1779 (void (*)(void *, const void *))
1781 (void (*)(void *, const void *))
1782 felem_square_reduce, (void (*)
1789 (void (*)(void *, const void *))
1791 (void (*)(void *, const void *))
1796 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1797 * values Result is stored in r (r can equal one of the inputs).
1799 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1800 const BIGNUM *scalar, size_t num,
1801 const EC_POINT *points[],
1802 const BIGNUM *scalars[], BN_CTX *ctx)
1807 BN_CTX *new_ctx = NULL;
1808 BIGNUM *x, *y, *z, *tmp_scalar;
1809 felem_bytearray g_secret;
1810 felem_bytearray *secrets = NULL;
1811 felem (*pre_comp)[17][3] = NULL;
1812 felem *tmp_felems = NULL;
1813 felem_bytearray tmp;
1814 unsigned i, num_bytes;
1815 int have_pre_comp = 0;
1816 size_t num_points = num;
1817 felem x_in, y_in, z_in, x_out, y_out, z_out;
1818 NISTP521_PRE_COMP *pre = NULL;
1819 felem(*g_pre_comp)[3] = NULL;
1820 EC_POINT *generator = NULL;
1821 const EC_POINT *p = NULL;
1822 const BIGNUM *p_scalar = NULL;
1825 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1828 if (((x = BN_CTX_get(ctx)) == NULL) ||
1829 ((y = BN_CTX_get(ctx)) == NULL) ||
1830 ((z = BN_CTX_get(ctx)) == NULL) ||
1831 ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1834 if (scalar != NULL) {
1835 pre = group->pre_comp.nistp521;
1837 /* we have precomputation, try to use it */
1838 g_pre_comp = &pre->g_pre_comp[0];
1840 /* try to use the standard precomputation */
1841 g_pre_comp = (felem(*)[3]) gmul;
1842 generator = EC_POINT_new(group);
1843 if (generator == NULL)
1845 /* get the generator from precomputation */
1846 if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1847 !felem_to_BN(y, g_pre_comp[1][1]) ||
1848 !felem_to_BN(z, g_pre_comp[1][2])) {
1849 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1852 if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1856 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1857 /* precomputation matches generator */
1861 * we don't have valid precomputation: treat the generator as a
1867 if (num_points > 0) {
1868 if (num_points >= 2) {
1870 * unless we precompute multiples for just one point, converting
1871 * those into affine form is time well spent
1875 secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1876 pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1879 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1880 if ((secrets == NULL) || (pre_comp == NULL)
1881 || (mixed && (tmp_felems == NULL))) {
1882 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1887 * we treat NULL scalars as 0, and NULL points as points at infinity,
1888 * i.e., they contribute nothing to the linear combination
1890 for (i = 0; i < num_points; ++i) {
1893 * we didn't have a valid precomputation, so we pick the
1897 p = EC_GROUP_get0_generator(group);
1900 /* the i^th point */
1903 p_scalar = scalars[i];
1905 if ((p_scalar != NULL) && (p != NULL)) {
1906 /* reduce scalar to 0 <= scalar < 2^521 */
1907 if ((BN_num_bits(p_scalar) > 521)
1908 || (BN_is_negative(p_scalar))) {
1910 * this is an unusual input, and we don't guarantee
1913 if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1914 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1917 num_bytes = BN_bn2bin(tmp_scalar, tmp);
1919 num_bytes = BN_bn2bin(p_scalar, tmp);
1920 flip_endian(secrets[i], tmp, num_bytes);
1921 /* precompute multiples */
1922 if ((!BN_to_felem(x_out, p->X)) ||
1923 (!BN_to_felem(y_out, p->Y)) ||
1924 (!BN_to_felem(z_out, p->Z)))
1926 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1927 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1928 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1929 for (j = 2; j <= 16; ++j) {
1931 point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1932 pre_comp[i][j][2], pre_comp[i][1][0],
1933 pre_comp[i][1][1], pre_comp[i][1][2], 0,
1934 pre_comp[i][j - 1][0],
1935 pre_comp[i][j - 1][1],
1936 pre_comp[i][j - 1][2]);
1938 point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1939 pre_comp[i][j][2], pre_comp[i][j / 2][0],
1940 pre_comp[i][j / 2][1],
1941 pre_comp[i][j / 2][2]);
1947 make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1950 /* the scalar for the generator */
1951 if ((scalar != NULL) && (have_pre_comp)) {
1952 memset(g_secret, 0, sizeof(g_secret));
1953 /* reduce scalar to 0 <= scalar < 2^521 */
1954 if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1956 * this is an unusual input, and we don't guarantee
1959 if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1960 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1963 num_bytes = BN_bn2bin(tmp_scalar, tmp);
1965 num_bytes = BN_bn2bin(scalar, tmp);
1966 flip_endian(g_secret, tmp, num_bytes);
1967 /* do the multiplication with generator precomputation */
1968 batch_mul(x_out, y_out, z_out,
1969 (const felem_bytearray(*))secrets, num_points,
1971 mixed, (const felem(*)[17][3])pre_comp,
1972 (const felem(*)[3])g_pre_comp);
1974 /* do the multiplication without generator precomputation */
1975 batch_mul(x_out, y_out, z_out,
1976 (const felem_bytearray(*))secrets, num_points,
1977 NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1978 /* reduce the output to its unique minimal representation */
1979 felem_contract(x_in, x_out);
1980 felem_contract(y_in, y_out);
1981 felem_contract(z_in, z_out);
1982 if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1983 (!felem_to_BN(z, z_in))) {
1984 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1987 ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1991 EC_POINT_free(generator);
1992 BN_CTX_free(new_ctx);
1993 OPENSSL_free(secrets);
1994 OPENSSL_free(pre_comp);
1995 OPENSSL_free(tmp_felems);
1999 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2002 NISTP521_PRE_COMP *pre = NULL;
2004 BN_CTX *new_ctx = NULL;
2006 EC_POINT *generator = NULL;
2007 felem tmp_felems[16];
2009 /* throw away old precomputation */
2010 EC_pre_comp_free(group);
2012 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2015 if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2017 /* get the generator */
2018 if (group->generator == NULL)
2020 generator = EC_POINT_new(group);
2021 if (generator == NULL)
2023 BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2024 BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2025 if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2027 if ((pre = nistp521_pre_comp_new()) == NULL)
2030 * if the generator is the standard one, use built-in precomputation
2032 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2033 memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2036 if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2037 (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2038 (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2040 /* compute 2^130*G, 2^260*G, 2^390*G */
2041 for (i = 1; i <= 4; i <<= 1) {
2042 point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2043 pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2044 pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2045 for (j = 0; j < 129; ++j) {
2046 point_double(pre->g_pre_comp[2 * i][0],
2047 pre->g_pre_comp[2 * i][1],
2048 pre->g_pre_comp[2 * i][2],
2049 pre->g_pre_comp[2 * i][0],
2050 pre->g_pre_comp[2 * i][1],
2051 pre->g_pre_comp[2 * i][2]);
2054 /* g_pre_comp[0] is the point at infinity */
2055 memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2056 /* the remaining multiples */
2057 /* 2^130*G + 2^260*G */
2058 point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2059 pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2060 pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2061 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2062 pre->g_pre_comp[2][2]);
2063 /* 2^130*G + 2^390*G */
2064 point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2065 pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2066 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2067 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2068 pre->g_pre_comp[2][2]);
2069 /* 2^260*G + 2^390*G */
2070 point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2071 pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2072 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2073 0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2074 pre->g_pre_comp[4][2]);
2075 /* 2^130*G + 2^260*G + 2^390*G */
2076 point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2077 pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2078 pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2079 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2080 pre->g_pre_comp[2][2]);
2081 for (i = 1; i < 8; ++i) {
2082 /* odd multiples: add G */
2083 point_add(pre->g_pre_comp[2 * i + 1][0],
2084 pre->g_pre_comp[2 * i + 1][1],
2085 pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2086 pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2087 pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2088 pre->g_pre_comp[1][2]);
2090 make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2093 SETPRECOMP(group, nistp521, pre);
2098 EC_POINT_free(generator);
2099 BN_CTX_free(new_ctx);
2100 EC_nistp521_pre_comp_free(pre);
2104 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2106 return HAVEPRECOMP(group, nistp521);
2110 static void *dummy = &dummy;