1 /* crypto/ec/ecp_nistp521.c */
3 * Written by Adam Langley (Google) for the OpenSSL project
5 /* Copyright 2011 Google Inc.
7 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
22 * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
24 * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25 * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26 * work which got its smarts from Daniel J. Bernstein's work on the same.
29 #include <openssl/opensslconf.h>
30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
32 # ifndef OPENSSL_SYS_VMS
35 # include <inttypes.h>
39 # include <openssl/err.h>
42 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43 /* even with gcc, the typedef won't work for 32-bit platforms */
44 typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit
47 # error "Need GCC 3.1 or later to define type uint128_t"
55 * The underlying field. P521 operates over GF(2^521-1). We can serialise an
56 * element of this field into 66 bytes where the most significant byte
57 * contains only a single bit. We call this an felem_bytearray.
60 typedef u8 felem_bytearray[66];
63 * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
64 * These values are big-endian.
66 static const felem_bytearray nistp521_curve_params[5] = {
67 {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
68 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
76 {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
77 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
85 {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
86 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
87 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
88 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
89 0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
90 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
91 0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
92 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
94 {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
95 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
96 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
97 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
98 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
99 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
100 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
101 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
103 {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
104 0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
105 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
106 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
107 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
108 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
109 0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
110 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
115 * The representation of field elements.
116 * ------------------------------------
118 * We represent field elements with nine values. These values are either 64 or
119 * 128 bits and the field element represented is:
120 * v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464 (mod p)
121 * Each of the nine values is called a 'limb'. Since the limbs are spaced only
122 * 58 bits apart, but are greater than 58 bits in length, the most significant
123 * bits of each limb overlap with the least significant bits of the next.
125 * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
130 typedef uint64_t limb;
131 typedef limb felem[NLIMBS];
132 typedef uint128_t largefelem[NLIMBS];
134 static const limb bottom57bits = 0x1ffffffffffffff;
135 static const limb bottom58bits = 0x3ffffffffffffff;
138 * bin66_to_felem takes a little-endian byte array and converts it into felem
139 * form. This assumes that the CPU is little-endian.
141 static void bin66_to_felem(felem out, const u8 in[66])
143 out[0] = (*((limb *) & in[0])) & bottom58bits;
144 out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
145 out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
146 out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
147 out[4] = (*((limb *) & in[29])) & bottom58bits;
148 out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
149 out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
150 out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
151 out[8] = (*((limb *) & in[58])) & bottom57bits;
155 * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
156 * array. This assumes that the CPU is little-endian.
158 static void felem_to_bin66(u8 out[66], const felem in)
161 (*((limb *) & out[0])) = in[0];
162 (*((limb *) & out[7])) |= in[1] << 2;
163 (*((limb *) & out[14])) |= in[2] << 4;
164 (*((limb *) & out[21])) |= in[3] << 6;
165 (*((limb *) & out[29])) = in[4];
166 (*((limb *) & out[36])) |= in[5] << 2;
167 (*((limb *) & out[43])) |= in[6] << 4;
168 (*((limb *) & out[50])) |= in[7] << 6;
169 (*((limb *) & out[58])) = in[8];
172 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
173 static void flip_endian(u8 *out, const u8 *in, unsigned len)
176 for (i = 0; i < len; ++i)
177 out[i] = in[len - 1 - i];
180 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
181 static int BN_to_felem(felem out, const BIGNUM *bn)
183 felem_bytearray b_in;
184 felem_bytearray b_out;
187 /* BN_bn2bin eats leading zeroes */
188 memset(b_out, 0, sizeof b_out);
189 num_bytes = BN_num_bytes(bn);
190 if (num_bytes > sizeof b_out) {
191 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
194 if (BN_is_negative(bn)) {
195 ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
198 num_bytes = BN_bn2bin(bn, b_in);
199 flip_endian(b_out, b_in, num_bytes);
200 bin66_to_felem(out, b_out);
204 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
205 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
207 felem_bytearray b_in, b_out;
208 felem_to_bin66(b_in, in);
209 flip_endian(b_out, b_in, sizeof b_out);
210 return BN_bin2bn(b_out, sizeof b_out, out);
218 static void felem_one(felem out)
231 static void felem_assign(felem out, const felem in)
244 /* felem_sum64 sets out = out + in. */
245 static void felem_sum64(felem out, const felem in)
258 /* felem_scalar sets out = in * scalar */
259 static void felem_scalar(felem out, const felem in, limb scalar)
261 out[0] = in[0] * scalar;
262 out[1] = in[1] * scalar;
263 out[2] = in[2] * scalar;
264 out[3] = in[3] * scalar;
265 out[4] = in[4] * scalar;
266 out[5] = in[5] * scalar;
267 out[6] = in[6] * scalar;
268 out[7] = in[7] * scalar;
269 out[8] = in[8] * scalar;
272 /* felem_scalar64 sets out = out * scalar */
273 static void felem_scalar64(felem out, limb scalar)
286 /* felem_scalar128 sets out = out * scalar */
287 static void felem_scalar128(largefelem out, limb scalar)
301 * felem_neg sets |out| to |-in|
303 * in[i] < 2^59 + 2^14
307 static void felem_neg(felem out, const felem in)
309 /* In order to prevent underflow, we subtract from 0 mod p. */
310 static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
311 static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
313 out[0] = two62m3 - in[0];
314 out[1] = two62m2 - in[1];
315 out[2] = two62m2 - in[2];
316 out[3] = two62m2 - in[3];
317 out[4] = two62m2 - in[4];
318 out[5] = two62m2 - in[5];
319 out[6] = two62m2 - in[6];
320 out[7] = two62m2 - in[7];
321 out[8] = two62m2 - in[8];
325 * felem_diff64 subtracts |in| from |out|
327 * in[i] < 2^59 + 2^14
329 * out[i] < out[i] + 2^62
331 static void felem_diff64(felem out, const felem in)
334 * In order to prevent underflow, we add 0 mod p before subtracting.
336 static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
337 static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
339 out[0] += two62m3 - in[0];
340 out[1] += two62m2 - in[1];
341 out[2] += two62m2 - in[2];
342 out[3] += two62m2 - in[3];
343 out[4] += two62m2 - in[4];
344 out[5] += two62m2 - in[5];
345 out[6] += two62m2 - in[6];
346 out[7] += two62m2 - in[7];
347 out[8] += two62m2 - in[8];
351 * felem_diff_128_64 subtracts |in| from |out|
353 * in[i] < 2^62 + 2^17
355 * out[i] < out[i] + 2^63
357 static void felem_diff_128_64(largefelem out, const felem in)
360 * In order to prevent underflow, we add 0 mod p before subtracting.
362 static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
363 static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
365 out[0] += two63m6 - in[0];
366 out[1] += two63m5 - in[1];
367 out[2] += two63m5 - in[2];
368 out[3] += two63m5 - in[3];
369 out[4] += two63m5 - in[4];
370 out[5] += two63m5 - in[5];
371 out[6] += two63m5 - in[6];
372 out[7] += two63m5 - in[7];
373 out[8] += two63m5 - in[8];
377 * felem_diff_128_64 subtracts |in| from |out|
381 * out[i] < out[i] + 2^127 - 2^69
383 static void felem_diff128(largefelem out, const largefelem in)
386 * In order to prevent underflow, we add 0 mod p before subtracting.
388 static const uint128_t two127m70 =
389 (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
390 static const uint128_t two127m69 =
391 (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
393 out[0] += (two127m70 - in[0]);
394 out[1] += (two127m69 - in[1]);
395 out[2] += (two127m69 - in[2]);
396 out[3] += (two127m69 - in[3]);
397 out[4] += (two127m69 - in[4]);
398 out[5] += (two127m69 - in[5]);
399 out[6] += (two127m69 - in[6]);
400 out[7] += (two127m69 - in[7]);
401 out[8] += (two127m69 - in[8]);
405 * felem_square sets |out| = |in|^2
409 * out[i] < 17 * max(in[i]) * max(in[i])
411 static void felem_square(largefelem out, const felem in)
414 felem_scalar(inx2, in, 2);
415 felem_scalar(inx4, in, 4);
418 * We have many cases were we want to do
421 * This is obviously just
423 * However, rather than do the doubling on the 128 bit result, we
424 * double one of the inputs to the multiplication by reading from
427 out[0] = ((uint128_t) in[0]) * in[0];
428 out[1] = ((uint128_t) in[0]) * inx2[1];
429 out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
430 out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
431 out[4] = ((uint128_t) in[0]) * inx2[4] +
432 ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
433 out[5] = ((uint128_t) in[0]) * inx2[5] +
434 ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
435 out[6] = ((uint128_t) in[0]) * inx2[6] +
436 ((uint128_t) in[1]) * inx2[5] +
437 ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
438 out[7] = ((uint128_t) in[0]) * inx2[7] +
439 ((uint128_t) in[1]) * inx2[6] +
440 ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
441 out[8] = ((uint128_t) in[0]) * inx2[8] +
442 ((uint128_t) in[1]) * inx2[7] +
443 ((uint128_t) in[2]) * inx2[6] +
444 ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
447 * The remaining limbs fall above 2^521, with the first falling at 2^522.
448 * They correspond to locations one bit up from the limbs produced above
449 * so we would have to multiply by two to align them. Again, rather than
450 * operate on the 128-bit result, we double one of the inputs to the
451 * multiplication. If we want to double for both this reason, and the
452 * reason above, then we end up multiplying by four.
456 out[0] += ((uint128_t) in[1]) * inx4[8] +
457 ((uint128_t) in[2]) * inx4[7] +
458 ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
461 out[1] += ((uint128_t) in[2]) * inx4[8] +
462 ((uint128_t) in[3]) * inx4[7] +
463 ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
466 out[2] += ((uint128_t) in[3]) * inx4[8] +
467 ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
470 out[3] += ((uint128_t) in[4]) * inx4[8] +
471 ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
474 out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
477 out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
480 out[6] += ((uint128_t) in[7]) * inx4[8];
483 out[7] += ((uint128_t) in[8]) * inx2[8];
487 * felem_mul sets |out| = |in1| * |in2|
492 * out[i] < 17 * max(in1[i]) * max(in2[i])
494 static void felem_mul(largefelem out, const felem in1, const felem in2)
497 felem_scalar(in2x2, in2, 2);
499 out[0] = ((uint128_t) in1[0]) * in2[0];
501 out[1] = ((uint128_t) in1[0]) * in2[1] + ((uint128_t) in1[1]) * in2[0];
503 out[2] = ((uint128_t) in1[0]) * in2[2] +
504 ((uint128_t) in1[1]) * in2[1] + ((uint128_t) in1[2]) * in2[0];
506 out[3] = ((uint128_t) in1[0]) * in2[3] +
507 ((uint128_t) in1[1]) * in2[2] +
508 ((uint128_t) in1[2]) * in2[1] + ((uint128_t) in1[3]) * in2[0];
510 out[4] = ((uint128_t) in1[0]) * in2[4] +
511 ((uint128_t) in1[1]) * in2[3] +
512 ((uint128_t) in1[2]) * in2[2] +
513 ((uint128_t) in1[3]) * in2[1] + ((uint128_t) in1[4]) * in2[0];
515 out[5] = ((uint128_t) in1[0]) * in2[5] +
516 ((uint128_t) in1[1]) * in2[4] +
517 ((uint128_t) in1[2]) * in2[3] +
518 ((uint128_t) in1[3]) * in2[2] +
519 ((uint128_t) in1[4]) * in2[1] + ((uint128_t) in1[5]) * in2[0];
521 out[6] = ((uint128_t) in1[0]) * in2[6] +
522 ((uint128_t) in1[1]) * in2[5] +
523 ((uint128_t) in1[2]) * in2[4] +
524 ((uint128_t) in1[3]) * in2[3] +
525 ((uint128_t) in1[4]) * in2[2] +
526 ((uint128_t) in1[5]) * in2[1] + ((uint128_t) in1[6]) * in2[0];
528 out[7] = ((uint128_t) in1[0]) * in2[7] +
529 ((uint128_t) in1[1]) * in2[6] +
530 ((uint128_t) in1[2]) * in2[5] +
531 ((uint128_t) in1[3]) * in2[4] +
532 ((uint128_t) in1[4]) * in2[3] +
533 ((uint128_t) in1[5]) * in2[2] +
534 ((uint128_t) in1[6]) * in2[1] + ((uint128_t) in1[7]) * in2[0];
536 out[8] = ((uint128_t) in1[0]) * in2[8] +
537 ((uint128_t) in1[1]) * in2[7] +
538 ((uint128_t) in1[2]) * in2[6] +
539 ((uint128_t) in1[3]) * in2[5] +
540 ((uint128_t) in1[4]) * in2[4] +
541 ((uint128_t) in1[5]) * in2[3] +
542 ((uint128_t) in1[6]) * in2[2] +
543 ((uint128_t) in1[7]) * in2[1] + ((uint128_t) in1[8]) * in2[0];
545 /* See comment in felem_square about the use of in2x2 here */
547 out[0] += ((uint128_t) in1[1]) * in2x2[8] +
548 ((uint128_t) in1[2]) * in2x2[7] +
549 ((uint128_t) in1[3]) * in2x2[6] +
550 ((uint128_t) in1[4]) * in2x2[5] +
551 ((uint128_t) in1[5]) * in2x2[4] +
552 ((uint128_t) in1[6]) * in2x2[3] +
553 ((uint128_t) in1[7]) * in2x2[2] + ((uint128_t) in1[8]) * in2x2[1];
555 out[1] += ((uint128_t) in1[2]) * in2x2[8] +
556 ((uint128_t) in1[3]) * in2x2[7] +
557 ((uint128_t) in1[4]) * in2x2[6] +
558 ((uint128_t) in1[5]) * in2x2[5] +
559 ((uint128_t) in1[6]) * in2x2[4] +
560 ((uint128_t) in1[7]) * in2x2[3] + ((uint128_t) in1[8]) * in2x2[2];
562 out[2] += ((uint128_t) in1[3]) * in2x2[8] +
563 ((uint128_t) in1[4]) * in2x2[7] +
564 ((uint128_t) in1[5]) * in2x2[6] +
565 ((uint128_t) in1[6]) * in2x2[5] +
566 ((uint128_t) in1[7]) * in2x2[4] + ((uint128_t) in1[8]) * in2x2[3];
568 out[3] += ((uint128_t) in1[4]) * in2x2[8] +
569 ((uint128_t) in1[5]) * in2x2[7] +
570 ((uint128_t) in1[6]) * in2x2[6] +
571 ((uint128_t) in1[7]) * in2x2[5] + ((uint128_t) in1[8]) * in2x2[4];
573 out[4] += ((uint128_t) in1[5]) * in2x2[8] +
574 ((uint128_t) in1[6]) * in2x2[7] +
575 ((uint128_t) in1[7]) * in2x2[6] + ((uint128_t) in1[8]) * in2x2[5];
577 out[5] += ((uint128_t) in1[6]) * in2x2[8] +
578 ((uint128_t) in1[7]) * in2x2[7] + ((uint128_t) in1[8]) * in2x2[6];
580 out[6] += ((uint128_t) in1[7]) * in2x2[8] +
581 ((uint128_t) in1[8]) * in2x2[7];
583 out[7] += ((uint128_t) in1[8]) * in2x2[8];
586 static const limb bottom52bits = 0xfffffffffffff;
589 * felem_reduce converts a largefelem to an felem.
593 * out[i] < 2^59 + 2^14
595 static void felem_reduce(felem out, const largefelem in)
597 u64 overflow1, overflow2;
599 out[0] = ((limb) in[0]) & bottom58bits;
600 out[1] = ((limb) in[1]) & bottom58bits;
601 out[2] = ((limb) in[2]) & bottom58bits;
602 out[3] = ((limb) in[3]) & bottom58bits;
603 out[4] = ((limb) in[4]) & bottom58bits;
604 out[5] = ((limb) in[5]) & bottom58bits;
605 out[6] = ((limb) in[6]) & bottom58bits;
606 out[7] = ((limb) in[7]) & bottom58bits;
607 out[8] = ((limb) in[8]) & bottom58bits;
611 out[1] += ((limb) in[0]) >> 58;
612 out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
614 * out[1] < 2^58 + 2^6 + 2^58
617 out[2] += ((limb) (in[0] >> 64)) >> 52;
619 out[2] += ((limb) in[1]) >> 58;
620 out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
621 out[3] += ((limb) (in[1] >> 64)) >> 52;
623 out[3] += ((limb) in[2]) >> 58;
624 out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
625 out[4] += ((limb) (in[2] >> 64)) >> 52;
627 out[4] += ((limb) in[3]) >> 58;
628 out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
629 out[5] += ((limb) (in[3] >> 64)) >> 52;
631 out[5] += ((limb) in[4]) >> 58;
632 out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
633 out[6] += ((limb) (in[4] >> 64)) >> 52;
635 out[6] += ((limb) in[5]) >> 58;
636 out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
637 out[7] += ((limb) (in[5] >> 64)) >> 52;
639 out[7] += ((limb) in[6]) >> 58;
640 out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
641 out[8] += ((limb) (in[6] >> 64)) >> 52;
643 out[8] += ((limb) in[7]) >> 58;
644 out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
646 * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
649 overflow1 = ((limb) (in[7] >> 64)) >> 52;
651 overflow1 += ((limb) in[8]) >> 58;
652 overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
653 overflow2 = ((limb) (in[8] >> 64)) >> 52;
655 overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */
656 overflow2 <<= 1; /* overflow2 < 2^13 */
658 out[0] += overflow1; /* out[0] < 2^60 */
659 out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */
661 out[1] += out[0] >> 58;
662 out[0] &= bottom58bits;
665 * out[1] < 2^59 + 2^6 + 2^13 + 2^2
670 static void felem_square_reduce(felem out, const felem in)
673 felem_square(tmp, in);
674 felem_reduce(out, tmp);
677 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
680 felem_mul(tmp, in1, in2);
681 felem_reduce(out, tmp);
685 * felem_inv calculates |out| = |in|^{-1}
687 * Based on Fermat's Little Theorem:
689 * a^{p-1} = 1 (mod p)
690 * a^{p-2} = a^{-1} (mod p)
692 static void felem_inv(felem out, const felem in)
694 felem ftmp, ftmp2, ftmp3, ftmp4;
698 felem_square(tmp, in);
699 felem_reduce(ftmp, tmp); /* 2^1 */
700 felem_mul(tmp, in, ftmp);
701 felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
702 felem_assign(ftmp2, ftmp);
703 felem_square(tmp, ftmp);
704 felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
705 felem_mul(tmp, in, ftmp);
706 felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */
707 felem_square(tmp, ftmp);
708 felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */
710 felem_square(tmp, ftmp2);
711 felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */
712 felem_square(tmp, ftmp3);
713 felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */
714 felem_mul(tmp, ftmp3, ftmp2);
715 felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
717 felem_assign(ftmp2, ftmp3);
718 felem_square(tmp, ftmp3);
719 felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */
720 felem_square(tmp, ftmp3);
721 felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */
722 felem_square(tmp, ftmp3);
723 felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */
724 felem_square(tmp, ftmp3);
725 felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */
726 felem_assign(ftmp4, ftmp3);
727 felem_mul(tmp, ftmp3, ftmp);
728 felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */
729 felem_square(tmp, ftmp4);
730 felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */
731 felem_mul(tmp, ftmp3, ftmp2);
732 felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
733 felem_assign(ftmp2, ftmp3);
735 for (i = 0; i < 8; i++) {
736 felem_square(tmp, ftmp3);
737 felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
739 felem_mul(tmp, ftmp3, ftmp2);
740 felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
741 felem_assign(ftmp2, ftmp3);
743 for (i = 0; i < 16; i++) {
744 felem_square(tmp, ftmp3);
745 felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
747 felem_mul(tmp, ftmp3, ftmp2);
748 felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
749 felem_assign(ftmp2, ftmp3);
751 for (i = 0; i < 32; i++) {
752 felem_square(tmp, ftmp3);
753 felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
755 felem_mul(tmp, ftmp3, ftmp2);
756 felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
757 felem_assign(ftmp2, ftmp3);
759 for (i = 0; i < 64; i++) {
760 felem_square(tmp, ftmp3);
761 felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
763 felem_mul(tmp, ftmp3, ftmp2);
764 felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
765 felem_assign(ftmp2, ftmp3);
767 for (i = 0; i < 128; i++) {
768 felem_square(tmp, ftmp3);
769 felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
771 felem_mul(tmp, ftmp3, ftmp2);
772 felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
773 felem_assign(ftmp2, ftmp3);
775 for (i = 0; i < 256; i++) {
776 felem_square(tmp, ftmp3);
777 felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
779 felem_mul(tmp, ftmp3, ftmp2);
780 felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
782 for (i = 0; i < 9; i++) {
783 felem_square(tmp, ftmp3);
784 felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
786 felem_mul(tmp, ftmp3, ftmp4);
787 felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */
788 felem_mul(tmp, ftmp3, in);
789 felem_reduce(out, tmp); /* 2^512 - 3 */
792 /* This is 2^521-1, expressed as an felem */
793 static const felem kPrime = {
794 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
795 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
796 0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
800 * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
803 * in[i] < 2^59 + 2^14
805 static limb felem_is_zero(const felem in)
809 felem_assign(ftmp, in);
811 ftmp[0] += ftmp[8] >> 57;
812 ftmp[8] &= bottom57bits;
814 ftmp[1] += ftmp[0] >> 58;
815 ftmp[0] &= bottom58bits;
816 ftmp[2] += ftmp[1] >> 58;
817 ftmp[1] &= bottom58bits;
818 ftmp[3] += ftmp[2] >> 58;
819 ftmp[2] &= bottom58bits;
820 ftmp[4] += ftmp[3] >> 58;
821 ftmp[3] &= bottom58bits;
822 ftmp[5] += ftmp[4] >> 58;
823 ftmp[4] &= bottom58bits;
824 ftmp[6] += ftmp[5] >> 58;
825 ftmp[5] &= bottom58bits;
826 ftmp[7] += ftmp[6] >> 58;
827 ftmp[6] &= bottom58bits;
828 ftmp[8] += ftmp[7] >> 58;
829 ftmp[7] &= bottom58bits;
830 /* ftmp[8] < 2^57 + 4 */
833 * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
834 * than our bound for ftmp[8]. Therefore we only have to check if the
835 * zero is zero or 2^521-1.
851 * We know that ftmp[i] < 2^63, therefore the only way that the top bit
852 * can be set is if is_zero was 0 before the decrement.
854 is_zero = ((s64) is_zero) >> 63;
856 is_p = ftmp[0] ^ kPrime[0];
857 is_p |= ftmp[1] ^ kPrime[1];
858 is_p |= ftmp[2] ^ kPrime[2];
859 is_p |= ftmp[3] ^ kPrime[3];
860 is_p |= ftmp[4] ^ kPrime[4];
861 is_p |= ftmp[5] ^ kPrime[5];
862 is_p |= ftmp[6] ^ kPrime[6];
863 is_p |= ftmp[7] ^ kPrime[7];
864 is_p |= ftmp[8] ^ kPrime[8];
867 is_p = ((s64) is_p) >> 63;
873 static int felem_is_zero_int(const felem in)
875 return (int)(felem_is_zero(in) & ((limb) 1));
879 * felem_contract converts |in| to its unique, minimal representation.
881 * in[i] < 2^59 + 2^14
883 static void felem_contract(felem out, const felem in)
885 limb is_p, is_greater, sign;
886 static const limb two58 = ((limb) 1) << 58;
888 felem_assign(out, in);
890 out[0] += out[8] >> 57;
891 out[8] &= bottom57bits;
893 out[1] += out[0] >> 58;
894 out[0] &= bottom58bits;
895 out[2] += out[1] >> 58;
896 out[1] &= bottom58bits;
897 out[3] += out[2] >> 58;
898 out[2] &= bottom58bits;
899 out[4] += out[3] >> 58;
900 out[3] &= bottom58bits;
901 out[5] += out[4] >> 58;
902 out[4] &= bottom58bits;
903 out[6] += out[5] >> 58;
904 out[5] &= bottom58bits;
905 out[7] += out[6] >> 58;
906 out[6] &= bottom58bits;
907 out[8] += out[7] >> 58;
908 out[7] &= bottom58bits;
909 /* out[8] < 2^57 + 4 */
912 * If the value is greater than 2^521-1 then we have to subtract 2^521-1
913 * out. See the comments in felem_is_zero regarding why we don't test for
914 * other multiples of the prime.
918 * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
921 is_p = out[0] ^ kPrime[0];
922 is_p |= out[1] ^ kPrime[1];
923 is_p |= out[2] ^ kPrime[2];
924 is_p |= out[3] ^ kPrime[3];
925 is_p |= out[4] ^ kPrime[4];
926 is_p |= out[5] ^ kPrime[5];
927 is_p |= out[6] ^ kPrime[6];
928 is_p |= out[7] ^ kPrime[7];
929 is_p |= out[8] ^ kPrime[8];
938 is_p = ((s64) is_p) >> 63;
941 /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
954 * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
955 * 57 is greater than zero as (2^521-1) + x >= 2^522
957 is_greater = out[8] >> 57;
958 is_greater |= is_greater << 32;
959 is_greater |= is_greater << 16;
960 is_greater |= is_greater << 8;
961 is_greater |= is_greater << 4;
962 is_greater |= is_greater << 2;
963 is_greater |= is_greater << 1;
964 is_greater = ((s64) is_greater) >> 63;
966 out[0] -= kPrime[0] & is_greater;
967 out[1] -= kPrime[1] & is_greater;
968 out[2] -= kPrime[2] & is_greater;
969 out[3] -= kPrime[3] & is_greater;
970 out[4] -= kPrime[4] & is_greater;
971 out[5] -= kPrime[5] & is_greater;
972 out[6] -= kPrime[6] & is_greater;
973 out[7] -= kPrime[7] & is_greater;
974 out[8] -= kPrime[8] & is_greater;
976 /* Eliminate negative coefficients */
977 sign = -(out[0] >> 63);
978 out[0] += (two58 & sign);
979 out[1] -= (1 & sign);
980 sign = -(out[1] >> 63);
981 out[1] += (two58 & sign);
982 out[2] -= (1 & sign);
983 sign = -(out[2] >> 63);
984 out[2] += (two58 & sign);
985 out[3] -= (1 & sign);
986 sign = -(out[3] >> 63);
987 out[3] += (two58 & sign);
988 out[4] -= (1 & sign);
989 sign = -(out[4] >> 63);
990 out[4] += (two58 & sign);
991 out[5] -= (1 & sign);
992 sign = -(out[0] >> 63);
993 out[5] += (two58 & sign);
994 out[6] -= (1 & sign);
995 sign = -(out[6] >> 63);
996 out[6] += (two58 & sign);
997 out[7] -= (1 & sign);
998 sign = -(out[7] >> 63);
999 out[7] += (two58 & sign);
1000 out[8] -= (1 & sign);
1001 sign = -(out[5] >> 63);
1002 out[5] += (two58 & sign);
1003 out[6] -= (1 & sign);
1004 sign = -(out[6] >> 63);
1005 out[6] += (two58 & sign);
1006 out[7] -= (1 & sign);
1007 sign = -(out[7] >> 63);
1008 out[7] += (two58 & sign);
1009 out[8] -= (1 & sign);
1016 * Building on top of the field operations we have the operations on the
1017 * elliptic curve group itself. Points on the curve are represented in Jacobian
1021 * point_double calcuates 2*(x_in, y_in, z_in)
1023 * The method is taken from:
1024 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1026 * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1027 * while x_out == y_in is not (maybe this works, but it's not tested). */
1029 point_double(felem x_out, felem y_out, felem z_out,
1030 const felem x_in, const felem y_in, const felem z_in)
1032 largefelem tmp, tmp2;
1033 felem delta, gamma, beta, alpha, ftmp, ftmp2;
1035 felem_assign(ftmp, x_in);
1036 felem_assign(ftmp2, x_in);
1039 felem_square(tmp, z_in);
1040 felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */
1043 felem_square(tmp, y_in);
1044 felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */
1046 /* beta = x*gamma */
1047 felem_mul(tmp, x_in, gamma);
1048 felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */
1050 /* alpha = 3*(x-delta)*(x+delta) */
1051 felem_diff64(ftmp, delta);
1052 /* ftmp[i] < 2^61 */
1053 felem_sum64(ftmp2, delta);
1054 /* ftmp2[i] < 2^60 + 2^15 */
1055 felem_scalar64(ftmp2, 3);
1056 /* ftmp2[i] < 3*2^60 + 3*2^15 */
1057 felem_mul(tmp, ftmp, ftmp2);
1059 * tmp[i] < 17(3*2^121 + 3*2^76)
1060 * = 61*2^121 + 61*2^76
1061 * < 64*2^121 + 64*2^76
1065 felem_reduce(alpha, tmp);
1067 /* x' = alpha^2 - 8*beta */
1068 felem_square(tmp, alpha);
1070 * tmp[i] < 17*2^120 < 2^125
1072 felem_assign(ftmp, beta);
1073 felem_scalar64(ftmp, 8);
1074 /* ftmp[i] < 2^62 + 2^17 */
1075 felem_diff_128_64(tmp, ftmp);
1076 /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1077 felem_reduce(x_out, tmp);
1079 /* z' = (y + z)^2 - gamma - delta */
1080 felem_sum64(delta, gamma);
1081 /* delta[i] < 2^60 + 2^15 */
1082 felem_assign(ftmp, y_in);
1083 felem_sum64(ftmp, z_in);
1084 /* ftmp[i] < 2^60 + 2^15 */
1085 felem_square(tmp, ftmp);
1087 * tmp[i] < 17(2^122) < 2^127
1089 felem_diff_128_64(tmp, delta);
1090 /* tmp[i] < 2^127 + 2^63 */
1091 felem_reduce(z_out, tmp);
1093 /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1094 felem_scalar64(beta, 4);
1095 /* beta[i] < 2^61 + 2^16 */
1096 felem_diff64(beta, x_out);
1097 /* beta[i] < 2^61 + 2^60 + 2^16 */
1098 felem_mul(tmp, alpha, beta);
1100 * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1101 * = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1102 * = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1105 felem_square(tmp2, gamma);
1107 * tmp2[i] < 17*(2^59 + 2^14)^2
1108 * = 17*(2^118 + 2^74 + 2^28)
1110 felem_scalar128(tmp2, 8);
1112 * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1113 * = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1116 felem_diff128(tmp, tmp2);
1118 * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1119 * = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1120 * 2^74 + 2^69 + 2^34 + 2^30
1123 felem_reduce(y_out, tmp);
1126 /* copy_conditional copies in to out iff mask is all ones. */
1127 static void copy_conditional(felem out, const felem in, limb mask)
1130 for (i = 0; i < NLIMBS; ++i) {
1131 const limb tmp = mask & (in[i] ^ out[i]);
1137 * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1139 * The method is taken from
1140 * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1141 * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1143 * This function includes a branch for checking whether the two input points
1144 * are equal (while not equal to the point at infinity). This case never
1145 * happens during single point multiplication, so there is no timing leak for
1146 * ECDH or ECDSA signing. */
1147 static void point_add(felem x3, felem y3, felem z3,
1148 const felem x1, const felem y1, const felem z1,
1149 const int mixed, const felem x2, const felem y2,
1152 felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1153 largefelem tmp, tmp2;
1154 limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1156 z1_is_zero = felem_is_zero(z1);
1157 z2_is_zero = felem_is_zero(z2);
1159 /* ftmp = z1z1 = z1**2 */
1160 felem_square(tmp, z1);
1161 felem_reduce(ftmp, tmp);
1164 /* ftmp2 = z2z2 = z2**2 */
1165 felem_square(tmp, z2);
1166 felem_reduce(ftmp2, tmp);
1168 /* u1 = ftmp3 = x1*z2z2 */
1169 felem_mul(tmp, x1, ftmp2);
1170 felem_reduce(ftmp3, tmp);
1172 /* ftmp5 = z1 + z2 */
1173 felem_assign(ftmp5, z1);
1174 felem_sum64(ftmp5, z2);
1175 /* ftmp5[i] < 2^61 */
1177 /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1178 felem_square(tmp, ftmp5);
1179 /* tmp[i] < 17*2^122 */
1180 felem_diff_128_64(tmp, ftmp);
1181 /* tmp[i] < 17*2^122 + 2^63 */
1182 felem_diff_128_64(tmp, ftmp2);
1183 /* tmp[i] < 17*2^122 + 2^64 */
1184 felem_reduce(ftmp5, tmp);
1186 /* ftmp2 = z2 * z2z2 */
1187 felem_mul(tmp, ftmp2, z2);
1188 felem_reduce(ftmp2, tmp);
1190 /* s1 = ftmp6 = y1 * z2**3 */
1191 felem_mul(tmp, y1, ftmp2);
1192 felem_reduce(ftmp6, tmp);
1195 * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1198 /* u1 = ftmp3 = x1*z2z2 */
1199 felem_assign(ftmp3, x1);
1201 /* ftmp5 = 2*z1z2 */
1202 felem_scalar(ftmp5, z1, 2);
1204 /* s1 = ftmp6 = y1 * z2**3 */
1205 felem_assign(ftmp6, y1);
1209 felem_mul(tmp, x2, ftmp);
1210 /* tmp[i] < 17*2^120 */
1212 /* h = ftmp4 = u2 - u1 */
1213 felem_diff_128_64(tmp, ftmp3);
1214 /* tmp[i] < 17*2^120 + 2^63 */
1215 felem_reduce(ftmp4, tmp);
1217 x_equal = felem_is_zero(ftmp4);
1219 /* z_out = ftmp5 * h */
1220 felem_mul(tmp, ftmp5, ftmp4);
1221 felem_reduce(z_out, tmp);
1223 /* ftmp = z1 * z1z1 */
1224 felem_mul(tmp, ftmp, z1);
1225 felem_reduce(ftmp, tmp);
1227 /* s2 = tmp = y2 * z1**3 */
1228 felem_mul(tmp, y2, ftmp);
1229 /* tmp[i] < 17*2^120 */
1231 /* r = ftmp5 = (s2 - s1)*2 */
1232 felem_diff_128_64(tmp, ftmp6);
1233 /* tmp[i] < 17*2^120 + 2^63 */
1234 felem_reduce(ftmp5, tmp);
1235 y_equal = felem_is_zero(ftmp5);
1236 felem_scalar64(ftmp5, 2);
1237 /* ftmp5[i] < 2^61 */
1239 if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1240 point_double(x3, y3, z3, x1, y1, z1);
1244 /* I = ftmp = (2h)**2 */
1245 felem_assign(ftmp, ftmp4);
1246 felem_scalar64(ftmp, 2);
1247 /* ftmp[i] < 2^61 */
1248 felem_square(tmp, ftmp);
1249 /* tmp[i] < 17*2^122 */
1250 felem_reduce(ftmp, tmp);
1252 /* J = ftmp2 = h * I */
1253 felem_mul(tmp, ftmp4, ftmp);
1254 felem_reduce(ftmp2, tmp);
1256 /* V = ftmp4 = U1 * I */
1257 felem_mul(tmp, ftmp3, ftmp);
1258 felem_reduce(ftmp4, tmp);
1260 /* x_out = r**2 - J - 2V */
1261 felem_square(tmp, ftmp5);
1262 /* tmp[i] < 17*2^122 */
1263 felem_diff_128_64(tmp, ftmp2);
1264 /* tmp[i] < 17*2^122 + 2^63 */
1265 felem_assign(ftmp3, ftmp4);
1266 felem_scalar64(ftmp4, 2);
1267 /* ftmp4[i] < 2^61 */
1268 felem_diff_128_64(tmp, ftmp4);
1269 /* tmp[i] < 17*2^122 + 2^64 */
1270 felem_reduce(x_out, tmp);
1272 /* y_out = r(V-x_out) - 2 * s1 * J */
1273 felem_diff64(ftmp3, x_out);
1275 * ftmp3[i] < 2^60 + 2^60 = 2^61
1277 felem_mul(tmp, ftmp5, ftmp3);
1278 /* tmp[i] < 17*2^122 */
1279 felem_mul(tmp2, ftmp6, ftmp2);
1280 /* tmp2[i] < 17*2^120 */
1281 felem_scalar128(tmp2, 2);
1282 /* tmp2[i] < 17*2^121 */
1283 felem_diff128(tmp, tmp2);
1285 * tmp[i] < 2^127 - 2^69 + 17*2^122
1286 * = 2^126 - 2^122 - 2^6 - 2^2 - 1
1289 felem_reduce(y_out, tmp);
1291 copy_conditional(x_out, x2, z1_is_zero);
1292 copy_conditional(x_out, x1, z2_is_zero);
1293 copy_conditional(y_out, y2, z1_is_zero);
1294 copy_conditional(y_out, y1, z2_is_zero);
1295 copy_conditional(z_out, z2, z1_is_zero);
1296 copy_conditional(z_out, z1, z2_is_zero);
1297 felem_assign(x3, x_out);
1298 felem_assign(y3, y_out);
1299 felem_assign(z3, z_out);
1303 * Base point pre computation
1304 * --------------------------
1306 * Two different sorts of precomputed tables are used in the following code.
1307 * Each contain various points on the curve, where each point is three field
1308 * elements (x, y, z).
1310 * For the base point table, z is usually 1 (0 for the point at infinity).
1311 * This table has 16 elements:
1312 * index | bits | point
1313 * ------+---------+------------------------------
1316 * 2 | 0 0 1 0 | 2^130G
1317 * 3 | 0 0 1 1 | (2^130 + 1)G
1318 * 4 | 0 1 0 0 | 2^260G
1319 * 5 | 0 1 0 1 | (2^260 + 1)G
1320 * 6 | 0 1 1 0 | (2^260 + 2^130)G
1321 * 7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1322 * 8 | 1 0 0 0 | 2^390G
1323 * 9 | 1 0 0 1 | (2^390 + 1)G
1324 * 10 | 1 0 1 0 | (2^390 + 2^130)G
1325 * 11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1326 * 12 | 1 1 0 0 | (2^390 + 2^260)G
1327 * 13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1328 * 14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1329 * 15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1331 * The reason for this is so that we can clock bits into four different
1332 * locations when doing simple scalar multiplies against the base point.
1334 * Tables for other points have table[i] = iG for i in 0 .. 16. */
1336 /* gmul is the table of precomputed base points */
1337 static const felem gmul[16][3] = { {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1338 {0, 0, 0, 0, 0, 0, 0, 0, 0},
1339 {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1340 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1341 0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1342 0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1343 {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1344 0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1345 0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1346 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1347 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1348 0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1349 0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1350 {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1351 0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1352 0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1353 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1354 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1355 0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1356 0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1357 {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1358 0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1359 0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1360 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1361 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1362 0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1363 0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1364 {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1365 0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1366 0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1367 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1368 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1369 0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1370 0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1371 {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1372 0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1373 0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1374 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1375 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1376 0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1377 0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1378 {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1379 0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1380 0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1381 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1382 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1383 0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1384 0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1385 {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1386 0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1387 0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1388 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1389 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1390 0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1391 0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1392 {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1393 0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1394 0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1395 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1396 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1397 0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1398 0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1399 {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1400 0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1401 0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1402 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1403 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1404 0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1405 0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1406 {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1407 0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1408 0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1409 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1410 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1411 0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1412 0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1413 {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1414 0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1415 0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1416 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1417 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1418 0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1419 0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1420 {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1421 0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1422 0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1423 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1424 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1425 0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1426 0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1427 {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1428 0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1429 0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1430 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1431 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1432 0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1433 0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1434 {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1435 0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1436 0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1437 {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1438 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1439 0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1440 0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1441 {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1442 0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1443 0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1444 {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1448 * select_point selects the |idx|th point from a precomputation table and
1451 /* pre_comp below is of the size provided in |size| */
1452 static void select_point(const limb idx, unsigned int size,
1453 const felem pre_comp[][3], felem out[3])
1456 limb *outlimbs = &out[0][0];
1457 memset(outlimbs, 0, 3 * sizeof(felem));
1459 for (i = 0; i < size; i++) {
1460 const limb *inlimbs = &pre_comp[i][0][0];
1461 limb mask = i ^ idx;
1467 for (j = 0; j < NLIMBS * 3; j++)
1468 outlimbs[j] |= inlimbs[j] & mask;
1472 /* get_bit returns the |i|th bit in |in| */
1473 static char get_bit(const felem_bytearray in, int i)
1477 return (in[i >> 3] >> (i & 7)) & 1;
1481 * Interleaved point multiplication using precomputed point multiples: The
1482 * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1483 * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1484 * generator, using certain (large) precomputed multiples in g_pre_comp.
1485 * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1487 static void batch_mul(felem x_out, felem y_out, felem z_out,
1488 const felem_bytearray scalars[],
1489 const unsigned num_points, const u8 *g_scalar,
1490 const int mixed, const felem pre_comp[][17][3],
1491 const felem g_pre_comp[16][3])
1494 unsigned num, gen_mul = (g_scalar != NULL);
1495 felem nq[3], tmp[4];
1499 /* set nq to the point at infinity */
1500 memset(nq, 0, 3 * sizeof(felem));
1503 * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1504 * of the generator (last quarter of rounds) and additions of other
1505 * points multiples (every 5th round).
1507 skip = 1; /* save two point operations in the first
1509 for (i = (num_points ? 520 : 130); i >= 0; --i) {
1512 point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1514 /* add multiples of the generator */
1515 if (gen_mul && (i <= 130)) {
1516 bits = get_bit(g_scalar, i + 390) << 3;
1518 bits |= get_bit(g_scalar, i + 260) << 2;
1519 bits |= get_bit(g_scalar, i + 130) << 1;
1520 bits |= get_bit(g_scalar, i);
1522 /* select the point to add, in constant time */
1523 select_point(bits, 16, g_pre_comp, tmp);
1525 /* The 1 argument below is for "mixed" */
1526 point_add(nq[0], nq[1], nq[2],
1527 nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1529 memcpy(nq, tmp, 3 * sizeof(felem));
1534 /* do other additions every 5 doublings */
1535 if (num_points && (i % 5 == 0)) {
1536 /* loop over all scalars */
1537 for (num = 0; num < num_points; ++num) {
1538 bits = get_bit(scalars[num], i + 4) << 5;
1539 bits |= get_bit(scalars[num], i + 3) << 4;
1540 bits |= get_bit(scalars[num], i + 2) << 3;
1541 bits |= get_bit(scalars[num], i + 1) << 2;
1542 bits |= get_bit(scalars[num], i) << 1;
1543 bits |= get_bit(scalars[num], i - 1);
1544 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1547 * select the point to add or subtract, in constant time
1549 select_point(digit, 17, pre_comp[num], tmp);
1550 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1552 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1555 point_add(nq[0], nq[1], nq[2],
1556 nq[0], nq[1], nq[2],
1557 mixed, tmp[0], tmp[1], tmp[2]);
1559 memcpy(nq, tmp, 3 * sizeof(felem));
1565 felem_assign(x_out, nq[0]);
1566 felem_assign(y_out, nq[1]);
1567 felem_assign(z_out, nq[2]);
1570 /* Precomputation for the group generator. */
1572 felem g_pre_comp[16][3];
1574 } NISTP521_PRE_COMP;
1576 const EC_METHOD *EC_GFp_nistp521_method(void)
1578 static const EC_METHOD ret = {
1579 EC_FLAGS_DEFAULT_OCT,
1580 NID_X9_62_prime_field,
1581 ec_GFp_nistp521_group_init,
1582 ec_GFp_simple_group_finish,
1583 ec_GFp_simple_group_clear_finish,
1584 ec_GFp_nist_group_copy,
1585 ec_GFp_nistp521_group_set_curve,
1586 ec_GFp_simple_group_get_curve,
1587 ec_GFp_simple_group_get_degree,
1588 ec_GFp_simple_group_check_discriminant,
1589 ec_GFp_simple_point_init,
1590 ec_GFp_simple_point_finish,
1591 ec_GFp_simple_point_clear_finish,
1592 ec_GFp_simple_point_copy,
1593 ec_GFp_simple_point_set_to_infinity,
1594 ec_GFp_simple_set_Jprojective_coordinates_GFp,
1595 ec_GFp_simple_get_Jprojective_coordinates_GFp,
1596 ec_GFp_simple_point_set_affine_coordinates,
1597 ec_GFp_nistp521_point_get_affine_coordinates,
1598 0 /* point_set_compressed_coordinates */ ,
1603 ec_GFp_simple_invert,
1604 ec_GFp_simple_is_at_infinity,
1605 ec_GFp_simple_is_on_curve,
1607 ec_GFp_simple_make_affine,
1608 ec_GFp_simple_points_make_affine,
1609 ec_GFp_nistp521_points_mul,
1610 ec_GFp_nistp521_precompute_mult,
1611 ec_GFp_nistp521_have_precompute_mult,
1612 ec_GFp_nist_field_mul,
1613 ec_GFp_nist_field_sqr,
1615 0 /* field_encode */ ,
1616 0 /* field_decode */ ,
1617 0 /* field_set_to_one */
1623 /******************************************************************************/
1625 * FUNCTIONS TO MANAGE PRECOMPUTATION
1628 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1630 NISTP521_PRE_COMP *ret = NULL;
1631 ret = (NISTP521_PRE_COMP *) OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1633 ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1636 memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1637 ret->references = 1;
1641 static void *nistp521_pre_comp_dup(void *src_)
1643 NISTP521_PRE_COMP *src = src_;
1645 /* no need to actually copy, these objects never change! */
1646 CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1651 static void nistp521_pre_comp_free(void *pre_)
1654 NISTP521_PRE_COMP *pre = pre_;
1659 i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1666 static void nistp521_pre_comp_clear_free(void *pre_)
1669 NISTP521_PRE_COMP *pre = pre_;
1674 i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1678 OPENSSL_cleanse(pre, sizeof(*pre));
1682 /******************************************************************************/
1684 * OPENSSL EC_METHOD FUNCTIONS
1687 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1690 ret = ec_GFp_simple_group_init(group);
1691 group->a_is_minus3 = 1;
1695 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1696 const BIGNUM *a, const BIGNUM *b,
1700 BN_CTX *new_ctx = NULL;
1701 BIGNUM *curve_p, *curve_a, *curve_b;
1704 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1707 if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1708 ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1709 ((curve_b = BN_CTX_get(ctx)) == NULL))
1711 BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1712 BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1713 BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1714 if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1715 ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1716 EC_R_WRONG_CURVE_PARAMETERS);
1719 group->field_mod_func = BN_nist_mod_521;
1720 ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1723 if (new_ctx != NULL)
1724 BN_CTX_free(new_ctx);
1729 * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1732 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1733 const EC_POINT *point,
1734 BIGNUM *x, BIGNUM *y,
1737 felem z1, z2, x_in, y_in, x_out, y_out;
1740 if (EC_POINT_is_at_infinity(group, point)) {
1741 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1742 EC_R_POINT_AT_INFINITY);
1745 if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1746 (!BN_to_felem(z1, &point->Z)))
1749 felem_square(tmp, z2);
1750 felem_reduce(z1, tmp);
1751 felem_mul(tmp, x_in, z1);
1752 felem_reduce(x_in, tmp);
1753 felem_contract(x_out, x_in);
1755 if (!felem_to_BN(x, x_out)) {
1756 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1761 felem_mul(tmp, z1, z2);
1762 felem_reduce(z1, tmp);
1763 felem_mul(tmp, y_in, z1);
1764 felem_reduce(y_in, tmp);
1765 felem_contract(y_out, y_in);
1767 if (!felem_to_BN(y, y_out)) {
1768 ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1776 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1777 static void make_points_affine(size_t num, felem points[][3],
1781 * Runs in constant time, unless an input is the point at infinity (which
1782 * normally shouldn't happen).
1784 ec_GFp_nistp_points_make_affine_internal(num,
1788 (void (*)(void *))felem_one,
1789 (int (*)(const void *))
1791 (void (*)(void *, const void *))
1793 (void (*)(void *, const void *))
1794 felem_square_reduce, (void (*)
1801 (void (*)(void *, const void *))
1803 (void (*)(void *, const void *))
1808 * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1809 * values Result is stored in r (r can equal one of the inputs).
1811 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1812 const BIGNUM *scalar, size_t num,
1813 const EC_POINT *points[],
1814 const BIGNUM *scalars[], BN_CTX *ctx)
1819 BN_CTX *new_ctx = NULL;
1820 BIGNUM *x, *y, *z, *tmp_scalar;
1821 felem_bytearray g_secret;
1822 felem_bytearray *secrets = NULL;
1823 felem(*pre_comp)[17][3] = NULL;
1824 felem *tmp_felems = NULL;
1825 felem_bytearray tmp;
1826 unsigned i, num_bytes;
1827 int have_pre_comp = 0;
1828 size_t num_points = num;
1829 felem x_in, y_in, z_in, x_out, y_out, z_out;
1830 NISTP521_PRE_COMP *pre = NULL;
1831 felem(*g_pre_comp)[3] = NULL;
1832 EC_POINT *generator = NULL;
1833 const EC_POINT *p = NULL;
1834 const BIGNUM *p_scalar = NULL;
1837 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1840 if (((x = BN_CTX_get(ctx)) == NULL) ||
1841 ((y = BN_CTX_get(ctx)) == NULL) ||
1842 ((z = BN_CTX_get(ctx)) == NULL) ||
1843 ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1846 if (scalar != NULL) {
1847 pre = EC_EX_DATA_get_data(group->extra_data,
1848 nistp521_pre_comp_dup,
1849 nistp521_pre_comp_free,
1850 nistp521_pre_comp_clear_free);
1852 /* we have precomputation, try to use it */
1853 g_pre_comp = &pre->g_pre_comp[0];
1855 /* try to use the standard precomputation */
1856 g_pre_comp = (felem(*)[3]) gmul;
1857 generator = EC_POINT_new(group);
1858 if (generator == NULL)
1860 /* get the generator from precomputation */
1861 if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1862 !felem_to_BN(y, g_pre_comp[1][1]) ||
1863 !felem_to_BN(z, g_pre_comp[1][2])) {
1864 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1867 if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1871 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1872 /* precomputation matches generator */
1876 * we don't have valid precomputation: treat the generator as a
1882 if (num_points > 0) {
1883 if (num_points >= 2) {
1885 * unless we precompute multiples for just one point, converting
1886 * those into affine form is time well spent
1890 secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1891 pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1894 OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1895 if ((secrets == NULL) || (pre_comp == NULL)
1896 || (mixed && (tmp_felems == NULL))) {
1897 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1902 * we treat NULL scalars as 0, and NULL points as points at infinity,
1903 * i.e., they contribute nothing to the linear combination
1905 memset(secrets, 0, num_points * sizeof(felem_bytearray));
1906 memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1907 for (i = 0; i < num_points; ++i) {
1910 * we didn't have a valid precomputation, so we pick the
1914 p = EC_GROUP_get0_generator(group);
1917 /* the i^th point */
1920 p_scalar = scalars[i];
1922 if ((p_scalar != NULL) && (p != NULL)) {
1923 /* reduce scalar to 0 <= scalar < 2^521 */
1924 if ((BN_num_bits(p_scalar) > 521)
1925 || (BN_is_negative(p_scalar))) {
1927 * this is an unusual input, and we don't guarantee
1930 if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1931 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1934 num_bytes = BN_bn2bin(tmp_scalar, tmp);
1936 num_bytes = BN_bn2bin(p_scalar, tmp);
1937 flip_endian(secrets[i], tmp, num_bytes);
1938 /* precompute multiples */
1939 if ((!BN_to_felem(x_out, &p->X)) ||
1940 (!BN_to_felem(y_out, &p->Y)) ||
1941 (!BN_to_felem(z_out, &p->Z)))
1943 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1944 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1945 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1946 for (j = 2; j <= 16; ++j) {
1948 point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1949 pre_comp[i][j][2], pre_comp[i][1][0],
1950 pre_comp[i][1][1], pre_comp[i][1][2], 0,
1951 pre_comp[i][j - 1][0],
1952 pre_comp[i][j - 1][1],
1953 pre_comp[i][j - 1][2]);
1955 point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1956 pre_comp[i][j][2], pre_comp[i][j / 2][0],
1957 pre_comp[i][j / 2][1],
1958 pre_comp[i][j / 2][2]);
1964 make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1967 /* the scalar for the generator */
1968 if ((scalar != NULL) && (have_pre_comp)) {
1969 memset(g_secret, 0, sizeof(g_secret));
1970 /* reduce scalar to 0 <= scalar < 2^521 */
1971 if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1973 * this is an unusual input, and we don't guarantee
1976 if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1977 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1980 num_bytes = BN_bn2bin(tmp_scalar, tmp);
1982 num_bytes = BN_bn2bin(scalar, tmp);
1983 flip_endian(g_secret, tmp, num_bytes);
1984 /* do the multiplication with generator precomputation */
1985 batch_mul(x_out, y_out, z_out,
1986 (const felem_bytearray(*))secrets, num_points,
1988 mixed, (const felem(*)[17][3])pre_comp,
1989 (const felem(*)[3])g_pre_comp);
1991 /* do the multiplication without generator precomputation */
1992 batch_mul(x_out, y_out, z_out,
1993 (const felem_bytearray(*))secrets, num_points,
1994 NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1995 /* reduce the output to its unique minimal representation */
1996 felem_contract(x_in, x_out);
1997 felem_contract(y_in, y_out);
1998 felem_contract(z_in, z_out);
1999 if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2000 (!felem_to_BN(z, z_in))) {
2001 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2004 ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2008 if (generator != NULL)
2009 EC_POINT_free(generator);
2010 if (new_ctx != NULL)
2011 BN_CTX_free(new_ctx);
2012 if (secrets != NULL)
2013 OPENSSL_free(secrets);
2014 if (pre_comp != NULL)
2015 OPENSSL_free(pre_comp);
2016 if (tmp_felems != NULL)
2017 OPENSSL_free(tmp_felems);
2021 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2024 NISTP521_PRE_COMP *pre = NULL;
2026 BN_CTX *new_ctx = NULL;
2028 EC_POINT *generator = NULL;
2029 felem tmp_felems[16];
2031 /* throw away old precomputation */
2032 EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2033 nistp521_pre_comp_free,
2034 nistp521_pre_comp_clear_free);
2036 if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2039 if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2041 /* get the generator */
2042 if (group->generator == NULL)
2044 generator = EC_POINT_new(group);
2045 if (generator == NULL)
2047 BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2048 BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2049 if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2051 if ((pre = nistp521_pre_comp_new()) == NULL)
2054 * if the generator is the standard one, use built-in precomputation
2056 if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2057 memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2061 if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2062 (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2063 (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2065 /* compute 2^130*G, 2^260*G, 2^390*G */
2066 for (i = 1; i <= 4; i <<= 1) {
2067 point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2068 pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2069 pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2070 for (j = 0; j < 129; ++j) {
2071 point_double(pre->g_pre_comp[2 * i][0],
2072 pre->g_pre_comp[2 * i][1],
2073 pre->g_pre_comp[2 * i][2],
2074 pre->g_pre_comp[2 * i][0],
2075 pre->g_pre_comp[2 * i][1],
2076 pre->g_pre_comp[2 * i][2]);
2079 /* g_pre_comp[0] is the point at infinity */
2080 memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2081 /* the remaining multiples */
2082 /* 2^130*G + 2^260*G */
2083 point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2084 pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2085 pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2086 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2087 pre->g_pre_comp[2][2]);
2088 /* 2^130*G + 2^390*G */
2089 point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2090 pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2091 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2092 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2093 pre->g_pre_comp[2][2]);
2094 /* 2^260*G + 2^390*G */
2095 point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2096 pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2097 pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2098 0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2099 pre->g_pre_comp[4][2]);
2100 /* 2^130*G + 2^260*G + 2^390*G */
2101 point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2102 pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2103 pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2104 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2105 pre->g_pre_comp[2][2]);
2106 for (i = 1; i < 8; ++i) {
2107 /* odd multiples: add G */
2108 point_add(pre->g_pre_comp[2 * i + 1][0],
2109 pre->g_pre_comp[2 * i + 1][1],
2110 pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2111 pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2112 pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2113 pre->g_pre_comp[1][2]);
2115 make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2117 if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2118 nistp521_pre_comp_free,
2119 nistp521_pre_comp_clear_free))
2125 if (generator != NULL)
2126 EC_POINT_free(generator);
2127 if (new_ctx != NULL)
2128 BN_CTX_free(new_ctx);
2130 nistp521_pre_comp_free(pre);
2134 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2136 if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2137 nistp521_pre_comp_free,
2138 nistp521_pre_comp_clear_free)
2146 static void *dummy = &dummy;