xref: /src/crypto/openssl/crypto/ec/ecp_nistp521.c (revision f25b8c9fb4f58cf61adb47d7570abe7caa6d385d)
1 /*
2  * Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the Apache License 2.0 (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 /* Copyright 2011 Google Inc.
11  *
12  * Licensed under the Apache License, Version 2.0 (the "License");
13  *
14  * you may not use this file except in compliance with the License.
15  * You may obtain a copy of the License at
16  *
17  *     http://www.apache.org/licenses/LICENSE-2.0
18  *
19  *  Unless required by applicable law or agreed to in writing, software
20  *  distributed under the License is distributed on an "AS IS" BASIS,
21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  *  See the License for the specific language governing permissions and
23  *  limitations under the License.
24  */
25 
26 /*
27  * ECDSA low level APIs are deprecated for public use, but still ok for
28  * internal use.
29  */
30 #include "internal/deprecated.h"
31 
32 /*
33  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
34  *
35  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
36  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
37  * work which got its smarts from Daniel J. Bernstein's work on the same.
38  */
39 
40 #include <openssl/e_os2.h>
41 
42 #include <string.h>
43 #include <openssl/err.h>
44 #include "ec_local.h"
45 
46 #include "internal/numbers.h"
47 
48 #ifndef INT128_MAX
49 #error "Your compiler doesn't appear to support 128-bit integer types"
50 #endif
51 
52 typedef uint8_t u8;
53 typedef uint64_t u64;
54 
55 /*
56  * The underlying field. P521 operates over GF(2^521-1). We can serialize an
57  * element of this field into 66 bytes where the most significant byte
58  * contains only a single bit. We call this an felem_bytearray.
59  */
60 
61 typedef u8 felem_bytearray[66];
62 
63 /*
64  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
65  * These values are big-endian.
66  */
67 static const felem_bytearray nistp521_curve_params[5] = {
68     { 0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
69         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
76         0xff, 0xff },
77     { 0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
78         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
84         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
85         0xff, 0xfc },
86     { 0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
87         0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
88         0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
89         0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
90         0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
91         0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
92         0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
93         0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
94         0x3f, 0x00 },
95     { 0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
96         0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
97         0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
98         0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
99         0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
100         0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
101         0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
102         0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
103         0xbd, 0x66 },
104     { 0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
105         0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
106         0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
107         0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
108         0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
109         0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
110         0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
111         0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
112         0x66, 0x50 }
113 };
114 
115 /*-
116  * The representation of field elements.
117  * ------------------------------------
118  *
119  * We represent field elements with nine values. These values are either 64 or
120  * 128 bits and the field element represented is:
121  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
122  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
123  * 58 bits apart, but are greater than 58 bits in length, the most significant
124  * bits of each limb overlap with the least significant bits of the next.
125  *
126  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
127  * 'largefelem' */
128 
129 #define NLIMBS 9
130 
131 typedef uint64_t limb;
132 typedef limb limb_aX __attribute((__aligned__(1)));
133 typedef limb felem[NLIMBS];
134 typedef uint128_t largefelem[NLIMBS];
135 
136 static const limb bottom57bits = 0x1ffffffffffffff;
137 static const limb bottom58bits = 0x3ffffffffffffff;
138 
139 /*
140  * bin66_to_felem takes a little-endian byte array and converts it into felem
141  * form. This assumes that the CPU is little-endian.
142  */
bin66_to_felem(felem out,const u8 in[66])143 static void bin66_to_felem(felem out, const u8 in[66])
144 {
145     out[0] = (*((limb *)&in[0])) & bottom58bits;
146     out[1] = (*((limb_aX *)&in[7]) >> 2) & bottom58bits;
147     out[2] = (*((limb_aX *)&in[14]) >> 4) & bottom58bits;
148     out[3] = (*((limb_aX *)&in[21]) >> 6) & bottom58bits;
149     out[4] = (*((limb_aX *)&in[29])) & bottom58bits;
150     out[5] = (*((limb_aX *)&in[36]) >> 2) & bottom58bits;
151     out[6] = (*((limb_aX *)&in[43]) >> 4) & bottom58bits;
152     out[7] = (*((limb_aX *)&in[50]) >> 6) & bottom58bits;
153     out[8] = (*((limb_aX *)&in[58])) & bottom57bits;
154 }
155 
156 /*
157  * felem_to_bin66 takes an felem and serializes into a little endian, 66 byte
158  * array. This assumes that the CPU is little-endian.
159  */
felem_to_bin66(u8 out[66],const felem in)160 static void felem_to_bin66(u8 out[66], const felem in)
161 {
162     memset(out, 0, 66);
163     (*((limb *)&out[0])) = in[0];
164     (*((limb_aX *)&out[7])) |= in[1] << 2;
165     (*((limb_aX *)&out[14])) |= in[2] << 4;
166     (*((limb_aX *)&out[21])) |= in[3] << 6;
167     (*((limb_aX *)&out[29])) = in[4];
168     (*((limb_aX *)&out[36])) |= in[5] << 2;
169     (*((limb_aX *)&out[43])) |= in[6] << 4;
170     (*((limb_aX *)&out[50])) |= in[7] << 6;
171     (*((limb_aX *)&out[58])) = in[8];
172 }
173 
174 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
BN_to_felem(felem out,const BIGNUM * bn)175 static int BN_to_felem(felem out, const BIGNUM *bn)
176 {
177     felem_bytearray b_out;
178     int num_bytes;
179 
180     if (BN_is_negative(bn)) {
181         ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
182         return 0;
183     }
184     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
185     if (num_bytes < 0) {
186         ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
187         return 0;
188     }
189     bin66_to_felem(out, b_out);
190     return 1;
191 }
192 
193 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
felem_to_BN(BIGNUM * out,const felem in)194 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
195 {
196     felem_bytearray b_out;
197     felem_to_bin66(b_out, in);
198     return BN_lebin2bn(b_out, sizeof(b_out), out);
199 }
200 
201 /*-
202  * Field operations
203  * ----------------
204  */
205 
felem_one(felem out)206 static void felem_one(felem out)
207 {
208     out[0] = 1;
209     out[1] = 0;
210     out[2] = 0;
211     out[3] = 0;
212     out[4] = 0;
213     out[5] = 0;
214     out[6] = 0;
215     out[7] = 0;
216     out[8] = 0;
217 }
218 
felem_assign(felem out,const felem in)219 static void felem_assign(felem out, const felem in)
220 {
221     out[0] = in[0];
222     out[1] = in[1];
223     out[2] = in[2];
224     out[3] = in[3];
225     out[4] = in[4];
226     out[5] = in[5];
227     out[6] = in[6];
228     out[7] = in[7];
229     out[8] = in[8];
230 }
231 
232 /* felem_sum64 sets out = out + in. */
felem_sum64(felem out,const felem in)233 static void felem_sum64(felem out, const felem in)
234 {
235     out[0] += in[0];
236     out[1] += in[1];
237     out[2] += in[2];
238     out[3] += in[3];
239     out[4] += in[4];
240     out[5] += in[5];
241     out[6] += in[6];
242     out[7] += in[7];
243     out[8] += in[8];
244 }
245 
246 /* felem_scalar sets out = in * scalar */
felem_scalar(felem out,const felem in,limb scalar)247 static void felem_scalar(felem out, const felem in, limb scalar)
248 {
249     out[0] = in[0] * scalar;
250     out[1] = in[1] * scalar;
251     out[2] = in[2] * scalar;
252     out[3] = in[3] * scalar;
253     out[4] = in[4] * scalar;
254     out[5] = in[5] * scalar;
255     out[6] = in[6] * scalar;
256     out[7] = in[7] * scalar;
257     out[8] = in[8] * scalar;
258 }
259 
260 /* felem_scalar64 sets out = out * scalar */
felem_scalar64(felem out,limb scalar)261 static void felem_scalar64(felem out, limb scalar)
262 {
263     out[0] *= scalar;
264     out[1] *= scalar;
265     out[2] *= scalar;
266     out[3] *= scalar;
267     out[4] *= scalar;
268     out[5] *= scalar;
269     out[6] *= scalar;
270     out[7] *= scalar;
271     out[8] *= scalar;
272 }
273 
274 /* felem_scalar128 sets out = out * scalar */
felem_scalar128(largefelem out,limb scalar)275 static void felem_scalar128(largefelem out, limb scalar)
276 {
277     out[0] *= scalar;
278     out[1] *= scalar;
279     out[2] *= scalar;
280     out[3] *= scalar;
281     out[4] *= scalar;
282     out[5] *= scalar;
283     out[6] *= scalar;
284     out[7] *= scalar;
285     out[8] *= scalar;
286 }
287 
288 /*-
289  * felem_neg sets |out| to |-in|
290  * On entry:
291  *   in[i] < 2^59 + 2^14
292  * On exit:
293  *   out[i] < 2^62
294  */
felem_neg(felem out,const felem in)295 static void felem_neg(felem out, const felem in)
296 {
297     /* In order to prevent underflow, we subtract from 0 mod p. */
298     static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
299     static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
300 
301     out[0] = two62m3 - in[0];
302     out[1] = two62m2 - in[1];
303     out[2] = two62m2 - in[2];
304     out[3] = two62m2 - in[3];
305     out[4] = two62m2 - in[4];
306     out[5] = two62m2 - in[5];
307     out[6] = two62m2 - in[6];
308     out[7] = two62m2 - in[7];
309     out[8] = two62m2 - in[8];
310 }
311 
312 /*-
313  * felem_diff64 subtracts |in| from |out|
314  * On entry:
315  *   in[i] < 2^59 + 2^14
316  * On exit:
317  *   out[i] < out[i] + 2^62
318  */
felem_diff64(felem out,const felem in)319 static void felem_diff64(felem out, const felem in)
320 {
321     /*
322      * In order to prevent underflow, we add 0 mod p before subtracting.
323      */
324     static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
325     static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
326 
327     out[0] += two62m3 - in[0];
328     out[1] += two62m2 - in[1];
329     out[2] += two62m2 - in[2];
330     out[3] += two62m2 - in[3];
331     out[4] += two62m2 - in[4];
332     out[5] += two62m2 - in[5];
333     out[6] += two62m2 - in[6];
334     out[7] += two62m2 - in[7];
335     out[8] += two62m2 - in[8];
336 }
337 
338 /*-
339  * felem_diff_128_64 subtracts |in| from |out|
340  * On entry:
341  *   in[i] < 2^62 + 2^17
342  * On exit:
343  *   out[i] < out[i] + 2^63
344  */
felem_diff_128_64(largefelem out,const felem in)345 static void felem_diff_128_64(largefelem out, const felem in)
346 {
347     /*
348      * In order to prevent underflow, we add 64p mod p (which is equivalent
349      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
350      * digit number with all bits set to 1. See "The representation of field
351      * elements" comment above for a description of how limbs are used to
352      * represent a number. 64p is represented with 8 limbs containing a number
353      * with 58 bits set and one limb with a number with 57 bits set.
354      */
355     static const limb two63m6 = (((limb)1) << 63) - (((limb)1) << 6);
356     static const limb two63m5 = (((limb)1) << 63) - (((limb)1) << 5);
357 
358     out[0] += two63m6 - in[0];
359     out[1] += two63m5 - in[1];
360     out[2] += two63m5 - in[2];
361     out[3] += two63m5 - in[3];
362     out[4] += two63m5 - in[4];
363     out[5] += two63m5 - in[5];
364     out[6] += two63m5 - in[6];
365     out[7] += two63m5 - in[7];
366     out[8] += two63m5 - in[8];
367 }
368 
369 /*-
370  * felem_diff_128_64 subtracts |in| from |out|
371  * On entry:
372  *   in[i] < 2^126
373  * On exit:
374  *   out[i] < out[i] + 2^127 - 2^69
375  */
felem_diff128(largefelem out,const largefelem in)376 static void felem_diff128(largefelem out, const largefelem in)
377 {
378     /*
379      * In order to prevent underflow, we add 0 mod p before subtracting.
380      */
381     static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70);
382     static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69);
383 
384     out[0] += (two127m70 - in[0]);
385     out[1] += (two127m69 - in[1]);
386     out[2] += (two127m69 - in[2]);
387     out[3] += (two127m69 - in[3]);
388     out[4] += (two127m69 - in[4]);
389     out[5] += (two127m69 - in[5]);
390     out[6] += (two127m69 - in[6]);
391     out[7] += (two127m69 - in[7]);
392     out[8] += (two127m69 - in[8]);
393 }
394 
395 /*-
396  * felem_square sets |out| = |in|^2
397  * On entry:
398  *   in[i] < 2^62
399  * On exit:
400  *   out[i] < 17 * max(in[i]) * max(in[i])
401  */
felem_square_ref(largefelem out,const felem in)402 static void felem_square_ref(largefelem out, const felem in)
403 {
404     felem inx2, inx4;
405     felem_scalar(inx2, in, 2);
406     felem_scalar(inx4, in, 4);
407 
408     /*-
409      * We have many cases were we want to do
410      *   in[x] * in[y] +
411      *   in[y] * in[x]
412      * This is obviously just
413      *   2 * in[x] * in[y]
414      * However, rather than do the doubling on the 128 bit result, we
415      * double one of the inputs to the multiplication by reading from
416      * |inx2|
417      */
418 
419     out[0] = ((uint128_t)in[0]) * in[0];
420     out[1] = ((uint128_t)in[0]) * inx2[1];
421     out[2] = ((uint128_t)in[0]) * inx2[2] + ((uint128_t)in[1]) * in[1];
422     out[3] = ((uint128_t)in[0]) * inx2[3] + ((uint128_t)in[1]) * inx2[2];
423     out[4] = ((uint128_t)in[0]) * inx2[4] + ((uint128_t)in[1]) * inx2[3] + ((uint128_t)in[2]) * in[2];
424     out[5] = ((uint128_t)in[0]) * inx2[5] + ((uint128_t)in[1]) * inx2[4] + ((uint128_t)in[2]) * inx2[3];
425     out[6] = ((uint128_t)in[0]) * inx2[6] + ((uint128_t)in[1]) * inx2[5] + ((uint128_t)in[2]) * inx2[4] + ((uint128_t)in[3]) * in[3];
426     out[7] = ((uint128_t)in[0]) * inx2[7] + ((uint128_t)in[1]) * inx2[6] + ((uint128_t)in[2]) * inx2[5] + ((uint128_t)in[3]) * inx2[4];
427     out[8] = ((uint128_t)in[0]) * inx2[8] + ((uint128_t)in[1]) * inx2[7] + ((uint128_t)in[2]) * inx2[6] + ((uint128_t)in[3]) * inx2[5] + ((uint128_t)in[4]) * in[4];
428 
429     /*
430      * The remaining limbs fall above 2^521, with the first falling at 2^522.
431      * They correspond to locations one bit up from the limbs produced above
432      * so we would have to multiply by two to align them. Again, rather than
433      * operate on the 128-bit result, we double one of the inputs to the
434      * multiplication. If we want to double for both this reason, and the
435      * reason above, then we end up multiplying by four.
436      */
437 
438     /* 9 */
439     out[0] += ((uint128_t)in[1]) * inx4[8] + ((uint128_t)in[2]) * inx4[7] + ((uint128_t)in[3]) * inx4[6] + ((uint128_t)in[4]) * inx4[5];
440 
441     /* 10 */
442     out[1] += ((uint128_t)in[2]) * inx4[8] + ((uint128_t)in[3]) * inx4[7] + ((uint128_t)in[4]) * inx4[6] + ((uint128_t)in[5]) * inx2[5];
443 
444     /* 11 */
445     out[2] += ((uint128_t)in[3]) * inx4[8] + ((uint128_t)in[4]) * inx4[7] + ((uint128_t)in[5]) * inx4[6];
446 
447     /* 12 */
448     out[3] += ((uint128_t)in[4]) * inx4[8] + ((uint128_t)in[5]) * inx4[7] + ((uint128_t)in[6]) * inx2[6];
449 
450     /* 13 */
451     out[4] += ((uint128_t)in[5]) * inx4[8] + ((uint128_t)in[6]) * inx4[7];
452 
453     /* 14 */
454     out[5] += ((uint128_t)in[6]) * inx4[8] + ((uint128_t)in[7]) * inx2[7];
455 
456     /* 15 */
457     out[6] += ((uint128_t)in[7]) * inx4[8];
458 
459     /* 16 */
460     out[7] += ((uint128_t)in[8]) * inx2[8];
461 }
462 
463 /*-
464  * felem_mul sets |out| = |in1| * |in2|
465  * On entry:
466  *   in1[i] < 2^64
467  *   in2[i] < 2^63
468  * On exit:
469  *   out[i] < 17 * max(in1[i]) * max(in2[i])
470  */
felem_mul_ref(largefelem out,const felem in1,const felem in2)471 static void felem_mul_ref(largefelem out, const felem in1, const felem in2)
472 {
473     felem in2x2;
474     felem_scalar(in2x2, in2, 2);
475 
476     out[0] = ((uint128_t)in1[0]) * in2[0];
477 
478     out[1] = ((uint128_t)in1[0]) * in2[1] + ((uint128_t)in1[1]) * in2[0];
479 
480     out[2] = ((uint128_t)in1[0]) * in2[2] + ((uint128_t)in1[1]) * in2[1] + ((uint128_t)in1[2]) * in2[0];
481 
482     out[3] = ((uint128_t)in1[0]) * in2[3] + ((uint128_t)in1[1]) * in2[2] + ((uint128_t)in1[2]) * in2[1] + ((uint128_t)in1[3]) * in2[0];
483 
484     out[4] = ((uint128_t)in1[0]) * in2[4] + ((uint128_t)in1[1]) * in2[3] + ((uint128_t)in1[2]) * in2[2] + ((uint128_t)in1[3]) * in2[1] + ((uint128_t)in1[4]) * in2[0];
485 
486     out[5] = ((uint128_t)in1[0]) * in2[5] + ((uint128_t)in1[1]) * in2[4] + ((uint128_t)in1[2]) * in2[3] + ((uint128_t)in1[3]) * in2[2] + ((uint128_t)in1[4]) * in2[1] + ((uint128_t)in1[5]) * in2[0];
487 
488     out[6] = ((uint128_t)in1[0]) * in2[6] + ((uint128_t)in1[1]) * in2[5] + ((uint128_t)in1[2]) * in2[4] + ((uint128_t)in1[3]) * in2[3] + ((uint128_t)in1[4]) * in2[2] + ((uint128_t)in1[5]) * in2[1] + ((uint128_t)in1[6]) * in2[0];
489 
490     out[7] = ((uint128_t)in1[0]) * in2[7] + ((uint128_t)in1[1]) * in2[6] + ((uint128_t)in1[2]) * in2[5] + ((uint128_t)in1[3]) * in2[4] + ((uint128_t)in1[4]) * in2[3] + ((uint128_t)in1[5]) * in2[2] + ((uint128_t)in1[6]) * in2[1] + ((uint128_t)in1[7]) * in2[0];
491 
492     out[8] = ((uint128_t)in1[0]) * in2[8] + ((uint128_t)in1[1]) * in2[7] + ((uint128_t)in1[2]) * in2[6] + ((uint128_t)in1[3]) * in2[5] + ((uint128_t)in1[4]) * in2[4] + ((uint128_t)in1[5]) * in2[3] + ((uint128_t)in1[6]) * in2[2] + ((uint128_t)in1[7]) * in2[1] + ((uint128_t)in1[8]) * in2[0];
493 
494     /* See comment in felem_square about the use of in2x2 here */
495 
496     out[0] += ((uint128_t)in1[1]) * in2x2[8] + ((uint128_t)in1[2]) * in2x2[7] + ((uint128_t)in1[3]) * in2x2[6] + ((uint128_t)in1[4]) * in2x2[5] + ((uint128_t)in1[5]) * in2x2[4] + ((uint128_t)in1[6]) * in2x2[3] + ((uint128_t)in1[7]) * in2x2[2] + ((uint128_t)in1[8]) * in2x2[1];
497 
498     out[1] += ((uint128_t)in1[2]) * in2x2[8] + ((uint128_t)in1[3]) * in2x2[7] + ((uint128_t)in1[4]) * in2x2[6] + ((uint128_t)in1[5]) * in2x2[5] + ((uint128_t)in1[6]) * in2x2[4] + ((uint128_t)in1[7]) * in2x2[3] + ((uint128_t)in1[8]) * in2x2[2];
499 
500     out[2] += ((uint128_t)in1[3]) * in2x2[8] + ((uint128_t)in1[4]) * in2x2[7] + ((uint128_t)in1[5]) * in2x2[6] + ((uint128_t)in1[6]) * in2x2[5] + ((uint128_t)in1[7]) * in2x2[4] + ((uint128_t)in1[8]) * in2x2[3];
501 
502     out[3] += ((uint128_t)in1[4]) * in2x2[8] + ((uint128_t)in1[5]) * in2x2[7] + ((uint128_t)in1[6]) * in2x2[6] + ((uint128_t)in1[7]) * in2x2[5] + ((uint128_t)in1[8]) * in2x2[4];
503 
504     out[4] += ((uint128_t)in1[5]) * in2x2[8] + ((uint128_t)in1[6]) * in2x2[7] + ((uint128_t)in1[7]) * in2x2[6] + ((uint128_t)in1[8]) * in2x2[5];
505 
506     out[5] += ((uint128_t)in1[6]) * in2x2[8] + ((uint128_t)in1[7]) * in2x2[7] + ((uint128_t)in1[8]) * in2x2[6];
507 
508     out[6] += ((uint128_t)in1[7]) * in2x2[8] + ((uint128_t)in1[8]) * in2x2[7];
509 
510     out[7] += ((uint128_t)in1[8]) * in2x2[8];
511 }
512 
513 static const limb bottom52bits = 0xfffffffffffff;
514 
515 /*-
516  * felem_reduce converts a largefelem to an felem.
517  * On entry:
518  *   in[i] < 2^128
519  * On exit:
520  *   out[i] < 2^59 + 2^14
521  */
felem_reduce(felem out,const largefelem in)522 static void felem_reduce(felem out, const largefelem in)
523 {
524     u64 overflow1, overflow2;
525 
526     out[0] = ((limb)in[0]) & bottom58bits;
527     out[1] = ((limb)in[1]) & bottom58bits;
528     out[2] = ((limb)in[2]) & bottom58bits;
529     out[3] = ((limb)in[3]) & bottom58bits;
530     out[4] = ((limb)in[4]) & bottom58bits;
531     out[5] = ((limb)in[5]) & bottom58bits;
532     out[6] = ((limb)in[6]) & bottom58bits;
533     out[7] = ((limb)in[7]) & bottom58bits;
534     out[8] = ((limb)in[8]) & bottom58bits;
535 
536     /* out[i] < 2^58 */
537 
538     out[1] += ((limb)in[0]) >> 58;
539     out[1] += (((limb)(in[0] >> 64)) & bottom52bits) << 6;
540     /*-
541      * out[1] < 2^58 + 2^6 + 2^58
542      *        = 2^59 + 2^6
543      */
544     out[2] += ((limb)(in[0] >> 64)) >> 52;
545 
546     out[2] += ((limb)in[1]) >> 58;
547     out[2] += (((limb)(in[1] >> 64)) & bottom52bits) << 6;
548     out[3] += ((limb)(in[1] >> 64)) >> 52;
549 
550     out[3] += ((limb)in[2]) >> 58;
551     out[3] += (((limb)(in[2] >> 64)) & bottom52bits) << 6;
552     out[4] += ((limb)(in[2] >> 64)) >> 52;
553 
554     out[4] += ((limb)in[3]) >> 58;
555     out[4] += (((limb)(in[3] >> 64)) & bottom52bits) << 6;
556     out[5] += ((limb)(in[3] >> 64)) >> 52;
557 
558     out[5] += ((limb)in[4]) >> 58;
559     out[5] += (((limb)(in[4] >> 64)) & bottom52bits) << 6;
560     out[6] += ((limb)(in[4] >> 64)) >> 52;
561 
562     out[6] += ((limb)in[5]) >> 58;
563     out[6] += (((limb)(in[5] >> 64)) & bottom52bits) << 6;
564     out[7] += ((limb)(in[5] >> 64)) >> 52;
565 
566     out[7] += ((limb)in[6]) >> 58;
567     out[7] += (((limb)(in[6] >> 64)) & bottom52bits) << 6;
568     out[8] += ((limb)(in[6] >> 64)) >> 52;
569 
570     out[8] += ((limb)in[7]) >> 58;
571     out[8] += (((limb)(in[7] >> 64)) & bottom52bits) << 6;
572     /*-
573      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
574      *            < 2^59 + 2^13
575      */
576     overflow1 = ((limb)(in[7] >> 64)) >> 52;
577 
578     overflow1 += ((limb)in[8]) >> 58;
579     overflow1 += (((limb)(in[8] >> 64)) & bottom52bits) << 6;
580     overflow2 = ((limb)(in[8] >> 64)) >> 52;
581 
582     overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */
583     overflow2 <<= 1; /* overflow2 < 2^13 */
584 
585     out[0] += overflow1; /* out[0] < 2^60 */
586     out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */
587 
588     out[1] += out[0] >> 58;
589     out[0] &= bottom58bits;
590     /*-
591      * out[0] < 2^58
592      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
593      *        < 2^59 + 2^14
594      */
595 }
596 
597 #if defined(ECP_NISTP521_ASM)
598 static void felem_square_wrapper(largefelem out, const felem in);
599 static void felem_mul_wrapper(largefelem out, const felem in1, const felem in2);
600 
601 static void (*felem_square_p)(largefelem out, const felem in) = felem_square_wrapper;
602 static void (*felem_mul_p)(largefelem out, const felem in1, const felem in2) = felem_mul_wrapper;
603 
604 void p521_felem_square(largefelem out, const felem in);
605 void p521_felem_mul(largefelem out, const felem in1, const felem in2);
606 
607 #if defined(_ARCH_PPC64)
608 #include "crypto/ppc_arch.h"
609 #endif
610 
felem_select(void)611 static void felem_select(void)
612 {
613 #if defined(_ARCH_PPC64)
614     if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
615         felem_square_p = p521_felem_square;
616         felem_mul_p = p521_felem_mul;
617 
618         return;
619     }
620 #endif
621 
622     /* Default */
623     felem_square_p = felem_square_ref;
624     felem_mul_p = felem_mul_ref;
625 }
626 
felem_square_wrapper(largefelem out,const felem in)627 static void felem_square_wrapper(largefelem out, const felem in)
628 {
629     felem_select();
630     felem_square_p(out, in);
631 }
632 
felem_mul_wrapper(largefelem out,const felem in1,const felem in2)633 static void felem_mul_wrapper(largefelem out, const felem in1, const felem in2)
634 {
635     felem_select();
636     felem_mul_p(out, in1, in2);
637 }
638 
639 #define felem_square felem_square_p
640 #define felem_mul felem_mul_p
641 #else
642 #define felem_square felem_square_ref
643 #define felem_mul felem_mul_ref
644 #endif
645 
felem_square_reduce(felem out,const felem in)646 static void felem_square_reduce(felem out, const felem in)
647 {
648     largefelem tmp;
649     felem_square(tmp, in);
650     felem_reduce(out, tmp);
651 }
652 
felem_mul_reduce(felem out,const felem in1,const felem in2)653 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
654 {
655     largefelem tmp;
656     felem_mul(tmp, in1, in2);
657     felem_reduce(out, tmp);
658 }
659 
660 /*-
661  * felem_inv calculates |out| = |in|^{-1}
662  *
663  * Based on Fermat's Little Theorem:
664  *   a^p = a (mod p)
665  *   a^{p-1} = 1 (mod p)
666  *   a^{p-2} = a^{-1} (mod p)
667  */
felem_inv(felem out,const felem in)668 static void felem_inv(felem out, const felem in)
669 {
670     felem ftmp, ftmp2, ftmp3, ftmp4;
671     largefelem tmp;
672     unsigned i;
673 
674     felem_square(tmp, in);
675     felem_reduce(ftmp, tmp); /* 2^1 */
676     felem_mul(tmp, in, ftmp);
677     felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
678     felem_assign(ftmp2, ftmp);
679     felem_square(tmp, ftmp);
680     felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
681     felem_mul(tmp, in, ftmp);
682     felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */
683     felem_square(tmp, ftmp);
684     felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */
685 
686     felem_square(tmp, ftmp2);
687     felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */
688     felem_square(tmp, ftmp3);
689     felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */
690     felem_mul(tmp, ftmp3, ftmp2);
691     felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */
692 
693     felem_assign(ftmp2, ftmp3);
694     felem_square(tmp, ftmp3);
695     felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */
696     felem_square(tmp, ftmp3);
697     felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */
698     felem_square(tmp, ftmp3);
699     felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */
700     felem_square(tmp, ftmp3);
701     felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */
702     felem_mul(tmp, ftmp3, ftmp);
703     felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */
704     felem_square(tmp, ftmp4);
705     felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */
706     felem_mul(tmp, ftmp3, ftmp2);
707     felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */
708     felem_assign(ftmp2, ftmp3);
709 
710     for (i = 0; i < 8; i++) {
711         felem_square(tmp, ftmp3);
712         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
713     }
714     felem_mul(tmp, ftmp3, ftmp2);
715     felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */
716     felem_assign(ftmp2, ftmp3);
717 
718     for (i = 0; i < 16; i++) {
719         felem_square(tmp, ftmp3);
720         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
721     }
722     felem_mul(tmp, ftmp3, ftmp2);
723     felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */
724     felem_assign(ftmp2, ftmp3);
725 
726     for (i = 0; i < 32; i++) {
727         felem_square(tmp, ftmp3);
728         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
729     }
730     felem_mul(tmp, ftmp3, ftmp2);
731     felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */
732     felem_assign(ftmp2, ftmp3);
733 
734     for (i = 0; i < 64; i++) {
735         felem_square(tmp, ftmp3);
736         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
737     }
738     felem_mul(tmp, ftmp3, ftmp2);
739     felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */
740     felem_assign(ftmp2, ftmp3);
741 
742     for (i = 0; i < 128; i++) {
743         felem_square(tmp, ftmp3);
744         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
745     }
746     felem_mul(tmp, ftmp3, ftmp2);
747     felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */
748     felem_assign(ftmp2, ftmp3);
749 
750     for (i = 0; i < 256; i++) {
751         felem_square(tmp, ftmp3);
752         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
753     }
754     felem_mul(tmp, ftmp3, ftmp2);
755     felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */
756 
757     for (i = 0; i < 9; i++) {
758         felem_square(tmp, ftmp3);
759         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
760     }
761     felem_mul(tmp, ftmp3, ftmp4);
762     felem_reduce(ftmp3, tmp); /* 2^521 - 2^2 */
763     felem_mul(tmp, ftmp3, in);
764     felem_reduce(out, tmp); /* 2^521 - 3 */
765 }
766 
767 /* This is 2^521-1, expressed as an felem */
768 static const felem kPrime = {
769     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
770     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
771     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
772 };
773 
774 /*-
775  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
776  * otherwise.
777  * On entry:
778  *   in[i] < 2^59 + 2^14
779  */
felem_is_zero(const felem in)780 static limb felem_is_zero(const felem in)
781 {
782     felem ftmp;
783     limb is_zero, is_p;
784     felem_assign(ftmp, in);
785 
786     ftmp[0] += ftmp[8] >> 57;
787     ftmp[8] &= bottom57bits;
788     /* ftmp[8] < 2^57 */
789     ftmp[1] += ftmp[0] >> 58;
790     ftmp[0] &= bottom58bits;
791     ftmp[2] += ftmp[1] >> 58;
792     ftmp[1] &= bottom58bits;
793     ftmp[3] += ftmp[2] >> 58;
794     ftmp[2] &= bottom58bits;
795     ftmp[4] += ftmp[3] >> 58;
796     ftmp[3] &= bottom58bits;
797     ftmp[5] += ftmp[4] >> 58;
798     ftmp[4] &= bottom58bits;
799     ftmp[6] += ftmp[5] >> 58;
800     ftmp[5] &= bottom58bits;
801     ftmp[7] += ftmp[6] >> 58;
802     ftmp[6] &= bottom58bits;
803     ftmp[8] += ftmp[7] >> 58;
804     ftmp[7] &= bottom58bits;
805     /* ftmp[8] < 2^57 + 4 */
806 
807     /*
808      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
809      * than our bound for ftmp[8]. Therefore we only have to check if the
810      * zero is zero or 2^521-1.
811      */
812 
813     is_zero = 0;
814     is_zero |= ftmp[0];
815     is_zero |= ftmp[1];
816     is_zero |= ftmp[2];
817     is_zero |= ftmp[3];
818     is_zero |= ftmp[4];
819     is_zero |= ftmp[5];
820     is_zero |= ftmp[6];
821     is_zero |= ftmp[7];
822     is_zero |= ftmp[8];
823 
824     is_zero--;
825     /*
826      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
827      * can be set is if is_zero was 0 before the decrement.
828      */
829     is_zero = 0 - (is_zero >> 63);
830 
831     is_p = ftmp[0] ^ kPrime[0];
832     is_p |= ftmp[1] ^ kPrime[1];
833     is_p |= ftmp[2] ^ kPrime[2];
834     is_p |= ftmp[3] ^ kPrime[3];
835     is_p |= ftmp[4] ^ kPrime[4];
836     is_p |= ftmp[5] ^ kPrime[5];
837     is_p |= ftmp[6] ^ kPrime[6];
838     is_p |= ftmp[7] ^ kPrime[7];
839     is_p |= ftmp[8] ^ kPrime[8];
840 
841     is_p--;
842     is_p = 0 - (is_p >> 63);
843 
844     is_zero |= is_p;
845     return is_zero;
846 }
847 
felem_is_zero_int(const void * in)848 static int felem_is_zero_int(const void *in)
849 {
850     return (int)(felem_is_zero(in) & ((limb)1));
851 }
852 
853 /*-
854  * felem_contract converts |in| to its unique, minimal representation.
855  * On entry:
856  *   in[i] < 2^59 + 2^14
857  */
felem_contract(felem out,const felem in)858 static void felem_contract(felem out, const felem in)
859 {
860     limb is_p, is_greater, sign;
861     static const limb two58 = ((limb)1) << 58;
862 
863     felem_assign(out, in);
864 
865     out[0] += out[8] >> 57;
866     out[8] &= bottom57bits;
867     /* out[8] < 2^57 */
868     out[1] += out[0] >> 58;
869     out[0] &= bottom58bits;
870     out[2] += out[1] >> 58;
871     out[1] &= bottom58bits;
872     out[3] += out[2] >> 58;
873     out[2] &= bottom58bits;
874     out[4] += out[3] >> 58;
875     out[3] &= bottom58bits;
876     out[5] += out[4] >> 58;
877     out[4] &= bottom58bits;
878     out[6] += out[5] >> 58;
879     out[5] &= bottom58bits;
880     out[7] += out[6] >> 58;
881     out[6] &= bottom58bits;
882     out[8] += out[7] >> 58;
883     out[7] &= bottom58bits;
884     /* out[8] < 2^57 + 4 */
885 
886     /*
887      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
888      * out. See the comments in felem_is_zero regarding why we don't test for
889      * other multiples of the prime.
890      */
891 
892     /*
893      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
894      */
895 
896     is_p = out[0] ^ kPrime[0];
897     is_p |= out[1] ^ kPrime[1];
898     is_p |= out[2] ^ kPrime[2];
899     is_p |= out[3] ^ kPrime[3];
900     is_p |= out[4] ^ kPrime[4];
901     is_p |= out[5] ^ kPrime[5];
902     is_p |= out[6] ^ kPrime[6];
903     is_p |= out[7] ^ kPrime[7];
904     is_p |= out[8] ^ kPrime[8];
905 
906     is_p--;
907     is_p &= is_p << 32;
908     is_p &= is_p << 16;
909     is_p &= is_p << 8;
910     is_p &= is_p << 4;
911     is_p &= is_p << 2;
912     is_p &= is_p << 1;
913     is_p = 0 - (is_p >> 63);
914     is_p = ~is_p;
915 
916     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
917 
918     out[0] &= is_p;
919     out[1] &= is_p;
920     out[2] &= is_p;
921     out[3] &= is_p;
922     out[4] &= is_p;
923     out[5] &= is_p;
924     out[6] &= is_p;
925     out[7] &= is_p;
926     out[8] &= is_p;
927 
928     /*
929      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
930      * 57 is greater than zero as (2^521-1) + x >= 2^522
931      */
932     is_greater = out[8] >> 57;
933     is_greater |= is_greater << 32;
934     is_greater |= is_greater << 16;
935     is_greater |= is_greater << 8;
936     is_greater |= is_greater << 4;
937     is_greater |= is_greater << 2;
938     is_greater |= is_greater << 1;
939     is_greater = 0 - (is_greater >> 63);
940 
941     out[0] -= kPrime[0] & is_greater;
942     out[1] -= kPrime[1] & is_greater;
943     out[2] -= kPrime[2] & is_greater;
944     out[3] -= kPrime[3] & is_greater;
945     out[4] -= kPrime[4] & is_greater;
946     out[5] -= kPrime[5] & is_greater;
947     out[6] -= kPrime[6] & is_greater;
948     out[7] -= kPrime[7] & is_greater;
949     out[8] -= kPrime[8] & is_greater;
950 
951     /* Eliminate negative coefficients */
952     sign = -(out[0] >> 63);
953     out[0] += (two58 & sign);
954     out[1] -= (1 & sign);
955     sign = -(out[1] >> 63);
956     out[1] += (two58 & sign);
957     out[2] -= (1 & sign);
958     sign = -(out[2] >> 63);
959     out[2] += (two58 & sign);
960     out[3] -= (1 & sign);
961     sign = -(out[3] >> 63);
962     out[3] += (two58 & sign);
963     out[4] -= (1 & sign);
964     sign = -(out[4] >> 63);
965     out[4] += (two58 & sign);
966     out[5] -= (1 & sign);
967     sign = -(out[0] >> 63);
968     out[5] += (two58 & sign);
969     out[6] -= (1 & sign);
970     sign = -(out[6] >> 63);
971     out[6] += (two58 & sign);
972     out[7] -= (1 & sign);
973     sign = -(out[7] >> 63);
974     out[7] += (two58 & sign);
975     out[8] -= (1 & sign);
976     sign = -(out[5] >> 63);
977     out[5] += (two58 & sign);
978     out[6] -= (1 & sign);
979     sign = -(out[6] >> 63);
980     out[6] += (two58 & sign);
981     out[7] -= (1 & sign);
982     sign = -(out[7] >> 63);
983     out[7] += (two58 & sign);
984     out[8] -= (1 & sign);
985 }
986 
987 /*-
988  * Group operations
989  * ----------------
990  *
991  * Building on top of the field operations we have the operations on the
992  * elliptic curve group itself. Points on the curve are represented in Jacobian
993  * coordinates */
994 
995 /*-
996  * point_double calculates 2*(x_in, y_in, z_in)
997  *
998  * The method is taken from:
999  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1000  *
1001  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1002  * while x_out == y_in is not (maybe this works, but it's not tested). */
1003 static void
point_double(felem x_out,felem y_out,felem z_out,const felem x_in,const felem y_in,const felem z_in)1004 point_double(felem x_out, felem y_out, felem z_out,
1005     const felem x_in, const felem y_in, const felem z_in)
1006 {
1007     largefelem tmp, tmp2;
1008     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1009 
1010     felem_assign(ftmp, x_in);
1011     felem_assign(ftmp2, x_in);
1012 
1013     /* delta = z^2 */
1014     felem_square(tmp, z_in);
1015     felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */
1016 
1017     /* gamma = y^2 */
1018     felem_square(tmp, y_in);
1019     felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */
1020 
1021     /* beta = x*gamma */
1022     felem_mul(tmp, x_in, gamma);
1023     felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */
1024 
1025     /* alpha = 3*(x-delta)*(x+delta) */
1026     felem_diff64(ftmp, delta);
1027     /* ftmp[i] < 2^61 */
1028     felem_sum64(ftmp2, delta);
1029     /* ftmp2[i] < 2^60 + 2^15 */
1030     felem_scalar64(ftmp2, 3);
1031     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1032     felem_mul(tmp, ftmp, ftmp2);
1033     /*-
1034      * tmp[i] < 17(3*2^121 + 3*2^76)
1035      *        = 61*2^121 + 61*2^76
1036      *        < 64*2^121 + 64*2^76
1037      *        = 2^127 + 2^82
1038      *        < 2^128
1039      */
1040     felem_reduce(alpha, tmp);
1041 
1042     /* x' = alpha^2 - 8*beta */
1043     felem_square(tmp, alpha);
1044     /*
1045      * tmp[i] < 17*2^120 < 2^125
1046      */
1047     felem_assign(ftmp, beta);
1048     felem_scalar64(ftmp, 8);
1049     /* ftmp[i] < 2^62 + 2^17 */
1050     felem_diff_128_64(tmp, ftmp);
1051     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1052     felem_reduce(x_out, tmp);
1053 
1054     /* z' = (y + z)^2 - gamma - delta */
1055     felem_sum64(delta, gamma);
1056     /* delta[i] < 2^60 + 2^15 */
1057     felem_assign(ftmp, y_in);
1058     felem_sum64(ftmp, z_in);
1059     /* ftmp[i] < 2^60 + 2^15 */
1060     felem_square(tmp, ftmp);
1061     /*
1062      * tmp[i] < 17(2^122) < 2^127
1063      */
1064     felem_diff_128_64(tmp, delta);
1065     /* tmp[i] < 2^127 + 2^63 */
1066     felem_reduce(z_out, tmp);
1067 
1068     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1069     felem_scalar64(beta, 4);
1070     /* beta[i] < 2^61 + 2^16 */
1071     felem_diff64(beta, x_out);
1072     /* beta[i] < 2^61 + 2^60 + 2^16 */
1073     felem_mul(tmp, alpha, beta);
1074     /*-
1075      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1076      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1077      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1078      *        < 2^128
1079      */
1080     felem_square(tmp2, gamma);
1081     /*-
1082      * tmp2[i] < 17*(2^59 + 2^14)^2
1083      *         = 17*(2^118 + 2^74 + 2^28)
1084      */
1085     felem_scalar128(tmp2, 8);
1086     /*-
1087      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1088      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1089      *         < 2^126
1090      */
1091     felem_diff128(tmp, tmp2);
1092     /*-
1093      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1094      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1095      *          2^74 + 2^69 + 2^34 + 2^30
1096      *        < 2^128
1097      */
1098     felem_reduce(y_out, tmp);
1099 }
1100 
1101 /* copy_conditional copies in to out iff mask is all ones. */
copy_conditional(felem out,const felem in,limb mask)1102 static void copy_conditional(felem out, const felem in, limb mask)
1103 {
1104     unsigned i;
1105     for (i = 0; i < NLIMBS; ++i) {
1106         const limb tmp = mask & (in[i] ^ out[i]);
1107         out[i] ^= tmp;
1108     }
1109 }
1110 
1111 /*-
1112  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1113  *
1114  * The method is taken from
1115  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1116  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1117  *
1118  * This function includes a branch for checking whether the two input points
1119  * are equal (while not equal to the point at infinity). See comment below
1120  * on constant-time.
1121  */
point_add(felem x3,felem y3,felem z3,const felem x1,const felem y1,const felem z1,const int mixed,const felem x2,const felem y2,const felem z2)1122 static void point_add(felem x3, felem y3, felem z3,
1123     const felem x1, const felem y1, const felem z1,
1124     const int mixed, const felem x2, const felem y2,
1125     const felem z2)
1126 {
1127     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1128     largefelem tmp, tmp2;
1129     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1130     limb points_equal;
1131 
1132     z1_is_zero = felem_is_zero(z1);
1133     z2_is_zero = felem_is_zero(z2);
1134 
1135     /* ftmp = z1z1 = z1**2 */
1136     felem_square(tmp, z1);
1137     felem_reduce(ftmp, tmp);
1138 
1139     if (!mixed) {
1140         /* ftmp2 = z2z2 = z2**2 */
1141         felem_square(tmp, z2);
1142         felem_reduce(ftmp2, tmp);
1143 
1144         /* u1 = ftmp3 = x1*z2z2 */
1145         felem_mul(tmp, x1, ftmp2);
1146         felem_reduce(ftmp3, tmp);
1147 
1148         /* ftmp5 = z1 + z2 */
1149         felem_assign(ftmp5, z1);
1150         felem_sum64(ftmp5, z2);
1151         /* ftmp5[i] < 2^61 */
1152 
1153         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1154         felem_square(tmp, ftmp5);
1155         /* tmp[i] < 17*2^122 */
1156         felem_diff_128_64(tmp, ftmp);
1157         /* tmp[i] < 17*2^122 + 2^63 */
1158         felem_diff_128_64(tmp, ftmp2);
1159         /* tmp[i] < 17*2^122 + 2^64 */
1160         felem_reduce(ftmp5, tmp);
1161 
1162         /* ftmp2 = z2 * z2z2 */
1163         felem_mul(tmp, ftmp2, z2);
1164         felem_reduce(ftmp2, tmp);
1165 
1166         /* s1 = ftmp6 = y1 * z2**3 */
1167         felem_mul(tmp, y1, ftmp2);
1168         felem_reduce(ftmp6, tmp);
1169     } else {
1170         /*
1171          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1172          */
1173 
1174         /* u1 = ftmp3 = x1*z2z2 */
1175         felem_assign(ftmp3, x1);
1176 
1177         /* ftmp5 = 2*z1z2 */
1178         felem_scalar(ftmp5, z1, 2);
1179 
1180         /* s1 = ftmp6 = y1 * z2**3 */
1181         felem_assign(ftmp6, y1);
1182     }
1183 
1184     /* u2 = x2*z1z1 */
1185     felem_mul(tmp, x2, ftmp);
1186     /* tmp[i] < 17*2^120 */
1187 
1188     /* h = ftmp4 = u2 - u1 */
1189     felem_diff_128_64(tmp, ftmp3);
1190     /* tmp[i] < 17*2^120 + 2^63 */
1191     felem_reduce(ftmp4, tmp);
1192 
1193     x_equal = felem_is_zero(ftmp4);
1194 
1195     /* z_out = ftmp5 * h */
1196     felem_mul(tmp, ftmp5, ftmp4);
1197     felem_reduce(z_out, tmp);
1198 
1199     /* ftmp = z1 * z1z1 */
1200     felem_mul(tmp, ftmp, z1);
1201     felem_reduce(ftmp, tmp);
1202 
1203     /* s2 = tmp = y2 * z1**3 */
1204     felem_mul(tmp, y2, ftmp);
1205     /* tmp[i] < 17*2^120 */
1206 
1207     /* r = ftmp5 = (s2 - s1)*2 */
1208     felem_diff_128_64(tmp, ftmp6);
1209     /* tmp[i] < 17*2^120 + 2^63 */
1210     felem_reduce(ftmp5, tmp);
1211     y_equal = felem_is_zero(ftmp5);
1212     felem_scalar64(ftmp5, 2);
1213     /* ftmp5[i] < 2^61 */
1214 
1215     /*
1216      * The formulae are incorrect if the points are equal, in affine coordinates
1217      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1218      * happens.
1219      *
1220      * We use bitwise operations to avoid potential side-channels introduced by
1221      * the short-circuiting behaviour of boolean operators.
1222      *
1223      * The special case of either point being the point at infinity (z1 and/or
1224      * z2 are zero), is handled separately later on in this function, so we
1225      * avoid jumping to point_double here in those special cases.
1226      *
1227      * Notice the comment below on the implications of this branching for timing
1228      * leaks and why it is considered practically irrelevant.
1229      */
1230     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1231 
1232     if (points_equal) {
1233         /*
1234          * This is obviously not constant-time but it will almost-never happen
1235          * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1236          * where the intermediate value gets very close to the group order.
1237          * Since |ossl_ec_GFp_nistp_recode_scalar_bits| produces signed digits
1238          * for the scalar, it's possible for the intermediate value to be a small
1239          * negative multiple of the base point, and for the final signed digit
1240          * to be the same value. We believe that this only occurs for the scalar
1241          * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1242          * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1243          * 71e913863f7, in that case the penultimate intermediate is -9G and
1244          * the final digit is also -9G. Since this only happens for a single
1245          * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1246          * check whether a secret scalar was that exact value, can already do
1247          * so.)
1248          */
1249         point_double(x3, y3, z3, x1, y1, z1);
1250         return;
1251     }
1252 
1253     /* I = ftmp = (2h)**2 */
1254     felem_assign(ftmp, ftmp4);
1255     felem_scalar64(ftmp, 2);
1256     /* ftmp[i] < 2^61 */
1257     felem_square(tmp, ftmp);
1258     /* tmp[i] < 17*2^122 */
1259     felem_reduce(ftmp, tmp);
1260 
1261     /* J = ftmp2 = h * I */
1262     felem_mul(tmp, ftmp4, ftmp);
1263     felem_reduce(ftmp2, tmp);
1264 
1265     /* V = ftmp4 = U1 * I */
1266     felem_mul(tmp, ftmp3, ftmp);
1267     felem_reduce(ftmp4, tmp);
1268 
1269     /* x_out = r**2 - J - 2V */
1270     felem_square(tmp, ftmp5);
1271     /* tmp[i] < 17*2^122 */
1272     felem_diff_128_64(tmp, ftmp2);
1273     /* tmp[i] < 17*2^122 + 2^63 */
1274     felem_assign(ftmp3, ftmp4);
1275     felem_scalar64(ftmp4, 2);
1276     /* ftmp4[i] < 2^61 */
1277     felem_diff_128_64(tmp, ftmp4);
1278     /* tmp[i] < 17*2^122 + 2^64 */
1279     felem_reduce(x_out, tmp);
1280 
1281     /* y_out = r(V-x_out) - 2 * s1 * J */
1282     felem_diff64(ftmp3, x_out);
1283     /*
1284      * ftmp3[i] < 2^60 + 2^60 = 2^61
1285      */
1286     felem_mul(tmp, ftmp5, ftmp3);
1287     /* tmp[i] < 17*2^122 */
1288     felem_mul(tmp2, ftmp6, ftmp2);
1289     /* tmp2[i] < 17*2^120 */
1290     felem_scalar128(tmp2, 2);
1291     /* tmp2[i] < 17*2^121 */
1292     felem_diff128(tmp, tmp2);
1293     /*-
1294      * tmp[i] < 2^127 - 2^69 + 17*2^122
1295      *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1296      *        < 2^127
1297      */
1298     felem_reduce(y_out, tmp);
1299 
1300     copy_conditional(x_out, x2, z1_is_zero);
1301     copy_conditional(x_out, x1, z2_is_zero);
1302     copy_conditional(y_out, y2, z1_is_zero);
1303     copy_conditional(y_out, y1, z2_is_zero);
1304     copy_conditional(z_out, z2, z1_is_zero);
1305     copy_conditional(z_out, z1, z2_is_zero);
1306     felem_assign(x3, x_out);
1307     felem_assign(y3, y_out);
1308     felem_assign(z3, z_out);
1309 }
1310 
1311 /*-
1312  * Base point pre computation
1313  * --------------------------
1314  *
1315  * Two different sorts of precomputed tables are used in the following code.
1316  * Each contain various points on the curve, where each point is three field
1317  * elements (x, y, z).
1318  *
1319  * For the base point table, z is usually 1 (0 for the point at infinity).
1320  * This table has 16 elements:
1321  * index | bits    | point
1322  * ------+---------+------------------------------
1323  *     0 | 0 0 0 0 | 0G
1324  *     1 | 0 0 0 1 | 1G
1325  *     2 | 0 0 1 0 | 2^130G
1326  *     3 | 0 0 1 1 | (2^130 + 1)G
1327  *     4 | 0 1 0 0 | 2^260G
1328  *     5 | 0 1 0 1 | (2^260 + 1)G
1329  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1330  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1331  *     8 | 1 0 0 0 | 2^390G
1332  *     9 | 1 0 0 1 | (2^390 + 1)G
1333  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1334  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1335  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1336  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1337  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1338  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1339  *
1340  * The reason for this is so that we can clock bits into four different
1341  * locations when doing simple scalar multiplies against the base point.
1342  *
1343  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1344 
1345 /* gmul is the table of precomputed base points */
1346 static const felem gmul[16][3] = {
1347     { { 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1348         { 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1349         { 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
1350     { { 0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1351           0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1352           0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404 },
1353         { 0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1354             0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1355             0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b },
1356         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1357     { { 0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1358           0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1359           0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5 },
1360         { 0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1361             0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1362             0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7 },
1363         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1364     { { 0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1365           0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1366           0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9 },
1367         { 0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1368             0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1369             0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe },
1370         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1371     { { 0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1372           0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1373           0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065 },
1374         { 0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1375             0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1376             0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524 },
1377         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1378     { { 0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1379           0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1380           0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe },
1381         { 0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1382             0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1383             0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7 },
1384         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1385     { { 0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1386           0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1387           0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256 },
1388         { 0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1389             0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1390             0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd },
1391         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1392     { { 0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1393           0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1394           0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23 },
1395         { 0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1396             0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1397             0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e },
1398         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1399     { { 0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1400           0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1401           0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5 },
1402         { 0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1403             0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1404             0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242 },
1405         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1406     { { 0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1407           0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1408           0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203 },
1409         { 0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1410             0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1411             0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f },
1412         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1413     { { 0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1414           0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1415           0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a },
1416         { 0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1417             0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1418             0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a },
1419         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1420     { { 0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1421           0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1422           0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b },
1423         { 0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1424             0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1425             0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f },
1426         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1427     { { 0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1428           0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1429           0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf },
1430         { 0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1431             0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1432             0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d },
1433         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1434     { { 0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1435           0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1436           0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684 },
1437         { 0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1438             0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1439             0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81 },
1440         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1441     { { 0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1442           0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1443           0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d },
1444         { 0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1445             0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1446             0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42 },
1447         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } },
1448     { { 0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1449           0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1450           0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f },
1451         { 0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1452             0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1453             0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055 },
1454         { 1, 0, 0, 0, 0, 0, 0, 0, 0 } }
1455 };
1456 
1457 /*
1458  * select_point selects the |idx|th point from a precomputation table and
1459  * copies it to out.
1460  */
1461 /* pre_comp below is of the size provided in |size| */
select_point(const limb idx,unsigned int size,const felem pre_comp[][3],felem out[3])1462 static void select_point(const limb idx, unsigned int size,
1463     const felem pre_comp[][3], felem out[3])
1464 {
1465     unsigned i, j;
1466     limb *outlimbs = &out[0][0];
1467 
1468     memset(out, 0, sizeof(*out) * 3);
1469 
1470     for (i = 0; i < size; i++) {
1471         const limb *inlimbs = &pre_comp[i][0][0];
1472         limb mask = i ^ idx;
1473         mask |= mask >> 4;
1474         mask |= mask >> 2;
1475         mask |= mask >> 1;
1476         mask &= 1;
1477         mask--;
1478         for (j = 0; j < NLIMBS * 3; j++)
1479             outlimbs[j] |= inlimbs[j] & mask;
1480     }
1481 }
1482 
1483 /* get_bit returns the |i|th bit in |in| */
get_bit(const felem_bytearray in,int i)1484 static char get_bit(const felem_bytearray in, int i)
1485 {
1486     if (i < 0)
1487         return 0;
1488     return (in[i >> 3] >> (i & 7)) & 1;
1489 }
1490 
1491 /*
1492  * Interleaved point multiplication using precomputed point multiples: The
1493  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1494  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1495  * generator, using certain (large) precomputed multiples in g_pre_comp.
1496  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1497  */
batch_mul(felem x_out,felem y_out,felem z_out,const felem_bytearray scalars[],const unsigned num_points,const u8 * g_scalar,const int mixed,const felem pre_comp[][17][3],const felem g_pre_comp[16][3])1498 static void batch_mul(felem x_out, felem y_out, felem z_out,
1499     const felem_bytearray scalars[],
1500     const unsigned num_points, const u8 *g_scalar,
1501     const int mixed, const felem pre_comp[][17][3],
1502     const felem g_pre_comp[16][3])
1503 {
1504     int i, skip;
1505     unsigned num, gen_mul = (g_scalar != NULL);
1506     felem nq[3], tmp[4];
1507     limb bits;
1508     u8 sign, digit;
1509 
1510     /* set nq to the point at infinity */
1511     memset(nq, 0, sizeof(nq));
1512 
1513     /*
1514      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1515      * of the generator (last quarter of rounds) and additions of other
1516      * points multiples (every 5th round).
1517      */
1518     skip = 1; /* save two point operations in the first
1519                * round */
1520     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1521         /* double */
1522         if (!skip)
1523             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1524 
1525         /* add multiples of the generator */
1526         if (gen_mul && (i <= 130)) {
1527             bits = get_bit(g_scalar, i + 390) << 3;
1528             if (i < 130) {
1529                 bits |= get_bit(g_scalar, i + 260) << 2;
1530                 bits |= get_bit(g_scalar, i + 130) << 1;
1531                 bits |= get_bit(g_scalar, i);
1532             }
1533             /* select the point to add, in constant time */
1534             select_point(bits, 16, g_pre_comp, tmp);
1535             if (!skip) {
1536                 /* The 1 argument below is for "mixed" */
1537                 point_add(nq[0], nq[1], nq[2],
1538                     nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1539             } else {
1540                 memcpy(nq, tmp, 3 * sizeof(felem));
1541                 skip = 0;
1542             }
1543         }
1544 
1545         /* do other additions every 5 doublings */
1546         if (num_points && (i % 5 == 0)) {
1547             /* loop over all scalars */
1548             for (num = 0; num < num_points; ++num) {
1549                 bits = get_bit(scalars[num], i + 4) << 5;
1550                 bits |= get_bit(scalars[num], i + 3) << 4;
1551                 bits |= get_bit(scalars[num], i + 2) << 3;
1552                 bits |= get_bit(scalars[num], i + 1) << 2;
1553                 bits |= get_bit(scalars[num], i) << 1;
1554                 bits |= get_bit(scalars[num], i - 1);
1555                 ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1556 
1557                 /*
1558                  * select the point to add or subtract, in constant time
1559                  */
1560                 select_point(digit, 17, pre_comp[num], tmp);
1561                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1562                                             * point */
1563                 copy_conditional(tmp[1], tmp[3], (-(limb)sign));
1564 
1565                 if (!skip) {
1566                     point_add(nq[0], nq[1], nq[2],
1567                         nq[0], nq[1], nq[2],
1568                         mixed, tmp[0], tmp[1], tmp[2]);
1569                 } else {
1570                     memcpy(nq, tmp, 3 * sizeof(felem));
1571                     skip = 0;
1572                 }
1573             }
1574         }
1575     }
1576     felem_assign(x_out, nq[0]);
1577     felem_assign(y_out, nq[1]);
1578     felem_assign(z_out, nq[2]);
1579 }
1580 
1581 /* Precomputation for the group generator. */
1582 struct nistp521_pre_comp_st {
1583     felem g_pre_comp[16][3];
1584     CRYPTO_REF_COUNT references;
1585 };
1586 
EC_GFp_nistp521_method(void)1587 const EC_METHOD *EC_GFp_nistp521_method(void)
1588 {
1589     static const EC_METHOD ret = {
1590         EC_FLAGS_DEFAULT_OCT,
1591         NID_X9_62_prime_field,
1592         ossl_ec_GFp_nistp521_group_init,
1593         ossl_ec_GFp_simple_group_finish,
1594         ossl_ec_GFp_simple_group_clear_finish,
1595         ossl_ec_GFp_nist_group_copy,
1596         ossl_ec_GFp_nistp521_group_set_curve,
1597         ossl_ec_GFp_simple_group_get_curve,
1598         ossl_ec_GFp_simple_group_get_degree,
1599         ossl_ec_group_simple_order_bits,
1600         ossl_ec_GFp_simple_group_check_discriminant,
1601         ossl_ec_GFp_simple_point_init,
1602         ossl_ec_GFp_simple_point_finish,
1603         ossl_ec_GFp_simple_point_clear_finish,
1604         ossl_ec_GFp_simple_point_copy,
1605         ossl_ec_GFp_simple_point_set_to_infinity,
1606         ossl_ec_GFp_simple_point_set_affine_coordinates,
1607         ossl_ec_GFp_nistp521_point_get_affine_coordinates,
1608         0 /* point_set_compressed_coordinates */,
1609         0 /* point2oct */,
1610         0 /* oct2point */,
1611         ossl_ec_GFp_simple_add,
1612         ossl_ec_GFp_simple_dbl,
1613         ossl_ec_GFp_simple_invert,
1614         ossl_ec_GFp_simple_is_at_infinity,
1615         ossl_ec_GFp_simple_is_on_curve,
1616         ossl_ec_GFp_simple_cmp,
1617         ossl_ec_GFp_simple_make_affine,
1618         ossl_ec_GFp_simple_points_make_affine,
1619         ossl_ec_GFp_nistp521_points_mul,
1620         ossl_ec_GFp_nistp521_precompute_mult,
1621         ossl_ec_GFp_nistp521_have_precompute_mult,
1622         ossl_ec_GFp_nist_field_mul,
1623         ossl_ec_GFp_nist_field_sqr,
1624         0 /* field_div */,
1625         ossl_ec_GFp_simple_field_inv,
1626         0 /* field_encode */,
1627         0 /* field_decode */,
1628         0, /* field_set_to_one */
1629         ossl_ec_key_simple_priv2oct,
1630         ossl_ec_key_simple_oct2priv,
1631         0, /* set private */
1632         ossl_ec_key_simple_generate_key,
1633         ossl_ec_key_simple_check_key,
1634         ossl_ec_key_simple_generate_public_key,
1635         0, /* keycopy */
1636         0, /* keyfinish */
1637         ossl_ecdh_simple_compute_key,
1638         ossl_ecdsa_simple_sign_setup,
1639         ossl_ecdsa_simple_sign_sig,
1640         ossl_ecdsa_simple_verify_sig,
1641         0, /* field_inverse_mod_ord */
1642         0, /* blind_coordinates */
1643         0, /* ladder_pre */
1644         0, /* ladder_step */
1645         0 /* ladder_post */
1646     };
1647 
1648     return &ret;
1649 }
1650 
1651 /******************************************************************************/
1652 /*
1653  * FUNCTIONS TO MANAGE PRECOMPUTATION
1654  */
1655 
nistp521_pre_comp_new(void)1656 static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1657 {
1658     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1659 
1660     if (ret == NULL)
1661         return ret;
1662 
1663     if (!CRYPTO_NEW_REF(&ret->references, 1)) {
1664         OPENSSL_free(ret);
1665         return NULL;
1666     }
1667     return ret;
1668 }
1669 
EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP * p)1670 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1671 {
1672     int i;
1673     if (p != NULL)
1674         CRYPTO_UP_REF(&p->references, &i);
1675     return p;
1676 }
1677 
EC_nistp521_pre_comp_free(NISTP521_PRE_COMP * p)1678 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1679 {
1680     int i;
1681 
1682     if (p == NULL)
1683         return;
1684 
1685     CRYPTO_DOWN_REF(&p->references, &i);
1686     REF_PRINT_COUNT("EC_nistp521", i, p);
1687     if (i > 0)
1688         return;
1689     REF_ASSERT_ISNT(i < 0);
1690 
1691     CRYPTO_FREE_REF(&p->references);
1692     OPENSSL_free(p);
1693 }
1694 
1695 /******************************************************************************/
1696 /*
1697  * OPENSSL EC_METHOD FUNCTIONS
1698  */
1699 
ossl_ec_GFp_nistp521_group_init(EC_GROUP * group)1700 int ossl_ec_GFp_nistp521_group_init(EC_GROUP *group)
1701 {
1702     int ret;
1703     ret = ossl_ec_GFp_simple_group_init(group);
1704     group->a_is_minus3 = 1;
1705     return ret;
1706 }
1707 
ossl_ec_GFp_nistp521_group_set_curve(EC_GROUP * group,const BIGNUM * p,const BIGNUM * a,const BIGNUM * b,BN_CTX * ctx)1708 int ossl_ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1709     const BIGNUM *a, const BIGNUM *b,
1710     BN_CTX *ctx)
1711 {
1712     int ret = 0;
1713     BIGNUM *curve_p, *curve_a, *curve_b;
1714 #ifndef FIPS_MODULE
1715     BN_CTX *new_ctx = NULL;
1716 
1717     if (ctx == NULL)
1718         ctx = new_ctx = BN_CTX_new();
1719 #endif
1720     if (ctx == NULL)
1721         return 0;
1722 
1723     BN_CTX_start(ctx);
1724     curve_p = BN_CTX_get(ctx);
1725     curve_a = BN_CTX_get(ctx);
1726     curve_b = BN_CTX_get(ctx);
1727     if (curve_b == NULL)
1728         goto err;
1729     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1730     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1731     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1732     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1733         ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1734         goto err;
1735     }
1736     group->field_mod_func = BN_nist_mod_521;
1737     ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1738 err:
1739     BN_CTX_end(ctx);
1740 #ifndef FIPS_MODULE
1741     BN_CTX_free(new_ctx);
1742 #endif
1743     return ret;
1744 }
1745 
1746 /*
1747  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1748  * (X/Z^2, Y/Z^3)
1749  */
ossl_ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP * group,const EC_POINT * point,BIGNUM * x,BIGNUM * y,BN_CTX * ctx)1750 int ossl_ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1751     const EC_POINT *point,
1752     BIGNUM *x, BIGNUM *y,
1753     BN_CTX *ctx)
1754 {
1755     felem z1, z2, x_in, y_in, x_out, y_out;
1756     largefelem tmp;
1757 
1758     if (EC_POINT_is_at_infinity(group, point)) {
1759         ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1760         return 0;
1761     }
1762     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) || (!BN_to_felem(z1, point->Z)))
1763         return 0;
1764     felem_inv(z2, z1);
1765     felem_square(tmp, z2);
1766     felem_reduce(z1, tmp);
1767     felem_mul(tmp, x_in, z1);
1768     felem_reduce(x_in, tmp);
1769     felem_contract(x_out, x_in);
1770     if (x != NULL) {
1771         if (!felem_to_BN(x, x_out)) {
1772             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1773             return 0;
1774         }
1775     }
1776     felem_mul(tmp, z1, z2);
1777     felem_reduce(z1, tmp);
1778     felem_mul(tmp, y_in, z1);
1779     felem_reduce(y_in, tmp);
1780     felem_contract(y_out, y_in);
1781     if (y != NULL) {
1782         if (!felem_to_BN(y, y_out)) {
1783             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1784             return 0;
1785         }
1786     }
1787     return 1;
1788 }
1789 
1790 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
make_points_affine(size_t num,felem points[][3],felem tmp_felems[])1791 static void make_points_affine(size_t num, felem points[][3],
1792     felem tmp_felems[])
1793 {
1794     /*
1795      * Runs in constant time, unless an input is the point at infinity (which
1796      * normally shouldn't happen).
1797      */
1798     ossl_ec_GFp_nistp_points_make_affine_internal(num,
1799         points,
1800         sizeof(felem),
1801         tmp_felems,
1802         (void (*)(void *))felem_one,
1803         felem_is_zero_int,
1804         (void (*)(void *, const void *))
1805             felem_assign,
1806         (void (*)(void *, const void *))
1807             felem_square_reduce,
1808         (void (*)(void *,
1809             const void
1810                 *,
1811             const void
1812                 *))
1813             felem_mul_reduce,
1814         (void (*)(void *, const void *))
1815             felem_inv,
1816         (void (*)(void *, const void *))
1817             felem_contract);
1818 }
1819 
1820 /*
1821  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1822  * values Result is stored in r (r can equal one of the inputs).
1823  */
ossl_ec_GFp_nistp521_points_mul(const EC_GROUP * group,EC_POINT * r,const BIGNUM * scalar,size_t num,const EC_POINT * points[],const BIGNUM * scalars[],BN_CTX * ctx)1824 int ossl_ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1825     const BIGNUM *scalar, size_t num,
1826     const EC_POINT *points[],
1827     const BIGNUM *scalars[], BN_CTX *ctx)
1828 {
1829     int ret = 0;
1830     int j;
1831     int mixed = 0;
1832     BIGNUM *x, *y, *z, *tmp_scalar;
1833     felem_bytearray g_secret;
1834     felem_bytearray *secrets = NULL;
1835     felem(*pre_comp)[17][3] = NULL;
1836     felem *tmp_felems = NULL;
1837     unsigned i;
1838     int num_bytes;
1839     int have_pre_comp = 0;
1840     size_t num_points = num;
1841     felem x_in, y_in, z_in, x_out, y_out, z_out;
1842     NISTP521_PRE_COMP *pre = NULL;
1843     felem(*g_pre_comp)[3] = NULL;
1844     EC_POINT *generator = NULL;
1845     const EC_POINT *p = NULL;
1846     const BIGNUM *p_scalar = NULL;
1847 
1848     BN_CTX_start(ctx);
1849     x = BN_CTX_get(ctx);
1850     y = BN_CTX_get(ctx);
1851     z = BN_CTX_get(ctx);
1852     tmp_scalar = BN_CTX_get(ctx);
1853     if (tmp_scalar == NULL)
1854         goto err;
1855 
1856     if (scalar != NULL) {
1857         pre = group->pre_comp.nistp521;
1858         if (pre)
1859             /* we have precomputation, try to use it */
1860             g_pre_comp = &pre->g_pre_comp[0];
1861         else
1862             /* try to use the standard precomputation */
1863             g_pre_comp = (felem(*)[3])gmul;
1864         generator = EC_POINT_new(group);
1865         if (generator == NULL)
1866             goto err;
1867         /* get the generator from precomputation */
1868         if (!felem_to_BN(x, g_pre_comp[1][0]) || !felem_to_BN(y, g_pre_comp[1][1]) || !felem_to_BN(z, g_pre_comp[1][2])) {
1869             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1870             goto err;
1871         }
1872         if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1873                 generator,
1874                 x, y, z, ctx))
1875             goto err;
1876         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1877             /* precomputation matches generator */
1878             have_pre_comp = 1;
1879         else
1880             /*
1881              * we don't have valid precomputation: treat the generator as a
1882              * random point
1883              */
1884             num_points++;
1885     }
1886 
1887     if (num_points > 0) {
1888         if (num_points >= 2) {
1889             /*
1890              * unless we precompute multiples for just one point, converting
1891              * those into affine form is time well spent
1892              */
1893             mixed = 1;
1894         }
1895         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1896         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1897         if (mixed)
1898             tmp_felems = OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1899         if ((secrets == NULL) || (pre_comp == NULL)
1900             || (mixed && (tmp_felems == NULL)))
1901             goto err;
1902 
1903         /*
1904          * we treat NULL scalars as 0, and NULL points as points at infinity,
1905          * i.e., they contribute nothing to the linear combination
1906          */
1907         for (i = 0; i < num_points; ++i) {
1908             if (i == num) {
1909                 /*
1910                  * we didn't have a valid precomputation, so we pick the
1911                  * generator
1912                  */
1913                 p = EC_GROUP_get0_generator(group);
1914                 p_scalar = scalar;
1915             } else {
1916                 /* the i^th point */
1917                 p = points[i];
1918                 p_scalar = scalars[i];
1919             }
1920             if ((p_scalar != NULL) && (p != NULL)) {
1921                 /* reduce scalar to 0 <= scalar < 2^521 */
1922                 if ((BN_num_bits(p_scalar) > 521)
1923                     || (BN_is_negative(p_scalar))) {
1924                     /*
1925                      * this is an unusual input, and we don't guarantee
1926                      * constant-timeness
1927                      */
1928                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1929                         ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1930                         goto err;
1931                     }
1932                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1933                         secrets[i], sizeof(secrets[i]));
1934                 } else {
1935                     num_bytes = BN_bn2lebinpad(p_scalar,
1936                         secrets[i], sizeof(secrets[i]));
1937                 }
1938                 if (num_bytes < 0) {
1939                     ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1940                     goto err;
1941                 }
1942                 /* precompute multiples */
1943                 if ((!BN_to_felem(x_out, p->X)) || (!BN_to_felem(y_out, p->Y)) || (!BN_to_felem(z_out, p->Z)))
1944                     goto err;
1945                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1946                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1947                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1948                 for (j = 2; j <= 16; ++j) {
1949                     if (j & 1) {
1950                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1951                             pre_comp[i][j][2], pre_comp[i][1][0],
1952                             pre_comp[i][1][1], pre_comp[i][1][2], 0,
1953                             pre_comp[i][j - 1][0],
1954                             pre_comp[i][j - 1][1],
1955                             pre_comp[i][j - 1][2]);
1956                     } else {
1957                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1958                             pre_comp[i][j][2], pre_comp[i][j / 2][0],
1959                             pre_comp[i][j / 2][1],
1960                             pre_comp[i][j / 2][2]);
1961                     }
1962                 }
1963             }
1964         }
1965         if (mixed)
1966             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1967     }
1968 
1969     /* the scalar for the generator */
1970     if ((scalar != NULL) && (have_pre_comp)) {
1971         memset(g_secret, 0, sizeof(g_secret));
1972         /* reduce scalar to 0 <= scalar < 2^521 */
1973         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1974             /*
1975              * this is an unusual input, and we don't guarantee
1976              * constant-timeness
1977              */
1978             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1979                 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1980                 goto err;
1981             }
1982             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1983         } else {
1984             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1985         }
1986         /* do the multiplication with generator precomputation */
1987         batch_mul(x_out, y_out, z_out,
1988             (const felem_bytearray(*))secrets, num_points,
1989             g_secret,
1990             mixed, (const felem(*)[17][3])pre_comp,
1991             (const felem(*)[3])g_pre_comp);
1992     } else {
1993         /* do the multiplication without generator precomputation */
1994         batch_mul(x_out, y_out, z_out,
1995             (const felem_bytearray(*))secrets, num_points,
1996             NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1997     }
1998     /* reduce the output to its unique minimal representation */
1999     felem_contract(x_in, x_out);
2000     felem_contract(y_in, y_out);
2001     felem_contract(z_in, z_out);
2002     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || (!felem_to_BN(z, z_in))) {
2003         ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2004         goto err;
2005     }
2006     ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
2007         ctx);
2008 
2009 err:
2010     BN_CTX_end(ctx);
2011     EC_POINT_free(generator);
2012     OPENSSL_free(secrets);
2013     OPENSSL_free(pre_comp);
2014     OPENSSL_free(tmp_felems);
2015     return ret;
2016 }
2017 
ossl_ec_GFp_nistp521_precompute_mult(EC_GROUP * group,BN_CTX * ctx)2018 int ossl_ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2019 {
2020     int ret = 0;
2021     NISTP521_PRE_COMP *pre = NULL;
2022     int i, j;
2023     BIGNUM *x, *y;
2024     EC_POINT *generator = NULL;
2025     felem tmp_felems[16];
2026 #ifndef FIPS_MODULE
2027     BN_CTX *new_ctx = NULL;
2028 #endif
2029 
2030     /* throw away old precomputation */
2031     EC_pre_comp_free(group);
2032 
2033 #ifndef FIPS_MODULE
2034     if (ctx == NULL)
2035         ctx = new_ctx = BN_CTX_new();
2036 #endif
2037     if (ctx == NULL)
2038         return 0;
2039 
2040     BN_CTX_start(ctx);
2041     x = BN_CTX_get(ctx);
2042     y = BN_CTX_get(ctx);
2043     if (y == NULL)
2044         goto err;
2045     /* get the generator */
2046     if (group->generator == NULL)
2047         goto err;
2048     generator = EC_POINT_new(group);
2049     if (generator == NULL)
2050         goto err;
2051     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2052     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2053     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2054         goto err;
2055     if ((pre = nistp521_pre_comp_new()) == NULL)
2056         goto err;
2057     /*
2058      * if the generator is the standard one, use built-in precomputation
2059      */
2060     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2061         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2062         goto done;
2063     }
2064     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) || (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) || (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2065         goto err;
2066     /* compute 2^130*G, 2^260*G, 2^390*G */
2067     for (i = 1; i <= 4; i <<= 1) {
2068         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2069             pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2070             pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2071         for (j = 0; j < 129; ++j) {
2072             point_double(pre->g_pre_comp[2 * i][0],
2073                 pre->g_pre_comp[2 * i][1],
2074                 pre->g_pre_comp[2 * i][2],
2075                 pre->g_pre_comp[2 * i][0],
2076                 pre->g_pre_comp[2 * i][1],
2077                 pre->g_pre_comp[2 * i][2]);
2078         }
2079     }
2080     /* g_pre_comp[0] is the point at infinity */
2081     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2082     /* the remaining multiples */
2083     /* 2^130*G + 2^260*G */
2084     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2085         pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2086         pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2087         0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2088         pre->g_pre_comp[2][2]);
2089     /* 2^130*G + 2^390*G */
2090     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2091         pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2092         pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2093         0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2094         pre->g_pre_comp[2][2]);
2095     /* 2^260*G + 2^390*G */
2096     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2097         pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2098         pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2099         0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2100         pre->g_pre_comp[4][2]);
2101     /* 2^130*G + 2^260*G + 2^390*G */
2102     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2103         pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2104         pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2105         0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2106         pre->g_pre_comp[2][2]);
2107     for (i = 1; i < 8; ++i) {
2108         /* odd multiples: add G */
2109         point_add(pre->g_pre_comp[2 * i + 1][0],
2110             pre->g_pre_comp[2 * i + 1][1],
2111             pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2112             pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2113             pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2114             pre->g_pre_comp[1][2]);
2115     }
2116     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2117 
2118 done:
2119     SETPRECOMP(group, nistp521, pre);
2120     ret = 1;
2121     pre = NULL;
2122 err:
2123     BN_CTX_end(ctx);
2124     EC_POINT_free(generator);
2125 #ifndef FIPS_MODULE
2126     BN_CTX_free(new_ctx);
2127 #endif
2128     EC_nistp521_pre_comp_free(pre);
2129     return ret;
2130 }
2131 
ossl_ec_GFp_nistp521_have_precompute_mult(const EC_GROUP * group)2132 int ossl_ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2133 {
2134     return HAVEPRECOMP(group, nistp521);
2135 }
2136