xref: /src/crypto/openssl/crypto/ec/ecp_nistp256.c (revision f25b8c9fb4f58cf61adb47d7570abe7caa6d385d)
1 /*
2  * Copyright 2011-2024 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the Apache License 2.0 (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 /* Copyright 2011 Google Inc.
11  *
12  * Licensed under the Apache License, Version 2.0 (the "License");
13  *
14  * you may not use this file except in compliance with the License.
15  * You may obtain a copy of the License at
16  *
17  *     http://www.apache.org/licenses/LICENSE-2.0
18  *
19  *  Unless required by applicable law or agreed to in writing, software
20  *  distributed under the License is distributed on an "AS IS" BASIS,
21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  *  See the License for the specific language governing permissions and
23  *  limitations under the License.
24  */
25 
26 /*
27  * ECDSA low level APIs are deprecated for public use, but still ok for
28  * internal use.
29  */
30 #include "internal/deprecated.h"
31 
32 /*
33  * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
34  *
35  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
36  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
37  * work which got its smarts from Daniel J. Bernstein's work on the same.
38  */
39 
40 #include <openssl/opensslconf.h>
41 
42 #include <stdint.h>
43 #include <string.h>
44 #include <openssl/err.h>
45 #include "ec_local.h"
46 
47 #include "internal/numbers.h"
48 
49 #ifndef INT128_MAX
50 #error "Your compiler doesn't appear to support 128-bit integer types"
51 #endif
52 
53 typedef uint8_t u8;
54 typedef uint32_t u32;
55 typedef uint64_t u64;
56 
57 /*
58  * The underlying field. P256 operates over GF(2^256-2^224+2^192+2^96-1). We
59  * can serialize an element of this field into 32 bytes. We call this an
60  * felem_bytearray.
61  */
62 
63 typedef u8 felem_bytearray[32];
64 
65 /*
66  * These are the parameters of P256, taken from FIPS 186-3, page 86. These
67  * values are big-endian.
68  */
69 static const felem_bytearray nistp256_curve_params[5] = {
70     { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */
71         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
72         0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
73         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
74     { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */
75         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
76         0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
77         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc },
78     { 0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, /* b */
79         0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
80         0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
81         0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b },
82     { 0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */
83         0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
84         0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
85         0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96 },
86     { 0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */
87         0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
88         0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
89         0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5 }
90 };
91 
92 /*-
93  * The representation of field elements.
94  * ------------------------------------
95  *
96  * We represent field elements with either four 128-bit values, eight 128-bit
97  * values, or four 64-bit values. The field element represented is:
98  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
99  * or:
100  *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[7]*2^448  (mod p)
101  *
102  * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
103  * apart, but are 128-bits wide, the most significant bits of each limb overlap
104  * with the least significant bits of the next.
105  *
106  * A field element with four limbs is an 'felem'. One with eight limbs is a
107  * 'longfelem'
108  *
109  * A field element with four, 64-bit values is called a 'smallfelem'. Small
110  * values are used as intermediate values before multiplication.
111  */
112 
113 #define NLIMBS 4
114 
115 typedef uint128_t limb;
116 typedef limb felem[NLIMBS];
117 typedef limb longfelem[NLIMBS * 2];
118 typedef u64 smallfelem[NLIMBS];
119 
120 /* This is the value of the prime as four 64-bit words, little-endian. */
121 static const u64 kPrime[4] = {
122     0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul
123 };
124 static const u64 bottom63bits = 0x7ffffffffffffffful;
125 
126 /*
127  * bin32_to_felem takes a little-endian byte array and converts it into felem
128  * form. This assumes that the CPU is little-endian.
129  */
bin32_to_felem(felem out,const u8 in[32])130 static void bin32_to_felem(felem out, const u8 in[32])
131 {
132     out[0] = *((u64 *)&in[0]);
133     out[1] = *((u64 *)&in[8]);
134     out[2] = *((u64 *)&in[16]);
135     out[3] = *((u64 *)&in[24]);
136 }
137 
138 /*
139  * smallfelem_to_bin32 takes a smallfelem and serializes into a little
140  * endian, 32 byte array. This assumes that the CPU is little-endian.
141  */
smallfelem_to_bin32(u8 out[32],const smallfelem in)142 static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
143 {
144     *((u64 *)&out[0]) = in[0];
145     *((u64 *)&out[8]) = in[1];
146     *((u64 *)&out[16]) = in[2];
147     *((u64 *)&out[24]) = in[3];
148 }
149 
150 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
BN_to_felem(felem out,const BIGNUM * bn)151 static int BN_to_felem(felem out, const BIGNUM *bn)
152 {
153     felem_bytearray b_out;
154     int num_bytes;
155 
156     if (BN_is_negative(bn)) {
157         ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
158         return 0;
159     }
160     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
161     if (num_bytes < 0) {
162         ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
163         return 0;
164     }
165     bin32_to_felem(out, b_out);
166     return 1;
167 }
168 
169 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
smallfelem_to_BN(BIGNUM * out,const smallfelem in)170 static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
171 {
172     felem_bytearray b_out;
173     smallfelem_to_bin32(b_out, in);
174     return BN_lebin2bn(b_out, sizeof(b_out), out);
175 }
176 
177 /*-
178  * Field operations
179  * ----------------
180  */
181 
smallfelem_one(smallfelem out)182 static void smallfelem_one(smallfelem out)
183 {
184     out[0] = 1;
185     out[1] = 0;
186     out[2] = 0;
187     out[3] = 0;
188 }
189 
smallfelem_assign(smallfelem out,const smallfelem in)190 static void smallfelem_assign(smallfelem out, const smallfelem in)
191 {
192     out[0] = in[0];
193     out[1] = in[1];
194     out[2] = in[2];
195     out[3] = in[3];
196 }
197 
felem_assign(felem out,const felem in)198 static void felem_assign(felem out, const felem in)
199 {
200     out[0] = in[0];
201     out[1] = in[1];
202     out[2] = in[2];
203     out[3] = in[3];
204 }
205 
206 /* felem_sum sets out = out + in. */
felem_sum(felem out,const felem in)207 static void felem_sum(felem out, const felem in)
208 {
209     out[0] += in[0];
210     out[1] += in[1];
211     out[2] += in[2];
212     out[3] += in[3];
213 }
214 
215 /* felem_small_sum sets out = out + in. */
felem_small_sum(felem out,const smallfelem in)216 static void felem_small_sum(felem out, const smallfelem in)
217 {
218     out[0] += in[0];
219     out[1] += in[1];
220     out[2] += in[2];
221     out[3] += in[3];
222 }
223 
224 /* felem_scalar sets out = out * scalar */
felem_scalar(felem out,const u64 scalar)225 static void felem_scalar(felem out, const u64 scalar)
226 {
227     out[0] *= scalar;
228     out[1] *= scalar;
229     out[2] *= scalar;
230     out[3] *= scalar;
231 }
232 
233 /* longfelem_scalar sets out = out * scalar */
longfelem_scalar(longfelem out,const u64 scalar)234 static void longfelem_scalar(longfelem out, const u64 scalar)
235 {
236     out[0] *= scalar;
237     out[1] *= scalar;
238     out[2] *= scalar;
239     out[3] *= scalar;
240     out[4] *= scalar;
241     out[5] *= scalar;
242     out[6] *= scalar;
243     out[7] *= scalar;
244 }
245 
246 #define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
247 #define two105 (((limb)1) << 105)
248 #define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
249 
250 /* zero105 is 0 mod p */
251 static const felem zero105 = { two105m41m9, two105, two105m41p9, two105m41p9 };
252 
253 /*-
254  * smallfelem_neg sets |out| to |-small|
255  * On exit:
256  *   out[i] < out[i] + 2^105
257  */
smallfelem_neg(felem out,const smallfelem small)258 static void smallfelem_neg(felem out, const smallfelem small)
259 {
260     /* In order to prevent underflow, we subtract from 0 mod p. */
261     out[0] = zero105[0] - small[0];
262     out[1] = zero105[1] - small[1];
263     out[2] = zero105[2] - small[2];
264     out[3] = zero105[3] - small[3];
265 }
266 
267 /*-
268  * felem_diff subtracts |in| from |out|
269  * On entry:
270  *   in[i] < 2^104
271  * On exit:
272  *   out[i] < out[i] + 2^105
273  */
felem_diff(felem out,const felem in)274 static void felem_diff(felem out, const felem in)
275 {
276     /*
277      * In order to prevent underflow, we add 0 mod p before subtracting.
278      */
279     out[0] += zero105[0];
280     out[1] += zero105[1];
281     out[2] += zero105[2];
282     out[3] += zero105[3];
283 
284     out[0] -= in[0];
285     out[1] -= in[1];
286     out[2] -= in[2];
287     out[3] -= in[3];
288 }
289 
290 #define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
291 #define two107 (((limb)1) << 107)
292 #define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
293 
294 /* zero107 is 0 mod p */
295 static const felem zero107 = {
296     two107m43m11, two107, two107m43p11, two107m43p11
297 };
298 
299 /*-
300  * An alternative felem_diff for larger inputs |in|
301  * felem_diff_zero107 subtracts |in| from |out|
302  * On entry:
303  *   in[i] < 2^106
304  * On exit:
305  *   out[i] < out[i] + 2^107
306  */
felem_diff_zero107(felem out,const felem in)307 static void felem_diff_zero107(felem out, const felem in)
308 {
309     /*
310      * In order to prevent underflow, we add 0 mod p before subtracting.
311      */
312     out[0] += zero107[0];
313     out[1] += zero107[1];
314     out[2] += zero107[2];
315     out[3] += zero107[3];
316 
317     out[0] -= in[0];
318     out[1] -= in[1];
319     out[2] -= in[2];
320     out[3] -= in[3];
321 }
322 
323 /*-
324  * longfelem_diff subtracts |in| from |out|
325  * On entry:
326  *   in[i] < 7*2^67
327  * On exit:
328  *   out[i] < out[i] + 2^70 + 2^40
329  */
longfelem_diff(longfelem out,const longfelem in)330 static void longfelem_diff(longfelem out, const longfelem in)
331 {
332     static const limb two70m8p6 = (((limb)1) << 70) - (((limb)1) << 8) + (((limb)1) << 6);
333     static const limb two70p40 = (((limb)1) << 70) + (((limb)1) << 40);
334     static const limb two70 = (((limb)1) << 70);
335     static const limb two70m40m38p6 = (((limb)1) << 70) - (((limb)1) << 40) - (((limb)1) << 38) + (((limb)1) << 6);
336     static const limb two70m6 = (((limb)1) << 70) - (((limb)1) << 6);
337 
338     /* add 0 mod p to avoid underflow */
339     out[0] += two70m8p6;
340     out[1] += two70p40;
341     out[2] += two70;
342     out[3] += two70m40m38p6;
343     out[4] += two70m6;
344     out[5] += two70m6;
345     out[6] += two70m6;
346     out[7] += two70m6;
347 
348     /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
349     out[0] -= in[0];
350     out[1] -= in[1];
351     out[2] -= in[2];
352     out[3] -= in[3];
353     out[4] -= in[4];
354     out[5] -= in[5];
355     out[6] -= in[6];
356     out[7] -= in[7];
357 }
358 
359 #define two64m0 (((limb)1) << 64) - 1
360 #define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
361 #define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
362 #define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
363 
364 /* zero110 is 0 mod p */
365 static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
366 
367 /*-
368  * felem_shrink converts an felem into a smallfelem. The result isn't quite
369  * minimal as the value may be greater than p.
370  *
371  * On entry:
372  *   in[i] < 2^109
373  * On exit:
374  *   out[i] < 2^64
375  */
felem_shrink(smallfelem out,const felem in)376 static void felem_shrink(smallfelem out, const felem in)
377 {
378     felem tmp;
379     u64 a, b, mask;
380     u64 high, low;
381     static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
382 
383     /* Carry 2->3 */
384     tmp[3] = zero110[3] + in[3] + ((u64)(in[2] >> 64));
385     /* tmp[3] < 2^110 */
386 
387     tmp[2] = zero110[2] + (u64)in[2];
388     tmp[0] = zero110[0] + in[0];
389     tmp[1] = zero110[1] + in[1];
390     /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
391 
392     /*
393      * We perform two partial reductions where we eliminate the high-word of
394      * tmp[3]. We don't update the other words till the end.
395      */
396     a = tmp[3] >> 64; /* a < 2^46 */
397     tmp[3] = (u64)tmp[3];
398     tmp[3] -= a;
399     tmp[3] += ((limb)a) << 32;
400     /* tmp[3] < 2^79 */
401 
402     b = a;
403     a = tmp[3] >> 64; /* a < 2^15 */
404     b += a; /* b < 2^46 + 2^15 < 2^47 */
405     tmp[3] = (u64)tmp[3];
406     tmp[3] -= a;
407     tmp[3] += ((limb)a) << 32;
408     /* tmp[3] < 2^64 + 2^47 */
409 
410     /*
411      * This adjusts the other two words to complete the two partial
412      * reductions.
413      */
414     tmp[0] += b;
415     tmp[1] -= (((limb)b) << 32);
416 
417     /*
418      * In order to make space in tmp[3] for the carry from 2 -> 3, we
419      * conditionally subtract kPrime if tmp[3] is large enough.
420      */
421     high = (u64)(tmp[3] >> 64);
422     /* As tmp[3] < 2^65, high is either 1 or 0 */
423     high = 0 - high;
424     /*-
425      * high is:
426      *   all ones   if the high word of tmp[3] is 1
427      *   all zeros  if the high word of tmp[3] if 0
428      */
429     low = (u64)tmp[3];
430     mask = 0 - (low >> 63);
431     /*-
432      * mask is:
433      *   all ones   if the MSB of low is 1
434      *   all zeros  if the MSB of low if 0
435      */
436     low &= bottom63bits;
437     low -= kPrime3Test;
438     /* if low was greater than kPrime3Test then the MSB is zero */
439     low = ~low;
440     low = 0 - (low >> 63);
441     /*-
442      * low is:
443      *   all ones   if low was > kPrime3Test
444      *   all zeros  if low was <= kPrime3Test
445      */
446     mask = (mask & low) | high;
447     tmp[0] -= mask & kPrime[0];
448     tmp[1] -= mask & kPrime[1];
449     /* kPrime[2] is zero, so omitted */
450     tmp[3] -= mask & kPrime[3];
451     /* tmp[3] < 2**64 - 2**32 + 1 */
452 
453     tmp[1] += ((u64)(tmp[0] >> 64));
454     tmp[0] = (u64)tmp[0];
455     tmp[2] += ((u64)(tmp[1] >> 64));
456     tmp[1] = (u64)tmp[1];
457     tmp[3] += ((u64)(tmp[2] >> 64));
458     tmp[2] = (u64)tmp[2];
459     /* tmp[i] < 2^64 */
460 
461     out[0] = tmp[0];
462     out[1] = tmp[1];
463     out[2] = tmp[2];
464     out[3] = tmp[3];
465 }
466 
467 /* smallfelem_expand converts a smallfelem to an felem */
smallfelem_expand(felem out,const smallfelem in)468 static void smallfelem_expand(felem out, const smallfelem in)
469 {
470     out[0] = in[0];
471     out[1] = in[1];
472     out[2] = in[2];
473     out[3] = in[3];
474 }
475 
476 /*-
477  * smallfelem_square sets |out| = |small|^2
478  * On entry:
479  *   small[i] < 2^64
480  * On exit:
481  *   out[i] < 7 * 2^64 < 2^67
482  */
smallfelem_square(longfelem out,const smallfelem small)483 static void smallfelem_square(longfelem out, const smallfelem small)
484 {
485     limb a;
486     u64 high, low;
487 
488     a = ((uint128_t)small[0]) * small[0];
489     low = a;
490     high = a >> 64;
491     out[0] = low;
492     out[1] = high;
493 
494     a = ((uint128_t)small[0]) * small[1];
495     low = a;
496     high = a >> 64;
497     out[1] += low;
498     out[1] += low;
499     out[2] = high;
500 
501     a = ((uint128_t)small[0]) * small[2];
502     low = a;
503     high = a >> 64;
504     out[2] += low;
505     out[2] *= 2;
506     out[3] = high;
507 
508     a = ((uint128_t)small[0]) * small[3];
509     low = a;
510     high = a >> 64;
511     out[3] += low;
512     out[4] = high;
513 
514     a = ((uint128_t)small[1]) * small[2];
515     low = a;
516     high = a >> 64;
517     out[3] += low;
518     out[3] *= 2;
519     out[4] += high;
520 
521     a = ((uint128_t)small[1]) * small[1];
522     low = a;
523     high = a >> 64;
524     out[2] += low;
525     out[3] += high;
526 
527     a = ((uint128_t)small[1]) * small[3];
528     low = a;
529     high = a >> 64;
530     out[4] += low;
531     out[4] *= 2;
532     out[5] = high;
533 
534     a = ((uint128_t)small[2]) * small[3];
535     low = a;
536     high = a >> 64;
537     out[5] += low;
538     out[5] *= 2;
539     out[6] = high;
540     out[6] += high;
541 
542     a = ((uint128_t)small[2]) * small[2];
543     low = a;
544     high = a >> 64;
545     out[4] += low;
546     out[5] += high;
547 
548     a = ((uint128_t)small[3]) * small[3];
549     low = a;
550     high = a >> 64;
551     out[6] += low;
552     out[7] = high;
553 }
554 
555 /*-
556  * felem_square sets |out| = |in|^2
557  * On entry:
558  *   in[i] < 2^109
559  * On exit:
560  *   out[i] < 7 * 2^64 < 2^67
561  */
felem_square(longfelem out,const felem in)562 static void felem_square(longfelem out, const felem in)
563 {
564     u64 small[4];
565     felem_shrink(small, in);
566     smallfelem_square(out, small);
567 }
568 
569 /*-
570  * smallfelem_mul sets |out| = |small1| * |small2|
571  * On entry:
572  *   small1[i] < 2^64
573  *   small2[i] < 2^64
574  * On exit:
575  *   out[i] < 7 * 2^64 < 2^67
576  */
smallfelem_mul(longfelem out,const smallfelem small1,const smallfelem small2)577 static void smallfelem_mul(longfelem out, const smallfelem small1,
578     const smallfelem small2)
579 {
580     limb a;
581     u64 high, low;
582 
583     a = ((uint128_t)small1[0]) * small2[0];
584     low = a;
585     high = a >> 64;
586     out[0] = low;
587     out[1] = high;
588 
589     a = ((uint128_t)small1[0]) * small2[1];
590     low = a;
591     high = a >> 64;
592     out[1] += low;
593     out[2] = high;
594 
595     a = ((uint128_t)small1[1]) * small2[0];
596     low = a;
597     high = a >> 64;
598     out[1] += low;
599     out[2] += high;
600 
601     a = ((uint128_t)small1[0]) * small2[2];
602     low = a;
603     high = a >> 64;
604     out[2] += low;
605     out[3] = high;
606 
607     a = ((uint128_t)small1[1]) * small2[1];
608     low = a;
609     high = a >> 64;
610     out[2] += low;
611     out[3] += high;
612 
613     a = ((uint128_t)small1[2]) * small2[0];
614     low = a;
615     high = a >> 64;
616     out[2] += low;
617     out[3] += high;
618 
619     a = ((uint128_t)small1[0]) * small2[3];
620     low = a;
621     high = a >> 64;
622     out[3] += low;
623     out[4] = high;
624 
625     a = ((uint128_t)small1[1]) * small2[2];
626     low = a;
627     high = a >> 64;
628     out[3] += low;
629     out[4] += high;
630 
631     a = ((uint128_t)small1[2]) * small2[1];
632     low = a;
633     high = a >> 64;
634     out[3] += low;
635     out[4] += high;
636 
637     a = ((uint128_t)small1[3]) * small2[0];
638     low = a;
639     high = a >> 64;
640     out[3] += low;
641     out[4] += high;
642 
643     a = ((uint128_t)small1[1]) * small2[3];
644     low = a;
645     high = a >> 64;
646     out[4] += low;
647     out[5] = high;
648 
649     a = ((uint128_t)small1[2]) * small2[2];
650     low = a;
651     high = a >> 64;
652     out[4] += low;
653     out[5] += high;
654 
655     a = ((uint128_t)small1[3]) * small2[1];
656     low = a;
657     high = a >> 64;
658     out[4] += low;
659     out[5] += high;
660 
661     a = ((uint128_t)small1[2]) * small2[3];
662     low = a;
663     high = a >> 64;
664     out[5] += low;
665     out[6] = high;
666 
667     a = ((uint128_t)small1[3]) * small2[2];
668     low = a;
669     high = a >> 64;
670     out[5] += low;
671     out[6] += high;
672 
673     a = ((uint128_t)small1[3]) * small2[3];
674     low = a;
675     high = a >> 64;
676     out[6] += low;
677     out[7] = high;
678 }
679 
680 /*-
681  * felem_mul sets |out| = |in1| * |in2|
682  * On entry:
683  *   in1[i] < 2^109
684  *   in2[i] < 2^109
685  * On exit:
686  *   out[i] < 7 * 2^64 < 2^67
687  */
felem_mul(longfelem out,const felem in1,const felem in2)688 static void felem_mul(longfelem out, const felem in1, const felem in2)
689 {
690     smallfelem small1, small2;
691     felem_shrink(small1, in1);
692     felem_shrink(small2, in2);
693     smallfelem_mul(out, small1, small2);
694 }
695 
696 /*-
697  * felem_small_mul sets |out| = |small1| * |in2|
698  * On entry:
699  *   small1[i] < 2^64
700  *   in2[i] < 2^109
701  * On exit:
702  *   out[i] < 7 * 2^64 < 2^67
703  */
felem_small_mul(longfelem out,const smallfelem small1,const felem in2)704 static void felem_small_mul(longfelem out, const smallfelem small1,
705     const felem in2)
706 {
707     smallfelem small2;
708     felem_shrink(small2, in2);
709     smallfelem_mul(out, small1, small2);
710 }
711 
712 #define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
713 #define two100 (((limb)1) << 100)
714 #define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
715 /* zero100 is 0 mod p */
716 static const felem zero100 = { two100m36m4, two100, two100m36p4, two100m36p4 };
717 
718 /*-
719  * Internal function for the different flavours of felem_reduce.
720  * felem_reduce_ reduces the higher coefficients in[4]-in[7].
721  * On entry:
722  *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7]
723  *   out[1] >= in[7] + 2^32*in[4]
724  *   out[2] >= in[5] + 2^32*in[5]
725  *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
726  * On exit:
727  *   out[0] <= out[0] + in[4] + 2^32*in[5]
728  *   out[1] <= out[1] + in[5] + 2^33*in[6]
729  *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
730  *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
731  */
felem_reduce_(felem out,const longfelem in)732 static void felem_reduce_(felem out, const longfelem in)
733 {
734     int128_t c;
735     /* combine common terms from below */
736     c = in[4] + (in[5] << 32);
737     out[0] += c;
738     out[3] -= c;
739 
740     c = in[5] - in[7];
741     out[1] += c;
742     out[2] -= c;
743 
744     /* the remaining terms */
745     /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
746     out[1] -= (in[4] << 32);
747     out[3] += (in[4] << 32);
748 
749     /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
750     out[2] -= (in[5] << 32);
751 
752     /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
753     out[0] -= in[6];
754     out[0] -= (in[6] << 32);
755     out[1] += (in[6] << 33);
756     out[2] += (in[6] * 2);
757     out[3] -= (in[6] << 32);
758 
759     /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
760     out[0] -= in[7];
761     out[0] -= (in[7] << 32);
762     out[2] += (in[7] << 33);
763     out[3] += (in[7] * 3);
764 }
765 
766 /*-
767  * felem_reduce converts a longfelem into an felem.
768  * To be called directly after felem_square or felem_mul.
769  * On entry:
770  *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
771  *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
772  * On exit:
773  *   out[i] < 2^101
774  */
felem_reduce(felem out,const longfelem in)775 static void felem_reduce(felem out, const longfelem in)
776 {
777     out[0] = zero100[0] + in[0];
778     out[1] = zero100[1] + in[1];
779     out[2] = zero100[2] + in[2];
780     out[3] = zero100[3] + in[3];
781 
782     felem_reduce_(out, in);
783 
784     /*-
785      * out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
786      * out[1] > 2^100 - 2^64 - 7*2^96 > 0
787      * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
788      * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
789      *
790      * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
791      * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
792      * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
793      * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
794      */
795 }
796 
797 /*-
798  * felem_reduce_zero105 converts a larger longfelem into an felem.
799  * On entry:
800  *   in[0] < 2^71
801  * On exit:
802  *   out[i] < 2^106
803  */
felem_reduce_zero105(felem out,const longfelem in)804 static void felem_reduce_zero105(felem out, const longfelem in)
805 {
806     out[0] = zero105[0] + in[0];
807     out[1] = zero105[1] + in[1];
808     out[2] = zero105[2] + in[2];
809     out[3] = zero105[3] + in[3];
810 
811     felem_reduce_(out, in);
812 
813     /*-
814      * out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
815      * out[1] > 2^105 - 2^71 - 2^103 > 0
816      * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
817      * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
818      *
819      * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
820      * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
821      * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
822      * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
823      */
824 }
825 
826 /*
827  * subtract_u64 sets *result = *result - v and *carry to one if the
828  * subtraction underflowed.
829  */
subtract_u64(u64 * result,u64 * carry,u64 v)830 static void subtract_u64(u64 *result, u64 *carry, u64 v)
831 {
832     uint128_t r = *result;
833     r -= v;
834     *carry = (r >> 64) & 1;
835     *result = (u64)r;
836 }
837 
838 /*
839  * felem_contract converts |in| to its unique, minimal representation. On
840  * entry: in[i] < 2^109
841  */
felem_contract(smallfelem out,const felem in)842 static void felem_contract(smallfelem out, const felem in)
843 {
844     unsigned i;
845     u64 all_equal_so_far = 0, result = 0, carry;
846 
847     felem_shrink(out, in);
848     /* small is minimal except that the value might be > p */
849 
850     all_equal_so_far--;
851     /*
852      * We are doing a constant time test if out >= kPrime. We need to compare
853      * each u64, from most-significant to least significant. For each one, if
854      * all words so far have been equal (m is all ones) then a non-equal
855      * result is the answer. Otherwise we continue.
856      */
857     for (i = 3; i < 4; i--) {
858         u64 equal;
859         uint128_t a = ((uint128_t)kPrime[i]) - out[i];
860         /*
861          * if out[i] > kPrime[i] then a will underflow and the high 64-bits
862          * will all be set.
863          */
864         result |= all_equal_so_far & ((u64)(a >> 64));
865 
866         /*
867          * if kPrime[i] == out[i] then |equal| will be all zeros and the
868          * decrement will make it all ones.
869          */
870         equal = kPrime[i] ^ out[i];
871         equal--;
872         equal &= equal << 32;
873         equal &= equal << 16;
874         equal &= equal << 8;
875         equal &= equal << 4;
876         equal &= equal << 2;
877         equal &= equal << 1;
878         equal = 0 - (equal >> 63);
879 
880         all_equal_so_far &= equal;
881     }
882 
883     /*
884      * if all_equal_so_far is still all ones then the two values are equal
885      * and so out >= kPrime is true.
886      */
887     result |= all_equal_so_far;
888 
889     /* if out >= kPrime then we subtract kPrime. */
890     subtract_u64(&out[0], &carry, result & kPrime[0]);
891     subtract_u64(&out[1], &carry, carry);
892     subtract_u64(&out[2], &carry, carry);
893     subtract_u64(&out[3], &carry, carry);
894 
895     subtract_u64(&out[1], &carry, result & kPrime[1]);
896     subtract_u64(&out[2], &carry, carry);
897     subtract_u64(&out[3], &carry, carry);
898 
899     subtract_u64(&out[2], &carry, result & kPrime[2]);
900     subtract_u64(&out[3], &carry, carry);
901 
902     subtract_u64(&out[3], &carry, result & kPrime[3]);
903 }
904 
smallfelem_square_contract(smallfelem out,const smallfelem in)905 static void smallfelem_square_contract(smallfelem out, const smallfelem in)
906 {
907     longfelem longtmp;
908     felem tmp;
909 
910     smallfelem_square(longtmp, in);
911     felem_reduce(tmp, longtmp);
912     felem_contract(out, tmp);
913 }
914 
smallfelem_mul_contract(smallfelem out,const smallfelem in1,const smallfelem in2)915 static void smallfelem_mul_contract(smallfelem out, const smallfelem in1,
916     const smallfelem in2)
917 {
918     longfelem longtmp;
919     felem tmp;
920 
921     smallfelem_mul(longtmp, in1, in2);
922     felem_reduce(tmp, longtmp);
923     felem_contract(out, tmp);
924 }
925 
926 /*-
927  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
928  * otherwise.
929  * On entry:
930  *   small[i] < 2^64
931  */
smallfelem_is_zero(const smallfelem small)932 static limb smallfelem_is_zero(const smallfelem small)
933 {
934     limb result;
935     u64 is_p;
936 
937     u64 is_zero = small[0] | small[1] | small[2] | small[3];
938     is_zero--;
939     is_zero &= is_zero << 32;
940     is_zero &= is_zero << 16;
941     is_zero &= is_zero << 8;
942     is_zero &= is_zero << 4;
943     is_zero &= is_zero << 2;
944     is_zero &= is_zero << 1;
945     is_zero = 0 - (is_zero >> 63);
946 
947     is_p = (small[0] ^ kPrime[0]) | (small[1] ^ kPrime[1]) | (small[2] ^ kPrime[2]) | (small[3] ^ kPrime[3]);
948     is_p--;
949     is_p &= is_p << 32;
950     is_p &= is_p << 16;
951     is_p &= is_p << 8;
952     is_p &= is_p << 4;
953     is_p &= is_p << 2;
954     is_p &= is_p << 1;
955     is_p = 0 - (is_p >> 63);
956 
957     is_zero |= is_p;
958 
959     result = is_zero;
960     result |= ((limb)is_zero) << 64;
961     return result;
962 }
963 
smallfelem_is_zero_int(const void * small)964 static int smallfelem_is_zero_int(const void *small)
965 {
966     return (int)(smallfelem_is_zero(small) & ((limb)1));
967 }
968 
969 /*-
970  * felem_inv calculates |out| = |in|^{-1}
971  *
972  * Based on Fermat's Little Theorem:
973  *   a^p = a (mod p)
974  *   a^{p-1} = 1 (mod p)
975  *   a^{p-2} = a^{-1} (mod p)
976  */
felem_inv(felem out,const felem in)977 static void felem_inv(felem out, const felem in)
978 {
979     felem ftmp, ftmp2;
980     /* each e_I will hold |in|^{2^I - 1} */
981     felem e2, e4, e8, e16, e32, e64;
982     longfelem tmp;
983     unsigned i;
984 
985     felem_square(tmp, in);
986     felem_reduce(ftmp, tmp); /* 2^1 */
987     felem_mul(tmp, in, ftmp);
988     felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */
989     felem_assign(e2, ftmp);
990     felem_square(tmp, ftmp);
991     felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */
992     felem_square(tmp, ftmp);
993     felem_reduce(ftmp, tmp); /* 2^4 - 2^2 */
994     felem_mul(tmp, ftmp, e2);
995     felem_reduce(ftmp, tmp); /* 2^4 - 2^0 */
996     felem_assign(e4, ftmp);
997     felem_square(tmp, ftmp);
998     felem_reduce(ftmp, tmp); /* 2^5 - 2^1 */
999     felem_square(tmp, ftmp);
1000     felem_reduce(ftmp, tmp); /* 2^6 - 2^2 */
1001     felem_square(tmp, ftmp);
1002     felem_reduce(ftmp, tmp); /* 2^7 - 2^3 */
1003     felem_square(tmp, ftmp);
1004     felem_reduce(ftmp, tmp); /* 2^8 - 2^4 */
1005     felem_mul(tmp, ftmp, e4);
1006     felem_reduce(ftmp, tmp); /* 2^8 - 2^0 */
1007     felem_assign(e8, ftmp);
1008     for (i = 0; i < 8; i++) {
1009         felem_square(tmp, ftmp);
1010         felem_reduce(ftmp, tmp);
1011     } /* 2^16 - 2^8 */
1012     felem_mul(tmp, ftmp, e8);
1013     felem_reduce(ftmp, tmp); /* 2^16 - 2^0 */
1014     felem_assign(e16, ftmp);
1015     for (i = 0; i < 16; i++) {
1016         felem_square(tmp, ftmp);
1017         felem_reduce(ftmp, tmp);
1018     } /* 2^32 - 2^16 */
1019     felem_mul(tmp, ftmp, e16);
1020     felem_reduce(ftmp, tmp); /* 2^32 - 2^0 */
1021     felem_assign(e32, ftmp);
1022     for (i = 0; i < 32; i++) {
1023         felem_square(tmp, ftmp);
1024         felem_reduce(ftmp, tmp);
1025     } /* 2^64 - 2^32 */
1026     felem_assign(e64, ftmp);
1027     felem_mul(tmp, ftmp, in);
1028     felem_reduce(ftmp, tmp); /* 2^64 - 2^32 + 2^0 */
1029     for (i = 0; i < 192; i++) {
1030         felem_square(tmp, ftmp);
1031         felem_reduce(ftmp, tmp);
1032     } /* 2^256 - 2^224 + 2^192 */
1033 
1034     felem_mul(tmp, e64, e32);
1035     felem_reduce(ftmp2, tmp); /* 2^64 - 2^0 */
1036     for (i = 0; i < 16; i++) {
1037         felem_square(tmp, ftmp2);
1038         felem_reduce(ftmp2, tmp);
1039     } /* 2^80 - 2^16 */
1040     felem_mul(tmp, ftmp2, e16);
1041     felem_reduce(ftmp2, tmp); /* 2^80 - 2^0 */
1042     for (i = 0; i < 8; i++) {
1043         felem_square(tmp, ftmp2);
1044         felem_reduce(ftmp2, tmp);
1045     } /* 2^88 - 2^8 */
1046     felem_mul(tmp, ftmp2, e8);
1047     felem_reduce(ftmp2, tmp); /* 2^88 - 2^0 */
1048     for (i = 0; i < 4; i++) {
1049         felem_square(tmp, ftmp2);
1050         felem_reduce(ftmp2, tmp);
1051     } /* 2^92 - 2^4 */
1052     felem_mul(tmp, ftmp2, e4);
1053     felem_reduce(ftmp2, tmp); /* 2^92 - 2^0 */
1054     felem_square(tmp, ftmp2);
1055     felem_reduce(ftmp2, tmp); /* 2^93 - 2^1 */
1056     felem_square(tmp, ftmp2);
1057     felem_reduce(ftmp2, tmp); /* 2^94 - 2^2 */
1058     felem_mul(tmp, ftmp2, e2);
1059     felem_reduce(ftmp2, tmp); /* 2^94 - 2^0 */
1060     felem_square(tmp, ftmp2);
1061     felem_reduce(ftmp2, tmp); /* 2^95 - 2^1 */
1062     felem_square(tmp, ftmp2);
1063     felem_reduce(ftmp2, tmp); /* 2^96 - 2^2 */
1064     felem_mul(tmp, ftmp2, in);
1065     felem_reduce(ftmp2, tmp); /* 2^96 - 3 */
1066 
1067     felem_mul(tmp, ftmp2, ftmp);
1068     felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
1069 }
1070 
smallfelem_inv_contract(smallfelem out,const smallfelem in)1071 static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
1072 {
1073     felem tmp;
1074 
1075     smallfelem_expand(tmp, in);
1076     felem_inv(tmp, tmp);
1077     felem_contract(out, tmp);
1078 }
1079 
1080 /*-
1081  * Group operations
1082  * ----------------
1083  *
1084  * Building on top of the field operations we have the operations on the
1085  * elliptic curve group itself. Points on the curve are represented in Jacobian
1086  * coordinates
1087  */
1088 
1089 /*-
1090  * point_double calculates 2*(x_in, y_in, z_in)
1091  *
1092  * The method is taken from:
1093  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1094  *
1095  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1096  * while x_out == y_in is not (maybe this works, but it's not tested).
1097  */
1098 static void
point_double(felem x_out,felem y_out,felem z_out,const felem x_in,const felem y_in,const felem z_in)1099 point_double(felem x_out, felem y_out, felem z_out,
1100     const felem x_in, const felem y_in, const felem z_in)
1101 {
1102     longfelem tmp, tmp2;
1103     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1104     smallfelem small1, small2;
1105 
1106     felem_assign(ftmp, x_in);
1107     /* ftmp[i] < 2^106 */
1108     felem_assign(ftmp2, x_in);
1109     /* ftmp2[i] < 2^106 */
1110 
1111     /* delta = z^2 */
1112     felem_square(tmp, z_in);
1113     felem_reduce(delta, tmp);
1114     /* delta[i] < 2^101 */
1115 
1116     /* gamma = y^2 */
1117     felem_square(tmp, y_in);
1118     felem_reduce(gamma, tmp);
1119     /* gamma[i] < 2^101 */
1120     felem_shrink(small1, gamma);
1121 
1122     /* beta = x*gamma */
1123     felem_small_mul(tmp, small1, x_in);
1124     felem_reduce(beta, tmp);
1125     /* beta[i] < 2^101 */
1126 
1127     /* alpha = 3*(x-delta)*(x+delta) */
1128     felem_diff(ftmp, delta);
1129     /* ftmp[i] < 2^105 + 2^106 < 2^107 */
1130     felem_sum(ftmp2, delta);
1131     /* ftmp2[i] < 2^105 + 2^106 < 2^107 */
1132     felem_scalar(ftmp2, 3);
1133     /* ftmp2[i] < 3 * 2^107 < 2^109 */
1134     felem_mul(tmp, ftmp, ftmp2);
1135     felem_reduce(alpha, tmp);
1136     /* alpha[i] < 2^101 */
1137     felem_shrink(small2, alpha);
1138 
1139     /* x' = alpha^2 - 8*beta */
1140     smallfelem_square(tmp, small2);
1141     felem_reduce(x_out, tmp);
1142     felem_assign(ftmp, beta);
1143     felem_scalar(ftmp, 8);
1144     /* ftmp[i] < 8 * 2^101 = 2^104 */
1145     felem_diff(x_out, ftmp);
1146     /* x_out[i] < 2^105 + 2^101 < 2^106 */
1147 
1148     /* z' = (y + z)^2 - gamma - delta */
1149     felem_sum(delta, gamma);
1150     /* delta[i] < 2^101 + 2^101 = 2^102 */
1151     felem_assign(ftmp, y_in);
1152     felem_sum(ftmp, z_in);
1153     /* ftmp[i] < 2^106 + 2^106 = 2^107 */
1154     felem_square(tmp, ftmp);
1155     felem_reduce(z_out, tmp);
1156     felem_diff(z_out, delta);
1157     /* z_out[i] < 2^105 + 2^101 < 2^106 */
1158 
1159     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1160     felem_scalar(beta, 4);
1161     /* beta[i] < 4 * 2^101 = 2^103 */
1162     felem_diff_zero107(beta, x_out);
1163     /* beta[i] < 2^107 + 2^103 < 2^108 */
1164     felem_small_mul(tmp, small2, beta);
1165     /* tmp[i] < 7 * 2^64 < 2^67 */
1166     smallfelem_square(tmp2, small1);
1167     /* tmp2[i] < 7 * 2^64 */
1168     longfelem_scalar(tmp2, 8);
1169     /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
1170     longfelem_diff(tmp, tmp2);
1171     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1172     felem_reduce_zero105(y_out, tmp);
1173     /* y_out[i] < 2^106 */
1174 }
1175 
1176 /*
1177  * point_double_small is the same as point_double, except that it operates on
1178  * smallfelems
1179  */
1180 static void
point_double_small(smallfelem x_out,smallfelem y_out,smallfelem z_out,const smallfelem x_in,const smallfelem y_in,const smallfelem z_in)1181 point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
1182     const smallfelem x_in, const smallfelem y_in,
1183     const smallfelem z_in)
1184 {
1185     felem felem_x_out, felem_y_out, felem_z_out;
1186     felem felem_x_in, felem_y_in, felem_z_in;
1187 
1188     smallfelem_expand(felem_x_in, x_in);
1189     smallfelem_expand(felem_y_in, y_in);
1190     smallfelem_expand(felem_z_in, z_in);
1191     point_double(felem_x_out, felem_y_out, felem_z_out,
1192         felem_x_in, felem_y_in, felem_z_in);
1193     felem_shrink(x_out, felem_x_out);
1194     felem_shrink(y_out, felem_y_out);
1195     felem_shrink(z_out, felem_z_out);
1196 }
1197 
1198 /* copy_conditional copies in to out iff mask is all ones. */
copy_conditional(felem out,const felem in,limb mask)1199 static void copy_conditional(felem out, const felem in, limb mask)
1200 {
1201     unsigned i;
1202     for (i = 0; i < NLIMBS; ++i) {
1203         const limb tmp = mask & (in[i] ^ out[i]);
1204         out[i] ^= tmp;
1205     }
1206 }
1207 
1208 /* copy_small_conditional copies in to out iff mask is all ones. */
copy_small_conditional(felem out,const smallfelem in,limb mask)1209 static void copy_small_conditional(felem out, const smallfelem in, limb mask)
1210 {
1211     unsigned i;
1212     const u64 mask64 = mask;
1213     for (i = 0; i < NLIMBS; ++i) {
1214         out[i] = ((limb)(in[i] & mask64)) | (out[i] & ~mask);
1215     }
1216 }
1217 
1218 /*-
1219  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1220  *
1221  * The method is taken from:
1222  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1223  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1224  *
1225  * This function includes a branch for checking whether the two input points
1226  * are equal, (while not equal to the point at infinity). This case never
1227  * happens during single point multiplication, so there is no timing leak for
1228  * ECDH or ECDSA signing.
1229  */
point_add(felem x3,felem y3,felem z3,const felem x1,const felem y1,const felem z1,const int mixed,const smallfelem x2,const smallfelem y2,const smallfelem z2)1230 static void point_add(felem x3, felem y3, felem z3,
1231     const felem x1, const felem y1, const felem z1,
1232     const int mixed, const smallfelem x2,
1233     const smallfelem y2, const smallfelem z2)
1234 {
1235     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1236     longfelem tmp, tmp2;
1237     smallfelem small1, small2, small3, small4, small5;
1238     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1239     limb points_equal;
1240 
1241     felem_shrink(small3, z1);
1242 
1243     z1_is_zero = smallfelem_is_zero(small3);
1244     z2_is_zero = smallfelem_is_zero(z2);
1245 
1246     /* ftmp = z1z1 = z1**2 */
1247     smallfelem_square(tmp, small3);
1248     felem_reduce(ftmp, tmp);
1249     /* ftmp[i] < 2^101 */
1250     felem_shrink(small1, ftmp);
1251 
1252     if (!mixed) {
1253         /* ftmp2 = z2z2 = z2**2 */
1254         smallfelem_square(tmp, z2);
1255         felem_reduce(ftmp2, tmp);
1256         /* ftmp2[i] < 2^101 */
1257         felem_shrink(small2, ftmp2);
1258 
1259         felem_shrink(small5, x1);
1260 
1261         /* u1 = ftmp3 = x1*z2z2 */
1262         smallfelem_mul(tmp, small5, small2);
1263         felem_reduce(ftmp3, tmp);
1264         /* ftmp3[i] < 2^101 */
1265 
1266         /* ftmp5 = z1 + z2 */
1267         felem_assign(ftmp5, z1);
1268         felem_small_sum(ftmp5, z2);
1269         /* ftmp5[i] < 2^107 */
1270 
1271         /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
1272         felem_square(tmp, ftmp5);
1273         felem_reduce(ftmp5, tmp);
1274         /* ftmp2 = z2z2 + z1z1 */
1275         felem_sum(ftmp2, ftmp);
1276         /* ftmp2[i] < 2^101 + 2^101 = 2^102 */
1277         felem_diff(ftmp5, ftmp2);
1278         /* ftmp5[i] < 2^105 + 2^101 < 2^106 */
1279 
1280         /* ftmp2 = z2 * z2z2 */
1281         smallfelem_mul(tmp, small2, z2);
1282         felem_reduce(ftmp2, tmp);
1283 
1284         /* s1 = ftmp2 = y1 * z2**3 */
1285         felem_mul(tmp, y1, ftmp2);
1286         felem_reduce(ftmp6, tmp);
1287         /* ftmp6[i] < 2^101 */
1288     } else {
1289         /*
1290          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1291          */
1292 
1293         /* u1 = ftmp3 = x1*z2z2 */
1294         felem_assign(ftmp3, x1);
1295         /* ftmp3[i] < 2^106 */
1296 
1297         /* ftmp5 = 2z1z2 */
1298         felem_assign(ftmp5, z1);
1299         felem_scalar(ftmp5, 2);
1300         /* ftmp5[i] < 2*2^106 = 2^107 */
1301 
1302         /* s1 = ftmp2 = y1 * z2**3 */
1303         felem_assign(ftmp6, y1);
1304         /* ftmp6[i] < 2^106 */
1305     }
1306 
1307     /* u2 = x2*z1z1 */
1308     smallfelem_mul(tmp, x2, small1);
1309     felem_reduce(ftmp4, tmp);
1310 
1311     /* h = ftmp4 = u2 - u1 */
1312     felem_diff_zero107(ftmp4, ftmp3);
1313     /* ftmp4[i] < 2^107 + 2^101 < 2^108 */
1314     felem_shrink(small4, ftmp4);
1315 
1316     x_equal = smallfelem_is_zero(small4);
1317 
1318     /* z_out = ftmp5 * h */
1319     felem_small_mul(tmp, small4, ftmp5);
1320     felem_reduce(z_out, tmp);
1321     /* z_out[i] < 2^101 */
1322 
1323     /* ftmp = z1 * z1z1 */
1324     smallfelem_mul(tmp, small1, small3);
1325     felem_reduce(ftmp, tmp);
1326 
1327     /* s2 = tmp = y2 * z1**3 */
1328     felem_small_mul(tmp, y2, ftmp);
1329     felem_reduce(ftmp5, tmp);
1330 
1331     /* r = ftmp5 = (s2 - s1)*2 */
1332     felem_diff_zero107(ftmp5, ftmp6);
1333     /* ftmp5[i] < 2^107 + 2^107 = 2^108 */
1334     felem_scalar(ftmp5, 2);
1335     /* ftmp5[i] < 2^109 */
1336     felem_shrink(small1, ftmp5);
1337     y_equal = smallfelem_is_zero(small1);
1338 
1339     /*
1340      * The formulae are incorrect if the points are equal, in affine coordinates
1341      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1342      * happens.
1343      *
1344      * We use bitwise operations to avoid potential side-channels introduced by
1345      * the short-circuiting behaviour of boolean operators.
1346      *
1347      * The special case of either point being the point at infinity (z1 and/or
1348      * z2 are zero), is handled separately later on in this function, so we
1349      * avoid jumping to point_double here in those special cases.
1350      */
1351     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1352 
1353     if (points_equal) {
1354         /*
1355          * This is obviously not constant-time but, as mentioned before, this
1356          * case never happens during single point multiplication, so there is no
1357          * timing leak for ECDH or ECDSA signing.
1358          */
1359         point_double(x3, y3, z3, x1, y1, z1);
1360         return;
1361     }
1362 
1363     /* I = ftmp = (2h)**2 */
1364     felem_assign(ftmp, ftmp4);
1365     felem_scalar(ftmp, 2);
1366     /* ftmp[i] < 2*2^108 = 2^109 */
1367     felem_square(tmp, ftmp);
1368     felem_reduce(ftmp, tmp);
1369 
1370     /* J = ftmp2 = h * I */
1371     felem_mul(tmp, ftmp4, ftmp);
1372     felem_reduce(ftmp2, tmp);
1373 
1374     /* V = ftmp4 = U1 * I */
1375     felem_mul(tmp, ftmp3, ftmp);
1376     felem_reduce(ftmp4, tmp);
1377 
1378     /* x_out = r**2 - J - 2V */
1379     smallfelem_square(tmp, small1);
1380     felem_reduce(x_out, tmp);
1381     felem_assign(ftmp3, ftmp4);
1382     felem_scalar(ftmp4, 2);
1383     felem_sum(ftmp4, ftmp2);
1384     /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
1385     felem_diff(x_out, ftmp4);
1386     /* x_out[i] < 2^105 + 2^101 */
1387 
1388     /* y_out = r(V-x_out) - 2 * s1 * J */
1389     felem_diff_zero107(ftmp3, x_out);
1390     /* ftmp3[i] < 2^107 + 2^101 < 2^108 */
1391     felem_small_mul(tmp, small1, ftmp3);
1392     felem_mul(tmp2, ftmp6, ftmp2);
1393     longfelem_scalar(tmp2, 2);
1394     /* tmp2[i] < 2*2^67 = 2^68 */
1395     longfelem_diff(tmp, tmp2);
1396     /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
1397     felem_reduce_zero105(y_out, tmp);
1398     /* y_out[i] < 2^106 */
1399 
1400     copy_small_conditional(x_out, x2, z1_is_zero);
1401     copy_conditional(x_out, x1, z2_is_zero);
1402     copy_small_conditional(y_out, y2, z1_is_zero);
1403     copy_conditional(y_out, y1, z2_is_zero);
1404     copy_small_conditional(z_out, z2, z1_is_zero);
1405     copy_conditional(z_out, z1, z2_is_zero);
1406     felem_assign(x3, x_out);
1407     felem_assign(y3, y_out);
1408     felem_assign(z3, z_out);
1409 }
1410 
1411 /*
1412  * point_add_small is the same as point_add, except that it operates on
1413  * smallfelems
1414  */
point_add_small(smallfelem x3,smallfelem y3,smallfelem z3,smallfelem x1,smallfelem y1,smallfelem z1,smallfelem x2,smallfelem y2,smallfelem z2)1415 static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
1416     smallfelem x1, smallfelem y1, smallfelem z1,
1417     smallfelem x2, smallfelem y2, smallfelem z2)
1418 {
1419     felem felem_x3, felem_y3, felem_z3;
1420     felem felem_x1, felem_y1, felem_z1;
1421     smallfelem_expand(felem_x1, x1);
1422     smallfelem_expand(felem_y1, y1);
1423     smallfelem_expand(felem_z1, z1);
1424     point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0,
1425         x2, y2, z2);
1426     felem_shrink(x3, felem_x3);
1427     felem_shrink(y3, felem_y3);
1428     felem_shrink(z3, felem_z3);
1429 }
1430 
1431 /*-
1432  * Base point pre computation
1433  * --------------------------
1434  *
1435  * Two different sorts of precomputed tables are used in the following code.
1436  * Each contain various points on the curve, where each point is three field
1437  * elements (x, y, z).
1438  *
1439  * For the base point table, z is usually 1 (0 for the point at infinity).
1440  * This table has 2 * 16 elements, starting with the following:
1441  * index | bits    | point
1442  * ------+---------+------------------------------
1443  *     0 | 0 0 0 0 | 0G
1444  *     1 | 0 0 0 1 | 1G
1445  *     2 | 0 0 1 0 | 2^64G
1446  *     3 | 0 0 1 1 | (2^64 + 1)G
1447  *     4 | 0 1 0 0 | 2^128G
1448  *     5 | 0 1 0 1 | (2^128 + 1)G
1449  *     6 | 0 1 1 0 | (2^128 + 2^64)G
1450  *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
1451  *     8 | 1 0 0 0 | 2^192G
1452  *     9 | 1 0 0 1 | (2^192 + 1)G
1453  *    10 | 1 0 1 0 | (2^192 + 2^64)G
1454  *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
1455  *    12 | 1 1 0 0 | (2^192 + 2^128)G
1456  *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
1457  *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
1458  *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
1459  * followed by a copy of this with each element multiplied by 2^32.
1460  *
1461  * The reason for this is so that we can clock bits into four different
1462  * locations when doing simple scalar multiplies against the base point,
1463  * and then another four locations using the second 16 elements.
1464  *
1465  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1466 
1467 /* gmul is the table of precomputed base points */
1468 static const smallfelem gmul[2][16][3] = {
1469     { { { 0, 0, 0, 0 },
1470           { 0, 0, 0, 0 },
1471           { 0, 0, 0, 0 } },
1472         { { 0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2,
1473               0x6b17d1f2e12c4247 },
1474             { 0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16,
1475                 0x4fe342e2fe1a7f9b },
1476             { 1, 0, 0, 0 } },
1477         { { 0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de,
1478               0x0fa822bc2811aaa5 },
1479             { 0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b,
1480                 0xbff44ae8f5dba80d },
1481             { 1, 0, 0, 0 } },
1482         { { 0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789,
1483               0x300a4bbc89d6726f },
1484             { 0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f,
1485                 0x72aac7e0d09b4644 },
1486             { 1, 0, 0, 0 } },
1487         { { 0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e,
1488               0x447d739beedb5e67 },
1489             { 0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7,
1490                 0x2d4825ab834131ee },
1491             { 1, 0, 0, 0 } },
1492         { { 0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60,
1493               0xef9519328a9c72ff },
1494             { 0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c,
1495                 0x611e9fc37dbb2c9b },
1496             { 1, 0, 0, 0 } },
1497         { { 0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf,
1498               0x550663797b51f5d8 },
1499             { 0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5,
1500                 0x157164848aecb851 },
1501             { 1, 0, 0, 0 } },
1502         { { 0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391,
1503               0xeb5d7745b21141ea },
1504             { 0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee,
1505                 0xeafd72ebdbecc17b },
1506             { 1, 0, 0, 0 } },
1507         { { 0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5,
1508               0xa6d39677a7849276 },
1509             { 0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf,
1510                 0x674f84749b0b8816 },
1511             { 1, 0, 0, 0 } },
1512         { { 0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb,
1513               0x4e769e7672c9ddad },
1514             { 0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281,
1515                 0x42b99082de830663 },
1516             { 1, 0, 0, 0 } },
1517         { { 0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478,
1518               0x78878ef61c6ce04d },
1519             { 0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def,
1520                 0xb6cb3f5d7b72c321 },
1521             { 1, 0, 0, 0 } },
1522         { { 0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae,
1523               0x0c88bc4d716b1287 },
1524             { 0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa,
1525                 0xdd5ddea3f3901dc6 },
1526             { 1, 0, 0, 0 } },
1527         { { 0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3,
1528               0x68f344af6b317466 },
1529             { 0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3,
1530                 0x31b9c405f8540a20 },
1531             { 1, 0, 0, 0 } },
1532         { { 0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0,
1533               0x4052bf4b6f461db9 },
1534             { 0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8,
1535                 0xfecf4d5190b0fc61 },
1536             { 1, 0, 0, 0 } },
1537         { { 0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a,
1538               0x1eddbae2c802e41a },
1539             { 0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0,
1540                 0x43104d86560ebcfc },
1541             { 1, 0, 0, 0 } },
1542         { { 0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a,
1543               0xb48e26b484f7a21c },
1544             { 0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668,
1545                 0xfac015404d4d3dab },
1546             { 1, 0, 0, 0 } } },
1547     { { { 0, 0, 0, 0 },
1548           { 0, 0, 0, 0 },
1549           { 0, 0, 0, 0 } },
1550         { { 0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da,
1551               0x7fe36b40af22af89 },
1552             { 0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1,
1553                 0xe697d45825b63624 },
1554             { 1, 0, 0, 0 } },
1555         { { 0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902,
1556               0x4a5b506612a677a6 },
1557             { 0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40,
1558                 0xeb13461ceac089f1 },
1559             { 1, 0, 0, 0 } },
1560         { { 0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857,
1561               0x0781b8291c6a220a },
1562             { 0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434,
1563                 0x690cde8df0151593 },
1564             { 1, 0, 0, 0 } },
1565         { { 0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326,
1566               0x8a535f566ec73617 },
1567             { 0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf,
1568                 0x0455c08468b08bd7 },
1569             { 1, 0, 0, 0 } },
1570         { { 0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279,
1571               0x06bada7ab77f8276 },
1572             { 0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70,
1573                 0x5b476dfd0e6cb18a },
1574             { 1, 0, 0, 0 } },
1575         { { 0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8,
1576               0x3e29864e8a2ec908 },
1577             { 0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed,
1578                 0x239b90ea3dc31e7e },
1579             { 1, 0, 0, 0 } },
1580         { { 0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4,
1581               0x820f4dd949f72ff7 },
1582             { 0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3,
1583                 0x140406ec783a05ec },
1584             { 1, 0, 0, 0 } },
1585         { { 0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe,
1586               0x68f6b8542783dfee },
1587             { 0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028,
1588                 0xcbe1feba92e40ce6 },
1589             { 1, 0, 0, 0 } },
1590         { { 0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927,
1591               0xd0b2f94d2f420109 },
1592             { 0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a,
1593                 0x971459828b0719e5 },
1594             { 1, 0, 0, 0 } },
1595         { { 0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687,
1596               0x961610004a866aba },
1597             { 0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c,
1598                 0x7acb9fadcee75e44 },
1599             { 1, 0, 0, 0 } },
1600         { { 0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea,
1601               0x24eb9acca333bf5b },
1602             { 0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d,
1603                 0x69f891c5acd079cc },
1604             { 1, 0, 0, 0 } },
1605         { { 0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514,
1606               0xe51f547c5972a107 },
1607             { 0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06,
1608                 0x1c309a2b25bb1387 },
1609             { 1, 0, 0, 0 } },
1610         { { 0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828,
1611               0x20b87b8aa2c4e503 },
1612             { 0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044,
1613                 0xf5c6fa49919776be },
1614             { 1, 0, 0, 0 } },
1615         { { 0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56,
1616               0x1ed7d1b9332010b9 },
1617             { 0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24,
1618                 0x3a2b03f03217257a },
1619             { 1, 0, 0, 0 } },
1620         { { 0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b,
1621               0x15fee545c78dd9f6 },
1622             { 0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb,
1623                 0x4ab5b6b2b8753f81 },
1624             { 1, 0, 0, 0 } } }
1625 };
1626 
1627 /*
1628  * select_point selects the |idx|th point from a precomputation table and
1629  * copies it to out.
1630  */
select_point(const u64 idx,unsigned int size,const smallfelem pre_comp[16][3],smallfelem out[3])1631 static void select_point(const u64 idx, unsigned int size,
1632     const smallfelem pre_comp[16][3], smallfelem out[3])
1633 {
1634     unsigned i, j;
1635     u64 *outlimbs = &out[0][0];
1636 
1637     memset(out, 0, sizeof(*out) * 3);
1638 
1639     for (i = 0; i < size; i++) {
1640         const u64 *inlimbs = (u64 *)&pre_comp[i][0][0];
1641         u64 mask = i ^ idx;
1642         mask |= mask >> 4;
1643         mask |= mask >> 2;
1644         mask |= mask >> 1;
1645         mask &= 1;
1646         mask--;
1647         for (j = 0; j < NLIMBS * 3; j++)
1648             outlimbs[j] |= inlimbs[j] & mask;
1649     }
1650 }
1651 
1652 /* get_bit returns the |i|th bit in |in| */
get_bit(const felem_bytearray in,int i)1653 static char get_bit(const felem_bytearray in, int i)
1654 {
1655     if ((i < 0) || (i >= 256))
1656         return 0;
1657     return (in[i >> 3] >> (i & 7)) & 1;
1658 }
1659 
1660 /*
1661  * Interleaved point multiplication using precomputed point multiples: The
1662  * small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], the scalars
1663  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1664  * generator, using certain (large) precomputed multiples in g_pre_comp.
1665  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1666  */
batch_mul(felem x_out,felem y_out,felem z_out,const felem_bytearray scalars[],const unsigned num_points,const u8 * g_scalar,const int mixed,const smallfelem pre_comp[][17][3],const smallfelem g_pre_comp[2][16][3])1667 static void batch_mul(felem x_out, felem y_out, felem z_out,
1668     const felem_bytearray scalars[],
1669     const unsigned num_points, const u8 *g_scalar,
1670     const int mixed, const smallfelem pre_comp[][17][3],
1671     const smallfelem g_pre_comp[2][16][3])
1672 {
1673     int i, skip;
1674     unsigned num, gen_mul = (g_scalar != NULL);
1675     felem nq[3], ftmp;
1676     smallfelem tmp[3];
1677     u64 bits;
1678     u8 sign, digit;
1679 
1680     /* set nq to the point at infinity */
1681     memset(nq, 0, sizeof(nq));
1682 
1683     /*
1684      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1685      * of the generator (two in each of the last 32 rounds) and additions of
1686      * other points multiples (every 5th round).
1687      */
1688     skip = 1; /* save two point operations in the first
1689                * round */
1690     for (i = (num_points ? 255 : 31); i >= 0; --i) {
1691         /* double */
1692         if (!skip)
1693             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1694 
1695         /* add multiples of the generator */
1696         if (gen_mul && (i <= 31)) {
1697             /* first, look 32 bits upwards */
1698             bits = get_bit(g_scalar, i + 224) << 3;
1699             bits |= get_bit(g_scalar, i + 160) << 2;
1700             bits |= get_bit(g_scalar, i + 96) << 1;
1701             bits |= get_bit(g_scalar, i + 32);
1702             /* select the point to add, in constant time */
1703             select_point(bits, 16, g_pre_comp[1], tmp);
1704 
1705             if (!skip) {
1706                 /* Arg 1 below is for "mixed" */
1707                 point_add(nq[0], nq[1], nq[2],
1708                     nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1709             } else {
1710                 smallfelem_expand(nq[0], tmp[0]);
1711                 smallfelem_expand(nq[1], tmp[1]);
1712                 smallfelem_expand(nq[2], tmp[2]);
1713                 skip = 0;
1714             }
1715 
1716             /* second, look at the current position */
1717             bits = get_bit(g_scalar, i + 192) << 3;
1718             bits |= get_bit(g_scalar, i + 128) << 2;
1719             bits |= get_bit(g_scalar, i + 64) << 1;
1720             bits |= get_bit(g_scalar, i);
1721             /* select the point to add, in constant time */
1722             select_point(bits, 16, g_pre_comp[0], tmp);
1723             /* Arg 1 below is for "mixed" */
1724             point_add(nq[0], nq[1], nq[2],
1725                 nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1726         }
1727 
1728         /* do other additions every 5 doublings */
1729         if (num_points && (i % 5 == 0)) {
1730             /* loop over all scalars */
1731             for (num = 0; num < num_points; ++num) {
1732                 bits = get_bit(scalars[num], i + 4) << 5;
1733                 bits |= get_bit(scalars[num], i + 3) << 4;
1734                 bits |= get_bit(scalars[num], i + 2) << 3;
1735                 bits |= get_bit(scalars[num], i + 1) << 2;
1736                 bits |= get_bit(scalars[num], i) << 1;
1737                 bits |= get_bit(scalars[num], i - 1);
1738                 ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1739 
1740                 /*
1741                  * select the point to add or subtract, in constant time
1742                  */
1743                 select_point(digit, 17, pre_comp[num], tmp);
1744                 smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative
1745                                                * point */
1746                 copy_small_conditional(ftmp, tmp[1], (((limb)sign) - 1));
1747                 felem_contract(tmp[1], ftmp);
1748 
1749                 if (!skip) {
1750                     point_add(nq[0], nq[1], nq[2],
1751                         nq[0], nq[1], nq[2],
1752                         mixed, tmp[0], tmp[1], tmp[2]);
1753                 } else {
1754                     smallfelem_expand(nq[0], tmp[0]);
1755                     smallfelem_expand(nq[1], tmp[1]);
1756                     smallfelem_expand(nq[2], tmp[2]);
1757                     skip = 0;
1758                 }
1759             }
1760         }
1761     }
1762     felem_assign(x_out, nq[0]);
1763     felem_assign(y_out, nq[1]);
1764     felem_assign(z_out, nq[2]);
1765 }
1766 
1767 /* Precomputation for the group generator. */
1768 struct nistp256_pre_comp_st {
1769     smallfelem g_pre_comp[2][16][3];
1770     CRYPTO_REF_COUNT references;
1771 };
1772 
EC_GFp_nistp256_method(void)1773 const EC_METHOD *EC_GFp_nistp256_method(void)
1774 {
1775     static const EC_METHOD ret = {
1776         EC_FLAGS_DEFAULT_OCT,
1777         NID_X9_62_prime_field,
1778         ossl_ec_GFp_nistp256_group_init,
1779         ossl_ec_GFp_simple_group_finish,
1780         ossl_ec_GFp_simple_group_clear_finish,
1781         ossl_ec_GFp_nist_group_copy,
1782         ossl_ec_GFp_nistp256_group_set_curve,
1783         ossl_ec_GFp_simple_group_get_curve,
1784         ossl_ec_GFp_simple_group_get_degree,
1785         ossl_ec_group_simple_order_bits,
1786         ossl_ec_GFp_simple_group_check_discriminant,
1787         ossl_ec_GFp_simple_point_init,
1788         ossl_ec_GFp_simple_point_finish,
1789         ossl_ec_GFp_simple_point_clear_finish,
1790         ossl_ec_GFp_simple_point_copy,
1791         ossl_ec_GFp_simple_point_set_to_infinity,
1792         ossl_ec_GFp_simple_point_set_affine_coordinates,
1793         ossl_ec_GFp_nistp256_point_get_affine_coordinates,
1794         0 /* point_set_compressed_coordinates */,
1795         0 /* point2oct */,
1796         0 /* oct2point */,
1797         ossl_ec_GFp_simple_add,
1798         ossl_ec_GFp_simple_dbl,
1799         ossl_ec_GFp_simple_invert,
1800         ossl_ec_GFp_simple_is_at_infinity,
1801         ossl_ec_GFp_simple_is_on_curve,
1802         ossl_ec_GFp_simple_cmp,
1803         ossl_ec_GFp_simple_make_affine,
1804         ossl_ec_GFp_simple_points_make_affine,
1805         ossl_ec_GFp_nistp256_points_mul,
1806         ossl_ec_GFp_nistp256_precompute_mult,
1807         ossl_ec_GFp_nistp256_have_precompute_mult,
1808         ossl_ec_GFp_nist_field_mul,
1809         ossl_ec_GFp_nist_field_sqr,
1810         0 /* field_div */,
1811         ossl_ec_GFp_simple_field_inv,
1812         0 /* field_encode */,
1813         0 /* field_decode */,
1814         0, /* field_set_to_one */
1815         ossl_ec_key_simple_priv2oct,
1816         ossl_ec_key_simple_oct2priv,
1817         0, /* set private */
1818         ossl_ec_key_simple_generate_key,
1819         ossl_ec_key_simple_check_key,
1820         ossl_ec_key_simple_generate_public_key,
1821         0, /* keycopy */
1822         0, /* keyfinish */
1823         ossl_ecdh_simple_compute_key,
1824         ossl_ecdsa_simple_sign_setup,
1825         ossl_ecdsa_simple_sign_sig,
1826         ossl_ecdsa_simple_verify_sig,
1827         0, /* field_inverse_mod_ord */
1828         0, /* blind_coordinates */
1829         0, /* ladder_pre */
1830         0, /* ladder_step */
1831         0 /* ladder_post */
1832     };
1833 
1834     return &ret;
1835 }
1836 
1837 /******************************************************************************/
1838 /*
1839  * FUNCTIONS TO MANAGE PRECOMPUTATION
1840  */
1841 
nistp256_pre_comp_new(void)1842 static NISTP256_PRE_COMP *nistp256_pre_comp_new(void)
1843 {
1844     NISTP256_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1845 
1846     if (ret == NULL)
1847         return ret;
1848 
1849     if (!CRYPTO_NEW_REF(&ret->references, 1)) {
1850         OPENSSL_free(ret);
1851         return NULL;
1852     }
1853     return ret;
1854 }
1855 
EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP * p)1856 NISTP256_PRE_COMP *EC_nistp256_pre_comp_dup(NISTP256_PRE_COMP *p)
1857 {
1858     int i;
1859     if (p != NULL)
1860         CRYPTO_UP_REF(&p->references, &i);
1861     return p;
1862 }
1863 
EC_nistp256_pre_comp_free(NISTP256_PRE_COMP * pre)1864 void EC_nistp256_pre_comp_free(NISTP256_PRE_COMP *pre)
1865 {
1866     int i;
1867 
1868     if (pre == NULL)
1869         return;
1870 
1871     CRYPTO_DOWN_REF(&pre->references, &i);
1872     REF_PRINT_COUNT("EC_nistp256", i, pre);
1873     if (i > 0)
1874         return;
1875     REF_ASSERT_ISNT(i < 0);
1876 
1877     CRYPTO_FREE_REF(&pre->references);
1878     OPENSSL_free(pre);
1879 }
1880 
1881 /******************************************************************************/
1882 /*
1883  * OPENSSL EC_METHOD FUNCTIONS
1884  */
1885 
ossl_ec_GFp_nistp256_group_init(EC_GROUP * group)1886 int ossl_ec_GFp_nistp256_group_init(EC_GROUP *group)
1887 {
1888     int ret;
1889     ret = ossl_ec_GFp_simple_group_init(group);
1890     group->a_is_minus3 = 1;
1891     return ret;
1892 }
1893 
ossl_ec_GFp_nistp256_group_set_curve(EC_GROUP * group,const BIGNUM * p,const BIGNUM * a,const BIGNUM * b,BN_CTX * ctx)1894 int ossl_ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1895     const BIGNUM *a, const BIGNUM *b,
1896     BN_CTX *ctx)
1897 {
1898     int ret = 0;
1899     BIGNUM *curve_p, *curve_a, *curve_b;
1900 #ifndef FIPS_MODULE
1901     BN_CTX *new_ctx = NULL;
1902 
1903     if (ctx == NULL)
1904         ctx = new_ctx = BN_CTX_new();
1905 #endif
1906     if (ctx == NULL)
1907         return 0;
1908 
1909     BN_CTX_start(ctx);
1910     curve_p = BN_CTX_get(ctx);
1911     curve_a = BN_CTX_get(ctx);
1912     curve_b = BN_CTX_get(ctx);
1913     if (curve_b == NULL)
1914         goto err;
1915     BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
1916     BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
1917     BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
1918     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1919         ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1920         goto err;
1921     }
1922     group->field_mod_func = BN_nist_mod_256;
1923     ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1924 err:
1925     BN_CTX_end(ctx);
1926 #ifndef FIPS_MODULE
1927     BN_CTX_free(new_ctx);
1928 #endif
1929     return ret;
1930 }
1931 
1932 /*
1933  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1934  * (X/Z^2, Y/Z^3)
1935  */
ossl_ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP * group,const EC_POINT * point,BIGNUM * x,BIGNUM * y,BN_CTX * ctx)1936 int ossl_ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
1937     const EC_POINT *point,
1938     BIGNUM *x, BIGNUM *y,
1939     BN_CTX *ctx)
1940 {
1941     felem z1, z2, x_in, y_in;
1942     smallfelem x_out, y_out;
1943     longfelem tmp;
1944 
1945     if (EC_POINT_is_at_infinity(group, point)) {
1946         ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1947         return 0;
1948     }
1949     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) || (!BN_to_felem(z1, point->Z)))
1950         return 0;
1951     felem_inv(z2, z1);
1952     felem_square(tmp, z2);
1953     felem_reduce(z1, tmp);
1954     felem_mul(tmp, x_in, z1);
1955     felem_reduce(x_in, tmp);
1956     felem_contract(x_out, x_in);
1957     if (x != NULL) {
1958         if (!smallfelem_to_BN(x, x_out)) {
1959             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1960             return 0;
1961         }
1962     }
1963     felem_mul(tmp, z1, z2);
1964     felem_reduce(z1, tmp);
1965     felem_mul(tmp, y_in, z1);
1966     felem_reduce(y_in, tmp);
1967     felem_contract(y_out, y_in);
1968     if (y != NULL) {
1969         if (!smallfelem_to_BN(y, y_out)) {
1970             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1971             return 0;
1972         }
1973     }
1974     return 1;
1975 }
1976 
1977 /* points below is of size |num|, and tmp_smallfelems is of size |num+1| */
make_points_affine(size_t num,smallfelem points[][3],smallfelem tmp_smallfelems[])1978 static void make_points_affine(size_t num, smallfelem points[][3],
1979     smallfelem tmp_smallfelems[])
1980 {
1981     /*
1982      * Runs in constant time, unless an input is the point at infinity (which
1983      * normally shouldn't happen).
1984      */
1985     ossl_ec_GFp_nistp_points_make_affine_internal(num,
1986         points,
1987         sizeof(smallfelem),
1988         tmp_smallfelems,
1989         (void (*)(void *))smallfelem_one,
1990         smallfelem_is_zero_int,
1991         (void (*)(void *, const void *))
1992             smallfelem_assign,
1993         (void (*)(void *, const void *))
1994             smallfelem_square_contract,
1995         (void (*)(void *, const void *,
1996             const void *))
1997             smallfelem_mul_contract,
1998         (void (*)(void *, const void *))
1999             smallfelem_inv_contract,
2000         /* nothing to contract */
2001         (void (*)(void *, const void *))
2002             smallfelem_assign);
2003 }
2004 
2005 /*
2006  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
2007  * values Result is stored in r (r can equal one of the inputs).
2008  */
ossl_ec_GFp_nistp256_points_mul(const EC_GROUP * group,EC_POINT * r,const BIGNUM * scalar,size_t num,const EC_POINT * points[],const BIGNUM * scalars[],BN_CTX * ctx)2009 int ossl_ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
2010     const BIGNUM *scalar, size_t num,
2011     const EC_POINT *points[],
2012     const BIGNUM *scalars[], BN_CTX *ctx)
2013 {
2014     int ret = 0;
2015     int j;
2016     int mixed = 0;
2017     BIGNUM *x, *y, *z, *tmp_scalar;
2018     felem_bytearray g_secret;
2019     felem_bytearray *secrets = NULL;
2020     smallfelem(*pre_comp)[17][3] = NULL;
2021     smallfelem *tmp_smallfelems = NULL;
2022     unsigned i;
2023     int num_bytes;
2024     int have_pre_comp = 0;
2025     size_t num_points = num;
2026     smallfelem x_in, y_in, z_in;
2027     felem x_out, y_out, z_out;
2028     NISTP256_PRE_COMP *pre = NULL;
2029     const smallfelem(*g_pre_comp)[16][3] = NULL;
2030     EC_POINT *generator = NULL;
2031     const EC_POINT *p = NULL;
2032     const BIGNUM *p_scalar = NULL;
2033 
2034     BN_CTX_start(ctx);
2035     x = BN_CTX_get(ctx);
2036     y = BN_CTX_get(ctx);
2037     z = BN_CTX_get(ctx);
2038     tmp_scalar = BN_CTX_get(ctx);
2039     if (tmp_scalar == NULL)
2040         goto err;
2041 
2042     if (scalar != NULL) {
2043         pre = group->pre_comp.nistp256;
2044         if (pre)
2045             /* we have precomputation, try to use it */
2046             g_pre_comp = (const smallfelem(*)[16][3])pre->g_pre_comp;
2047         else
2048             /* try to use the standard precomputation */
2049             g_pre_comp = &gmul[0];
2050         generator = EC_POINT_new(group);
2051         if (generator == NULL)
2052             goto err;
2053         /* get the generator from precomputation */
2054         if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) || !smallfelem_to_BN(y, g_pre_comp[0][1][1]) || !smallfelem_to_BN(z, g_pre_comp[0][1][2])) {
2055             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2056             goto err;
2057         }
2058         if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
2059                 generator,
2060                 x, y, z, ctx))
2061             goto err;
2062         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
2063             /* precomputation matches generator */
2064             have_pre_comp = 1;
2065         else
2066             /*
2067              * we don't have valid precomputation: treat the generator as a
2068              * random point
2069              */
2070             num_points++;
2071     }
2072     if (num_points > 0) {
2073         if (num_points >= 3) {
2074             /*
2075              * unless we precompute multiples for just one or two points,
2076              * converting those into affine form is time well spent
2077              */
2078             mixed = 1;
2079         }
2080         secrets = OPENSSL_malloc(sizeof(*secrets) * num_points);
2081         pre_comp = OPENSSL_malloc(sizeof(*pre_comp) * num_points);
2082         if (mixed)
2083             tmp_smallfelems = OPENSSL_malloc(sizeof(*tmp_smallfelems) * (num_points * 17 + 1));
2084         if ((secrets == NULL) || (pre_comp == NULL)
2085             || (mixed && (tmp_smallfelems == NULL)))
2086             goto err;
2087 
2088         /*
2089          * we treat NULL scalars as 0, and NULL points as points at infinity,
2090          * i.e., they contribute nothing to the linear combination
2091          */
2092         memset(secrets, 0, sizeof(*secrets) * num_points);
2093         memset(pre_comp, 0, sizeof(*pre_comp) * num_points);
2094         for (i = 0; i < num_points; ++i) {
2095             if (i == num) {
2096                 /*
2097                  * we didn't have a valid precomputation, so we pick the
2098                  * generator
2099                  */
2100                 p = EC_GROUP_get0_generator(group);
2101                 p_scalar = scalar;
2102             } else {
2103                 /* the i^th point */
2104                 p = points[i];
2105                 p_scalar = scalars[i];
2106             }
2107             if ((p_scalar != NULL) && (p != NULL)) {
2108                 /* reduce scalar to 0 <= scalar < 2^256 */
2109                 if ((BN_num_bits(p_scalar) > 256)
2110                     || (BN_is_negative(p_scalar))) {
2111                     /*
2112                      * this is an unusual input, and we don't guarantee
2113                      * constant-timeness
2114                      */
2115                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
2116                         ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2117                         goto err;
2118                     }
2119                     num_bytes = BN_bn2lebinpad(tmp_scalar,
2120                         secrets[i], sizeof(secrets[i]));
2121                 } else {
2122                     num_bytes = BN_bn2lebinpad(p_scalar,
2123                         secrets[i], sizeof(secrets[i]));
2124                 }
2125                 if (num_bytes < 0) {
2126                     ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2127                     goto err;
2128                 }
2129                 /* precompute multiples */
2130                 if ((!BN_to_felem(x_out, p->X)) || (!BN_to_felem(y_out, p->Y)) || (!BN_to_felem(z_out, p->Z)))
2131                     goto err;
2132                 felem_shrink(pre_comp[i][1][0], x_out);
2133                 felem_shrink(pre_comp[i][1][1], y_out);
2134                 felem_shrink(pre_comp[i][1][2], z_out);
2135                 for (j = 2; j <= 16; ++j) {
2136                     if (j & 1) {
2137                         point_add_small(pre_comp[i][j][0], pre_comp[i][j][1],
2138                             pre_comp[i][j][2], pre_comp[i][1][0],
2139                             pre_comp[i][1][1], pre_comp[i][1][2],
2140                             pre_comp[i][j - 1][0],
2141                             pre_comp[i][j - 1][1],
2142                             pre_comp[i][j - 1][2]);
2143                     } else {
2144                         point_double_small(pre_comp[i][j][0],
2145                             pre_comp[i][j][1],
2146                             pre_comp[i][j][2],
2147                             pre_comp[i][j / 2][0],
2148                             pre_comp[i][j / 2][1],
2149                             pre_comp[i][j / 2][2]);
2150                     }
2151                 }
2152             }
2153         }
2154         if (mixed)
2155             make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
2156     }
2157 
2158     /* the scalar for the generator */
2159     if ((scalar != NULL) && (have_pre_comp)) {
2160         memset(g_secret, 0, sizeof(g_secret));
2161         /* reduce scalar to 0 <= scalar < 2^256 */
2162         if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) {
2163             /*
2164              * this is an unusual input, and we don't guarantee
2165              * constant-timeness
2166              */
2167             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2168                 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2169                 goto err;
2170             }
2171             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2172         } else {
2173             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2174         }
2175         /* do the multiplication with generator precomputation */
2176         batch_mul(x_out, y_out, z_out,
2177             (const felem_bytearray(*))secrets, num_points,
2178             g_secret,
2179             mixed, (const smallfelem(*)[17][3])pre_comp, g_pre_comp);
2180     } else {
2181         /* do the multiplication without generator precomputation */
2182         batch_mul(x_out, y_out, z_out,
2183             (const felem_bytearray(*))secrets, num_points,
2184             NULL, mixed, (const smallfelem(*)[17][3])pre_comp, NULL);
2185     }
2186     /* reduce the output to its unique minimal representation */
2187     felem_contract(x_in, x_out);
2188     felem_contract(y_in, y_out);
2189     felem_contract(z_in, z_out);
2190     if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) || (!smallfelem_to_BN(z, z_in))) {
2191         ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
2192         goto err;
2193     }
2194     ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
2195         ctx);
2196 
2197 err:
2198     BN_CTX_end(ctx);
2199     EC_POINT_free(generator);
2200     OPENSSL_free(secrets);
2201     OPENSSL_free(pre_comp);
2202     OPENSSL_free(tmp_smallfelems);
2203     return ret;
2204 }
2205 
ossl_ec_GFp_nistp256_precompute_mult(EC_GROUP * group,BN_CTX * ctx)2206 int ossl_ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2207 {
2208     int ret = 0;
2209     NISTP256_PRE_COMP *pre = NULL;
2210     int i, j;
2211     BIGNUM *x, *y;
2212     EC_POINT *generator = NULL;
2213     smallfelem tmp_smallfelems[32];
2214     felem x_tmp, y_tmp, z_tmp;
2215 #ifndef FIPS_MODULE
2216     BN_CTX *new_ctx = NULL;
2217 #endif
2218 
2219     /* throw away old precomputation */
2220     EC_pre_comp_free(group);
2221 
2222 #ifndef FIPS_MODULE
2223     if (ctx == NULL)
2224         ctx = new_ctx = BN_CTX_new();
2225 #endif
2226     if (ctx == NULL)
2227         return 0;
2228 
2229     BN_CTX_start(ctx);
2230     x = BN_CTX_get(ctx);
2231     y = BN_CTX_get(ctx);
2232     if (y == NULL)
2233         goto err;
2234     /* get the generator */
2235     if (group->generator == NULL)
2236         goto err;
2237     generator = EC_POINT_new(group);
2238     if (generator == NULL)
2239         goto err;
2240     BN_bin2bn(nistp256_curve_params[3], sizeof(felem_bytearray), x);
2241     BN_bin2bn(nistp256_curve_params[4], sizeof(felem_bytearray), y);
2242     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2243         goto err;
2244     if ((pre = nistp256_pre_comp_new()) == NULL)
2245         goto err;
2246     /*
2247      * if the generator is the standard one, use built-in precomputation
2248      */
2249     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2250         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2251         goto done;
2252     }
2253     if ((!BN_to_felem(x_tmp, group->generator->X)) || (!BN_to_felem(y_tmp, group->generator->Y)) || (!BN_to_felem(z_tmp, group->generator->Z)))
2254         goto err;
2255     felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
2256     felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
2257     felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
2258     /*
2259      * compute 2^64*G, 2^128*G, 2^192*G for the first table, 2^32*G, 2^96*G,
2260      * 2^160*G, 2^224*G for the second one
2261      */
2262     for (i = 1; i <= 8; i <<= 1) {
2263         point_double_small(pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2264             pre->g_pre_comp[1][i][2], pre->g_pre_comp[0][i][0],
2265             pre->g_pre_comp[0][i][1],
2266             pre->g_pre_comp[0][i][2]);
2267         for (j = 0; j < 31; ++j) {
2268             point_double_small(pre->g_pre_comp[1][i][0],
2269                 pre->g_pre_comp[1][i][1],
2270                 pre->g_pre_comp[1][i][2],
2271                 pre->g_pre_comp[1][i][0],
2272                 pre->g_pre_comp[1][i][1],
2273                 pre->g_pre_comp[1][i][2]);
2274         }
2275         if (i == 8)
2276             break;
2277         point_double_small(pre->g_pre_comp[0][2 * i][0],
2278             pre->g_pre_comp[0][2 * i][1],
2279             pre->g_pre_comp[0][2 * i][2],
2280             pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1],
2281             pre->g_pre_comp[1][i][2]);
2282         for (j = 0; j < 31; ++j) {
2283             point_double_small(pre->g_pre_comp[0][2 * i][0],
2284                 pre->g_pre_comp[0][2 * i][1],
2285                 pre->g_pre_comp[0][2 * i][2],
2286                 pre->g_pre_comp[0][2 * i][0],
2287                 pre->g_pre_comp[0][2 * i][1],
2288                 pre->g_pre_comp[0][2 * i][2]);
2289         }
2290     }
2291     for (i = 0; i < 2; i++) {
2292         /* g_pre_comp[i][0] is the point at infinity */
2293         memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
2294         /* the remaining multiples */
2295         /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
2296         point_add_small(pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
2297             pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
2298             pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
2299             pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2300             pre->g_pre_comp[i][2][2]);
2301         /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
2302         point_add_small(pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
2303             pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
2304             pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2305             pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2306             pre->g_pre_comp[i][2][2]);
2307         /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
2308         point_add_small(pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
2309             pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
2310             pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
2311             pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
2312             pre->g_pre_comp[i][4][2]);
2313         /*
2314          * 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G
2315          */
2316         point_add_small(pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
2317             pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
2318             pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
2319             pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
2320             pre->g_pre_comp[i][2][2]);
2321         for (j = 1; j < 8; ++j) {
2322             /* odd multiples: add G resp. 2^32*G */
2323             point_add_small(pre->g_pre_comp[i][2 * j + 1][0],
2324                 pre->g_pre_comp[i][2 * j + 1][1],
2325                 pre->g_pre_comp[i][2 * j + 1][2],
2326                 pre->g_pre_comp[i][2 * j][0],
2327                 pre->g_pre_comp[i][2 * j][1],
2328                 pre->g_pre_comp[i][2 * j][2],
2329                 pre->g_pre_comp[i][1][0],
2330                 pre->g_pre_comp[i][1][1],
2331                 pre->g_pre_comp[i][1][2]);
2332         }
2333     }
2334     make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
2335 
2336 done:
2337     SETPRECOMP(group, nistp256, pre);
2338     pre = NULL;
2339     ret = 1;
2340 
2341 err:
2342     BN_CTX_end(ctx);
2343     EC_POINT_free(generator);
2344 #ifndef FIPS_MODULE
2345     BN_CTX_free(new_ctx);
2346 #endif
2347     EC_nistp256_pre_comp_free(pre);
2348     return ret;
2349 }
2350 
ossl_ec_GFp_nistp256_have_precompute_mult(const EC_GROUP * group)2351 int ossl_ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
2352 {
2353     return HAVEPRECOMP(group, nistp256);
2354 }
2355