xref: /linux/arch/x86/crypto/curve25519-x86_64.c (revision 44a8c96edd0ee9320a1ad87afc7b10f38e55d5ec)
1 // SPDX-License-Identifier: GPL-2.0 OR MIT
2 /*
3  * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4  * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
5  */
6 
7 #include <crypto/curve25519.h>
8 #include <crypto/internal/kpp.h>
9 
10 #include <linux/export.h>
11 #include <linux/types.h>
12 #include <linux/jump_label.h>
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/scatterlist.h>
16 
17 #include <asm/cpufeature.h>
18 #include <asm/processor.h>
19 
eq_mask(u64 a,u64 b)20 static __always_inline u64 eq_mask(u64 a, u64 b)
21 {
22 	u64 x = a ^ b;
23 	u64 minus_x = ~x + (u64)1U;
24 	u64 x_or_minus_x = x | minus_x;
25 	u64 xnx = x_or_minus_x >> (u32)63U;
26 	return xnx - (u64)1U;
27 }
28 
gte_mask(u64 a,u64 b)29 static __always_inline u64 gte_mask(u64 a, u64 b)
30 {
31 	u64 x = a;
32 	u64 y = b;
33 	u64 x_xor_y = x ^ y;
34 	u64 x_sub_y = x - y;
35 	u64 x_sub_y_xor_y = x_sub_y ^ y;
36 	u64 q = x_xor_y | x_sub_y_xor_y;
37 	u64 x_xor_q = x ^ q;
38 	u64 x_xor_q_ = x_xor_q >> (u32)63U;
39 	return x_xor_q_ - (u64)1U;
40 }
41 
42 /* Computes the addition of four-element f1 with value in f2
43  * and returns the carry (if any) */
add_scalar(u64 * out,const u64 * f1,u64 f2)44 static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
45 {
46 	u64 carry_r;
47 
48 	asm volatile(
49 		/* Clear registers to propagate the carry bit */
50 		"  xor %%r8d, %%r8d;"
51 		"  xor %%r9d, %%r9d;"
52 		"  xor %%r10d, %%r10d;"
53 		"  xor %%r11d, %%r11d;"
54 		"  xor %k1, %k1;"
55 
56 		/* Begin addition chain */
57 		"  addq 0(%3), %0;"
58 		"  movq %0, 0(%2);"
59 		"  adcxq 8(%3), %%r8;"
60 		"  movq %%r8, 8(%2);"
61 		"  adcxq 16(%3), %%r9;"
62 		"  movq %%r9, 16(%2);"
63 		"  adcxq 24(%3), %%r10;"
64 		"  movq %%r10, 24(%2);"
65 
66 		/* Return the carry bit in a register */
67 		"  adcx %%r11, %1;"
68 		: "+&r"(f2), "=&r"(carry_r)
69 		: "r"(out), "r"(f1)
70 		: "%r8", "%r9", "%r10", "%r11", "memory", "cc");
71 
72 	return carry_r;
73 }
74 
75 /* Computes the field addition of two field elements */
fadd(u64 * out,const u64 * f1,const u64 * f2)76 static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
77 {
78 	asm volatile(
79 		/* Compute the raw addition of f1 + f2 */
80 		"  movq 0(%0), %%r8;"
81 		"  addq 0(%2), %%r8;"
82 		"  movq 8(%0), %%r9;"
83 		"  adcxq 8(%2), %%r9;"
84 		"  movq 16(%0), %%r10;"
85 		"  adcxq 16(%2), %%r10;"
86 		"  movq 24(%0), %%r11;"
87 		"  adcxq 24(%2), %%r11;"
88 
89 		/* Wrap the result back into the field */
90 
91 		/* Step 1: Compute carry*38 */
92 		"  mov $0, %%rax;"
93 		"  mov $38, %0;"
94 		"  cmovc %0, %%rax;"
95 
96 		/* Step 2: Add carry*38 to the original sum */
97 		"  xor %%ecx, %%ecx;"
98 		"  add %%rax, %%r8;"
99 		"  adcx %%rcx, %%r9;"
100 		"  movq %%r9, 8(%1);"
101 		"  adcx %%rcx, %%r10;"
102 		"  movq %%r10, 16(%1);"
103 		"  adcx %%rcx, %%r11;"
104 		"  movq %%r11, 24(%1);"
105 
106 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
107 		"  mov $0, %%rax;"
108 		"  cmovc %0, %%rax;"
109 		"  add %%rax, %%r8;"
110 		"  movq %%r8, 0(%1);"
111 		: "+&r"(f2)
112 		: "r"(out), "r"(f1)
113 		: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
114 }
115 
116 /* Computes the field subtraction of two field elements */
fsub(u64 * out,const u64 * f1,const u64 * f2)117 static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
118 {
119 	asm volatile(
120 		/* Compute the raw subtraction of f1-f2 */
121 		"  movq 0(%1), %%r8;"
122 		"  subq 0(%2), %%r8;"
123 		"  movq 8(%1), %%r9;"
124 		"  sbbq 8(%2), %%r9;"
125 		"  movq 16(%1), %%r10;"
126 		"  sbbq 16(%2), %%r10;"
127 		"  movq 24(%1), %%r11;"
128 		"  sbbq 24(%2), %%r11;"
129 
130 		/* Wrap the result back into the field */
131 
132 		/* Step 1: Compute carry*38 */
133 		"  mov $0, %%rax;"
134 		"  mov $38, %%rcx;"
135 		"  cmovc %%rcx, %%rax;"
136 
137 		/* Step 2: Subtract carry*38 from the original difference */
138 		"  sub %%rax, %%r8;"
139 		"  sbb $0, %%r9;"
140 		"  sbb $0, %%r10;"
141 		"  sbb $0, %%r11;"
142 
143 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
144 		"  mov $0, %%rax;"
145 		"  cmovc %%rcx, %%rax;"
146 		"  sub %%rax, %%r8;"
147 
148 		/* Store the result */
149 		"  movq %%r8, 0(%0);"
150 		"  movq %%r9, 8(%0);"
151 		"  movq %%r10, 16(%0);"
152 		"  movq %%r11, 24(%0);"
153 		:
154 		: "r"(out), "r"(f1), "r"(f2)
155 		: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc");
156 }
157 
158 /* Computes a field multiplication: out <- f1 * f2
159  * Uses the 8-element buffer tmp for intermediate results */
fmul(u64 * out,const u64 * f1,const u64 * f2,u64 * tmp)160 static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
161 {
162 	asm volatile(
163 
164 		/* Compute the raw multiplication: tmp <- src1 * src2 */
165 
166 		/* Compute src1[0] * src2 */
167 		"  movq 0(%0), %%rdx;"
168 		"  mulxq 0(%1), %%r8, %%r9;"
169 		"  xor %%r10d, %%r10d;"
170 		"  movq %%r8, 0(%2);"
171 		"  mulxq 8(%1), %%r10, %%r11;"
172 		"  adox %%r9, %%r10;"
173 		"  movq %%r10, 8(%2);"
174 		"  mulxq 16(%1), %%rbx, %%r13;"
175 		"  adox %%r11, %%rbx;"
176 		"  mulxq 24(%1), %%r14, %%rdx;"
177 		"  adox %%r13, %%r14;"
178 		"  mov $0, %%rax;"
179 		"  adox %%rdx, %%rax;"
180 
181 		/* Compute src1[1] * src2 */
182 		"  movq 8(%0), %%rdx;"
183 		"  mulxq 0(%1), %%r8, %%r9;"
184 		"  xor %%r10d, %%r10d;"
185 		"  adcxq 8(%2), %%r8;"
186 		"  movq %%r8, 8(%2);"
187 		"  mulxq 8(%1), %%r10, %%r11;"
188 		"  adox %%r9, %%r10;"
189 		"  adcx %%rbx, %%r10;"
190 		"  movq %%r10, 16(%2);"
191 		"  mulxq 16(%1), %%rbx, %%r13;"
192 		"  adox %%r11, %%rbx;"
193 		"  adcx %%r14, %%rbx;"
194 		"  mov $0, %%r8;"
195 		"  mulxq 24(%1), %%r14, %%rdx;"
196 		"  adox %%r13, %%r14;"
197 		"  adcx %%rax, %%r14;"
198 		"  mov $0, %%rax;"
199 		"  adox %%rdx, %%rax;"
200 		"  adcx %%r8, %%rax;"
201 
202 		/* Compute src1[2] * src2 */
203 		"  movq 16(%0), %%rdx;"
204 		"  mulxq 0(%1), %%r8, %%r9;"
205 		"  xor %%r10d, %%r10d;"
206 		"  adcxq 16(%2), %%r8;"
207 		"  movq %%r8, 16(%2);"
208 		"  mulxq 8(%1), %%r10, %%r11;"
209 		"  adox %%r9, %%r10;"
210 		"  adcx %%rbx, %%r10;"
211 		"  movq %%r10, 24(%2);"
212 		"  mulxq 16(%1), %%rbx, %%r13;"
213 		"  adox %%r11, %%rbx;"
214 		"  adcx %%r14, %%rbx;"
215 		"  mov $0, %%r8;"
216 		"  mulxq 24(%1), %%r14, %%rdx;"
217 		"  adox %%r13, %%r14;"
218 		"  adcx %%rax, %%r14;"
219 		"  mov $0, %%rax;"
220 		"  adox %%rdx, %%rax;"
221 		"  adcx %%r8, %%rax;"
222 
223 		/* Compute src1[3] * src2 */
224 		"  movq 24(%0), %%rdx;"
225 		"  mulxq 0(%1), %%r8, %%r9;"
226 		"  xor %%r10d, %%r10d;"
227 		"  adcxq 24(%2), %%r8;"
228 		"  movq %%r8, 24(%2);"
229 		"  mulxq 8(%1), %%r10, %%r11;"
230 		"  adox %%r9, %%r10;"
231 		"  adcx %%rbx, %%r10;"
232 		"  movq %%r10, 32(%2);"
233 		"  mulxq 16(%1), %%rbx, %%r13;"
234 		"  adox %%r11, %%rbx;"
235 		"  adcx %%r14, %%rbx;"
236 		"  movq %%rbx, 40(%2);"
237 		"  mov $0, %%r8;"
238 		"  mulxq 24(%1), %%r14, %%rdx;"
239 		"  adox %%r13, %%r14;"
240 		"  adcx %%rax, %%r14;"
241 		"  movq %%r14, 48(%2);"
242 		"  mov $0, %%rax;"
243 		"  adox %%rdx, %%rax;"
244 		"  adcx %%r8, %%rax;"
245 		"  movq %%rax, 56(%2);"
246 
247 		/* Line up pointers */
248 		"  mov %2, %0;"
249 		"  mov %3, %2;"
250 
251 		/* Wrap the result back into the field */
252 
253 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
254 		"  mov $38, %%rdx;"
255 		"  mulxq 32(%0), %%r8, %%r13;"
256 		"  xor %k1, %k1;"
257 		"  adoxq 0(%0), %%r8;"
258 		"  mulxq 40(%0), %%r9, %%rbx;"
259 		"  adcx %%r13, %%r9;"
260 		"  adoxq 8(%0), %%r9;"
261 		"  mulxq 48(%0), %%r10, %%r13;"
262 		"  adcx %%rbx, %%r10;"
263 		"  adoxq 16(%0), %%r10;"
264 		"  mulxq 56(%0), %%r11, %%rax;"
265 		"  adcx %%r13, %%r11;"
266 		"  adoxq 24(%0), %%r11;"
267 		"  adcx %1, %%rax;"
268 		"  adox %1, %%rax;"
269 		"  imul %%rdx, %%rax;"
270 
271 		/* Step 2: Fold the carry back into dst */
272 		"  add %%rax, %%r8;"
273 		"  adcx %1, %%r9;"
274 		"  movq %%r9, 8(%2);"
275 		"  adcx %1, %%r10;"
276 		"  movq %%r10, 16(%2);"
277 		"  adcx %1, %%r11;"
278 		"  movq %%r11, 24(%2);"
279 
280 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
281 		"  mov $0, %%rax;"
282 		"  cmovc %%rdx, %%rax;"
283 		"  add %%rax, %%r8;"
284 		"  movq %%r8, 0(%2);"
285 		: "+&r"(f1), "+&r"(f2), "+&r"(tmp)
286 		: "r"(out)
287 		: "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
288 		  "%r14", "memory", "cc");
289 }
290 
291 /* Computes two field multiplications:
292  *   out[0] <- f1[0] * f2[0]
293  *   out[1] <- f1[1] * f2[1]
294  * Uses the 16-element buffer tmp for intermediate results: */
fmul2(u64 * out,const u64 * f1,const u64 * f2,u64 * tmp)295 static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
296 {
297 	asm volatile(
298 
299 		/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
300 
301 		/* Compute src1[0] * src2 */
302 		"  movq 0(%0), %%rdx;"
303 		"  mulxq 0(%1), %%r8, %%r9;"
304 		"  xor %%r10d, %%r10d;"
305 		"  movq %%r8, 0(%2);"
306 		"  mulxq 8(%1), %%r10, %%r11;"
307 		"  adox %%r9, %%r10;"
308 		"  movq %%r10, 8(%2);"
309 		"  mulxq 16(%1), %%rbx, %%r13;"
310 		"  adox %%r11, %%rbx;"
311 		"  mulxq 24(%1), %%r14, %%rdx;"
312 		"  adox %%r13, %%r14;"
313 		"  mov $0, %%rax;"
314 		"  adox %%rdx, %%rax;"
315 
316 		/* Compute src1[1] * src2 */
317 		"  movq 8(%0), %%rdx;"
318 		"  mulxq 0(%1), %%r8, %%r9;"
319 		"  xor %%r10d, %%r10d;"
320 		"  adcxq 8(%2), %%r8;"
321 		"  movq %%r8, 8(%2);"
322 		"  mulxq 8(%1), %%r10, %%r11;"
323 		"  adox %%r9, %%r10;"
324 		"  adcx %%rbx, %%r10;"
325 		"  movq %%r10, 16(%2);"
326 		"  mulxq 16(%1), %%rbx, %%r13;"
327 		"  adox %%r11, %%rbx;"
328 		"  adcx %%r14, %%rbx;"
329 		"  mov $0, %%r8;"
330 		"  mulxq 24(%1), %%r14, %%rdx;"
331 		"  adox %%r13, %%r14;"
332 		"  adcx %%rax, %%r14;"
333 		"  mov $0, %%rax;"
334 		"  adox %%rdx, %%rax;"
335 		"  adcx %%r8, %%rax;"
336 
337 		/* Compute src1[2] * src2 */
338 		"  movq 16(%0), %%rdx;"
339 		"  mulxq 0(%1), %%r8, %%r9;"
340 		"  xor %%r10d, %%r10d;"
341 		"  adcxq 16(%2), %%r8;"
342 		"  movq %%r8, 16(%2);"
343 		"  mulxq 8(%1), %%r10, %%r11;"
344 		"  adox %%r9, %%r10;"
345 		"  adcx %%rbx, %%r10;"
346 		"  movq %%r10, 24(%2);"
347 		"  mulxq 16(%1), %%rbx, %%r13;"
348 		"  adox %%r11, %%rbx;"
349 		"  adcx %%r14, %%rbx;"
350 		"  mov $0, %%r8;"
351 		"  mulxq 24(%1), %%r14, %%rdx;"
352 		"  adox %%r13, %%r14;"
353 		"  adcx %%rax, %%r14;"
354 		"  mov $0, %%rax;"
355 		"  adox %%rdx, %%rax;"
356 		"  adcx %%r8, %%rax;"
357 
358 		/* Compute src1[3] * src2 */
359 		"  movq 24(%0), %%rdx;"
360 		"  mulxq 0(%1), %%r8, %%r9;"
361 		"  xor %%r10d, %%r10d;"
362 		"  adcxq 24(%2), %%r8;"
363 		"  movq %%r8, 24(%2);"
364 		"  mulxq 8(%1), %%r10, %%r11;"
365 		"  adox %%r9, %%r10;"
366 		"  adcx %%rbx, %%r10;"
367 		"  movq %%r10, 32(%2);"
368 		"  mulxq 16(%1), %%rbx, %%r13;"
369 		"  adox %%r11, %%rbx;"
370 		"  adcx %%r14, %%rbx;"
371 		"  movq %%rbx, 40(%2);"
372 		"  mov $0, %%r8;"
373 		"  mulxq 24(%1), %%r14, %%rdx;"
374 		"  adox %%r13, %%r14;"
375 		"  adcx %%rax, %%r14;"
376 		"  movq %%r14, 48(%2);"
377 		"  mov $0, %%rax;"
378 		"  adox %%rdx, %%rax;"
379 		"  adcx %%r8, %%rax;"
380 		"  movq %%rax, 56(%2);"
381 
382 		/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
383 
384 		/* Compute src1[0] * src2 */
385 		"  movq 32(%0), %%rdx;"
386 		"  mulxq 32(%1), %%r8, %%r9;"
387 		"  xor %%r10d, %%r10d;"
388 		"  movq %%r8, 64(%2);"
389 		"  mulxq 40(%1), %%r10, %%r11;"
390 		"  adox %%r9, %%r10;"
391 		"  movq %%r10, 72(%2);"
392 		"  mulxq 48(%1), %%rbx, %%r13;"
393 		"  adox %%r11, %%rbx;"
394 		"  mulxq 56(%1), %%r14, %%rdx;"
395 		"  adox %%r13, %%r14;"
396 		"  mov $0, %%rax;"
397 		"  adox %%rdx, %%rax;"
398 
399 		/* Compute src1[1] * src2 */
400 		"  movq 40(%0), %%rdx;"
401 		"  mulxq 32(%1), %%r8, %%r9;"
402 		"  xor %%r10d, %%r10d;"
403 		"  adcxq 72(%2), %%r8;"
404 		"  movq %%r8, 72(%2);"
405 		"  mulxq 40(%1), %%r10, %%r11;"
406 		"  adox %%r9, %%r10;"
407 		"  adcx %%rbx, %%r10;"
408 		"  movq %%r10, 80(%2);"
409 		"  mulxq 48(%1), %%rbx, %%r13;"
410 		"  adox %%r11, %%rbx;"
411 		"  adcx %%r14, %%rbx;"
412 		"  mov $0, %%r8;"
413 		"  mulxq 56(%1), %%r14, %%rdx;"
414 		"  adox %%r13, %%r14;"
415 		"  adcx %%rax, %%r14;"
416 		"  mov $0, %%rax;"
417 		"  adox %%rdx, %%rax;"
418 		"  adcx %%r8, %%rax;"
419 
420 		/* Compute src1[2] * src2 */
421 		"  movq 48(%0), %%rdx;"
422 		"  mulxq 32(%1), %%r8, %%r9;"
423 		"  xor %%r10d, %%r10d;"
424 		"  adcxq 80(%2), %%r8;"
425 		"  movq %%r8, 80(%2);"
426 		"  mulxq 40(%1), %%r10, %%r11;"
427 		"  adox %%r9, %%r10;"
428 		"  adcx %%rbx, %%r10;"
429 		"  movq %%r10, 88(%2);"
430 		"  mulxq 48(%1), %%rbx, %%r13;"
431 		"  adox %%r11, %%rbx;"
432 		"  adcx %%r14, %%rbx;"
433 		"  mov $0, %%r8;"
434 		"  mulxq 56(%1), %%r14, %%rdx;"
435 		"  adox %%r13, %%r14;"
436 		"  adcx %%rax, %%r14;"
437 		"  mov $0, %%rax;"
438 		"  adox %%rdx, %%rax;"
439 		"  adcx %%r8, %%rax;"
440 
441 		/* Compute src1[3] * src2 */
442 		"  movq 56(%0), %%rdx;"
443 		"  mulxq 32(%1), %%r8, %%r9;"
444 		"  xor %%r10d, %%r10d;"
445 		"  adcxq 88(%2), %%r8;"
446 		"  movq %%r8, 88(%2);"
447 		"  mulxq 40(%1), %%r10, %%r11;"
448 		"  adox %%r9, %%r10;"
449 		"  adcx %%rbx, %%r10;"
450 		"  movq %%r10, 96(%2);"
451 		"  mulxq 48(%1), %%rbx, %%r13;"
452 		"  adox %%r11, %%rbx;"
453 		"  adcx %%r14, %%rbx;"
454 		"  movq %%rbx, 104(%2);"
455 		"  mov $0, %%r8;"
456 		"  mulxq 56(%1), %%r14, %%rdx;"
457 		"  adox %%r13, %%r14;"
458 		"  adcx %%rax, %%r14;"
459 		"  movq %%r14, 112(%2);"
460 		"  mov $0, %%rax;"
461 		"  adox %%rdx, %%rax;"
462 		"  adcx %%r8, %%rax;"
463 		"  movq %%rax, 120(%2);"
464 
465 		/* Line up pointers */
466 		"  mov %2, %0;"
467 		"  mov %3, %2;"
468 
469 		/* Wrap the results back into the field */
470 
471 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
472 		"  mov $38, %%rdx;"
473 		"  mulxq 32(%0), %%r8, %%r13;"
474 		"  xor %k1, %k1;"
475 		"  adoxq 0(%0), %%r8;"
476 		"  mulxq 40(%0), %%r9, %%rbx;"
477 		"  adcx %%r13, %%r9;"
478 		"  adoxq 8(%0), %%r9;"
479 		"  mulxq 48(%0), %%r10, %%r13;"
480 		"  adcx %%rbx, %%r10;"
481 		"  adoxq 16(%0), %%r10;"
482 		"  mulxq 56(%0), %%r11, %%rax;"
483 		"  adcx %%r13, %%r11;"
484 		"  adoxq 24(%0), %%r11;"
485 		"  adcx %1, %%rax;"
486 		"  adox %1, %%rax;"
487 		"  imul %%rdx, %%rax;"
488 
489 		/* Step 2: Fold the carry back into dst */
490 		"  add %%rax, %%r8;"
491 		"  adcx %1, %%r9;"
492 		"  movq %%r9, 8(%2);"
493 		"  adcx %1, %%r10;"
494 		"  movq %%r10, 16(%2);"
495 		"  adcx %1, %%r11;"
496 		"  movq %%r11, 24(%2);"
497 
498 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
499 		"  mov $0, %%rax;"
500 		"  cmovc %%rdx, %%rax;"
501 		"  add %%rax, %%r8;"
502 		"  movq %%r8, 0(%2);"
503 
504 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
505 		"  mov $38, %%rdx;"
506 		"  mulxq 96(%0), %%r8, %%r13;"
507 		"  xor %k1, %k1;"
508 		"  adoxq 64(%0), %%r8;"
509 		"  mulxq 104(%0), %%r9, %%rbx;"
510 		"  adcx %%r13, %%r9;"
511 		"  adoxq 72(%0), %%r9;"
512 		"  mulxq 112(%0), %%r10, %%r13;"
513 		"  adcx %%rbx, %%r10;"
514 		"  adoxq 80(%0), %%r10;"
515 		"  mulxq 120(%0), %%r11, %%rax;"
516 		"  adcx %%r13, %%r11;"
517 		"  adoxq 88(%0), %%r11;"
518 		"  adcx %1, %%rax;"
519 		"  adox %1, %%rax;"
520 		"  imul %%rdx, %%rax;"
521 
522 		/* Step 2: Fold the carry back into dst */
523 		"  add %%rax, %%r8;"
524 		"  adcx %1, %%r9;"
525 		"  movq %%r9, 40(%2);"
526 		"  adcx %1, %%r10;"
527 		"  movq %%r10, 48(%2);"
528 		"  adcx %1, %%r11;"
529 		"  movq %%r11, 56(%2);"
530 
531 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
532 		"  mov $0, %%rax;"
533 		"  cmovc %%rdx, %%rax;"
534 		"  add %%rax, %%r8;"
535 		"  movq %%r8, 32(%2);"
536 		: "+&r"(f1), "+&r"(f2), "+&r"(tmp)
537 		: "r"(out)
538 		: "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
539 		  "%r14", "memory", "cc");
540 }
541 
542 /* Computes the field multiplication of four-element f1 with value in f2
543  * Requires f2 to be smaller than 2^17 */
fmul_scalar(u64 * out,const u64 * f1,u64 f2)544 static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
545 {
546 	register u64 f2_r asm("rdx") = f2;
547 
548 	asm volatile(
549 		/* Compute the raw multiplication of f1*f2 */
550 		"  mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
551 		"  mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */
552 		"  add %%rcx, %%r9;"
553 		"  mov $0, %%rcx;"
554 		"  mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
555 		"  adcx %%rbx, %%r10;"
556 		"  mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
557 		"  adcx %%r13, %%r11;"
558 		"  adcx %%rcx, %%rax;"
559 
560 		/* Wrap the result back into the field */
561 
562 		/* Step 1: Compute carry*38 */
563 		"  mov $38, %%rdx;"
564 		"  imul %%rdx, %%rax;"
565 
566 		/* Step 2: Fold the carry back into dst */
567 		"  add %%rax, %%r8;"
568 		"  adcx %%rcx, %%r9;"
569 		"  movq %%r9, 8(%1);"
570 		"  adcx %%rcx, %%r10;"
571 		"  movq %%r10, 16(%1);"
572 		"  adcx %%rcx, %%r11;"
573 		"  movq %%r11, 24(%1);"
574 
575 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
576 		"  mov $0, %%rax;"
577 		"  cmovc %%rdx, %%rax;"
578 		"  add %%rax, %%r8;"
579 		"  movq %%r8, 0(%1);"
580 		: "+&r"(f2_r)
581 		: "r"(out), "r"(f1)
582 		: "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13",
583 		  "memory", "cc");
584 }
585 
586 /* Computes p1 <- bit ? p2 : p1 in constant time */
cswap2(u64 bit,const u64 * p1,const u64 * p2)587 static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
588 {
589 	asm volatile(
590 		/* Transfer bit into CF flag */
591 		"  add $18446744073709551615, %0;"
592 
593 		/* cswap p1[0], p2[0] */
594 		"  movq 0(%1), %%r8;"
595 		"  movq 0(%2), %%r9;"
596 		"  mov %%r8, %%r10;"
597 		"  cmovc %%r9, %%r8;"
598 		"  cmovc %%r10, %%r9;"
599 		"  movq %%r8, 0(%1);"
600 		"  movq %%r9, 0(%2);"
601 
602 		/* cswap p1[1], p2[1] */
603 		"  movq 8(%1), %%r8;"
604 		"  movq 8(%2), %%r9;"
605 		"  mov %%r8, %%r10;"
606 		"  cmovc %%r9, %%r8;"
607 		"  cmovc %%r10, %%r9;"
608 		"  movq %%r8, 8(%1);"
609 		"  movq %%r9, 8(%2);"
610 
611 		/* cswap p1[2], p2[2] */
612 		"  movq 16(%1), %%r8;"
613 		"  movq 16(%2), %%r9;"
614 		"  mov %%r8, %%r10;"
615 		"  cmovc %%r9, %%r8;"
616 		"  cmovc %%r10, %%r9;"
617 		"  movq %%r8, 16(%1);"
618 		"  movq %%r9, 16(%2);"
619 
620 		/* cswap p1[3], p2[3] */
621 		"  movq 24(%1), %%r8;"
622 		"  movq 24(%2), %%r9;"
623 		"  mov %%r8, %%r10;"
624 		"  cmovc %%r9, %%r8;"
625 		"  cmovc %%r10, %%r9;"
626 		"  movq %%r8, 24(%1);"
627 		"  movq %%r9, 24(%2);"
628 
629 		/* cswap p1[4], p2[4] */
630 		"  movq 32(%1), %%r8;"
631 		"  movq 32(%2), %%r9;"
632 		"  mov %%r8, %%r10;"
633 		"  cmovc %%r9, %%r8;"
634 		"  cmovc %%r10, %%r9;"
635 		"  movq %%r8, 32(%1);"
636 		"  movq %%r9, 32(%2);"
637 
638 		/* cswap p1[5], p2[5] */
639 		"  movq 40(%1), %%r8;"
640 		"  movq 40(%2), %%r9;"
641 		"  mov %%r8, %%r10;"
642 		"  cmovc %%r9, %%r8;"
643 		"  cmovc %%r10, %%r9;"
644 		"  movq %%r8, 40(%1);"
645 		"  movq %%r9, 40(%2);"
646 
647 		/* cswap p1[6], p2[6] */
648 		"  movq 48(%1), %%r8;"
649 		"  movq 48(%2), %%r9;"
650 		"  mov %%r8, %%r10;"
651 		"  cmovc %%r9, %%r8;"
652 		"  cmovc %%r10, %%r9;"
653 		"  movq %%r8, 48(%1);"
654 		"  movq %%r9, 48(%2);"
655 
656 		/* cswap p1[7], p2[7] */
657 		"  movq 56(%1), %%r8;"
658 		"  movq 56(%2), %%r9;"
659 		"  mov %%r8, %%r10;"
660 		"  cmovc %%r9, %%r8;"
661 		"  cmovc %%r10, %%r9;"
662 		"  movq %%r8, 56(%1);"
663 		"  movq %%r9, 56(%2);"
664 		: "+&r"(bit)
665 		: "r"(p1), "r"(p2)
666 		: "%r8", "%r9", "%r10", "memory", "cc");
667 }
668 
669 /* Computes the square of a field element: out <- f * f
670  * Uses the 8-element buffer tmp for intermediate results */
fsqr(u64 * out,const u64 * f,u64 * tmp)671 static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
672 {
673 	asm volatile(
674 		/* Compute the raw multiplication: tmp <- f * f */
675 
676 		/* Step 1: Compute all partial products */
677 		"  movq 0(%0), %%rdx;" /* f[0] */
678 		"  mulxq 8(%0), %%r8, %%r14;"
679 		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
680 		"  mulxq 16(%0), %%r9, %%r10;"
681 		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
682 		"  mulxq 24(%0), %%rax, %%rcx;"
683 		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
684 		"  movq 24(%0), %%rdx;" /* f[3] */
685 		"  mulxq 8(%0), %%r11, %%rbx;"
686 		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
687 		"  mulxq 16(%0), %%rax, %%r13;"
688 		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
689 		"  movq 8(%0), %%rdx;"
690 		"  adcx %%r15, %%r13;" /* f1 */
691 		"  mulxq 16(%0), %%rax, %%rcx;"
692 		"  mov $0, %%r14;" /* f[2]*f[1] */
693 
694 		/* Step 2: Compute two parallel carry chains */
695 		"  xor %%r15d, %%r15d;"
696 		"  adox %%rax, %%r10;"
697 		"  adcx %%r8, %%r8;"
698 		"  adox %%rcx, %%r11;"
699 		"  adcx %%r9, %%r9;"
700 		"  adox %%r15, %%rbx;"
701 		"  adcx %%r10, %%r10;"
702 		"  adox %%r15, %%r13;"
703 		"  adcx %%r11, %%r11;"
704 		"  adox %%r15, %%r14;"
705 		"  adcx %%rbx, %%rbx;"
706 		"  adcx %%r13, %%r13;"
707 		"  adcx %%r14, %%r14;"
708 
709 		/* Step 3: Compute intermediate squares */
710 		"  movq 0(%0), %%rdx;"
711 		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
712 		"  movq %%rax, 0(%1);"
713 		"  add %%rcx, %%r8;"
714 		"  movq %%r8, 8(%1);"
715 		"  movq 8(%0), %%rdx;"
716 		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
717 		"  adcx %%rax, %%r9;"
718 		"  movq %%r9, 16(%1);"
719 		"  adcx %%rcx, %%r10;"
720 		"  movq %%r10, 24(%1);"
721 		"  movq 16(%0), %%rdx;"
722 		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
723 		"  adcx %%rax, %%r11;"
724 		"  movq %%r11, 32(%1);"
725 		"  adcx %%rcx, %%rbx;"
726 		"  movq %%rbx, 40(%1);"
727 		"  movq 24(%0), %%rdx;"
728 		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
729 		"  adcx %%rax, %%r13;"
730 		"  movq %%r13, 48(%1);"
731 		"  adcx %%rcx, %%r14;"
732 		"  movq %%r14, 56(%1);"
733 
734 		/* Line up pointers */
735 		"  mov %1, %0;"
736 		"  mov %2, %1;"
737 
738 		/* Wrap the result back into the field */
739 
740 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
741 		"  mov $38, %%rdx;"
742 		"  mulxq 32(%0), %%r8, %%r13;"
743 		"  xor %%ecx, %%ecx;"
744 		"  adoxq 0(%0), %%r8;"
745 		"  mulxq 40(%0), %%r9, %%rbx;"
746 		"  adcx %%r13, %%r9;"
747 		"  adoxq 8(%0), %%r9;"
748 		"  mulxq 48(%0), %%r10, %%r13;"
749 		"  adcx %%rbx, %%r10;"
750 		"  adoxq 16(%0), %%r10;"
751 		"  mulxq 56(%0), %%r11, %%rax;"
752 		"  adcx %%r13, %%r11;"
753 		"  adoxq 24(%0), %%r11;"
754 		"  adcx %%rcx, %%rax;"
755 		"  adox %%rcx, %%rax;"
756 		"  imul %%rdx, %%rax;"
757 
758 		/* Step 2: Fold the carry back into dst */
759 		"  add %%rax, %%r8;"
760 		"  adcx %%rcx, %%r9;"
761 		"  movq %%r9, 8(%1);"
762 		"  adcx %%rcx, %%r10;"
763 		"  movq %%r10, 16(%1);"
764 		"  adcx %%rcx, %%r11;"
765 		"  movq %%r11, 24(%1);"
766 
767 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
768 		"  mov $0, %%rax;"
769 		"  cmovc %%rdx, %%rax;"
770 		"  add %%rax, %%r8;"
771 		"  movq %%r8, 0(%1);"
772 		: "+&r"(f), "+&r"(tmp)
773 		: "r"(out)
774 		: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
775 		  "%r13", "%r14", "%r15", "memory", "cc");
776 }
777 
778 /* Computes two field squarings:
779  *   out[0] <- f[0] * f[0]
780  *   out[1] <- f[1] * f[1]
781  * Uses the 16-element buffer tmp for intermediate results */
fsqr2(u64 * out,const u64 * f,u64 * tmp)782 static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
783 {
784 	asm volatile(
785 		/* Step 1: Compute all partial products */
786 		"  movq 0(%0), %%rdx;" /* f[0] */
787 		"  mulxq 8(%0), %%r8, %%r14;"
788 		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
789 		"  mulxq 16(%0), %%r9, %%r10;"
790 		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
791 		"  mulxq 24(%0), %%rax, %%rcx;"
792 		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
793 		"  movq 24(%0), %%rdx;" /* f[3] */
794 		"  mulxq 8(%0), %%r11, %%rbx;"
795 		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
796 		"  mulxq 16(%0), %%rax, %%r13;"
797 		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
798 		"  movq 8(%0), %%rdx;"
799 		"  adcx %%r15, %%r13;" /* f1 */
800 		"  mulxq 16(%0), %%rax, %%rcx;"
801 		"  mov $0, %%r14;" /* f[2]*f[1] */
802 
803 		/* Step 2: Compute two parallel carry chains */
804 		"  xor %%r15d, %%r15d;"
805 		"  adox %%rax, %%r10;"
806 		"  adcx %%r8, %%r8;"
807 		"  adox %%rcx, %%r11;"
808 		"  adcx %%r9, %%r9;"
809 		"  adox %%r15, %%rbx;"
810 		"  adcx %%r10, %%r10;"
811 		"  adox %%r15, %%r13;"
812 		"  adcx %%r11, %%r11;"
813 		"  adox %%r15, %%r14;"
814 		"  adcx %%rbx, %%rbx;"
815 		"  adcx %%r13, %%r13;"
816 		"  adcx %%r14, %%r14;"
817 
818 		/* Step 3: Compute intermediate squares */
819 		"  movq 0(%0), %%rdx;"
820 		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
821 		"  movq %%rax, 0(%1);"
822 		"  add %%rcx, %%r8;"
823 		"  movq %%r8, 8(%1);"
824 		"  movq 8(%0), %%rdx;"
825 		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
826 		"  adcx %%rax, %%r9;"
827 		"  movq %%r9, 16(%1);"
828 		"  adcx %%rcx, %%r10;"
829 		"  movq %%r10, 24(%1);"
830 		"  movq 16(%0), %%rdx;"
831 		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
832 		"  adcx %%rax, %%r11;"
833 		"  movq %%r11, 32(%1);"
834 		"  adcx %%rcx, %%rbx;"
835 		"  movq %%rbx, 40(%1);"
836 		"  movq 24(%0), %%rdx;"
837 		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
838 		"  adcx %%rax, %%r13;"
839 		"  movq %%r13, 48(%1);"
840 		"  adcx %%rcx, %%r14;"
841 		"  movq %%r14, 56(%1);"
842 
843 		/* Step 1: Compute all partial products */
844 		"  movq 32(%0), %%rdx;" /* f[0] */
845 		"  mulxq 40(%0), %%r8, %%r14;"
846 		"  xor %%r15d, %%r15d;" /* f[1]*f[0] */
847 		"  mulxq 48(%0), %%r9, %%r10;"
848 		"  adcx %%r14, %%r9;" /* f[2]*f[0] */
849 		"  mulxq 56(%0), %%rax, %%rcx;"
850 		"  adcx %%rax, %%r10;" /* f[3]*f[0] */
851 		"  movq 56(%0), %%rdx;" /* f[3] */
852 		"  mulxq 40(%0), %%r11, %%rbx;"
853 		"  adcx %%rcx, %%r11;" /* f[1]*f[3] */
854 		"  mulxq 48(%0), %%rax, %%r13;"
855 		"  adcx %%rax, %%rbx;" /* f[2]*f[3] */
856 		"  movq 40(%0), %%rdx;"
857 		"  adcx %%r15, %%r13;" /* f1 */
858 		"  mulxq 48(%0), %%rax, %%rcx;"
859 		"  mov $0, %%r14;" /* f[2]*f[1] */
860 
861 		/* Step 2: Compute two parallel carry chains */
862 		"  xor %%r15d, %%r15d;"
863 		"  adox %%rax, %%r10;"
864 		"  adcx %%r8, %%r8;"
865 		"  adox %%rcx, %%r11;"
866 		"  adcx %%r9, %%r9;"
867 		"  adox %%r15, %%rbx;"
868 		"  adcx %%r10, %%r10;"
869 		"  adox %%r15, %%r13;"
870 		"  adcx %%r11, %%r11;"
871 		"  adox %%r15, %%r14;"
872 		"  adcx %%rbx, %%rbx;"
873 		"  adcx %%r13, %%r13;"
874 		"  adcx %%r14, %%r14;"
875 
876 		/* Step 3: Compute intermediate squares */
877 		"  movq 32(%0), %%rdx;"
878 		"  mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
879 		"  movq %%rax, 64(%1);"
880 		"  add %%rcx, %%r8;"
881 		"  movq %%r8, 72(%1);"
882 		"  movq 40(%0), %%rdx;"
883 		"  mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
884 		"  adcx %%rax, %%r9;"
885 		"  movq %%r9, 80(%1);"
886 		"  adcx %%rcx, %%r10;"
887 		"  movq %%r10, 88(%1);"
888 		"  movq 48(%0), %%rdx;"
889 		"  mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
890 		"  adcx %%rax, %%r11;"
891 		"  movq %%r11, 96(%1);"
892 		"  adcx %%rcx, %%rbx;"
893 		"  movq %%rbx, 104(%1);"
894 		"  movq 56(%0), %%rdx;"
895 		"  mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
896 		"  adcx %%rax, %%r13;"
897 		"  movq %%r13, 112(%1);"
898 		"  adcx %%rcx, %%r14;"
899 		"  movq %%r14, 120(%1);"
900 
901 		/* Line up pointers */
902 		"  mov %1, %0;"
903 		"  mov %2, %1;"
904 
905 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
906 		"  mov $38, %%rdx;"
907 		"  mulxq 32(%0), %%r8, %%r13;"
908 		"  xor %%ecx, %%ecx;"
909 		"  adoxq 0(%0), %%r8;"
910 		"  mulxq 40(%0), %%r9, %%rbx;"
911 		"  adcx %%r13, %%r9;"
912 		"  adoxq 8(%0), %%r9;"
913 		"  mulxq 48(%0), %%r10, %%r13;"
914 		"  adcx %%rbx, %%r10;"
915 		"  adoxq 16(%0), %%r10;"
916 		"  mulxq 56(%0), %%r11, %%rax;"
917 		"  adcx %%r13, %%r11;"
918 		"  adoxq 24(%0), %%r11;"
919 		"  adcx %%rcx, %%rax;"
920 		"  adox %%rcx, %%rax;"
921 		"  imul %%rdx, %%rax;"
922 
923 		/* Step 2: Fold the carry back into dst */
924 		"  add %%rax, %%r8;"
925 		"  adcx %%rcx, %%r9;"
926 		"  movq %%r9, 8(%1);"
927 		"  adcx %%rcx, %%r10;"
928 		"  movq %%r10, 16(%1);"
929 		"  adcx %%rcx, %%r11;"
930 		"  movq %%r11, 24(%1);"
931 
932 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
933 		"  mov $0, %%rax;"
934 		"  cmovc %%rdx, %%rax;"
935 		"  add %%rax, %%r8;"
936 		"  movq %%r8, 0(%1);"
937 
938 		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
939 		"  mov $38, %%rdx;"
940 		"  mulxq 96(%0), %%r8, %%r13;"
941 		"  xor %%ecx, %%ecx;"
942 		"  adoxq 64(%0), %%r8;"
943 		"  mulxq 104(%0), %%r9, %%rbx;"
944 		"  adcx %%r13, %%r9;"
945 		"  adoxq 72(%0), %%r9;"
946 		"  mulxq 112(%0), %%r10, %%r13;"
947 		"  adcx %%rbx, %%r10;"
948 		"  adoxq 80(%0), %%r10;"
949 		"  mulxq 120(%0), %%r11, %%rax;"
950 		"  adcx %%r13, %%r11;"
951 		"  adoxq 88(%0), %%r11;"
952 		"  adcx %%rcx, %%rax;"
953 		"  adox %%rcx, %%rax;"
954 		"  imul %%rdx, %%rax;"
955 
956 		/* Step 2: Fold the carry back into dst */
957 		"  add %%rax, %%r8;"
958 		"  adcx %%rcx, %%r9;"
959 		"  movq %%r9, 40(%1);"
960 		"  adcx %%rcx, %%r10;"
961 		"  movq %%r10, 48(%1);"
962 		"  adcx %%rcx, %%r11;"
963 		"  movq %%r11, 56(%1);"
964 
965 		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
966 		"  mov $0, %%rax;"
967 		"  cmovc %%rdx, %%rax;"
968 		"  add %%rax, %%r8;"
969 		"  movq %%r8, 32(%1);"
970 		: "+&r"(f), "+&r"(tmp)
971 		: "r"(out)
972 		: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
973 		  "%r13", "%r14", "%r15", "memory", "cc");
974 }
975 
point_add_and_double(u64 * q,u64 * p01_tmp1,u64 * tmp2)976 static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
977 {
978 	u64 *nq = p01_tmp1;
979 	u64 *nq_p1 = p01_tmp1 + (u32)8U;
980 	u64 *tmp1 = p01_tmp1 + (u32)16U;
981 	u64 *x1 = q;
982 	u64 *x2 = nq;
983 	u64 *z2 = nq + (u32)4U;
984 	u64 *z3 = nq_p1 + (u32)4U;
985 	u64 *a = tmp1;
986 	u64 *b = tmp1 + (u32)4U;
987 	u64 *ab = tmp1;
988 	u64 *dc = tmp1 + (u32)8U;
989 	u64 *x3;
990 	u64 *z31;
991 	u64 *d0;
992 	u64 *c0;
993 	u64 *a1;
994 	u64 *b1;
995 	u64 *d;
996 	u64 *c;
997 	u64 *ab1;
998 	u64 *dc1;
999 	fadd(a, x2, z2);
1000 	fsub(b, x2, z2);
1001 	x3 = nq_p1;
1002 	z31 = nq_p1 + (u32)4U;
1003 	d0 = dc;
1004 	c0 = dc + (u32)4U;
1005 	fadd(c0, x3, z31);
1006 	fsub(d0, x3, z31);
1007 	fmul2(dc, dc, ab, tmp2);
1008 	fadd(x3, d0, c0);
1009 	fsub(z31, d0, c0);
1010 	a1 = tmp1;
1011 	b1 = tmp1 + (u32)4U;
1012 	d = tmp1 + (u32)8U;
1013 	c = tmp1 + (u32)12U;
1014 	ab1 = tmp1;
1015 	dc1 = tmp1 + (u32)8U;
1016 	fsqr2(dc1, ab1, tmp2);
1017 	fsqr2(nq_p1, nq_p1, tmp2);
1018 	a1[0U] = c[0U];
1019 	a1[1U] = c[1U];
1020 	a1[2U] = c[2U];
1021 	a1[3U] = c[3U];
1022 	fsub(c, d, c);
1023 	fmul_scalar(b1, c, (u64)121665U);
1024 	fadd(b1, b1, d);
1025 	fmul2(nq, dc1, ab1, tmp2);
1026 	fmul(z3, z3, x1, tmp2);
1027 }
1028 
point_double(u64 * nq,u64 * tmp1,u64 * tmp2)1029 static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
1030 {
1031 	u64 *x2 = nq;
1032 	u64 *z2 = nq + (u32)4U;
1033 	u64 *a = tmp1;
1034 	u64 *b = tmp1 + (u32)4U;
1035 	u64 *d = tmp1 + (u32)8U;
1036 	u64 *c = tmp1 + (u32)12U;
1037 	u64 *ab = tmp1;
1038 	u64 *dc = tmp1 + (u32)8U;
1039 	fadd(a, x2, z2);
1040 	fsub(b, x2, z2);
1041 	fsqr2(dc, ab, tmp2);
1042 	a[0U] = c[0U];
1043 	a[1U] = c[1U];
1044 	a[2U] = c[2U];
1045 	a[3U] = c[3U];
1046 	fsub(c, d, c);
1047 	fmul_scalar(b, c, (u64)121665U);
1048 	fadd(b, b, d);
1049 	fmul2(nq, dc, ab, tmp2);
1050 }
1051 
montgomery_ladder(u64 * out,const u8 * key,u64 * init1)1052 static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
1053 {
1054 	u64 tmp2[16U] = { 0U };
1055 	u64 p01_tmp1_swap[33U] = { 0U };
1056 	u64 *p0 = p01_tmp1_swap;
1057 	u64 *p01 = p01_tmp1_swap;
1058 	u64 *p03 = p01;
1059 	u64 *p11 = p01 + (u32)8U;
1060 	u64 *x0;
1061 	u64 *z0;
1062 	u64 *p01_tmp1;
1063 	u64 *p01_tmp11;
1064 	u64 *nq10;
1065 	u64 *nq_p11;
1066 	u64 *swap1;
1067 	u64 sw0;
1068 	u64 *nq1;
1069 	u64 *tmp1;
1070 	memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
1071 	x0 = p03;
1072 	z0 = p03 + (u32)4U;
1073 	x0[0U] = (u64)1U;
1074 	x0[1U] = (u64)0U;
1075 	x0[2U] = (u64)0U;
1076 	x0[3U] = (u64)0U;
1077 	z0[0U] = (u64)0U;
1078 	z0[1U] = (u64)0U;
1079 	z0[2U] = (u64)0U;
1080 	z0[3U] = (u64)0U;
1081 	p01_tmp1 = p01_tmp1_swap;
1082 	p01_tmp11 = p01_tmp1_swap;
1083 	nq10 = p01_tmp1_swap;
1084 	nq_p11 = p01_tmp1_swap + (u32)8U;
1085 	swap1 = p01_tmp1_swap + (u32)32U;
1086 	cswap2((u64)1U, nq10, nq_p11);
1087 	point_add_and_double(init1, p01_tmp11, tmp2);
1088 	swap1[0U] = (u64)1U;
1089 	{
1090 		u32 i;
1091 		for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
1092 			u64 *p01_tmp12 = p01_tmp1_swap;
1093 			u64 *swap2 = p01_tmp1_swap + (u32)32U;
1094 			u64 *nq2 = p01_tmp12;
1095 			u64 *nq_p12 = p01_tmp12 + (u32)8U;
1096 			u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
1097 			u64 sw = swap2[0U] ^ bit;
1098 			cswap2(sw, nq2, nq_p12);
1099 			point_add_and_double(init1, p01_tmp12, tmp2);
1100 			swap2[0U] = bit;
1101 		}
1102 	}
1103 	sw0 = swap1[0U];
1104 	cswap2(sw0, nq10, nq_p11);
1105 	nq1 = p01_tmp1;
1106 	tmp1 = p01_tmp1 + (u32)16U;
1107 	point_double(nq1, tmp1, tmp2);
1108 	point_double(nq1, tmp1, tmp2);
1109 	point_double(nq1, tmp1, tmp2);
1110 	memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
1111 
1112 	memzero_explicit(tmp2, sizeof(tmp2));
1113 	memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
1114 }
1115 
fsquare_times(u64 * o,const u64 * inp,u64 * tmp,u32 n1)1116 static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
1117 {
1118 	u32 i;
1119 	fsqr(o, inp, tmp);
1120 	for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
1121 		fsqr(o, o, tmp);
1122 }
1123 
finv(u64 * o,const u64 * i,u64 * tmp)1124 static void finv(u64 *o, const u64 *i, u64 *tmp)
1125 {
1126 	u64 t1[16U] = { 0U };
1127 	u64 *a0 = t1;
1128 	u64 *b = t1 + (u32)4U;
1129 	u64 *c = t1 + (u32)8U;
1130 	u64 *t00 = t1 + (u32)12U;
1131 	u64 *tmp1 = tmp;
1132 	u64 *a;
1133 	u64 *t0;
1134 	fsquare_times(a0, i, tmp1, (u32)1U);
1135 	fsquare_times(t00, a0, tmp1, (u32)2U);
1136 	fmul(b, t00, i, tmp);
1137 	fmul(a0, b, a0, tmp);
1138 	fsquare_times(t00, a0, tmp1, (u32)1U);
1139 	fmul(b, t00, b, tmp);
1140 	fsquare_times(t00, b, tmp1, (u32)5U);
1141 	fmul(b, t00, b, tmp);
1142 	fsquare_times(t00, b, tmp1, (u32)10U);
1143 	fmul(c, t00, b, tmp);
1144 	fsquare_times(t00, c, tmp1, (u32)20U);
1145 	fmul(t00, t00, c, tmp);
1146 	fsquare_times(t00, t00, tmp1, (u32)10U);
1147 	fmul(b, t00, b, tmp);
1148 	fsquare_times(t00, b, tmp1, (u32)50U);
1149 	fmul(c, t00, b, tmp);
1150 	fsquare_times(t00, c, tmp1, (u32)100U);
1151 	fmul(t00, t00, c, tmp);
1152 	fsquare_times(t00, t00, tmp1, (u32)50U);
1153 	fmul(t00, t00, b, tmp);
1154 	fsquare_times(t00, t00, tmp1, (u32)5U);
1155 	a = t1;
1156 	t0 = t1 + (u32)12U;
1157 	fmul(o, t0, a, tmp);
1158 }
1159 
store_felem(u64 * b,u64 * f)1160 static void store_felem(u64 *b, u64 *f)
1161 {
1162 	u64 f30 = f[3U];
1163 	u64 top_bit0 = f30 >> (u32)63U;
1164 	u64 f31;
1165 	u64 top_bit;
1166 	u64 f0;
1167 	u64 f1;
1168 	u64 f2;
1169 	u64 f3;
1170 	u64 m0;
1171 	u64 m1;
1172 	u64 m2;
1173 	u64 m3;
1174 	u64 mask;
1175 	u64 f0_;
1176 	u64 f1_;
1177 	u64 f2_;
1178 	u64 f3_;
1179 	u64 o0;
1180 	u64 o1;
1181 	u64 o2;
1182 	u64 o3;
1183 	f[3U] = f30 & (u64)0x7fffffffffffffffU;
1184 	add_scalar(f, f, (u64)19U * top_bit0);
1185 	f31 = f[3U];
1186 	top_bit = f31 >> (u32)63U;
1187 	f[3U] = f31 & (u64)0x7fffffffffffffffU;
1188 	add_scalar(f, f, (u64)19U * top_bit);
1189 	f0 = f[0U];
1190 	f1 = f[1U];
1191 	f2 = f[2U];
1192 	f3 = f[3U];
1193 	m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
1194 	m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
1195 	m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
1196 	m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
1197 	mask = ((m0 & m1) & m2) & m3;
1198 	f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
1199 	f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
1200 	f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
1201 	f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
1202 	o0 = f0_;
1203 	o1 = f1_;
1204 	o2 = f2_;
1205 	o3 = f3_;
1206 	b[0U] = o0;
1207 	b[1U] = o1;
1208 	b[2U] = o2;
1209 	b[3U] = o3;
1210 }
1211 
encode_point(u8 * o,const u64 * i)1212 static void encode_point(u8 *o, const u64 *i)
1213 {
1214 	const u64 *x = i;
1215 	const u64 *z = i + (u32)4U;
1216 	u64 tmp[4U] = { 0U };
1217 	u64 tmp_w[16U] = { 0U };
1218 	finv(tmp, z, tmp_w);
1219 	fmul(tmp, tmp, x, tmp_w);
1220 	store_felem((u64 *)o, tmp);
1221 }
1222 
curve25519_ever64(u8 * out,const u8 * priv,const u8 * pub)1223 static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
1224 {
1225 	u64 init1[8U] = { 0U };
1226 	u64 tmp[4U] = { 0U };
1227 	u64 tmp3;
1228 	u64 *x;
1229 	u64 *z;
1230 	{
1231 		u32 i;
1232 		for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
1233 			u64 *os = tmp;
1234 			const u8 *bj = pub + i * (u32)8U;
1235 			u64 u = *(u64 *)bj;
1236 			u64 r = u;
1237 			u64 x0 = r;
1238 			os[i] = x0;
1239 		}
1240 	}
1241 	tmp3 = tmp[3U];
1242 	tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
1243 	x = init1;
1244 	z = init1 + (u32)4U;
1245 	z[0U] = (u64)1U;
1246 	z[1U] = (u64)0U;
1247 	z[2U] = (u64)0U;
1248 	z[3U] = (u64)0U;
1249 	x[0U] = tmp[0U];
1250 	x[1U] = tmp[1U];
1251 	x[2U] = tmp[2U];
1252 	x[3U] = tmp[3U];
1253 	montgomery_ladder(init1, priv, init1);
1254 	encode_point(out, init1);
1255 }
1256 
1257 /* The below constants were generated using this sage script:
1258  *
1259  * #!/usr/bin/env sage
1260  * import sys
1261  * from sage.all import *
1262  * def limbs(n):
1263  * 	n = int(n)
1264  * 	l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
1265  * 	return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
1266  * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
1267  * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
1268  * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
1269  * print("static const u64 table_ladder[] = {")
1270  * p = ec.lift_x(9)
1271  * for i in range(252):
1272  * 	l = (p[0] + p[2]) / (p[0] - p[2])
1273  * 	print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
1274  * 	p = p * 2
1275  * print("};")
1276  *
1277  */
1278 
1279 static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
1280 
1281 static const u64 table_ladder[] = {
1282 	0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
1283 	0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
1284 	0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
1285 	0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
1286 	0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
1287 	0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
1288 	0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
1289 	0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
1290 	0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
1291 	0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
1292 	0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
1293 	0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
1294 	0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
1295 	0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
1296 	0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
1297 	0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
1298 	0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
1299 	0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
1300 	0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
1301 	0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
1302 	0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
1303 	0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
1304 	0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
1305 	0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
1306 	0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
1307 	0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
1308 	0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
1309 	0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
1310 	0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
1311 	0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
1312 	0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
1313 	0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
1314 	0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
1315 	0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
1316 	0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
1317 	0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
1318 	0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
1319 	0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
1320 	0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
1321 	0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
1322 	0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
1323 	0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
1324 	0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
1325 	0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
1326 	0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
1327 	0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
1328 	0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
1329 	0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
1330 	0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
1331 	0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
1332 	0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
1333 	0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
1334 	0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
1335 	0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
1336 	0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
1337 	0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
1338 	0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
1339 	0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
1340 	0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
1341 	0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
1342 	0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
1343 	0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
1344 	0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
1345 	0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
1346 	0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
1347 	0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
1348 	0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
1349 	0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
1350 	0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
1351 	0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
1352 	0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
1353 	0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
1354 	0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
1355 	0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
1356 	0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
1357 	0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
1358 	0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
1359 	0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
1360 	0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
1361 	0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
1362 	0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
1363 	0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
1364 	0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
1365 	0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
1366 	0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
1367 	0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
1368 	0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
1369 	0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
1370 	0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
1371 	0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
1372 	0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
1373 	0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
1374 	0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
1375 	0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
1376 	0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
1377 	0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
1378 	0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
1379 	0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
1380 	0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
1381 	0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
1382 	0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
1383 	0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
1384 	0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
1385 	0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
1386 	0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
1387 	0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
1388 	0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
1389 	0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
1390 	0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
1391 	0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
1392 	0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
1393 	0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
1394 	0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
1395 	0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
1396 	0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
1397 	0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
1398 	0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
1399 	0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
1400 	0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
1401 	0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
1402 	0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
1403 	0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
1404 	0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
1405 	0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
1406 	0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
1407 	0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
1408 	0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
1409 	0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
1410 	0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
1411 	0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
1412 	0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
1413 	0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
1414 	0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
1415 	0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
1416 	0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
1417 	0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
1418 	0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
1419 	0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
1420 	0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
1421 	0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
1422 	0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
1423 	0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
1424 	0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
1425 	0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
1426 	0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
1427 	0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
1428 	0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
1429 	0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
1430 	0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
1431 	0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
1432 	0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
1433 	0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
1434 	0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
1435 	0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
1436 	0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
1437 	0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
1438 	0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
1439 	0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
1440 	0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
1441 	0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
1442 	0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
1443 	0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
1444 	0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
1445 	0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
1446 	0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
1447 	0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
1448 	0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
1449 	0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
1450 	0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
1451 	0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
1452 	0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
1453 	0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
1454 	0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
1455 	0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
1456 	0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
1457 	0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
1458 	0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
1459 	0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
1460 	0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
1461 	0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
1462 	0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
1463 	0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
1464 	0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
1465 	0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
1466 	0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
1467 	0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
1468 	0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
1469 	0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
1470 	0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
1471 	0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
1472 	0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
1473 	0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
1474 	0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
1475 	0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
1476 	0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
1477 	0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
1478 	0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
1479 	0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
1480 	0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
1481 	0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
1482 	0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
1483 	0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
1484 	0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
1485 	0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
1486 	0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
1487 	0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
1488 	0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
1489 	0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
1490 	0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
1491 	0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
1492 	0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
1493 	0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
1494 	0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
1495 	0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
1496 	0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
1497 	0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
1498 	0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
1499 	0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
1500 	0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
1501 	0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
1502 	0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
1503 	0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
1504 	0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
1505 	0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
1506 	0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
1507 	0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
1508 	0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
1509 	0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
1510 	0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
1511 	0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
1512 	0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
1513 	0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
1514 	0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
1515 	0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
1516 	0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
1517 	0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
1518 	0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
1519 	0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
1520 	0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
1521 	0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
1522 	0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
1523 	0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
1524 	0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
1525 	0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
1526 	0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
1527 	0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
1528 	0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
1529 	0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
1530 	0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
1531 	0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
1532 	0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
1533 	0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
1534 };
1535 
curve25519_ever64_base(u8 * out,const u8 * priv)1536 static void curve25519_ever64_base(u8 *out, const u8 *priv)
1537 {
1538 	u64 swap = 1;
1539 	int i, j, k;
1540 	u64 tmp[16 + 32 + 4];
1541 	u64 *x1 = &tmp[0];
1542 	u64 *z1 = &tmp[4];
1543 	u64 *x2 = &tmp[8];
1544 	u64 *z2 = &tmp[12];
1545 	u64 *xz1 = &tmp[0];
1546 	u64 *xz2 = &tmp[8];
1547 	u64 *a = &tmp[0 + 16];
1548 	u64 *b = &tmp[4 + 16];
1549 	u64 *c = &tmp[8 + 16];
1550 	u64 *ab = &tmp[0 + 16];
1551 	u64 *abcd = &tmp[0 + 16];
1552 	u64 *ef = &tmp[16 + 16];
1553 	u64 *efgh = &tmp[16 + 16];
1554 	u64 *key = &tmp[0 + 16 + 32];
1555 
1556 	memcpy(key, priv, 32);
1557 	((u8 *)key)[0] &= 248;
1558 	((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
1559 
1560 	x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
1561 	z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
1562 	z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
1563 	memcpy(x2, p_minus_s, sizeof(p_minus_s));
1564 
1565 	j = 3;
1566 	for (i = 0; i < 4; ++i) {
1567 		while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
1568 			u64 bit = (key[i] >> j) & 1;
1569 			k = (64 * i + j - 3);
1570 			swap = swap ^ bit;
1571 			cswap2(swap, xz1, xz2);
1572 			swap = bit;
1573 			fsub(b, x1, z1);
1574 			fadd(a, x1, z1);
1575 			fmul(c, &table_ladder[4 * k], b, ef);
1576 			fsub(b, a, c);
1577 			fadd(a, a, c);
1578 			fsqr2(ab, ab, efgh);
1579 			fmul2(xz1, xz2, ab, efgh);
1580 			++j;
1581 		}
1582 		j = 0;
1583 	}
1584 
1585 	point_double(xz1, abcd, efgh);
1586 	point_double(xz1, abcd, efgh);
1587 	point_double(xz1, abcd, efgh);
1588 	encode_point(out, xz1);
1589 
1590 	memzero_explicit(tmp, sizeof(tmp));
1591 }
1592 
1593 static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2_adx);
1594 
curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],const u8 secret[CURVE25519_KEY_SIZE],const u8 basepoint[CURVE25519_KEY_SIZE])1595 void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
1596 		     const u8 secret[CURVE25519_KEY_SIZE],
1597 		     const u8 basepoint[CURVE25519_KEY_SIZE])
1598 {
1599 	if (static_branch_likely(&curve25519_use_bmi2_adx))
1600 		curve25519_ever64(mypublic, secret, basepoint);
1601 	else
1602 		curve25519_generic(mypublic, secret, basepoint);
1603 }
1604 EXPORT_SYMBOL(curve25519_arch);
1605 
curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],const u8 secret[CURVE25519_KEY_SIZE])1606 void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
1607 			  const u8 secret[CURVE25519_KEY_SIZE])
1608 {
1609 	if (static_branch_likely(&curve25519_use_bmi2_adx))
1610 		curve25519_ever64_base(pub, secret);
1611 	else
1612 		curve25519_generic(pub, secret, curve25519_base_point);
1613 }
1614 EXPORT_SYMBOL(curve25519_base_arch);
1615 
curve25519_set_secret(struct crypto_kpp * tfm,const void * buf,unsigned int len)1616 static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
1617 				 unsigned int len)
1618 {
1619 	u8 *secret = kpp_tfm_ctx(tfm);
1620 
1621 	if (!len)
1622 		curve25519_generate_secret(secret);
1623 	else if (len == CURVE25519_KEY_SIZE &&
1624 		 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
1625 		memcpy(secret, buf, CURVE25519_KEY_SIZE);
1626 	else
1627 		return -EINVAL;
1628 	return 0;
1629 }
1630 
curve25519_generate_public_key(struct kpp_request * req)1631 static int curve25519_generate_public_key(struct kpp_request *req)
1632 {
1633 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1634 	const u8 *secret = kpp_tfm_ctx(tfm);
1635 	u8 buf[CURVE25519_KEY_SIZE];
1636 	int copied, nbytes;
1637 
1638 	if (req->src)
1639 		return -EINVAL;
1640 
1641 	curve25519_base_arch(buf, secret);
1642 
1643 	/* might want less than we've got */
1644 	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1645 	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1646 								nbytes),
1647 				     buf, nbytes);
1648 	if (copied != nbytes)
1649 		return -EINVAL;
1650 	return 0;
1651 }
1652 
curve25519_compute_shared_secret(struct kpp_request * req)1653 static int curve25519_compute_shared_secret(struct kpp_request *req)
1654 {
1655 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
1656 	const u8 *secret = kpp_tfm_ctx(tfm);
1657 	u8 public_key[CURVE25519_KEY_SIZE];
1658 	u8 buf[CURVE25519_KEY_SIZE];
1659 	int copied, nbytes;
1660 
1661 	if (!req->src)
1662 		return -EINVAL;
1663 
1664 	copied = sg_copy_to_buffer(req->src,
1665 				   sg_nents_for_len(req->src,
1666 						    CURVE25519_KEY_SIZE),
1667 				   public_key, CURVE25519_KEY_SIZE);
1668 	if (copied != CURVE25519_KEY_SIZE)
1669 		return -EINVAL;
1670 
1671 	curve25519_arch(buf, secret, public_key);
1672 
1673 	/* might want less than we've got */
1674 	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
1675 	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
1676 								nbytes),
1677 				     buf, nbytes);
1678 	if (copied != nbytes)
1679 		return -EINVAL;
1680 	return 0;
1681 }
1682 
curve25519_max_size(struct crypto_kpp * tfm)1683 static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
1684 {
1685 	return CURVE25519_KEY_SIZE;
1686 }
1687 
1688 static struct kpp_alg curve25519_alg = {
1689 	.base.cra_name		= "curve25519",
1690 	.base.cra_driver_name	= "curve25519-x86",
1691 	.base.cra_priority	= 200,
1692 	.base.cra_module	= THIS_MODULE,
1693 	.base.cra_ctxsize	= CURVE25519_KEY_SIZE,
1694 
1695 	.set_secret		= curve25519_set_secret,
1696 	.generate_public_key	= curve25519_generate_public_key,
1697 	.compute_shared_secret	= curve25519_compute_shared_secret,
1698 	.max_size		= curve25519_max_size,
1699 };
1700 
1701 
curve25519_mod_init(void)1702 static int __init curve25519_mod_init(void)
1703 {
1704 	if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
1705 		static_branch_enable(&curve25519_use_bmi2_adx);
1706 	else
1707 		return 0;
1708 	return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
1709 		crypto_register_kpp(&curve25519_alg) : 0;
1710 }
1711 
curve25519_mod_exit(void)1712 static void __exit curve25519_mod_exit(void)
1713 {
1714 	if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
1715 	    static_branch_likely(&curve25519_use_bmi2_adx))
1716 		crypto_unregister_kpp(&curve25519_alg);
1717 }
1718 
1719 module_init(curve25519_mod_init);
1720 module_exit(curve25519_mod_exit);
1721 
1722 MODULE_ALIAS_CRYPTO("curve25519");
1723 MODULE_ALIAS_CRYPTO("curve25519-x86");
1724 MODULE_DESCRIPTION("Curve25519 algorithm, ADX optimized");
1725 MODULE_LICENSE("GPL v2");
1726 MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
1727