xref: /linux/arch/arm64/crypto/ghash-ce-core.S (revision 370c3883195566ee3e7d79e0146c3d735a406573)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated AES-GCM implementation with ARMv8 Crypto Extensions.
4 *
5 * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	SHASH		.req	v0
12	SHASH2		.req	v1
13	T1		.req	v2
14	T2		.req	v3
15	MASK		.req	v4
16	XM		.req	v5
17	XL		.req	v6
18	XH		.req	v7
19	IN1		.req	v7
20
21	XL2		.req	v8
22	XM2		.req	v9
23	XH2		.req	v10
24	XL3		.req	v11
25	XM3		.req	v12
26	XH3		.req	v13
27	TT3		.req	v14
28	TT4		.req	v15
29	HH		.req	v16
30	HH3		.req	v17
31	HH4		.req	v18
32	HH34		.req	v19
33
34	.text
35	.arch		armv8-a+crypto
36
37	.macro		__pmull_pre_p64
38	add		x8, x3, #16
39	ld1		{HH.2d-HH4.2d}, [x8]
40
41	trn1		SHASH2.2d, SHASH.2d, HH.2d
42	trn2		T1.2d, SHASH.2d, HH.2d
43	eor		SHASH2.16b, SHASH2.16b, T1.16b
44
45	trn1		HH34.2d, HH3.2d, HH4.2d
46	trn2		T1.2d, HH3.2d, HH4.2d
47	eor		HH34.16b, HH34.16b, T1.16b
48
49	movi		MASK.16b, #0xe1
50	shl		MASK.2d, MASK.2d, #57
51	.endm
52
53	.macro		__pmull_reduce_p64
54	pmull		T2.1q, XL.1d, MASK.1d
55	eor		XM.16b, XM.16b, T1.16b
56
57	mov		XH.d[0], XM.d[1]
58	mov		XM.d[1], XL.d[0]
59
60	eor		XL.16b, XM.16b, T2.16b
61	ext		T2.16b, XL.16b, XL.16b, #8
62	pmull		XL.1q, XL.1d, MASK.1d
63	.endm
64
65	/*
66	 * void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
67	 *			       u64 const h[4][2], const char *head)
68	 */
69SYM_FUNC_START(pmull_ghash_update_p64)
70	ld1		{SHASH.2d}, [x3]
71	ld1		{XL.2d}, [x1]
72
73	__pmull_pre_p64
74
75	/* do the head block first, if supplied */
76	cbz		x4, 0f
77	ld1		{T1.2d}, [x4]
78	mov		x4, xzr
79	b		3f
80
810:
82	tbnz		w0, #0, 2f		// skip until #blocks is a
83	tbnz		w0, #1, 2f		// round multiple of 4
84
851:	ld1		{XM3.16b-TT4.16b}, [x2], #64
86
87	sub		w0, w0, #4
88
89	rev64		T1.16b, XM3.16b
90	rev64		T2.16b, XH3.16b
91	rev64		TT4.16b, TT4.16b
92	rev64		TT3.16b, TT3.16b
93
94	ext		IN1.16b, TT4.16b, TT4.16b, #8
95	ext		XL3.16b, TT3.16b, TT3.16b, #8
96
97	eor		TT4.16b, TT4.16b, IN1.16b
98	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
99	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
100	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
101
102	eor		TT3.16b, TT3.16b, XL3.16b
103	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
104	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
105	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
106
107	ext		IN1.16b, T2.16b, T2.16b, #8
108	eor		XL2.16b, XL2.16b, XL3.16b
109	eor		XH2.16b, XH2.16b, XH3.16b
110	eor		XM2.16b, XM2.16b, XM3.16b
111
112	eor		T2.16b, T2.16b, IN1.16b
113	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
114	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
115	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
116
117	eor		XL2.16b, XL2.16b, XL3.16b
118	eor		XH2.16b, XH2.16b, XH3.16b
119	eor		XM2.16b, XM2.16b, XM3.16b
120
121	ext		IN1.16b, T1.16b, T1.16b, #8
122	ext		TT3.16b, XL.16b, XL.16b, #8
123	eor		XL.16b, XL.16b, IN1.16b
124	eor		T1.16b, T1.16b, TT3.16b
125
126	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
127	eor		T1.16b, T1.16b, XL.16b
128	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
129	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
130
131	eor		XL.16b, XL.16b, XL2.16b
132	eor		XH.16b, XH.16b, XH2.16b
133	eor		XM.16b, XM.16b, XM2.16b
134
135	eor		T2.16b, XL.16b, XH.16b
136	ext		T1.16b, XL.16b, XH.16b, #8
137	eor		XM.16b, XM.16b, T2.16b
138
139	__pmull_reduce_p64
140
141	eor		T2.16b, T2.16b, XH.16b
142	eor		XL.16b, XL.16b, T2.16b
143
144	cbz		w0, 5f
145	b		1b
146
1472:	ld1		{T1.2d}, [x2], #16
148	sub		w0, w0, #1
149
1503:	/* multiply XL by SHASH in GF(2^128) */
151CPU_LE(	rev64		T1.16b, T1.16b	)
152
153	ext		T2.16b, XL.16b, XL.16b, #8
154	ext		IN1.16b, T1.16b, T1.16b, #8
155	eor		T1.16b, T1.16b, T2.16b
156	eor		XL.16b, XL.16b, IN1.16b
157
158	pmull2		XH.1q, XL.2d, SHASH.2d		// a1 * b1
159	eor		T1.16b, T1.16b, XL.16b
160	pmull		XL.1q, XL.1d, SHASH.1d		// a0 * b0
161	pmull		XM.1q, T1.1d, SHASH2.1d		// (a1 + a0)(b1 + b0)
162
1634:	eor		T2.16b, XL.16b, XH.16b
164	ext		T1.16b, XL.16b, XH.16b, #8
165	eor		XM.16b, XM.16b, T2.16b
166
167	__pmull_reduce_p64
168
169	eor		T2.16b, T2.16b, XH.16b
170	eor		XL.16b, XL.16b, T2.16b
171
172	cbnz		w0, 0b
173
1745:	st1		{XL.2d}, [x1]
175	ret
176SYM_FUNC_END(pmull_ghash_update_p64)
177
178	KS0		.req	v8
179	KS1		.req	v9
180	KS2		.req	v10
181	KS3		.req	v11
182
183	INP0		.req	v21
184	INP1		.req	v22
185	INP2		.req	v23
186	INP3		.req	v24
187
188	K0		.req	v25
189	K1		.req	v26
190	K2		.req	v27
191	K3		.req	v28
192	K4		.req	v12
193	K5		.req	v13
194	K6		.req	v4
195	K7		.req	v5
196	K8		.req	v14
197	K9		.req	v15
198	KK		.req	v29
199	KL		.req	v30
200	KM		.req	v31
201
202	.macro		load_round_keys, rounds, rk, tmp
203	add		\tmp, \rk, #64
204	ld1		{K0.4s-K3.4s}, [\rk]
205	ld1		{K4.4s-K5.4s}, [\tmp]
206	add		\tmp, \rk, \rounds, lsl #4
207	sub		\tmp, \tmp, #32
208	ld1		{KK.4s-KM.4s}, [\tmp]
209	.endm
210
211	.macro		enc_round, state, key
212	aese		\state\().16b, \key\().16b
213	aesmc		\state\().16b, \state\().16b
214	.endm
215
216	.macro		enc_qround, s0, s1, s2, s3, key
217	enc_round	\s0, \key
218	enc_round	\s1, \key
219	enc_round	\s2, \key
220	enc_round	\s3, \key
221	.endm
222
223	.macro		enc_block, state, rounds, rk, tmp
224	add		\tmp, \rk, #96
225	ld1		{K6.4s-K7.4s}, [\tmp], #32
226	.irp		key, K0, K1, K2, K3, K4 K5
227	enc_round	\state, \key
228	.endr
229
230	tbnz		\rounds, #2, .Lnot128_\@
231.Lout256_\@:
232	enc_round	\state, K6
233	enc_round	\state, K7
234
235.Lout192_\@:
236	enc_round	\state, KK
237	aese		\state\().16b, KL.16b
238	eor		\state\().16b, \state\().16b, KM.16b
239
240	.subsection	1
241.Lnot128_\@:
242	ld1		{K8.4s-K9.4s}, [\tmp], #32
243	enc_round	\state, K6
244	enc_round	\state, K7
245	ld1		{K6.4s-K7.4s}, [\tmp]
246	enc_round	\state, K8
247	enc_round	\state, K9
248	tbz		\rounds, #1, .Lout192_\@
249	b		.Lout256_\@
250	.previous
251	.endm
252
253	.align		6
254	.macro		pmull_gcm_do_crypt, enc
255	frame_push	1
256
257	load_round_keys	x7, x6, x8
258
259	ld1		{SHASH.2d}, [x3], #16
260	ld1		{HH.2d-HH4.2d}, [x3]
261
262	trn1		SHASH2.2d, SHASH.2d, HH.2d
263	trn2		T1.2d, SHASH.2d, HH.2d
264	eor		SHASH2.16b, SHASH2.16b, T1.16b
265
266	trn1		HH34.2d, HH3.2d, HH4.2d
267	trn2		T1.2d, HH3.2d, HH4.2d
268	eor		HH34.16b, HH34.16b, T1.16b
269
270	ld1		{XL.2d}, [x4]
271
272	cbz		x0, 3f				// tag only?
273
274	ldr		w8, [x5, #12]			// load lower counter
275CPU_LE(	rev		w8, w8		)
276
2770:	mov		w9, #4				// max blocks per round
278	add		x10, x0, #0xf
279	lsr		x10, x10, #4			// remaining blocks
280
281	subs		x0, x0, #64
282	csel		w9, w10, w9, mi
283	add		w8, w8, w9
284
285	bmi		1f
286	ld1		{INP0.16b-INP3.16b}, [x2], #64
287	.subsection	1
288	/*
289	 * Populate the four input registers right to left with up to 63 bytes
290	 * of data, using overlapping loads to avoid branches.
291	 *
292	 *                INP0     INP1     INP2     INP3
293	 *  1 byte     |        |        |        |x       |
294	 * 16 bytes    |        |        |        |xxxxxxxx|
295	 * 17 bytes    |        |        |xxxxxxxx|x       |
296	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
297	 * etc etc
298	 *
299	 * Note that this code may read up to 15 bytes before the start of
300	 * the input. It is up to the calling code to ensure this is safe if
301	 * this happens in the first iteration of the loop (i.e., when the
302	 * input size is < 16 bytes)
303	 */
3041:	mov		x15, #16
305	ands		x19, x0, #0xf
306	csel		x19, x19, x15, ne
307	adr_l		x17, .Lpermute_table + 16
308
309	sub		x11, x15, x19
310	add		x12, x17, x11
311	sub		x17, x17, x11
312	ld1		{T1.16b}, [x12]
313	sub		x10, x1, x11
314	sub		x11, x2, x11
315
316	cmp		x0, #-16
317	csel		x14, x15, xzr, gt
318	cmp		x0, #-32
319	csel		x15, x15, xzr, gt
320	cmp		x0, #-48
321	csel		x16, x19, xzr, gt
322	csel		x1, x1, x10, gt
323	csel		x2, x2, x11, gt
324
325	ld1		{INP0.16b}, [x2], x14
326	ld1		{INP1.16b}, [x2], x15
327	ld1		{INP2.16b}, [x2], x16
328	ld1		{INP3.16b}, [x2]
329	tbl		INP3.16b, {INP3.16b}, T1.16b
330	b		2f
331	.previous
332
3332:	.if		\enc == 0
334	bl		pmull_gcm_ghash_4x
335	.endif
336
337	bl		pmull_gcm_enc_4x
338
339	tbnz		x0, #63, 6f
340	st1		{INP0.16b-INP3.16b}, [x1], #64
341	.if		\enc == 1
342	bl		pmull_gcm_ghash_4x
343	.endif
344	bne		0b
345
3463:	ldr		x10, [sp, #.Lframe_local_offset]
347	cbz		x10, 5f				// output tag?
348
349	ld1		{INP3.16b}, [x10]		// load lengths[]
350	mov		w9, #1
351	bl		pmull_gcm_ghash_4x
352
353	mov		w11, #(0x1 << 24)		// BE '1U'
354	ld1		{KS0.16b}, [x5]
355	mov		KS0.s[3], w11
356
357	enc_block	KS0, x7, x6, x12
358
359	ext		XL.16b, XL.16b, XL.16b, #8
360	rev64		XL.16b, XL.16b
361	eor		XL.16b, XL.16b, KS0.16b
362
363	.if		\enc == 1
364	st1		{XL.16b}, [x10]			// store tag
365	.else
366	ldp		x11, x12, [sp, #40]		// load tag pointer and authsize
367	adr_l		x17, .Lpermute_table
368	ld1		{KS0.16b}, [x11]		// load supplied tag
369	add		x17, x17, x12
370	ld1		{KS1.16b}, [x17]		// load permute vector
371
372	cmeq		XL.16b, XL.16b, KS0.16b		// compare tags
373	mvn		XL.16b, XL.16b			// -1 for fail, 0 for pass
374	tbl		XL.16b, {XL.16b}, KS1.16b	// keep authsize bytes only
375	sminv		b0, XL.16b			// signed minimum across XL
376	smov		w0, v0.b[0]			// return b0
377	.endif
378
3794:	frame_pop
380	ret
381
3825:
383CPU_LE(	rev		w8, w8		)
384	str		w8, [x5, #12]			// store lower counter
385	st1		{XL.2d}, [x4]
386	b		4b
387
3886:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
389	sub		x17, x17, x19, lsl #1
390
391	cmp		w9, #1
392	beq		7f
393	.subsection	1
3947:	ld1		{INP2.16b}, [x1]
395	tbx		INP2.16b, {INP3.16b}, T1.16b
396	mov		INP3.16b, INP2.16b
397	b		8f
398	.previous
399
400	st1		{INP0.16b}, [x1], x14
401	st1		{INP1.16b}, [x1], x15
402	st1		{INP2.16b}, [x1], x16
403	tbl		INP3.16b, {INP3.16b}, T1.16b
404	tbx		INP3.16b, {INP2.16b}, T2.16b
4058:	st1		{INP3.16b}, [x1]
406
407	.if		\enc == 1
408	ld1		{T1.16b}, [x17]
409	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
410	bl		pmull_gcm_ghash_4x
411	.endif
412	b		3b
413	.endm
414
415	/*
416	 * void pmull_gcm_encrypt(int bytes, u8 dst[], const u8 src[],
417	 *			  u64 const h[4][2], u64 dg[], u8 ctr[],
418	 *			  u32 const rk[], int rounds, u8 tag[])
419	 */
420SYM_FUNC_START(pmull_gcm_encrypt)
421	pmull_gcm_do_crypt	1
422SYM_FUNC_END(pmull_gcm_encrypt)
423
424	/*
425	 * int pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[],
426	 *			 u64 const h[4][2], u64 dg[], u8 ctr[],
427	 *			 u32 const rk[], int rounds, const u8 l[],
428	 *			 const u8 tag[], u64 authsize)
429	 */
430SYM_FUNC_START(pmull_gcm_decrypt)
431	pmull_gcm_do_crypt	0
432SYM_FUNC_END(pmull_gcm_decrypt)
433
434SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
435	movi		MASK.16b, #0xe1
436	shl		MASK.2d, MASK.2d, #57
437
438	rev64		T1.16b, INP0.16b
439	rev64		T2.16b, INP1.16b
440	rev64		TT3.16b, INP2.16b
441	rev64		TT4.16b, INP3.16b
442
443	ext		XL.16b, XL.16b, XL.16b, #8
444
445	tbz		w9, #2, 0f			// <4 blocks?
446	.subsection	1
4470:	movi		XH2.16b, #0
448	movi		XM2.16b, #0
449	movi		XL2.16b, #0
450
451	tbz		w9, #0, 1f			// 2 blocks?
452	tbz		w9, #1, 2f			// 1 block?
453
454	eor		T2.16b, T2.16b, XL.16b
455	ext		T1.16b, T2.16b, T2.16b, #8
456	b		.Lgh3
457
4581:	eor		TT3.16b, TT3.16b, XL.16b
459	ext		T2.16b, TT3.16b, TT3.16b, #8
460	b		.Lgh2
461
4622:	eor		TT4.16b, TT4.16b, XL.16b
463	ext		IN1.16b, TT4.16b, TT4.16b, #8
464	b		.Lgh1
465	.previous
466
467	eor		T1.16b, T1.16b, XL.16b
468	ext		IN1.16b, T1.16b, T1.16b, #8
469
470	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
471	eor		T1.16b, T1.16b, IN1.16b
472	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
473	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
474
475	ext		T1.16b, T2.16b, T2.16b, #8
476.Lgh3:	eor		T2.16b, T2.16b, T1.16b
477	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
478	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
479	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
480
481	eor		XH2.16b, XH2.16b, XH.16b
482	eor		XL2.16b, XL2.16b, XL.16b
483	eor		XM2.16b, XM2.16b, XM.16b
484
485	ext		T2.16b, TT3.16b, TT3.16b, #8
486.Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
487	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
488	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
489	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
490
491	eor		XH2.16b, XH2.16b, XH.16b
492	eor		XL2.16b, XL2.16b, XL.16b
493	eor		XM2.16b, XM2.16b, XM.16b
494
495	ext		IN1.16b, TT4.16b, TT4.16b, #8
496.Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
497	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
498	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
499	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
500
501	eor		XH.16b, XH.16b, XH2.16b
502	eor		XL.16b, XL.16b, XL2.16b
503	eor		XM.16b, XM.16b, XM2.16b
504
505	eor		T2.16b, XL.16b, XH.16b
506	ext		T1.16b, XL.16b, XH.16b, #8
507	eor		XM.16b, XM.16b, T2.16b
508
509	__pmull_reduce_p64
510
511	eor		T2.16b, T2.16b, XH.16b
512	eor		XL.16b, XL.16b, T2.16b
513
514	ret
515SYM_FUNC_END(pmull_gcm_ghash_4x)
516
517SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
518	ld1		{KS0.16b}, [x5]			// load upper counter
519	sub		w10, w8, #4
520	sub		w11, w8, #3
521	sub		w12, w8, #2
522	sub		w13, w8, #1
523	rev		w10, w10
524	rev		w11, w11
525	rev		w12, w12
526	rev		w13, w13
527	mov		KS1.16b, KS0.16b
528	mov		KS2.16b, KS0.16b
529	mov		KS3.16b, KS0.16b
530	ins		KS0.s[3], w10			// set lower counter
531	ins		KS1.s[3], w11
532	ins		KS2.s[3], w12
533	ins		KS3.s[3], w13
534
535	add		x10, x6, #96			// round key pointer
536	ld1		{K6.4s-K7.4s}, [x10], #32
537	.irp		key, K0, K1, K2, K3, K4, K5
538	enc_qround	KS0, KS1, KS2, KS3, \key
539	.endr
540
541	tbnz		x7, #2, .Lnot128
542	.subsection	1
543.Lnot128:
544	ld1		{K8.4s-K9.4s}, [x10], #32
545	.irp		key, K6, K7
546	enc_qround	KS0, KS1, KS2, KS3, \key
547	.endr
548	ld1		{K6.4s-K7.4s}, [x10]
549	.irp		key, K8, K9
550	enc_qround	KS0, KS1, KS2, KS3, \key
551	.endr
552	tbz		x7, #1, .Lout192
553	b		.Lout256
554	.previous
555
556.Lout256:
557	.irp		key, K6, K7
558	enc_qround	KS0, KS1, KS2, KS3, \key
559	.endr
560
561.Lout192:
562	enc_qround	KS0, KS1, KS2, KS3, KK
563
564	aese		KS0.16b, KL.16b
565	aese		KS1.16b, KL.16b
566	aese		KS2.16b, KL.16b
567	aese		KS3.16b, KL.16b
568
569	eor		KS0.16b, KS0.16b, KM.16b
570	eor		KS1.16b, KS1.16b, KM.16b
571	eor		KS2.16b, KS2.16b, KM.16b
572	eor		KS3.16b, KS3.16b, KM.16b
573
574	eor		INP0.16b, INP0.16b, KS0.16b
575	eor		INP1.16b, INP1.16b, KS1.16b
576	eor		INP2.16b, INP2.16b, KS2.16b
577	eor		INP3.16b, INP3.16b, KS3.16b
578
579	ret
580SYM_FUNC_END(pmull_gcm_enc_4x)
581
582	.section	".rodata", "a"
583	.align		6
584.Lpermute_table:
585	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
586	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
587	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
588	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
589	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
590	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
591	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
592	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
593	.previous
594