xref: /linux/lib/crypto/arm/ghash-neon-core.S (revision 71e59795c9f65a30416ed719b4b4da585df3903a)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated GHASH implementation with NEON vmull.p8 instructions.
4 *
5 * Copyright (C) 2015 - 2017 Linaro Ltd.
6 * Copyright (C) 2023 Google LLC. <ardb@google.com>
7 */
8
9#include <linux/linkage.h>
10#include <asm/assembler.h>
11
12	.fpu		neon
13
14	SHASH		.req	q0
15	T1		.req	q1
16	XL		.req	q2
17	XM		.req	q3
18	XH		.req	q4
19	IN1		.req	q4
20
21	SHASH_L		.req	d0
22	SHASH_H		.req	d1
23	T1_L		.req	d2
24	T1_H		.req	d3
25	XL_L		.req	d4
26	XL_H		.req	d5
27	XM_L		.req	d6
28	XM_H		.req	d7
29	XH_L		.req	d8
30
31	t0l		.req	d10
32	t0h		.req	d11
33	t1l		.req	d12
34	t1h		.req	d13
35	t2l		.req	d14
36	t2h		.req	d15
37	t3l		.req	d16
38	t3h		.req	d17
39	t4l		.req	d18
40	t4h		.req	d19
41
42	t0q		.req	q5
43	t1q		.req	q6
44	t2q		.req	q7
45	t3q		.req	q8
46	t4q		.req	q9
47
48	s1l		.req	d20
49	s1h		.req	d21
50	s2l		.req	d22
51	s2h		.req	d23
52	s3l		.req	d24
53	s3h		.req	d25
54	s4l		.req	d26
55	s4h		.req	d27
56
57	SHASH2_p8	.req	d28
58
59	k16		.req	d29
60	k32		.req	d30
61	k48		.req	d31
62
63	T2		.req	q7
64
65	.text
66
67	/*
68	 * This implementation of 64x64 -> 128 bit polynomial multiplication
69	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
70	 * "Fast Software Polynomial Multiplication on ARM Processors Using
71	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
72	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
73	 *
74	 * It has been slightly tweaked for in-order performance, and to allow
75	 * 'rq' to overlap with 'ad' or 'bd'.
76	 */
77	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
78	vext.8		t0l, \ad, \ad, #1	@ A1
79	.ifc		\b1, t4l
80	vext.8		t4l, \bd, \bd, #1	@ B1
81	.endif
82	vmull.p8	t0q, t0l, \bd		@ F = A1*B
83	vext.8		t1l, \ad, \ad, #2	@ A2
84	vmull.p8	t4q, \ad, \b1		@ E = A*B1
85	.ifc		\b2, t3l
86	vext.8		t3l, \bd, \bd, #2	@ B2
87	.endif
88	vmull.p8	t1q, t1l, \bd		@ H = A2*B
89	vext.8		t2l, \ad, \ad, #3	@ A3
90	vmull.p8	t3q, \ad, \b2		@ G = A*B2
91	veor		t0q, t0q, t4q		@ L = E + F
92	.ifc		\b3, t4l
93	vext.8		t4l, \bd, \bd, #3	@ B3
94	.endif
95	vmull.p8	t2q, t2l, \bd		@ J = A3*B
96	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
97	veor		t1q, t1q, t3q		@ M = G + H
98	.ifc		\b4, t3l
99	vext.8		t3l, \bd, \bd, #4	@ B4
100	.endif
101	vmull.p8	t4q, \ad, \b3		@ I = A*B3
102	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
103	vmull.p8	t3q, \ad, \b4		@ K = A*B4
104	vand		t0h, t0h, k48
105	vand		t1h, t1h, k32
106	veor		t2q, t2q, t4q		@ N = I + J
107	veor		t0l, t0l, t0h
108	veor		t1l, t1l, t1h
109	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
110	vand		t2h, t2h, k16
111	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
112	vmov.i64	t3h, #0
113	vext.8		t0q, t0q, t0q, #15
114	veor		t2l, t2l, t2h
115	vext.8		t1q, t1q, t1q, #14
116	vmull.p8	\rq, \ad, \bd		@ D = A*B
117	vext.8		t2q, t2q, t2q, #13
118	vext.8		t3q, t3q, t3q, #12
119	veor		t0q, t0q, t1q
120	veor		t2q, t2q, t3q
121	veor		\rq, \rq, t0q
122	veor		\rq, \rq, t2q
123	.endm
124
125	.macro		__pmull_reduce_p8
126	veor		XL_H, XL_H, XM_L
127	veor		XH_L, XH_L, XM_H
128
129	vshl.i64	T1, XL, #57
130	vshl.i64	T2, XL, #62
131	veor		T1, T1, T2
132	vshl.i64	T2, XL, #63
133	veor		T1, T1, T2
134	veor		XL_H, XL_H, T1_L
135	veor		XH_L, XH_L, T1_H
136
137	vshr.u64	T1, XL, #1
138	veor		XH, XH, XL
139	veor		XL, XL, T1
140	vshr.u64	T1, T1, #6
141	vshr.u64	XL, XL, #1
142	.endm
143
144	.macro		vrev64_if_be	a
145#ifdef CONFIG_CPU_BIG_ENDIAN
146	vrev64.8	\a, \a
147#endif
148	.endm
149
150	.macro		ghash_update
151	vld1.64		{XL}, [r1]
152	vrev64_if_be	XL
153
1540:
155	vld1.8		{T1}, [r2]!
156	subs		r0, r0, #1
157
158	/* multiply XL by SHASH in GF(2^128) */
159	vrev64.8	T1, T1
160
161	vext.8		IN1, T1, T1, #8
162	veor		T1_L, T1_L, XL_H
163	veor		XL, XL, IN1
164
165	__pmull_p8	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
166	veor		T1, T1, XL
167	__pmull_p8	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
168	__pmull_p8	XM, T1_L, SHASH2_p8			@ (a1+a0)(b1+b0)
169
170	veor		T1, XL, XH
171	veor		XM, XM, T1
172
173	__pmull_reduce_p8
174
175	veor		T1, T1, XH
176	veor		XL, XL, T1
177
178	bne		0b
179	.endm
180
181	/*
182	 * void pmull_ghash_update_p8(size_t blocks, struct polyval_elem *dg,
183	 *			      const u8 *src,
184	 *			      const struct polyval_elem *h)
185	 */
186ENTRY(pmull_ghash_update_p8)
187	vld1.64		{SHASH}, [r3]
188	vrev64_if_be	SHASH
189	veor		SHASH2_p8, SHASH_L, SHASH_H
190
191	vext.8		s1l, SHASH_L, SHASH_L, #1
192	vext.8		s2l, SHASH_L, SHASH_L, #2
193	vext.8		s3l, SHASH_L, SHASH_L, #3
194	vext.8		s4l, SHASH_L, SHASH_L, #4
195	vext.8		s1h, SHASH_H, SHASH_H, #1
196	vext.8		s2h, SHASH_H, SHASH_H, #2
197	vext.8		s3h, SHASH_H, SHASH_H, #3
198	vext.8		s4h, SHASH_H, SHASH_H, #4
199
200	vmov.i64	k16, #0xffff
201	vmov.i64	k32, #0xffffffff
202	vmov.i64	k48, #0xffffffffffff
203
204	ghash_update
205	vrev64_if_be	XL
206	vst1.64		{XL}, [r1]
207
208	bx		lr
209ENDPROC(pmull_ghash_update_p8)
210