xref: /linux/lib/crypto/arm64/sm3-neon-core.S (revision 9f69f52b462cdaed83b782d0408ce9286f054f92) !
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * sm3-neon-core.S - SM3 secure hash using NEON instructions
4 *
5 * Linux/arm64 port of the libgcrypt SM3 implementation for AArch64
6 *
7 * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
8 * Copyright (c) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14/* Context structure */
15
16#define state_h0 0
17#define state_h1 4
18#define state_h2 8
19#define state_h3 12
20#define state_h4 16
21#define state_h5 20
22#define state_h6 24
23#define state_h7 28
24
25/* Stack structure */
26
27#define STACK_W_SIZE        (32 * 2 * 3)
28
29#define STACK_W             (0)
30#define STACK_SIZE          (STACK_W + STACK_W_SIZE)
31
32/* Register macros */
33
34#define RSTATE x0
35#define RDATA  x1
36#define RNBLKS x2
37#define RKPTR  x28
38#define RFRAME x29
39
40#define ra w3
41#define rb w4
42#define rc w5
43#define rd w6
44#define re w7
45#define rf w8
46#define rg w9
47#define rh w10
48
49#define t0 w11
50#define t1 w12
51#define t2 w13
52#define t3 w14
53#define t4 w15
54#define t5 w16
55#define t6 w17
56
57#define k_even w19
58#define k_odd w20
59
60#define addr0 x21
61#define addr1 x22
62
63#define s0 w23
64#define s1 w24
65#define s2 w25
66#define s3 w26
67
68#define W0 v0
69#define W1 v1
70#define W2 v2
71#define W3 v3
72#define W4 v4
73#define W5 v5
74
75#define XTMP0 v6
76#define XTMP1 v7
77#define XTMP2 v16
78#define XTMP3 v17
79#define XTMP4 v18
80#define XTMP5 v19
81#define XTMP6 v20
82
83/* Helper macros. */
84
85#define _(...) /*_*/
86
87#define clear_vec(x) \
88	movi	x.8h, #0;
89
90#define rolw(o, a, n) \
91	ror	o, a, #(32 - n);
92
93/* Round function macros. */
94
95#define GG1_1(x, y, z, o, t) \
96	eor	o, x, y;
97#define GG1_2(x, y, z, o, t) \
98	eor	o, o, z;
99#define GG1_3(x, y, z, o, t)
100
101#define FF1_1(x, y, z, o, t) GG1_1(x, y, z, o, t)
102#define FF1_2(x, y, z, o, t)
103#define FF1_3(x, y, z, o, t) GG1_2(x, y, z, o, t)
104
105#define GG2_1(x, y, z, o, t) \
106	bic	o, z, x;
107#define GG2_2(x, y, z, o, t) \
108	and	t, y, x;
109#define GG2_3(x, y, z, o, t) \
110	eor	o, o, t;
111
112#define FF2_1(x, y, z, o, t) \
113	eor	o, x, y;
114#define FF2_2(x, y, z, o, t) \
115	and	t, x, y; \
116	and	o, o, z;
117#define FF2_3(x, y, z, o, t) \
118	eor	o, o, t;
119
120#define R(i, a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
121	K_LOAD(round);                                                        \
122	ldr	t5, [sp, #(wtype##_W1_ADDR(round, widx))];                    \
123	rolw(t0, a, 12);                              /* rol(a, 12) => t0 */  \
124      IOP(1, iop_param);                                                      \
125	FF##i##_1(a, b, c, t1, t2);                                           \
126	ldr	t6, [sp, #(wtype##_W1W2_ADDR(round, widx))];                  \
127	add	k, k, e;                                                      \
128      IOP(2, iop_param);                                                      \
129	GG##i##_1(e, f, g, t3, t4);                                           \
130	FF##i##_2(a, b, c, t1, t2);                                           \
131      IOP(3, iop_param);                                                      \
132	add	k, k, t0;                                                     \
133	add	h, h, t5;                                                     \
134	add	d, d, t6;                     /* w1w2 + d => d */             \
135      IOP(4, iop_param);                                                      \
136	rolw(k, k, 7);                        /* rol (t0 + e + t), 7) => k */ \
137	GG##i##_2(e, f, g, t3, t4);                                           \
138	add	h, h, k;                      /* h + w1 + k => h */           \
139      IOP(5, iop_param);                                                      \
140	FF##i##_3(a, b, c, t1, t2);                                           \
141	eor	t0, t0, k;                    /* k ^ t0 => t0 */              \
142	GG##i##_3(e, f, g, t3, t4);                                           \
143	add	d, d, t1;                     /* FF(a,b,c) + d => d */        \
144      IOP(6, iop_param);                                                      \
145	add	t3, t3, h;                    /* GG(e,f,g) + h => t3 */       \
146	rolw(b, b, 9);                        /* rol(b, 9) => b */            \
147	eor	h, t3, t3, ror #(32-9);                                       \
148      IOP(7, iop_param);                                                      \
149	add	d, d, t0;                     /* t0 + d => d */               \
150	rolw(f, f, 19);                       /* rol(f, 19) => f */           \
151      IOP(8, iop_param);                                                      \
152	eor	h, h, t3, ror #(32-17);       /* P0(t3) => h */
153
154#define R1(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
155	R(1, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
156
157#define R2(a, b, c, d, e, f, g, h, k, K_LOAD, round, widx, wtype, IOP, iop_param) \
158	R(2, ##a, ##b, ##c, ##d, ##e, ##f, ##g, ##h, ##k, K_LOAD, round, widx, wtype, IOP, iop_param)
159
160#define KL(round) \
161	ldp	k_even, k_odd, [RKPTR, #(4*(round))];
162
163/* Input expansion macros. */
164
165/* Byte-swapped input address. */
166#define IW_W_ADDR(round, widx, offs) \
167	(STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))
168
169/* Expanded input address. */
170#define XW_W_ADDR(round, widx, offs) \
171	(STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))
172
173/* Rounds 1-12, byte-swapped input block addresses. */
174#define IW_W1_ADDR(round, widx)   IW_W_ADDR(round, widx, 32)
175#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 48)
176
177/* Rounds 1-12, expanded input block addresses. */
178#define XW_W1_ADDR(round, widx)   XW_W_ADDR(round, widx, 0)
179#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 16)
180
181/* Input block loading.
182 * Interleaving within round function needed for in-order CPUs. */
183#define LOAD_W_VEC_1_1() \
184	add	addr0, sp, #IW_W1_ADDR(0, 0);
185#define LOAD_W_VEC_1_2() \
186	add	addr1, sp, #IW_W1_ADDR(4, 0);
187#define LOAD_W_VEC_1_3() \
188	ld1	{W0.16b}, [RDATA], #16;
189#define LOAD_W_VEC_1_4() \
190	ld1	{W1.16b}, [RDATA], #16;
191#define LOAD_W_VEC_1_5() \
192	ld1	{W2.16b}, [RDATA], #16;
193#define LOAD_W_VEC_1_6() \
194	ld1	{W3.16b}, [RDATA], #16;
195#define LOAD_W_VEC_1_7() \
196	rev32	XTMP0.16b, W0.16b;
197#define LOAD_W_VEC_1_8() \
198	rev32	XTMP1.16b, W1.16b;
199#define LOAD_W_VEC_2_1() \
200	rev32	XTMP2.16b, W2.16b;
201#define LOAD_W_VEC_2_2() \
202	rev32	XTMP3.16b, W3.16b;
203#define LOAD_W_VEC_2_3() \
204	eor	XTMP4.16b, XTMP1.16b, XTMP0.16b;
205#define LOAD_W_VEC_2_4() \
206	eor	XTMP5.16b, XTMP2.16b, XTMP1.16b;
207#define LOAD_W_VEC_2_5() \
208	st1	{XTMP0.16b}, [addr0], #16;
209#define LOAD_W_VEC_2_6() \
210	st1	{XTMP4.16b}, [addr0]; \
211	add	addr0, sp, #IW_W1_ADDR(8, 0);
212#define LOAD_W_VEC_2_7() \
213	eor	XTMP6.16b, XTMP3.16b, XTMP2.16b;
214#define LOAD_W_VEC_2_8() \
215	ext	W0.16b, XTMP0.16b, XTMP0.16b, #8;  /* W0: xx, w0, xx, xx */
216#define LOAD_W_VEC_3_1() \
217	mov	W2.16b, XTMP1.16b;                 /* W2: xx, w6, w5, w4 */
218#define LOAD_W_VEC_3_2() \
219	st1	{XTMP1.16b}, [addr1], #16;
220#define LOAD_W_VEC_3_3() \
221	st1	{XTMP5.16b}, [addr1]; \
222	ext	W1.16b, XTMP0.16b, XTMP0.16b, #4;  /* W1: xx, w3, w2, w1 */
223#define LOAD_W_VEC_3_4() \
224	ext	W3.16b, XTMP1.16b, XTMP2.16b, #12; /* W3: xx, w9, w8, w7 */
225#define LOAD_W_VEC_3_5() \
226	ext	W4.16b, XTMP2.16b, XTMP3.16b, #8;  /* W4: xx, w12, w11, w10 */
227#define LOAD_W_VEC_3_6() \
228	st1	{XTMP2.16b}, [addr0], #16;
229#define LOAD_W_VEC_3_7() \
230	st1	{XTMP6.16b}, [addr0];
231#define LOAD_W_VEC_3_8() \
232	ext	W5.16b, XTMP3.16b, XTMP3.16b, #4;  /* W5: xx, w15, w14, w13 */
233
234#define LOAD_W_VEC_1(iop_num, ...) \
235	LOAD_W_VEC_1_##iop_num()
236#define LOAD_W_VEC_2(iop_num, ...) \
237	LOAD_W_VEC_2_##iop_num()
238#define LOAD_W_VEC_3(iop_num, ...) \
239	LOAD_W_VEC_3_##iop_num()
240
241/* Message scheduling. Note: 3 words per vector register.
242 * Interleaving within round function needed for in-order CPUs. */
243#define SCHED_W_1_1(round, w0, w1, w2, w3, w4, w5) \
244	/* Load (w[i - 16]) => XTMP0 */            \
245	/* Load (w[i - 13]) => XTMP5 */            \
246	ext	XTMP0.16b, w0.16b, w0.16b, #12;    /* XTMP0: w0, xx, xx, xx */
247#define SCHED_W_1_2(round, w0, w1, w2, w3, w4, w5) \
248	ext	XTMP5.16b, w1.16b, w1.16b, #12;
249#define SCHED_W_1_3(round, w0, w1, w2, w3, w4, w5) \
250	ext	XTMP0.16b, XTMP0.16b, w1.16b, #12; /* XTMP0: xx, w2, w1, w0 */
251#define SCHED_W_1_4(round, w0, w1, w2, w3, w4, w5) \
252	ext	XTMP5.16b, XTMP5.16b, w2.16b, #12;
253#define SCHED_W_1_5(round, w0, w1, w2, w3, w4, w5) \
254	/* w[i - 9] == w3 */                       \
255	/* W3 ^ XTMP0 => XTMP0 */                  \
256	eor	XTMP0.16b, XTMP0.16b, w3.16b;
257#define SCHED_W_1_6(round, w0, w1, w2, w3, w4, w5) \
258	/* w[i - 3] == w5 */                       \
259	/* rol(XMM5, 15) ^ XTMP0 => XTMP0 */       \
260	/* rol(XTMP5, 7) => XTMP1 */               \
261	add	addr0, sp, #XW_W1_ADDR((round), 0); \
262	shl	XTMP2.4s, w5.4s, #15;
263#define SCHED_W_1_7(round, w0, w1, w2, w3, w4, w5) \
264	shl	XTMP1.4s, XTMP5.4s, #7;
265#define SCHED_W_1_8(round, w0, w1, w2, w3, w4, w5) \
266	sri	XTMP2.4s, w5.4s, #(32-15);
267#define SCHED_W_2_1(round, w0, w1, w2, w3, w4, w5) \
268	sri	XTMP1.4s, XTMP5.4s, #(32-7);
269#define SCHED_W_2_2(round, w0, w1, w2, w3, w4, w5) \
270	eor	XTMP0.16b, XTMP0.16b, XTMP2.16b;
271#define SCHED_W_2_3(round, w0, w1, w2, w3, w4, w5) \
272	/* w[i - 6] == W4 */                       \
273	/* W4 ^ XTMP1 => XTMP1 */                  \
274	eor	XTMP1.16b, XTMP1.16b, w4.16b;
275#define SCHED_W_2_4(round, w0, w1, w2, w3, w4, w5) \
276	/* P1(XTMP0) ^ XTMP1 => W0 */              \
277	shl	XTMP3.4s, XTMP0.4s, #15;
278#define SCHED_W_2_5(round, w0, w1, w2, w3, w4, w5) \
279	shl	XTMP4.4s, XTMP0.4s, #23;
280#define SCHED_W_2_6(round, w0, w1, w2, w3, w4, w5) \
281	eor	w0.16b, XTMP1.16b, XTMP0.16b;
282#define SCHED_W_2_7(round, w0, w1, w2, w3, w4, w5) \
283	sri	XTMP3.4s, XTMP0.4s, #(32-15);
284#define SCHED_W_2_8(round, w0, w1, w2, w3, w4, w5) \
285	sri	XTMP4.4s, XTMP0.4s, #(32-23);
286#define SCHED_W_3_1(round, w0, w1, w2, w3, w4, w5) \
287	eor	w0.16b, w0.16b, XTMP3.16b;
288#define SCHED_W_3_2(round, w0, w1, w2, w3, w4, w5) \
289	/* Load (w[i - 3]) => XTMP2 */             \
290	ext	XTMP2.16b, w4.16b, w4.16b, #12;
291#define SCHED_W_3_3(round, w0, w1, w2, w3, w4, w5) \
292	eor	w0.16b, w0.16b, XTMP4.16b;
293#define SCHED_W_3_4(round, w0, w1, w2, w3, w4, w5) \
294	ext	XTMP2.16b, XTMP2.16b, w5.16b, #12;
295#define SCHED_W_3_5(round, w0, w1, w2, w3, w4, w5) \
296	/* W1 ^ W2 => XTMP3 */                     \
297	eor	XTMP3.16b, XTMP2.16b, w0.16b;
298#define SCHED_W_3_6(round, w0, w1, w2, w3, w4, w5)
299#define SCHED_W_3_7(round, w0, w1, w2, w3, w4, w5) \
300	st1	{XTMP2.16b-XTMP3.16b}, [addr0];
301#define SCHED_W_3_8(round, w0, w1, w2, w3, w4, w5)
302
303#define SCHED_W_W0W1W2W3W4W5_1(iop_num, round) \
304	SCHED_W_1_##iop_num(round, W0, W1, W2, W3, W4, W5)
305#define SCHED_W_W0W1W2W3W4W5_2(iop_num, round) \
306	SCHED_W_2_##iop_num(round, W0, W1, W2, W3, W4, W5)
307#define SCHED_W_W0W1W2W3W4W5_3(iop_num, round) \
308	SCHED_W_3_##iop_num(round, W0, W1, W2, W3, W4, W5)
309
310#define SCHED_W_W1W2W3W4W5W0_1(iop_num, round) \
311	SCHED_W_1_##iop_num(round, W1, W2, W3, W4, W5, W0)
312#define SCHED_W_W1W2W3W4W5W0_2(iop_num, round) \
313	SCHED_W_2_##iop_num(round, W1, W2, W3, W4, W5, W0)
314#define SCHED_W_W1W2W3W4W5W0_3(iop_num, round) \
315	SCHED_W_3_##iop_num(round, W1, W2, W3, W4, W5, W0)
316
317#define SCHED_W_W2W3W4W5W0W1_1(iop_num, round) \
318	SCHED_W_1_##iop_num(round, W2, W3, W4, W5, W0, W1)
319#define SCHED_W_W2W3W4W5W0W1_2(iop_num, round) \
320	SCHED_W_2_##iop_num(round, W2, W3, W4, W5, W0, W1)
321#define SCHED_W_W2W3W4W5W0W1_3(iop_num, round) \
322	SCHED_W_3_##iop_num(round, W2, W3, W4, W5, W0, W1)
323
324#define SCHED_W_W3W4W5W0W1W2_1(iop_num, round) \
325	SCHED_W_1_##iop_num(round, W3, W4, W5, W0, W1, W2)
326#define SCHED_W_W3W4W5W0W1W2_2(iop_num, round) \
327	SCHED_W_2_##iop_num(round, W3, W4, W5, W0, W1, W2)
328#define SCHED_W_W3W4W5W0W1W2_3(iop_num, round) \
329	SCHED_W_3_##iop_num(round, W3, W4, W5, W0, W1, W2)
330
331#define SCHED_W_W4W5W0W1W2W3_1(iop_num, round) \
332	SCHED_W_1_##iop_num(round, W4, W5, W0, W1, W2, W3)
333#define SCHED_W_W4W5W0W1W2W3_2(iop_num, round) \
334	SCHED_W_2_##iop_num(round, W4, W5, W0, W1, W2, W3)
335#define SCHED_W_W4W5W0W1W2W3_3(iop_num, round) \
336	SCHED_W_3_##iop_num(round, W4, W5, W0, W1, W2, W3)
337
338#define SCHED_W_W5W0W1W2W3W4_1(iop_num, round) \
339	SCHED_W_1_##iop_num(round, W5, W0, W1, W2, W3, W4)
340#define SCHED_W_W5W0W1W2W3W4_2(iop_num, round) \
341	SCHED_W_2_##iop_num(round, W5, W0, W1, W2, W3, W4)
342#define SCHED_W_W5W0W1W2W3W4_3(iop_num, round) \
343	SCHED_W_3_##iop_num(round, W5, W0, W1, W2, W3, W4)
344
345
346	/*
347	 * Transform nblocks*64 bytes (nblocks*16 32-bit words) at 'data'.
348	 *
349	 * void sm3_neon_transform(struct sm3_block_state *state,
350	 *			   const u8 *data, size_t nblocks)
351	 */
352	.text
353.align 3
354SYM_FUNC_START(sm3_neon_transform)
355	ldp		ra, rb, [RSTATE, #0]
356	ldp		rc, rd, [RSTATE, #8]
357	ldp		re, rf, [RSTATE, #16]
358	ldp		rg, rh, [RSTATE, #24]
359
360	stp		x28, x29, [sp, #-16]!
361	stp		x19, x20, [sp, #-16]!
362	stp		x21, x22, [sp, #-16]!
363	stp		x23, x24, [sp, #-16]!
364	stp		x25, x26, [sp, #-16]!
365	mov		RFRAME, sp
366
367	sub		addr0, sp, #STACK_SIZE
368	adr_l		RKPTR, .LKtable
369	and		sp, addr0, #(~63)
370
371	/* Preload first block. */
372	LOAD_W_VEC_1(1, 0)
373	LOAD_W_VEC_1(2, 0)
374	LOAD_W_VEC_1(3, 0)
375	LOAD_W_VEC_1(4, 0)
376	LOAD_W_VEC_1(5, 0)
377	LOAD_W_VEC_1(6, 0)
378	LOAD_W_VEC_1(7, 0)
379	LOAD_W_VEC_1(8, 0)
380	LOAD_W_VEC_2(1, 0)
381	LOAD_W_VEC_2(2, 0)
382	LOAD_W_VEC_2(3, 0)
383	LOAD_W_VEC_2(4, 0)
384	LOAD_W_VEC_2(5, 0)
385	LOAD_W_VEC_2(6, 0)
386	LOAD_W_VEC_2(7, 0)
387	LOAD_W_VEC_2(8, 0)
388	LOAD_W_VEC_3(1, 0)
389	LOAD_W_VEC_3(2, 0)
390	LOAD_W_VEC_3(3, 0)
391	LOAD_W_VEC_3(4, 0)
392	LOAD_W_VEC_3(5, 0)
393	LOAD_W_VEC_3(6, 0)
394	LOAD_W_VEC_3(7, 0)
395	LOAD_W_VEC_3(8, 0)
396
397.balign 16
398.Loop:
399	/* Transform 0-3 */
400	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 0, 0, IW, _, 0)
401	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  1, 1, IW, _, 0)
402	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 2, 2, IW, _, 0)
403	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  3, 3, IW, _, 0)
404
405	/* Transform 4-7 + Precalc 12-14 */
406	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 4, 0, IW, _, 0)
407	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  5, 1, IW, _, 0)
408	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 6, 2, IW, SCHED_W_W0W1W2W3W4W5_1, 12)
409	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  7, 3, IW, SCHED_W_W0W1W2W3W4W5_2, 12)
410
411	/* Transform 8-11 + Precalc 12-17 */
412	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 8, 0, IW, SCHED_W_W0W1W2W3W4W5_3, 12)
413	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  9, 1, IW, SCHED_W_W1W2W3W4W5W0_1, 15)
414	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 10, 2, IW, SCHED_W_W1W2W3W4W5W0_2, 15)
415	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  11, 3, IW, SCHED_W_W1W2W3W4W5W0_3, 15)
416
417	/* Transform 12-14 + Precalc 18-20 */
418	R1(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 12, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 18)
419	R1(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  13, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 18)
420	R1(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 14, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 18)
421
422	/* Transform 15-17 + Precalc 21-23 */
423	R1(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  15, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 21)
424	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 16, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 21)
425	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  17, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 21)
426
427	/* Transform 18-20 + Precalc 24-26 */
428	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 18, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 24)
429	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  19, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 24)
430	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 20, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 24)
431
432	/* Transform 21-23 + Precalc 27-29 */
433	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  21, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 27)
434	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 22, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 27)
435	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  23, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 27)
436
437	/* Transform 24-26 + Precalc 30-32 */
438	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 24, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 30)
439	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  25, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 30)
440	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 26, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 30)
441
442	/* Transform 27-29 + Precalc 33-35 */
443	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  27, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 33)
444	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 28, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 33)
445	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  29, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 33)
446
447	/* Transform 30-32 + Precalc 36-38 */
448	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 30, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 36)
449	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  31, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 36)
450	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 32, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 36)
451
452	/* Transform 33-35 + Precalc 39-41 */
453	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  33, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 39)
454	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 34, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 39)
455	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  35, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 39)
456
457	/* Transform 36-38 + Precalc 42-44 */
458	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 36, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 42)
459	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  37, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 42)
460	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 38, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 42)
461
462	/* Transform 39-41 + Precalc 45-47 */
463	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  39, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 45)
464	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 40, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 45)
465	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  41, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 45)
466
467	/* Transform 42-44 + Precalc 48-50 */
468	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 42, 0, XW, SCHED_W_W0W1W2W3W4W5_1, 48)
469	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  43, 1, XW, SCHED_W_W0W1W2W3W4W5_2, 48)
470	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 44, 2, XW, SCHED_W_W0W1W2W3W4W5_3, 48)
471
472	/* Transform 45-47 + Precalc 51-53 */
473	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  45, 0, XW, SCHED_W_W1W2W3W4W5W0_1, 51)
474	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 46, 1, XW, SCHED_W_W1W2W3W4W5W0_2, 51)
475	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  47, 2, XW, SCHED_W_W1W2W3W4W5W0_3, 51)
476
477	/* Transform 48-50 + Precalc 54-56 */
478	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 48, 0, XW, SCHED_W_W2W3W4W5W0W1_1, 54)
479	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  49, 1, XW, SCHED_W_W2W3W4W5W0W1_2, 54)
480	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 50, 2, XW, SCHED_W_W2W3W4W5W0W1_3, 54)
481
482	/* Transform 51-53 + Precalc 57-59 */
483	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  51, 0, XW, SCHED_W_W3W4W5W0W1W2_1, 57)
484	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 52, 1, XW, SCHED_W_W3W4W5W0W1W2_2, 57)
485	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  53, 2, XW, SCHED_W_W3W4W5W0W1W2_3, 57)
486
487	/* Transform 54-56 + Precalc 60-62 */
488	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 54, 0, XW, SCHED_W_W4W5W0W1W2W3_1, 60)
489	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  55, 1, XW, SCHED_W_W4W5W0W1W2W3_2, 60)
490	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 56, 2, XW, SCHED_W_W4W5W0W1W2W3_3, 60)
491
492	/* Transform 57-59 + Precalc 63 */
493	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  57, 0, XW, SCHED_W_W5W0W1W2W3W4_1, 63)
494	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 58, 1, XW, SCHED_W_W5W0W1W2W3W4_2, 63)
495	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  59, 2, XW, SCHED_W_W5W0W1W2W3W4_3, 63)
496
497	/* Transform 60 */
498	R2(ra, rb, rc, rd, re, rf, rg, rh, k_even, KL, 60, 0, XW, _, _)
499	subs		RNBLKS, RNBLKS, #1
500	b.eq		.Lend
501
502	/* Transform 61-63 + Preload next block */
503	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  61, 1, XW, LOAD_W_VEC_1, _)
504	ldp		s0, s1, [RSTATE, #0]
505	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, LOAD_W_VEC_2, _)
506	ldp		s2, s3, [RSTATE, #8]
507	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  63, 0, XW, LOAD_W_VEC_3, _)
508
509	/* Update the chaining variables. */
510	eor		ra, ra, s0
511	eor		rb, rb, s1
512	ldp		s0, s1, [RSTATE, #16]
513	eor		rc, rc, s2
514	ldp		k_even, k_odd, [RSTATE, #24]
515	eor		rd, rd, s3
516	eor		re, re, s0
517	stp		ra, rb, [RSTATE, #0]
518	eor		rf, rf, s1
519	stp		rc, rd, [RSTATE, #8]
520	eor		rg, rg, k_even
521	stp		re, rf, [RSTATE, #16]
522	eor		rh, rh, k_odd
523	stp		rg, rh, [RSTATE, #24]
524	b		.Loop
525
526.Lend:
527	/* Transform 61-63 */
528	R2(rd, ra, rb, rc, rh, re, rf, rg, k_odd,  _,  61, 1, XW, _, _)
529	ldp		s0, s1, [RSTATE, #0]
530	R2(rc, rd, ra, rb, rg, rh, re, rf, k_even, KL, 62, 2, XW, _, _)
531	ldp		s2, s3, [RSTATE, #8]
532	R2(rb, rc, rd, ra, rf, rg, rh, re, k_odd,  _,  63, 0, XW, _, _)
533
534	/* Update the chaining variables. */
535	eor		ra, ra, s0
536	clear_vec(W0)
537	eor		rb, rb, s1
538	clear_vec(W1)
539	ldp		s0, s1, [RSTATE, #16]
540	clear_vec(W2)
541	eor		rc, rc, s2
542	clear_vec(W3)
543	ldp		k_even, k_odd, [RSTATE, #24]
544	clear_vec(W4)
545	eor		rd, rd, s3
546	clear_vec(W5)
547	eor		re, re, s0
548	clear_vec(XTMP0)
549	stp		ra, rb, [RSTATE, #0]
550	clear_vec(XTMP1)
551	eor		rf, rf, s1
552	clear_vec(XTMP2)
553	stp		rc, rd, [RSTATE, #8]
554	clear_vec(XTMP3)
555	eor		rg, rg, k_even
556	clear_vec(XTMP4)
557	stp		re, rf, [RSTATE, #16]
558	clear_vec(XTMP5)
559	eor		rh, rh, k_odd
560	clear_vec(XTMP6)
561	stp		rg, rh, [RSTATE, #24]
562
563	/* Clear message expansion area */
564	add		addr0, sp, #STACK_W
565	st1		{W0.16b-W3.16b}, [addr0], #64
566	st1		{W0.16b-W3.16b}, [addr0], #64
567	st1		{W0.16b-W3.16b}, [addr0]
568
569	mov		sp, RFRAME
570
571	ldp		x25, x26, [sp], #16
572	ldp		x23, x24, [sp], #16
573	ldp		x21, x22, [sp], #16
574	ldp		x19, x20, [sp], #16
575	ldp		x28, x29, [sp], #16
576
577	ret
578SYM_FUNC_END(sm3_neon_transform)
579
580
581	.section	".rodata", "a"
582
583	.align 4
584.LKtable:
585	.long 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb
586	.long 0x9cc45197, 0x3988a32f, 0x7311465e, 0xe6228cbc
587	.long 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce
588	.long 0xc451979c, 0x88a32f39, 0x11465e73, 0x228cbce6
589	.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
590	.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
591	.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
592	.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
593	.long 0x7a879d8a, 0xf50f3b14, 0xea1e7629, 0xd43cec53
594	.long 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d
595	.long 0x879d8a7a, 0x0f3b14f5, 0x1e7629ea, 0x3cec53d4
596	.long 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43
597	.long 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c
598	.long 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce
599	.long 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec
600	.long 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5
601