1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
4 */
5
6#include <asm/asm.h>
7#include <asm/regdef.h>
8#include <linux/linkage.h>
9
10.text
11
12.macro	OP_4REG	op d0 d1 d2 d3 s0 s1 s2 s3
13	\op	\d0, \d0, \s0
14	\op	\d1, \d1, \s1
15	\op	\d2, \d2, \s2
16	\op	\d3, \d3, \s3
17.endm
18
19/*
20 * Very basic LoongArch implementation of ChaCha20. Produces a given positive
21 * number of blocks of output with a nonce of 0, taking an input key and
22 * 8-byte counter. Importantly does not spill to the stack. Its arguments
23 * are:
24 *
25 *	a0: output bytes
26 *	a1: 32-byte key input
27 *	a2: 8-byte counter input/output
28 *	a3: number of 64-byte blocks to write to output
29 */
30SYM_FUNC_START(__arch_chacha20_blocks_nostack)
31
32/* We don't need a frame pointer */
33#define s9		fp
34
35#define output		a0
36#define key		a1
37#define counter		a2
38#define nblocks		a3
39#define i		a4
40#define state0		s0
41#define state1		s1
42#define state2		s2
43#define state3		s3
44#define state4		s4
45#define state5		s5
46#define state6		s6
47#define state7		s7
48#define state8		s8
49#define state9		s9
50#define state10		a5
51#define state11		a6
52#define state12		a7
53#define state13		t0
54#define state14		t1
55#define state15		t2
56#define cnt_lo		t3
57#define cnt_hi		t4
58#define copy0		t5
59#define copy1		t6
60#define copy2		t7
61#define copy3		t8
62
63/* Packs to be used with OP_4REG */
64#define line0		state0, state1, state2, state3
65#define line1		state4, state5, state6, state7
66#define line2		state8, state9, state10, state11
67#define line3		state12, state13, state14, state15
68
69#define line1_perm	state5, state6, state7, state4
70#define line2_perm	state10, state11, state8, state9
71#define line3_perm	state15, state12, state13, state14
72
73#define copy		copy0, copy1, copy2, copy3
74
75#define _16		16, 16, 16, 16
76#define _20		20, 20, 20, 20
77#define _24		24, 24, 24, 24
78#define _25		25, 25, 25, 25
79
80	/*
81	 * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
82	 * This does not violate the stack-less requirement: no sensitive data
83	 * is spilled onto the stack.
84	 */
85	PTR_ADDI	sp, sp, (-SZREG * 10) & STACK_ALIGN
86	REG_S		s0, sp, 0
87	REG_S		s1, sp, SZREG
88	REG_S		s2, sp, SZREG * 2
89	REG_S		s3, sp, SZREG * 3
90	REG_S		s4, sp, SZREG * 4
91	REG_S		s5, sp, SZREG * 5
92	REG_S		s6, sp, SZREG * 6
93	REG_S		s7, sp, SZREG * 7
94	REG_S		s8, sp, SZREG * 8
95	REG_S		s9, sp, SZREG * 9
96
97	li.w		copy0, 0x61707865
98	li.w		copy1, 0x3320646e
99	li.w		copy2, 0x79622d32
100	li.w		copy3, 0x6b206574
101
102	ld.w		cnt_lo, counter, 0
103	ld.w		cnt_hi, counter, 4
104
105.Lblock:
106	/* state[0,1,2,3] = "expand 32-byte k" */
107	move		state0, copy0
108	move		state1, copy1
109	move		state2, copy2
110	move		state3, copy3
111
112	/* state[4,5,..,11] = key */
113	ld.w		state4, key, 0
114	ld.w		state5, key, 4
115	ld.w		state6, key, 8
116	ld.w		state7, key, 12
117	ld.w		state8, key, 16
118	ld.w		state9, key, 20
119	ld.w		state10, key, 24
120	ld.w		state11, key, 28
121
122	/* state[12,13] = counter */
123	move		state12, cnt_lo
124	move		state13, cnt_hi
125
126	/* state[14,15] = 0 */
127	move		state14, zero
128	move		state15, zero
129
130	li.w		i, 10
131.Lpermute:
132	/* odd round */
133	OP_4REG	add.w	line0, line1
134	OP_4REG	xor	line3, line0
135	OP_4REG	rotri.w	line3, _16
136
137	OP_4REG	add.w	line2, line3
138	OP_4REG	xor	line1, line2
139	OP_4REG	rotri.w	line1, _20
140
141	OP_4REG	add.w	line0, line1
142	OP_4REG	xor	line3, line0
143	OP_4REG	rotri.w	line3, _24
144
145	OP_4REG	add.w	line2, line3
146	OP_4REG	xor	line1, line2
147	OP_4REG	rotri.w	line1, _25
148
149	/* even round */
150	OP_4REG	add.w	line0, line1_perm
151	OP_4REG	xor	line3_perm, line0
152	OP_4REG	rotri.w	line3_perm, _16
153
154	OP_4REG	add.w	line2_perm, line3_perm
155	OP_4REG	xor	line1_perm, line2_perm
156	OP_4REG	rotri.w	line1_perm, _20
157
158	OP_4REG	add.w	line0, line1_perm
159	OP_4REG	xor	line3_perm, line0
160	OP_4REG	rotri.w	line3_perm, _24
161
162	OP_4REG	add.w	line2_perm, line3_perm
163	OP_4REG	xor	line1_perm, line2_perm
164	OP_4REG	rotri.w	line1_perm, _25
165
166	addi.w		i, i, -1
167	bnez		i, .Lpermute
168
169	/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
170	OP_4REG	add.w	line0, copy
171	st.w		state0, output, 0
172	st.w		state1, output, 4
173	st.w		state2, output, 8
174	st.w		state3, output, 12
175
176	/* from now on state[0,1,2,3] are scratch registers  */
177
178	/* state[0,1,2,3] = lo32(key) */
179	ld.w		state0, key, 0
180	ld.w		state1, key, 4
181	ld.w		state2, key, 8
182	ld.w		state3, key, 12
183
184	/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
185	OP_4REG	add.w	line1, line0
186	st.w		state4, output, 16
187	st.w		state5, output, 20
188	st.w		state6, output, 24
189	st.w		state7, output, 28
190
191	/* state[0,1,2,3] = hi32(key) */
192	ld.w		state0, key, 16
193	ld.w		state1, key, 20
194	ld.w		state2, key, 24
195	ld.w		state3, key, 28
196
197	/* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
198	OP_4REG	add.w	line2, line0
199	st.w		state8, output, 32
200	st.w		state9, output, 36
201	st.w		state10, output, 40
202	st.w		state11, output, 44
203
204	/* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
205	add.w		state12, state12, cnt_lo
206	add.w		state13, state13, cnt_hi
207	st.w		state12, output, 48
208	st.w		state13, output, 52
209	st.w		state14, output, 56
210	st.w		state15, output, 60
211
212	/* ++counter  */
213	addi.w		cnt_lo, cnt_lo, 1
214	sltui		state0, cnt_lo, 1
215	add.w		cnt_hi, cnt_hi, state0
216
217	/* output += 64 */
218	PTR_ADDI	output, output, 64
219	/* --nblocks */
220	PTR_ADDI	nblocks, nblocks, -1
221	bnez		nblocks, .Lblock
222
223	/* counter = [cnt_lo, cnt_hi] */
224	st.w		cnt_lo, counter, 0
225	st.w		cnt_hi, counter, 4
226
227	/*
228	 * Zero out the potentially sensitive regs, in case nothing uses these
229	 * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
230	 * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we
231	 * only need to zero state[11,...,15].
232	 */
233	move		state10, zero
234	move		state11, zero
235	move		state12, zero
236	move		state13, zero
237	move		state14, zero
238	move		state15, zero
239
240	REG_L		s0, sp, 0
241	REG_L		s1, sp, SZREG
242	REG_L		s2, sp, SZREG * 2
243	REG_L		s3, sp, SZREG * 3
244	REG_L		s4, sp, SZREG * 4
245	REG_L		s5, sp, SZREG * 5
246	REG_L		s6, sp, SZREG * 6
247	REG_L		s7, sp, SZREG * 7
248	REG_L		s8, sp, SZREG * 8
249	REG_L		s9, sp, SZREG * 9
250	PTR_ADDI	sp, sp, -((-SZREG * 10) & STACK_ALIGN)
251
252	jr		ra
253SYM_FUNC_END(__arch_chacha20_blocks_nostack)
254