xref: /linux/lib/crypto/mips/chacha-core.S (revision 22375adaa0d9fbba9646c8e2b099c6e87c97bfae)
1/* SPDX-License-Identifier: GPL-2.0 OR MIT */
2/*
3 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
4 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
5 */
6
7#define MASK_U32		0x3c
8#define CHACHA20_BLOCK_SIZE	64
9#define STACK_SIZE		32
10
11#define X0	$t0
12#define X1	$t1
13#define X2	$t2
14#define X3	$t3
15#define X4	$t4
16#define X5	$t5
17#define X6	$t6
18#define X7	$t7
19#define X8	$t8
20#define X9	$t9
21#define X10	$v1
22#define X11	$s6
23#define X12	$s5
24#define X13	$s4
25#define X14	$s3
26#define X15	$s2
27/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
28#define T0	$s1
29#define T1	$s0
30#define T(n)	T ## n
31#define X(n)	X ## n
32
33/* Input arguments */
34#define STATE		$a0
35#define OUT		$a1
36#define IN		$a2
37#define BYTES		$a3
38
39/* Output argument */
40/* NONCE[0] is kept in a register and not in memory.
41 * We don't want to touch original value in memory.
42 * Must be incremented every loop iteration.
43 */
44#define NONCE_0		$v0
45
46/* SAVED_X and SAVED_CA are set in the jump table.
47 * Use regs which are overwritten on exit else we don't leak clear data.
48 * They are used to handling the last bytes which are not multiple of 4.
49 */
50#define SAVED_X		X15
51#define SAVED_CA	$s7
52
53#define IS_UNALIGNED	$s7
54
55#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
56#define MSB 0
57#define LSB 3
58#define	CPU_TO_LE32(n) \
59	wsbh	n, n; \
60	rotr	n, 16;
61#else
62#define MSB 3
63#define LSB 0
64#define CPU_TO_LE32(n)
65#endif
66
67#define FOR_EACH_WORD(x) \
68	x( 0); \
69	x( 1); \
70	x( 2); \
71	x( 3); \
72	x( 4); \
73	x( 5); \
74	x( 6); \
75	x( 7); \
76	x( 8); \
77	x( 9); \
78	x(10); \
79	x(11); \
80	x(12); \
81	x(13); \
82	x(14); \
83	x(15);
84
85#define FOR_EACH_WORD_REV(x) \
86	x(15); \
87	x(14); \
88	x(13); \
89	x(12); \
90	x(11); \
91	x(10); \
92	x( 9); \
93	x( 8); \
94	x( 7); \
95	x( 6); \
96	x( 5); \
97	x( 4); \
98	x( 3); \
99	x( 2); \
100	x( 1); \
101	x( 0);
102
103#define PLUS_ONE_0	 1
104#define PLUS_ONE_1	 2
105#define PLUS_ONE_2	 3
106#define PLUS_ONE_3	 4
107#define PLUS_ONE_4	 5
108#define PLUS_ONE_5	 6
109#define PLUS_ONE_6	 7
110#define PLUS_ONE_7	 8
111#define PLUS_ONE_8	 9
112#define PLUS_ONE_9	10
113#define PLUS_ONE_10	11
114#define PLUS_ONE_11	12
115#define PLUS_ONE_12	13
116#define PLUS_ONE_13	14
117#define PLUS_ONE_14	15
118#define PLUS_ONE_15	16
119#define PLUS_ONE(x)	PLUS_ONE_ ## x
120#define _CONCAT3(a,b,c)	a ## b ## c
121#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
122
123#define STORE_UNALIGNED(x) \
124CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
125	.if (x != 12); \
126		lw	T0, (x*4)(STATE); \
127	.endif; \
128	lwl	T1, (x*4)+MSB ## (IN); \
129	lwr	T1, (x*4)+LSB ## (IN); \
130	.if (x == 12); \
131		addu	X ## x, NONCE_0; \
132	.else; \
133		addu	X ## x, T0; \
134	.endif; \
135	CPU_TO_LE32(X ## x); \
136	xor	X ## x, T1; \
137	swl	X ## x, (x*4)+MSB ## (OUT); \
138	swr	X ## x, (x*4)+LSB ## (OUT);
139
140#define STORE_ALIGNED(x) \
141CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
142	.if (x != 12); \
143		lw	T0, (x*4)(STATE); \
144	.endif; \
145	lw	T1, (x*4) ## (IN); \
146	.if (x == 12); \
147		addu	X ## x, NONCE_0; \
148	.else; \
149		addu	X ## x, T0; \
150	.endif; \
151	CPU_TO_LE32(X ## x); \
152	xor	X ## x, T1; \
153	sw	X ## x, (x*4) ## (OUT);
154
155/* Jump table macro.
156 * Used for setup and handling the last bytes, which are not multiple of 4.
157 * X15 is free to store Xn
158 * Every jumptable entry must be equal in size.
159 */
160#define JMPTBL_ALIGNED(x) \
161.Lchacha_mips_jmptbl_aligned_ ## x: ; \
162	.set	noreorder; \
163	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
164	.if (x == 12); \
165		addu	SAVED_X, X ## x, NONCE_0; \
166	.else; \
167		addu	SAVED_X, X ## x, SAVED_CA; \
168	.endif; \
169	.set	reorder
170
171#define JMPTBL_UNALIGNED(x) \
172.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
173	.set	noreorder; \
174	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
175	.if (x == 12); \
176		addu	SAVED_X, X ## x, NONCE_0; \
177	.else; \
178		addu	SAVED_X, X ## x, SAVED_CA; \
179	.endif; \
180	.set	reorder
181
182#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
183	addu	X(A), X(K); \
184	addu	X(B), X(L); \
185	addu	X(C), X(M); \
186	addu	X(D), X(N); \
187	xor	X(V), X(A); \
188	xor	X(W), X(B); \
189	xor	X(Y), X(C); \
190	xor	X(Z), X(D); \
191	rotr	X(V), 32 - S; \
192	rotr	X(W), 32 - S; \
193	rotr	X(Y), 32 - S; \
194	rotr	X(Z), 32 - S;
195
196.text
197.set	reorder
198.set	noat
199.globl	chacha_crypt_arch
200.ent	chacha_crypt_arch
201chacha_crypt_arch:
202	.frame	$sp, STACK_SIZE, $ra
203
204	/* Load number of rounds */
205	lw	$at, 16($sp)
206
207	addiu	$sp, -STACK_SIZE
208
209	/* Return bytes = 0. */
210	beqz	BYTES, .Lchacha_mips_end
211
212	lw	NONCE_0, 48(STATE)
213
214	/* Save s0-s7 */
215	sw	$s0,  0($sp)
216	sw	$s1,  4($sp)
217	sw	$s2,  8($sp)
218	sw	$s3, 12($sp)
219	sw	$s4, 16($sp)
220	sw	$s5, 20($sp)
221	sw	$s6, 24($sp)
222	sw	$s7, 28($sp)
223
224	/* Test IN or OUT is unaligned.
225	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
226	 */
227	or	IS_UNALIGNED, IN, OUT
228	andi	IS_UNALIGNED, 0x3
229
230	b	.Lchacha_rounds_start
231
232.align 4
233.Loop_chacha_rounds:
234	addiu	IN,  CHACHA20_BLOCK_SIZE
235	addiu	OUT, CHACHA20_BLOCK_SIZE
236	addiu	NONCE_0, 1
237
238.Lchacha_rounds_start:
239	lw	X0,  0(STATE)
240	lw	X1,  4(STATE)
241	lw	X2,  8(STATE)
242	lw	X3,  12(STATE)
243
244	lw	X4,  16(STATE)
245	lw	X5,  20(STATE)
246	lw	X6,  24(STATE)
247	lw	X7,  28(STATE)
248	lw	X8,  32(STATE)
249	lw	X9,  36(STATE)
250	lw	X10, 40(STATE)
251	lw	X11, 44(STATE)
252
253	move	X12, NONCE_0
254	lw	X13, 52(STATE)
255	lw	X14, 56(STATE)
256	lw	X15, 60(STATE)
257
258.Loop_chacha_xor_rounds:
259	addiu	$at, -2
260	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
261	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
262	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
263	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
264	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
265	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
266	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
267	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
268	bnez	$at, .Loop_chacha_xor_rounds
269
270	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
271
272	/* Is data src/dst unaligned? Jump */
273	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
274
275	/* Set number rounds here to fill delayslot. */
276	lw	$at, (STACK_SIZE+16)($sp)
277
278	/* BYTES < 0, it has no full block. */
279	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
280
281	FOR_EACH_WORD_REV(STORE_ALIGNED)
282
283	/* BYTES > 0? Loop again. */
284	bgtz	BYTES, .Loop_chacha_rounds
285
286	/* Place this here to fill delay slot */
287	addiu	NONCE_0, 1
288
289	/* BYTES < 0? Handle last bytes */
290	bltz	BYTES, .Lchacha_mips_xor_bytes
291
292.Lchacha_mips_xor_done:
293	/* Restore used registers */
294	lw	$s0,  0($sp)
295	lw	$s1,  4($sp)
296	lw	$s2,  8($sp)
297	lw	$s3, 12($sp)
298	lw	$s4, 16($sp)
299	lw	$s5, 20($sp)
300	lw	$s6, 24($sp)
301	lw	$s7, 28($sp)
302
303	/* Write NONCE_0 back to right location in state */
304	sw	NONCE_0, 48(STATE)
305
306.Lchacha_mips_end:
307	addiu	$sp, STACK_SIZE
308	jr	$ra
309
310.Lchacha_mips_no_full_block_aligned:
311	/* Restore the offset on BYTES */
312	addiu	BYTES, CHACHA20_BLOCK_SIZE
313
314	/* Get number of full WORDS */
315	andi	$at, BYTES, MASK_U32
316
317	/* Load upper half of jump table addr */
318	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
319
320	/* Calculate lower half jump table offset */
321	ins	T0, $at, 1, 6
322
323	/* Add offset to STATE */
324	addu	T1, STATE, $at
325
326	/* Add lower half jump table addr */
327	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
328
329	/* Read value from STATE */
330	lw	SAVED_CA, 0(T1)
331
332	/* Store remaining bytecounter as negative value */
333	subu	BYTES, $at, BYTES
334
335	jr	T0
336
337	/* Jump table */
338	FOR_EACH_WORD(JMPTBL_ALIGNED)
339
340
341.Loop_chacha_unaligned:
342	/* Set number rounds here to fill delayslot. */
343	lw	$at, (STACK_SIZE+16)($sp)
344
345	/* BYTES > 0, it has no full block. */
346	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
347
348	FOR_EACH_WORD_REV(STORE_UNALIGNED)
349
350	/* BYTES > 0? Loop again. */
351	bgtz	BYTES, .Loop_chacha_rounds
352
353	/* Write NONCE_0 back to right location in state */
354	sw	NONCE_0, 48(STATE)
355
356	.set noreorder
357	/* Fall through to byte handling */
358	bgez	BYTES, .Lchacha_mips_xor_done
359.Lchacha_mips_xor_unaligned_0_b:
360.Lchacha_mips_xor_aligned_0_b:
361	/* Place this here to fill delay slot */
362	addiu	NONCE_0, 1
363	.set reorder
364
365.Lchacha_mips_xor_bytes:
366	addu	IN, $at
367	addu	OUT, $at
368	/* First byte */
369	lbu	T1, 0(IN)
370	addiu	$at, BYTES, 1
371	xor	T1, SAVED_X
372	sb	T1, 0(OUT)
373	beqz	$at, .Lchacha_mips_xor_done
374	/* Second byte */
375	lbu	T1, 1(IN)
376	addiu	$at, BYTES, 2
377	rotr	SAVED_X, 8
378	xor	T1, SAVED_X
379	sb	T1, 1(OUT)
380	beqz	$at, .Lchacha_mips_xor_done
381	/* Third byte */
382	lbu	T1, 2(IN)
383	rotr	SAVED_X, 8
384	xor	T1, SAVED_X
385	sb	T1, 2(OUT)
386	b	.Lchacha_mips_xor_done
387
388.Lchacha_mips_no_full_block_unaligned:
389	/* Restore the offset on BYTES */
390	addiu	BYTES, CHACHA20_BLOCK_SIZE
391
392	/* Get number of full WORDS */
393	andi	$at, BYTES, MASK_U32
394
395	/* Load upper half of jump table addr */
396	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
397
398	/* Calculate lower half jump table offset */
399	ins	T0, $at, 1, 6
400
401	/* Add offset to STATE */
402	addu	T1, STATE, $at
403
404	/* Add lower half jump table addr */
405	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
406
407	/* Read value from STATE */
408	lw	SAVED_CA, 0(T1)
409
410	/* Store remaining bytecounter as negative value */
411	subu	BYTES, $at, BYTES
412
413	jr	T0
414
415	/* Jump table */
416	FOR_EACH_WORD(JMPTBL_UNALIGNED)
417.end chacha_crypt_arch
418.set at
419
420/* Input arguments
421 * STATE	$a0
422 * OUT		$a1
423 * NROUND	$a2
424 */
425
426#undef X12
427#undef X13
428#undef X14
429#undef X15
430
431#define X12	$a3
432#define X13	$at
433#define X14	$v0
434#define X15	STATE
435
436.set noat
437.globl	hchacha_block_arch
438.ent	hchacha_block_arch
439hchacha_block_arch:
440	.frame	$sp, STACK_SIZE, $ra
441
442	addiu	$sp, -STACK_SIZE
443
444	/* Save X11(s6) */
445	sw	X11, 0($sp)
446
447	lw	X0,  0(STATE)
448	lw	X1,  4(STATE)
449	lw	X2,  8(STATE)
450	lw	X3,  12(STATE)
451	lw	X4,  16(STATE)
452	lw	X5,  20(STATE)
453	lw	X6,  24(STATE)
454	lw	X7,  28(STATE)
455	lw	X8,  32(STATE)
456	lw	X9,  36(STATE)
457	lw	X10, 40(STATE)
458	lw	X11, 44(STATE)
459	lw	X12, 48(STATE)
460	lw	X13, 52(STATE)
461	lw	X14, 56(STATE)
462	lw	X15, 60(STATE)
463
464.Loop_hchacha_xor_rounds:
465	addiu	$a2, -2
466	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
467	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
468	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
469	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
470	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
471	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
472	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
473	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
474	bnez	$a2, .Loop_hchacha_xor_rounds
475
476	/* Restore used register */
477	lw	X11, 0($sp)
478
479	sw	X0,  0(OUT)
480	sw	X1,  4(OUT)
481	sw	X2,  8(OUT)
482	sw	X3,  12(OUT)
483	sw	X12, 16(OUT)
484	sw	X13, 20(OUT)
485	sw	X14, 24(OUT)
486	sw	X15, 28(OUT)
487
488	addiu	$sp, STACK_SIZE
489	jr	$ra
490.end hchacha_block_arch
491.set at
492