xref: /linux/arch/loongarch/vdso/vgetrandom-chacha.S (revision 4f9786035f9e519db41375818e1d0b5f20da2f10)
118efd0b1SXi Ruoyao// SPDX-License-Identifier: GPL-2.0
218efd0b1SXi Ruoyao/*
318efd0b1SXi Ruoyao * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
418efd0b1SXi Ruoyao */
518efd0b1SXi Ruoyao
618efd0b1SXi Ruoyao#include <asm/asm.h>
718efd0b1SXi Ruoyao#include <asm/regdef.h>
818efd0b1SXi Ruoyao#include <linux/linkage.h>
918efd0b1SXi Ruoyao
1018efd0b1SXi Ruoyao.text
1118efd0b1SXi Ruoyao
129805f39dSXi Ruoyao.macro	OP_4REG	op d0 d1 d2 d3 s0 s1 s2 s3
139805f39dSXi Ruoyao	\op	\d0, \d0, \s0
149805f39dSXi Ruoyao	\op	\d1, \d1, \s1
159805f39dSXi Ruoyao	\op	\d2, \d2, \s2
169805f39dSXi Ruoyao	\op	\d3, \d3, \s3
1718efd0b1SXi Ruoyao.endm
1818efd0b1SXi Ruoyao
1918efd0b1SXi Ruoyao/*
2018efd0b1SXi Ruoyao * Very basic LoongArch implementation of ChaCha20. Produces a given positive
2118efd0b1SXi Ruoyao * number of blocks of output with a nonce of 0, taking an input key and
2218efd0b1SXi Ruoyao * 8-byte counter. Importantly does not spill to the stack. Its arguments
2318efd0b1SXi Ruoyao * are:
2418efd0b1SXi Ruoyao *
2518efd0b1SXi Ruoyao *	a0: output bytes
2618efd0b1SXi Ruoyao *	a1: 32-byte key input
2718efd0b1SXi Ruoyao *	a2: 8-byte counter input/output
2818efd0b1SXi Ruoyao *	a3: number of 64-byte blocks to write to output
2918efd0b1SXi Ruoyao */
3018efd0b1SXi RuoyaoSYM_FUNC_START(__arch_chacha20_blocks_nostack)
3118efd0b1SXi Ruoyao
3218efd0b1SXi Ruoyao/* We don't need a frame pointer */
3318efd0b1SXi Ruoyao#define s9		fp
3418efd0b1SXi Ruoyao
3518efd0b1SXi Ruoyao#define output		a0
3618efd0b1SXi Ruoyao#define key		a1
3718efd0b1SXi Ruoyao#define counter		a2
3818efd0b1SXi Ruoyao#define nblocks		a3
3918efd0b1SXi Ruoyao#define i		a4
4018efd0b1SXi Ruoyao#define state0		s0
4118efd0b1SXi Ruoyao#define state1		s1
4218efd0b1SXi Ruoyao#define state2		s2
4318efd0b1SXi Ruoyao#define state3		s3
4418efd0b1SXi Ruoyao#define state4		s4
4518efd0b1SXi Ruoyao#define state5		s5
4618efd0b1SXi Ruoyao#define state6		s6
4718efd0b1SXi Ruoyao#define state7		s7
4818efd0b1SXi Ruoyao#define state8		s8
4918efd0b1SXi Ruoyao#define state9		s9
5018efd0b1SXi Ruoyao#define state10		a5
5118efd0b1SXi Ruoyao#define state11		a6
5218efd0b1SXi Ruoyao#define state12		a7
5318efd0b1SXi Ruoyao#define state13		t0
5418efd0b1SXi Ruoyao#define state14		t1
5518efd0b1SXi Ruoyao#define state15		t2
5618efd0b1SXi Ruoyao#define cnt_lo		t3
5718efd0b1SXi Ruoyao#define cnt_hi		t4
5818efd0b1SXi Ruoyao#define copy0		t5
5918efd0b1SXi Ruoyao#define copy1		t6
6018efd0b1SXi Ruoyao#define copy2		t7
61*a34ea549SXi Ruoyao#define copy3		t8
6218efd0b1SXi Ruoyao
639805f39dSXi Ruoyao/* Packs to be used with OP_4REG */
649805f39dSXi Ruoyao#define line0		state0, state1, state2, state3
659805f39dSXi Ruoyao#define line1		state4, state5, state6, state7
669805f39dSXi Ruoyao#define line2		state8, state9, state10, state11
679805f39dSXi Ruoyao#define line3		state12, state13, state14, state15
689805f39dSXi Ruoyao
699805f39dSXi Ruoyao#define line1_perm	state5, state6, state7, state4
709805f39dSXi Ruoyao#define line2_perm	state10, state11, state8, state9
719805f39dSXi Ruoyao#define line3_perm	state15, state12, state13, state14
729805f39dSXi Ruoyao
739805f39dSXi Ruoyao#define copy		copy0, copy1, copy2, copy3
749805f39dSXi Ruoyao
759805f39dSXi Ruoyao#define _16		16, 16, 16, 16
769805f39dSXi Ruoyao#define _20		20, 20, 20, 20
779805f39dSXi Ruoyao#define _24		24, 24, 24, 24
789805f39dSXi Ruoyao#define _25		25, 25, 25, 25
799805f39dSXi Ruoyao
8018efd0b1SXi Ruoyao	/*
8118efd0b1SXi Ruoyao	 * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
8218efd0b1SXi Ruoyao	 * This does not violate the stack-less requirement: no sensitive data
8318efd0b1SXi Ruoyao	 * is spilled onto the stack.
8418efd0b1SXi Ruoyao	 */
8518efd0b1SXi Ruoyao	PTR_ADDI	sp, sp, (-SZREG * 10) & STACK_ALIGN
8618efd0b1SXi Ruoyao	REG_S		s0, sp, 0
8718efd0b1SXi Ruoyao	REG_S		s1, sp, SZREG
8818efd0b1SXi Ruoyao	REG_S		s2, sp, SZREG * 2
8918efd0b1SXi Ruoyao	REG_S		s3, sp, SZREG * 3
9018efd0b1SXi Ruoyao	REG_S		s4, sp, SZREG * 4
9118efd0b1SXi Ruoyao	REG_S		s5, sp, SZREG * 5
9218efd0b1SXi Ruoyao	REG_S		s6, sp, SZREG * 6
9318efd0b1SXi Ruoyao	REG_S		s7, sp, SZREG * 7
9418efd0b1SXi Ruoyao	REG_S		s8, sp, SZREG * 8
9518efd0b1SXi Ruoyao	REG_S		s9, sp, SZREG * 9
9618efd0b1SXi Ruoyao
9718efd0b1SXi Ruoyao	li.w		copy0, 0x61707865
9818efd0b1SXi Ruoyao	li.w		copy1, 0x3320646e
9918efd0b1SXi Ruoyao	li.w		copy2, 0x79622d32
100*a34ea549SXi Ruoyao	li.w		copy3, 0x6b206574
10118efd0b1SXi Ruoyao
10218efd0b1SXi Ruoyao	ld.w		cnt_lo, counter, 0
10318efd0b1SXi Ruoyao	ld.w		cnt_hi, counter, 4
10418efd0b1SXi Ruoyao
10518efd0b1SXi Ruoyao.Lblock:
10618efd0b1SXi Ruoyao	/* state[0,1,2,3] = "expand 32-byte k" */
10718efd0b1SXi Ruoyao	move		state0, copy0
10818efd0b1SXi Ruoyao	move		state1, copy1
10918efd0b1SXi Ruoyao	move		state2, copy2
110*a34ea549SXi Ruoyao	move		state3, copy3
11118efd0b1SXi Ruoyao
11218efd0b1SXi Ruoyao	/* state[4,5,..,11] = key */
11318efd0b1SXi Ruoyao	ld.w		state4, key, 0
11418efd0b1SXi Ruoyao	ld.w		state5, key, 4
11518efd0b1SXi Ruoyao	ld.w		state6, key, 8
11618efd0b1SXi Ruoyao	ld.w		state7, key, 12
11718efd0b1SXi Ruoyao	ld.w		state8, key, 16
11818efd0b1SXi Ruoyao	ld.w		state9, key, 20
11918efd0b1SXi Ruoyao	ld.w		state10, key, 24
12018efd0b1SXi Ruoyao	ld.w		state11, key, 28
12118efd0b1SXi Ruoyao
12218efd0b1SXi Ruoyao	/* state[12,13] = counter */
12318efd0b1SXi Ruoyao	move		state12, cnt_lo
12418efd0b1SXi Ruoyao	move		state13, cnt_hi
12518efd0b1SXi Ruoyao
12618efd0b1SXi Ruoyao	/* state[14,15] = 0 */
12718efd0b1SXi Ruoyao	move		state14, zero
12818efd0b1SXi Ruoyao	move		state15, zero
12918efd0b1SXi Ruoyao
13018efd0b1SXi Ruoyao	li.w		i, 10
13118efd0b1SXi Ruoyao.Lpermute:
13218efd0b1SXi Ruoyao	/* odd round */
1339805f39dSXi Ruoyao	OP_4REG	add.w	line0, line1
1349805f39dSXi Ruoyao	OP_4REG	xor	line3, line0
1359805f39dSXi Ruoyao	OP_4REG	rotri.w	line3, _16
1369805f39dSXi Ruoyao
1379805f39dSXi Ruoyao	OP_4REG	add.w	line2, line3
1389805f39dSXi Ruoyao	OP_4REG	xor	line1, line2
1399805f39dSXi Ruoyao	OP_4REG	rotri.w	line1, _20
1409805f39dSXi Ruoyao
1419805f39dSXi Ruoyao	OP_4REG	add.w	line0, line1
1429805f39dSXi Ruoyao	OP_4REG	xor	line3, line0
1439805f39dSXi Ruoyao	OP_4REG	rotri.w	line3, _24
1449805f39dSXi Ruoyao
1459805f39dSXi Ruoyao	OP_4REG	add.w	line2, line3
1469805f39dSXi Ruoyao	OP_4REG	xor	line1, line2
1479805f39dSXi Ruoyao	OP_4REG	rotri.w	line1, _25
14818efd0b1SXi Ruoyao
14918efd0b1SXi Ruoyao	/* even round */
1509805f39dSXi Ruoyao	OP_4REG	add.w	line0, line1_perm
1519805f39dSXi Ruoyao	OP_4REG	xor	line3_perm, line0
1529805f39dSXi Ruoyao	OP_4REG	rotri.w	line3_perm, _16
1539805f39dSXi Ruoyao
1549805f39dSXi Ruoyao	OP_4REG	add.w	line2_perm, line3_perm
1559805f39dSXi Ruoyao	OP_4REG	xor	line1_perm, line2_perm
1569805f39dSXi Ruoyao	OP_4REG	rotri.w	line1_perm, _20
1579805f39dSXi Ruoyao
1589805f39dSXi Ruoyao	OP_4REG	add.w	line0, line1_perm
1599805f39dSXi Ruoyao	OP_4REG	xor	line3_perm, line0
1609805f39dSXi Ruoyao	OP_4REG	rotri.w	line3_perm, _24
1619805f39dSXi Ruoyao
1629805f39dSXi Ruoyao	OP_4REG	add.w	line2_perm, line3_perm
1639805f39dSXi Ruoyao	OP_4REG	xor	line1_perm, line2_perm
1649805f39dSXi Ruoyao	OP_4REG	rotri.w	line1_perm, _25
16518efd0b1SXi Ruoyao
16618efd0b1SXi Ruoyao	addi.w		i, i, -1
16718efd0b1SXi Ruoyao	bnez		i, .Lpermute
16818efd0b1SXi Ruoyao
16918efd0b1SXi Ruoyao	/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
1709805f39dSXi Ruoyao	OP_4REG	add.w	line0, copy
17118efd0b1SXi Ruoyao	st.w		state0, output, 0
17218efd0b1SXi Ruoyao	st.w		state1, output, 4
17318efd0b1SXi Ruoyao	st.w		state2, output, 8
17418efd0b1SXi Ruoyao	st.w		state3, output, 12
17518efd0b1SXi Ruoyao
17618efd0b1SXi Ruoyao	/* from now on state[0,1,2,3] are scratch registers  */
17718efd0b1SXi Ruoyao
17818efd0b1SXi Ruoyao	/* state[0,1,2,3] = lo32(key) */
17918efd0b1SXi Ruoyao	ld.w		state0, key, 0
18018efd0b1SXi Ruoyao	ld.w		state1, key, 4
18118efd0b1SXi Ruoyao	ld.w		state2, key, 8
18218efd0b1SXi Ruoyao	ld.w		state3, key, 12
18318efd0b1SXi Ruoyao
18418efd0b1SXi Ruoyao	/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
1859805f39dSXi Ruoyao	OP_4REG	add.w	line1, line0
18618efd0b1SXi Ruoyao	st.w		state4, output, 16
18718efd0b1SXi Ruoyao	st.w		state5, output, 20
18818efd0b1SXi Ruoyao	st.w		state6, output, 24
18918efd0b1SXi Ruoyao	st.w		state7, output, 28
19018efd0b1SXi Ruoyao
19118efd0b1SXi Ruoyao	/* state[0,1,2,3] = hi32(key) */
19218efd0b1SXi Ruoyao	ld.w		state0, key, 16
19318efd0b1SXi Ruoyao	ld.w		state1, key, 20
19418efd0b1SXi Ruoyao	ld.w		state2, key, 24
19518efd0b1SXi Ruoyao	ld.w		state3, key, 28
19618efd0b1SXi Ruoyao
19718efd0b1SXi Ruoyao	/* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
1989805f39dSXi Ruoyao	OP_4REG	add.w	line2, line0
19918efd0b1SXi Ruoyao	st.w		state8, output, 32
20018efd0b1SXi Ruoyao	st.w		state9, output, 36
20118efd0b1SXi Ruoyao	st.w		state10, output, 40
20218efd0b1SXi Ruoyao	st.w		state11, output, 44
20318efd0b1SXi Ruoyao
20418efd0b1SXi Ruoyao	/* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
20518efd0b1SXi Ruoyao	add.w		state12, state12, cnt_lo
20618efd0b1SXi Ruoyao	add.w		state13, state13, cnt_hi
20718efd0b1SXi Ruoyao	st.w		state12, output, 48
20818efd0b1SXi Ruoyao	st.w		state13, output, 52
20918efd0b1SXi Ruoyao	st.w		state14, output, 56
21018efd0b1SXi Ruoyao	st.w		state15, output, 60
21118efd0b1SXi Ruoyao
21218efd0b1SXi Ruoyao	/* ++counter  */
21318efd0b1SXi Ruoyao	addi.w		cnt_lo, cnt_lo, 1
21418efd0b1SXi Ruoyao	sltui		state0, cnt_lo, 1
21518efd0b1SXi Ruoyao	add.w		cnt_hi, cnt_hi, state0
21618efd0b1SXi Ruoyao
21718efd0b1SXi Ruoyao	/* output += 64 */
21818efd0b1SXi Ruoyao	PTR_ADDI	output, output, 64
21918efd0b1SXi Ruoyao	/* --nblocks */
22018efd0b1SXi Ruoyao	PTR_ADDI	nblocks, nblocks, -1
22118efd0b1SXi Ruoyao	bnez		nblocks, .Lblock
22218efd0b1SXi Ruoyao
22318efd0b1SXi Ruoyao	/* counter = [cnt_lo, cnt_hi] */
22418efd0b1SXi Ruoyao	st.w		cnt_lo, counter, 0
22518efd0b1SXi Ruoyao	st.w		cnt_hi, counter, 4
22618efd0b1SXi Ruoyao
22718efd0b1SXi Ruoyao	/*
22818efd0b1SXi Ruoyao	 * Zero out the potentially sensitive regs, in case nothing uses these
22918efd0b1SXi Ruoyao	 * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
23018efd0b1SXi Ruoyao	 * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we
23118efd0b1SXi Ruoyao	 * only need to zero state[11,...,15].
23218efd0b1SXi Ruoyao	 */
23318efd0b1SXi Ruoyao	move		state10, zero
23418efd0b1SXi Ruoyao	move		state11, zero
23518efd0b1SXi Ruoyao	move		state12, zero
23618efd0b1SXi Ruoyao	move		state13, zero
23718efd0b1SXi Ruoyao	move		state14, zero
23818efd0b1SXi Ruoyao	move		state15, zero
23918efd0b1SXi Ruoyao
24018efd0b1SXi Ruoyao	REG_L		s0, sp, 0
24118efd0b1SXi Ruoyao	REG_L		s1, sp, SZREG
24218efd0b1SXi Ruoyao	REG_L		s2, sp, SZREG * 2
24318efd0b1SXi Ruoyao	REG_L		s3, sp, SZREG * 3
24418efd0b1SXi Ruoyao	REG_L		s4, sp, SZREG * 4
24518efd0b1SXi Ruoyao	REG_L		s5, sp, SZREG * 5
24618efd0b1SXi Ruoyao	REG_L		s6, sp, SZREG * 6
24718efd0b1SXi Ruoyao	REG_L		s7, sp, SZREG * 7
24818efd0b1SXi Ruoyao	REG_L		s8, sp, SZREG * 8
24918efd0b1SXi Ruoyao	REG_L		s9, sp, SZREG * 9
25018efd0b1SXi Ruoyao	PTR_ADDI	sp, sp, -((-SZREG * 10) & STACK_ALIGN)
25118efd0b1SXi Ruoyao
25218efd0b1SXi Ruoyao	jr		ra
25318efd0b1SXi RuoyaoSYM_FUNC_END(__arch_chacha20_blocks_nostack)
254