xref: /linux/lib/crypto/mips/chacha-core.S (revision 22375adaa0d9fbba9646c8e2b099c6e87c97bfae)
149aa7c00SJason A. Donenfeld/* SPDX-License-Identifier: GPL-2.0 OR MIT */
249aa7c00SJason A. Donenfeld/*
349aa7c00SJason A. Donenfeld * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
449aa7c00SJason A. Donenfeld * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
549aa7c00SJason A. Donenfeld */
649aa7c00SJason A. Donenfeld
749aa7c00SJason A. Donenfeld#define MASK_U32		0x3c
849aa7c00SJason A. Donenfeld#define CHACHA20_BLOCK_SIZE	64
949aa7c00SJason A. Donenfeld#define STACK_SIZE		32
1049aa7c00SJason A. Donenfeld
1149aa7c00SJason A. Donenfeld#define X0	$t0
1249aa7c00SJason A. Donenfeld#define X1	$t1
1349aa7c00SJason A. Donenfeld#define X2	$t2
1449aa7c00SJason A. Donenfeld#define X3	$t3
1549aa7c00SJason A. Donenfeld#define X4	$t4
1649aa7c00SJason A. Donenfeld#define X5	$t5
1749aa7c00SJason A. Donenfeld#define X6	$t6
1849aa7c00SJason A. Donenfeld#define X7	$t7
1949aa7c00SJason A. Donenfeld#define X8	$t8
2049aa7c00SJason A. Donenfeld#define X9	$t9
2149aa7c00SJason A. Donenfeld#define X10	$v1
2249aa7c00SJason A. Donenfeld#define X11	$s6
2349aa7c00SJason A. Donenfeld#define X12	$s5
2449aa7c00SJason A. Donenfeld#define X13	$s4
2549aa7c00SJason A. Donenfeld#define X14	$s3
2649aa7c00SJason A. Donenfeld#define X15	$s2
2749aa7c00SJason A. Donenfeld/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
2849aa7c00SJason A. Donenfeld#define T0	$s1
2949aa7c00SJason A. Donenfeld#define T1	$s0
3049aa7c00SJason A. Donenfeld#define T(n)	T ## n
3149aa7c00SJason A. Donenfeld#define X(n)	X ## n
3249aa7c00SJason A. Donenfeld
3349aa7c00SJason A. Donenfeld/* Input arguments */
3449aa7c00SJason A. Donenfeld#define STATE		$a0
3549aa7c00SJason A. Donenfeld#define OUT		$a1
3649aa7c00SJason A. Donenfeld#define IN		$a2
3749aa7c00SJason A. Donenfeld#define BYTES		$a3
3849aa7c00SJason A. Donenfeld
3949aa7c00SJason A. Donenfeld/* Output argument */
4049aa7c00SJason A. Donenfeld/* NONCE[0] is kept in a register and not in memory.
4149aa7c00SJason A. Donenfeld * We don't want to touch original value in memory.
4249aa7c00SJason A. Donenfeld * Must be incremented every loop iteration.
4349aa7c00SJason A. Donenfeld */
4449aa7c00SJason A. Donenfeld#define NONCE_0		$v0
4549aa7c00SJason A. Donenfeld
4649aa7c00SJason A. Donenfeld/* SAVED_X and SAVED_CA are set in the jump table.
4749aa7c00SJason A. Donenfeld * Use regs which are overwritten on exit else we don't leak clear data.
4849aa7c00SJason A. Donenfeld * They are used to handling the last bytes which are not multiple of 4.
4949aa7c00SJason A. Donenfeld */
5049aa7c00SJason A. Donenfeld#define SAVED_X		X15
5149aa7c00SJason A. Donenfeld#define SAVED_CA	$s7
5249aa7c00SJason A. Donenfeld
5349aa7c00SJason A. Donenfeld#define IS_UNALIGNED	$s7
5449aa7c00SJason A. Donenfeld
5549aa7c00SJason A. Donenfeld#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
5649aa7c00SJason A. Donenfeld#define MSB 0
5749aa7c00SJason A. Donenfeld#define LSB 3
5849aa7c00SJason A. Donenfeld#define	CPU_TO_LE32(n) \
59*22375adaSEric Biggers	wsbh	n, n; \
6049aa7c00SJason A. Donenfeld	rotr	n, 16;
6149aa7c00SJason A. Donenfeld#else
6249aa7c00SJason A. Donenfeld#define MSB 3
6349aa7c00SJason A. Donenfeld#define LSB 0
6449aa7c00SJason A. Donenfeld#define CPU_TO_LE32(n)
6549aa7c00SJason A. Donenfeld#endif
6649aa7c00SJason A. Donenfeld
6749aa7c00SJason A. Donenfeld#define FOR_EACH_WORD(x) \
6849aa7c00SJason A. Donenfeld	x( 0); \
6949aa7c00SJason A. Donenfeld	x( 1); \
7049aa7c00SJason A. Donenfeld	x( 2); \
7149aa7c00SJason A. Donenfeld	x( 3); \
7249aa7c00SJason A. Donenfeld	x( 4); \
7349aa7c00SJason A. Donenfeld	x( 5); \
7449aa7c00SJason A. Donenfeld	x( 6); \
7549aa7c00SJason A. Donenfeld	x( 7); \
7649aa7c00SJason A. Donenfeld	x( 8); \
7749aa7c00SJason A. Donenfeld	x( 9); \
7849aa7c00SJason A. Donenfeld	x(10); \
7949aa7c00SJason A. Donenfeld	x(11); \
8049aa7c00SJason A. Donenfeld	x(12); \
8149aa7c00SJason A. Donenfeld	x(13); \
8249aa7c00SJason A. Donenfeld	x(14); \
8349aa7c00SJason A. Donenfeld	x(15);
8449aa7c00SJason A. Donenfeld
8549aa7c00SJason A. Donenfeld#define FOR_EACH_WORD_REV(x) \
8649aa7c00SJason A. Donenfeld	x(15); \
8749aa7c00SJason A. Donenfeld	x(14); \
8849aa7c00SJason A. Donenfeld	x(13); \
8949aa7c00SJason A. Donenfeld	x(12); \
9049aa7c00SJason A. Donenfeld	x(11); \
9149aa7c00SJason A. Donenfeld	x(10); \
9249aa7c00SJason A. Donenfeld	x( 9); \
9349aa7c00SJason A. Donenfeld	x( 8); \
9449aa7c00SJason A. Donenfeld	x( 7); \
9549aa7c00SJason A. Donenfeld	x( 6); \
9649aa7c00SJason A. Donenfeld	x( 5); \
9749aa7c00SJason A. Donenfeld	x( 4); \
9849aa7c00SJason A. Donenfeld	x( 3); \
9949aa7c00SJason A. Donenfeld	x( 2); \
10049aa7c00SJason A. Donenfeld	x( 1); \
10149aa7c00SJason A. Donenfeld	x( 0);
10249aa7c00SJason A. Donenfeld
10349aa7c00SJason A. Donenfeld#define PLUS_ONE_0	 1
10449aa7c00SJason A. Donenfeld#define PLUS_ONE_1	 2
10549aa7c00SJason A. Donenfeld#define PLUS_ONE_2	 3
10649aa7c00SJason A. Donenfeld#define PLUS_ONE_3	 4
10749aa7c00SJason A. Donenfeld#define PLUS_ONE_4	 5
10849aa7c00SJason A. Donenfeld#define PLUS_ONE_5	 6
10949aa7c00SJason A. Donenfeld#define PLUS_ONE_6	 7
11049aa7c00SJason A. Donenfeld#define PLUS_ONE_7	 8
11149aa7c00SJason A. Donenfeld#define PLUS_ONE_8	 9
11249aa7c00SJason A. Donenfeld#define PLUS_ONE_9	10
11349aa7c00SJason A. Donenfeld#define PLUS_ONE_10	11
11449aa7c00SJason A. Donenfeld#define PLUS_ONE_11	12
11549aa7c00SJason A. Donenfeld#define PLUS_ONE_12	13
11649aa7c00SJason A. Donenfeld#define PLUS_ONE_13	14
11749aa7c00SJason A. Donenfeld#define PLUS_ONE_14	15
11849aa7c00SJason A. Donenfeld#define PLUS_ONE_15	16
11949aa7c00SJason A. Donenfeld#define PLUS_ONE(x)	PLUS_ONE_ ## x
12049aa7c00SJason A. Donenfeld#define _CONCAT3(a,b,c)	a ## b ## c
12149aa7c00SJason A. Donenfeld#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
12249aa7c00SJason A. Donenfeld
12349aa7c00SJason A. Donenfeld#define STORE_UNALIGNED(x) \
1243a2f58f3SArd BiesheuvelCONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
12549aa7c00SJason A. Donenfeld	.if (x != 12); \
12649aa7c00SJason A. Donenfeld		lw	T0, (x*4)(STATE); \
12749aa7c00SJason A. Donenfeld	.endif; \
12849aa7c00SJason A. Donenfeld	lwl	T1, (x*4)+MSB ## (IN); \
12949aa7c00SJason A. Donenfeld	lwr	T1, (x*4)+LSB ## (IN); \
13049aa7c00SJason A. Donenfeld	.if (x == 12); \
13149aa7c00SJason A. Donenfeld		addu	X ## x, NONCE_0; \
13249aa7c00SJason A. Donenfeld	.else; \
13349aa7c00SJason A. Donenfeld		addu	X ## x, T0; \
13449aa7c00SJason A. Donenfeld	.endif; \
13549aa7c00SJason A. Donenfeld	CPU_TO_LE32(X ## x); \
13649aa7c00SJason A. Donenfeld	xor	X ## x, T1; \
13749aa7c00SJason A. Donenfeld	swl	X ## x, (x*4)+MSB ## (OUT); \
13849aa7c00SJason A. Donenfeld	swr	X ## x, (x*4)+LSB ## (OUT);
13949aa7c00SJason A. Donenfeld
14049aa7c00SJason A. Donenfeld#define STORE_ALIGNED(x) \
1413a2f58f3SArd BiesheuvelCONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
14249aa7c00SJason A. Donenfeld	.if (x != 12); \
14349aa7c00SJason A. Donenfeld		lw	T0, (x*4)(STATE); \
14449aa7c00SJason A. Donenfeld	.endif; \
14549aa7c00SJason A. Donenfeld	lw	T1, (x*4) ## (IN); \
14649aa7c00SJason A. Donenfeld	.if (x == 12); \
14749aa7c00SJason A. Donenfeld		addu	X ## x, NONCE_0; \
14849aa7c00SJason A. Donenfeld	.else; \
14949aa7c00SJason A. Donenfeld		addu	X ## x, T0; \
15049aa7c00SJason A. Donenfeld	.endif; \
15149aa7c00SJason A. Donenfeld	CPU_TO_LE32(X ## x); \
15249aa7c00SJason A. Donenfeld	xor	X ## x, T1; \
15349aa7c00SJason A. Donenfeld	sw	X ## x, (x*4) ## (OUT);
15449aa7c00SJason A. Donenfeld
15549aa7c00SJason A. Donenfeld/* Jump table macro.
15649aa7c00SJason A. Donenfeld * Used for setup and handling the last bytes, which are not multiple of 4.
15749aa7c00SJason A. Donenfeld * X15 is free to store Xn
15849aa7c00SJason A. Donenfeld * Every jumptable entry must be equal in size.
15949aa7c00SJason A. Donenfeld */
16049aa7c00SJason A. Donenfeld#define JMPTBL_ALIGNED(x) \
1613a2f58f3SArd Biesheuvel.Lchacha_mips_jmptbl_aligned_ ## x: ; \
16249aa7c00SJason A. Donenfeld	.set	noreorder; \
1633a2f58f3SArd Biesheuvel	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
16449aa7c00SJason A. Donenfeld	.if (x == 12); \
16549aa7c00SJason A. Donenfeld		addu	SAVED_X, X ## x, NONCE_0; \
16649aa7c00SJason A. Donenfeld	.else; \
16749aa7c00SJason A. Donenfeld		addu	SAVED_X, X ## x, SAVED_CA; \
16849aa7c00SJason A. Donenfeld	.endif; \
16949aa7c00SJason A. Donenfeld	.set	reorder
17049aa7c00SJason A. Donenfeld
17149aa7c00SJason A. Donenfeld#define JMPTBL_UNALIGNED(x) \
1723a2f58f3SArd Biesheuvel.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
17349aa7c00SJason A. Donenfeld	.set	noreorder; \
1743a2f58f3SArd Biesheuvel	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
17549aa7c00SJason A. Donenfeld	.if (x == 12); \
17649aa7c00SJason A. Donenfeld		addu	SAVED_X, X ## x, NONCE_0; \
17749aa7c00SJason A. Donenfeld	.else; \
17849aa7c00SJason A. Donenfeld		addu	SAVED_X, X ## x, SAVED_CA; \
17949aa7c00SJason A. Donenfeld	.endif; \
18049aa7c00SJason A. Donenfeld	.set	reorder
18149aa7c00SJason A. Donenfeld
18249aa7c00SJason A. Donenfeld#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
18349aa7c00SJason A. Donenfeld	addu	X(A), X(K); \
18449aa7c00SJason A. Donenfeld	addu	X(B), X(L); \
18549aa7c00SJason A. Donenfeld	addu	X(C), X(M); \
18649aa7c00SJason A. Donenfeld	addu	X(D), X(N); \
18749aa7c00SJason A. Donenfeld	xor	X(V), X(A); \
18849aa7c00SJason A. Donenfeld	xor	X(W), X(B); \
18949aa7c00SJason A. Donenfeld	xor	X(Y), X(C); \
19049aa7c00SJason A. Donenfeld	xor	X(Z), X(D); \
191*22375adaSEric Biggers	rotr	X(V), 32 - S; \
192*22375adaSEric Biggers	rotr	X(W), 32 - S; \
193*22375adaSEric Biggers	rotr	X(Y), 32 - S; \
194*22375adaSEric Biggers	rotr	X(Z), 32 - S;
19549aa7c00SJason A. Donenfeld
19649aa7c00SJason A. Donenfeld.text
19749aa7c00SJason A. Donenfeld.set	reorder
19849aa7c00SJason A. Donenfeld.set	noat
1993a2f58f3SArd Biesheuvel.globl	chacha_crypt_arch
2003a2f58f3SArd Biesheuvel.ent	chacha_crypt_arch
2013a2f58f3SArd Biesheuvelchacha_crypt_arch:
20249aa7c00SJason A. Donenfeld	.frame	$sp, STACK_SIZE, $ra
20349aa7c00SJason A. Donenfeld
2043a2f58f3SArd Biesheuvel	/* Load number of rounds */
2053a2f58f3SArd Biesheuvel	lw	$at, 16($sp)
2063a2f58f3SArd Biesheuvel
20749aa7c00SJason A. Donenfeld	addiu	$sp, -STACK_SIZE
20849aa7c00SJason A. Donenfeld
20949aa7c00SJason A. Donenfeld	/* Return bytes = 0. */
2103a2f58f3SArd Biesheuvel	beqz	BYTES, .Lchacha_mips_end
21149aa7c00SJason A. Donenfeld
21249aa7c00SJason A. Donenfeld	lw	NONCE_0, 48(STATE)
21349aa7c00SJason A. Donenfeld
21449aa7c00SJason A. Donenfeld	/* Save s0-s7 */
21549aa7c00SJason A. Donenfeld	sw	$s0,  0($sp)
21649aa7c00SJason A. Donenfeld	sw	$s1,  4($sp)
21749aa7c00SJason A. Donenfeld	sw	$s2,  8($sp)
21849aa7c00SJason A. Donenfeld	sw	$s3, 12($sp)
21949aa7c00SJason A. Donenfeld	sw	$s4, 16($sp)
22049aa7c00SJason A. Donenfeld	sw	$s5, 20($sp)
22149aa7c00SJason A. Donenfeld	sw	$s6, 24($sp)
22249aa7c00SJason A. Donenfeld	sw	$s7, 28($sp)
22349aa7c00SJason A. Donenfeld
22449aa7c00SJason A. Donenfeld	/* Test IN or OUT is unaligned.
22549aa7c00SJason A. Donenfeld	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
22649aa7c00SJason A. Donenfeld	 */
22749aa7c00SJason A. Donenfeld	or	IS_UNALIGNED, IN, OUT
22849aa7c00SJason A. Donenfeld	andi	IS_UNALIGNED, 0x3
22949aa7c00SJason A. Donenfeld
2303a2f58f3SArd Biesheuvel	b	.Lchacha_rounds_start
23149aa7c00SJason A. Donenfeld
23249aa7c00SJason A. Donenfeld.align 4
2333a2f58f3SArd Biesheuvel.Loop_chacha_rounds:
23449aa7c00SJason A. Donenfeld	addiu	IN,  CHACHA20_BLOCK_SIZE
23549aa7c00SJason A. Donenfeld	addiu	OUT, CHACHA20_BLOCK_SIZE
23649aa7c00SJason A. Donenfeld	addiu	NONCE_0, 1
23749aa7c00SJason A. Donenfeld
2383a2f58f3SArd Biesheuvel.Lchacha_rounds_start:
23949aa7c00SJason A. Donenfeld	lw	X0,  0(STATE)
24049aa7c00SJason A. Donenfeld	lw	X1,  4(STATE)
24149aa7c00SJason A. Donenfeld	lw	X2,  8(STATE)
24249aa7c00SJason A. Donenfeld	lw	X3,  12(STATE)
24349aa7c00SJason A. Donenfeld
24449aa7c00SJason A. Donenfeld	lw	X4,  16(STATE)
24549aa7c00SJason A. Donenfeld	lw	X5,  20(STATE)
24649aa7c00SJason A. Donenfeld	lw	X6,  24(STATE)
24749aa7c00SJason A. Donenfeld	lw	X7,  28(STATE)
24849aa7c00SJason A. Donenfeld	lw	X8,  32(STATE)
24949aa7c00SJason A. Donenfeld	lw	X9,  36(STATE)
25049aa7c00SJason A. Donenfeld	lw	X10, 40(STATE)
25149aa7c00SJason A. Donenfeld	lw	X11, 44(STATE)
25249aa7c00SJason A. Donenfeld
25349aa7c00SJason A. Donenfeld	move	X12, NONCE_0
25449aa7c00SJason A. Donenfeld	lw	X13, 52(STATE)
25549aa7c00SJason A. Donenfeld	lw	X14, 56(STATE)
25649aa7c00SJason A. Donenfeld	lw	X15, 60(STATE)
25749aa7c00SJason A. Donenfeld
2583a2f58f3SArd Biesheuvel.Loop_chacha_xor_rounds:
25949aa7c00SJason A. Donenfeld	addiu	$at, -2
26049aa7c00SJason A. Donenfeld	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
26149aa7c00SJason A. Donenfeld	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
26249aa7c00SJason A. Donenfeld	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
26349aa7c00SJason A. Donenfeld	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
26449aa7c00SJason A. Donenfeld	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
26549aa7c00SJason A. Donenfeld	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
26649aa7c00SJason A. Donenfeld	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
26749aa7c00SJason A. Donenfeld	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
2683a2f58f3SArd Biesheuvel	bnez	$at, .Loop_chacha_xor_rounds
26949aa7c00SJason A. Donenfeld
27049aa7c00SJason A. Donenfeld	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
27149aa7c00SJason A. Donenfeld
27249aa7c00SJason A. Donenfeld	/* Is data src/dst unaligned? Jump */
2733a2f58f3SArd Biesheuvel	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
27449aa7c00SJason A. Donenfeld
27549aa7c00SJason A. Donenfeld	/* Set number rounds here to fill delayslot. */
2763a2f58f3SArd Biesheuvel	lw	$at, (STACK_SIZE+16)($sp)
27749aa7c00SJason A. Donenfeld
27849aa7c00SJason A. Donenfeld	/* BYTES < 0, it has no full block. */
2793a2f58f3SArd Biesheuvel	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
28049aa7c00SJason A. Donenfeld
28149aa7c00SJason A. Donenfeld	FOR_EACH_WORD_REV(STORE_ALIGNED)
28249aa7c00SJason A. Donenfeld
28349aa7c00SJason A. Donenfeld	/* BYTES > 0? Loop again. */
2843a2f58f3SArd Biesheuvel	bgtz	BYTES, .Loop_chacha_rounds
28549aa7c00SJason A. Donenfeld
28649aa7c00SJason A. Donenfeld	/* Place this here to fill delay slot */
28749aa7c00SJason A. Donenfeld	addiu	NONCE_0, 1
28849aa7c00SJason A. Donenfeld
28949aa7c00SJason A. Donenfeld	/* BYTES < 0? Handle last bytes */
2903a2f58f3SArd Biesheuvel	bltz	BYTES, .Lchacha_mips_xor_bytes
29149aa7c00SJason A. Donenfeld
2923a2f58f3SArd Biesheuvel.Lchacha_mips_xor_done:
29349aa7c00SJason A. Donenfeld	/* Restore used registers */
29449aa7c00SJason A. Donenfeld	lw	$s0,  0($sp)
29549aa7c00SJason A. Donenfeld	lw	$s1,  4($sp)
29649aa7c00SJason A. Donenfeld	lw	$s2,  8($sp)
29749aa7c00SJason A. Donenfeld	lw	$s3, 12($sp)
29849aa7c00SJason A. Donenfeld	lw	$s4, 16($sp)
29949aa7c00SJason A. Donenfeld	lw	$s5, 20($sp)
30049aa7c00SJason A. Donenfeld	lw	$s6, 24($sp)
30149aa7c00SJason A. Donenfeld	lw	$s7, 28($sp)
30249aa7c00SJason A. Donenfeld
30349aa7c00SJason A. Donenfeld	/* Write NONCE_0 back to right location in state */
30449aa7c00SJason A. Donenfeld	sw	NONCE_0, 48(STATE)
30549aa7c00SJason A. Donenfeld
3063a2f58f3SArd Biesheuvel.Lchacha_mips_end:
30749aa7c00SJason A. Donenfeld	addiu	$sp, STACK_SIZE
30849aa7c00SJason A. Donenfeld	jr	$ra
30949aa7c00SJason A. Donenfeld
3103a2f58f3SArd Biesheuvel.Lchacha_mips_no_full_block_aligned:
31149aa7c00SJason A. Donenfeld	/* Restore the offset on BYTES */
31249aa7c00SJason A. Donenfeld	addiu	BYTES, CHACHA20_BLOCK_SIZE
31349aa7c00SJason A. Donenfeld
31449aa7c00SJason A. Donenfeld	/* Get number of full WORDS */
31549aa7c00SJason A. Donenfeld	andi	$at, BYTES, MASK_U32
31649aa7c00SJason A. Donenfeld
31749aa7c00SJason A. Donenfeld	/* Load upper half of jump table addr */
3183a2f58f3SArd Biesheuvel	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
31949aa7c00SJason A. Donenfeld
32049aa7c00SJason A. Donenfeld	/* Calculate lower half jump table offset */
32149aa7c00SJason A. Donenfeld	ins	T0, $at, 1, 6
32249aa7c00SJason A. Donenfeld
32349aa7c00SJason A. Donenfeld	/* Add offset to STATE */
32449aa7c00SJason A. Donenfeld	addu	T1, STATE, $at
32549aa7c00SJason A. Donenfeld
32649aa7c00SJason A. Donenfeld	/* Add lower half jump table addr */
3273a2f58f3SArd Biesheuvel	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
32849aa7c00SJason A. Donenfeld
32949aa7c00SJason A. Donenfeld	/* Read value from STATE */
33049aa7c00SJason A. Donenfeld	lw	SAVED_CA, 0(T1)
33149aa7c00SJason A. Donenfeld
33249aa7c00SJason A. Donenfeld	/* Store remaining bytecounter as negative value */
33349aa7c00SJason A. Donenfeld	subu	BYTES, $at, BYTES
33449aa7c00SJason A. Donenfeld
33549aa7c00SJason A. Donenfeld	jr	T0
33649aa7c00SJason A. Donenfeld
33749aa7c00SJason A. Donenfeld	/* Jump table */
33849aa7c00SJason A. Donenfeld	FOR_EACH_WORD(JMPTBL_ALIGNED)
33949aa7c00SJason A. Donenfeld
34049aa7c00SJason A. Donenfeld
3413a2f58f3SArd Biesheuvel.Loop_chacha_unaligned:
34249aa7c00SJason A. Donenfeld	/* Set number rounds here to fill delayslot. */
3433a2f58f3SArd Biesheuvel	lw	$at, (STACK_SIZE+16)($sp)
34449aa7c00SJason A. Donenfeld
34549aa7c00SJason A. Donenfeld	/* BYTES > 0, it has no full block. */
3463a2f58f3SArd Biesheuvel	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
34749aa7c00SJason A. Donenfeld
34849aa7c00SJason A. Donenfeld	FOR_EACH_WORD_REV(STORE_UNALIGNED)
34949aa7c00SJason A. Donenfeld
35049aa7c00SJason A. Donenfeld	/* BYTES > 0? Loop again. */
3513a2f58f3SArd Biesheuvel	bgtz	BYTES, .Loop_chacha_rounds
35249aa7c00SJason A. Donenfeld
35349aa7c00SJason A. Donenfeld	/* Write NONCE_0 back to right location in state */
35449aa7c00SJason A. Donenfeld	sw	NONCE_0, 48(STATE)
35549aa7c00SJason A. Donenfeld
35649aa7c00SJason A. Donenfeld	.set noreorder
35749aa7c00SJason A. Donenfeld	/* Fall through to byte handling */
3583a2f58f3SArd Biesheuvel	bgez	BYTES, .Lchacha_mips_xor_done
3593a2f58f3SArd Biesheuvel.Lchacha_mips_xor_unaligned_0_b:
3603a2f58f3SArd Biesheuvel.Lchacha_mips_xor_aligned_0_b:
36149aa7c00SJason A. Donenfeld	/* Place this here to fill delay slot */
36249aa7c00SJason A. Donenfeld	addiu	NONCE_0, 1
36349aa7c00SJason A. Donenfeld	.set reorder
36449aa7c00SJason A. Donenfeld
3653a2f58f3SArd Biesheuvel.Lchacha_mips_xor_bytes:
36649aa7c00SJason A. Donenfeld	addu	IN, $at
36749aa7c00SJason A. Donenfeld	addu	OUT, $at
36849aa7c00SJason A. Donenfeld	/* First byte */
36949aa7c00SJason A. Donenfeld	lbu	T1, 0(IN)
37049aa7c00SJason A. Donenfeld	addiu	$at, BYTES, 1
37149aa7c00SJason A. Donenfeld	xor	T1, SAVED_X
37249aa7c00SJason A. Donenfeld	sb	T1, 0(OUT)
3733a2f58f3SArd Biesheuvel	beqz	$at, .Lchacha_mips_xor_done
37449aa7c00SJason A. Donenfeld	/* Second byte */
37549aa7c00SJason A. Donenfeld	lbu	T1, 1(IN)
37649aa7c00SJason A. Donenfeld	addiu	$at, BYTES, 2
377*22375adaSEric Biggers	rotr	SAVED_X, 8
37849aa7c00SJason A. Donenfeld	xor	T1, SAVED_X
37949aa7c00SJason A. Donenfeld	sb	T1, 1(OUT)
3803a2f58f3SArd Biesheuvel	beqz	$at, .Lchacha_mips_xor_done
38149aa7c00SJason A. Donenfeld	/* Third byte */
38249aa7c00SJason A. Donenfeld	lbu	T1, 2(IN)
383*22375adaSEric Biggers	rotr	SAVED_X, 8
38449aa7c00SJason A. Donenfeld	xor	T1, SAVED_X
38549aa7c00SJason A. Donenfeld	sb	T1, 2(OUT)
3863a2f58f3SArd Biesheuvel	b	.Lchacha_mips_xor_done
38749aa7c00SJason A. Donenfeld
3883a2f58f3SArd Biesheuvel.Lchacha_mips_no_full_block_unaligned:
38949aa7c00SJason A. Donenfeld	/* Restore the offset on BYTES */
39049aa7c00SJason A. Donenfeld	addiu	BYTES, CHACHA20_BLOCK_SIZE
39149aa7c00SJason A. Donenfeld
39249aa7c00SJason A. Donenfeld	/* Get number of full WORDS */
39349aa7c00SJason A. Donenfeld	andi	$at, BYTES, MASK_U32
39449aa7c00SJason A. Donenfeld
39549aa7c00SJason A. Donenfeld	/* Load upper half of jump table addr */
3963a2f58f3SArd Biesheuvel	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
39749aa7c00SJason A. Donenfeld
39849aa7c00SJason A. Donenfeld	/* Calculate lower half jump table offset */
39949aa7c00SJason A. Donenfeld	ins	T0, $at, 1, 6
40049aa7c00SJason A. Donenfeld
40149aa7c00SJason A. Donenfeld	/* Add offset to STATE */
40249aa7c00SJason A. Donenfeld	addu	T1, STATE, $at
40349aa7c00SJason A. Donenfeld
40449aa7c00SJason A. Donenfeld	/* Add lower half jump table addr */
4053a2f58f3SArd Biesheuvel	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
40649aa7c00SJason A. Donenfeld
40749aa7c00SJason A. Donenfeld	/* Read value from STATE */
40849aa7c00SJason A. Donenfeld	lw	SAVED_CA, 0(T1)
40949aa7c00SJason A. Donenfeld
41049aa7c00SJason A. Donenfeld	/* Store remaining bytecounter as negative value */
41149aa7c00SJason A. Donenfeld	subu	BYTES, $at, BYTES
41249aa7c00SJason A. Donenfeld
41349aa7c00SJason A. Donenfeld	jr	T0
41449aa7c00SJason A. Donenfeld
41549aa7c00SJason A. Donenfeld	/* Jump table */
41649aa7c00SJason A. Donenfeld	FOR_EACH_WORD(JMPTBL_UNALIGNED)
4173a2f58f3SArd Biesheuvel.end chacha_crypt_arch
4183a2f58f3SArd Biesheuvel.set at
4193a2f58f3SArd Biesheuvel
4203a2f58f3SArd Biesheuvel/* Input arguments
4213a2f58f3SArd Biesheuvel * STATE	$a0
4223a2f58f3SArd Biesheuvel * OUT		$a1
4233a2f58f3SArd Biesheuvel * NROUND	$a2
4243a2f58f3SArd Biesheuvel */
4253a2f58f3SArd Biesheuvel
4263a2f58f3SArd Biesheuvel#undef X12
4273a2f58f3SArd Biesheuvel#undef X13
4283a2f58f3SArd Biesheuvel#undef X14
4293a2f58f3SArd Biesheuvel#undef X15
4303a2f58f3SArd Biesheuvel
4313a2f58f3SArd Biesheuvel#define X12	$a3
4323a2f58f3SArd Biesheuvel#define X13	$at
4333a2f58f3SArd Biesheuvel#define X14	$v0
4343a2f58f3SArd Biesheuvel#define X15	STATE
4353a2f58f3SArd Biesheuvel
4363a2f58f3SArd Biesheuvel.set noat
4373a2f58f3SArd Biesheuvel.globl	hchacha_block_arch
4383a2f58f3SArd Biesheuvel.ent	hchacha_block_arch
4393a2f58f3SArd Biesheuvelhchacha_block_arch:
4403a2f58f3SArd Biesheuvel	.frame	$sp, STACK_SIZE, $ra
4413a2f58f3SArd Biesheuvel
4423a2f58f3SArd Biesheuvel	addiu	$sp, -STACK_SIZE
4433a2f58f3SArd Biesheuvel
4443a2f58f3SArd Biesheuvel	/* Save X11(s6) */
4453a2f58f3SArd Biesheuvel	sw	X11, 0($sp)
4463a2f58f3SArd Biesheuvel
4473a2f58f3SArd Biesheuvel	lw	X0,  0(STATE)
4483a2f58f3SArd Biesheuvel	lw	X1,  4(STATE)
4493a2f58f3SArd Biesheuvel	lw	X2,  8(STATE)
4503a2f58f3SArd Biesheuvel	lw	X3,  12(STATE)
4513a2f58f3SArd Biesheuvel	lw	X4,  16(STATE)
4523a2f58f3SArd Biesheuvel	lw	X5,  20(STATE)
4533a2f58f3SArd Biesheuvel	lw	X6,  24(STATE)
4543a2f58f3SArd Biesheuvel	lw	X7,  28(STATE)
4553a2f58f3SArd Biesheuvel	lw	X8,  32(STATE)
4563a2f58f3SArd Biesheuvel	lw	X9,  36(STATE)
4573a2f58f3SArd Biesheuvel	lw	X10, 40(STATE)
4583a2f58f3SArd Biesheuvel	lw	X11, 44(STATE)
4593a2f58f3SArd Biesheuvel	lw	X12, 48(STATE)
4603a2f58f3SArd Biesheuvel	lw	X13, 52(STATE)
4613a2f58f3SArd Biesheuvel	lw	X14, 56(STATE)
4623a2f58f3SArd Biesheuvel	lw	X15, 60(STATE)
4633a2f58f3SArd Biesheuvel
4643a2f58f3SArd Biesheuvel.Loop_hchacha_xor_rounds:
4653a2f58f3SArd Biesheuvel	addiu	$a2, -2
4663a2f58f3SArd Biesheuvel	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
4673a2f58f3SArd Biesheuvel	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
4683a2f58f3SArd Biesheuvel	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
4693a2f58f3SArd Biesheuvel	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
4703a2f58f3SArd Biesheuvel	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
4713a2f58f3SArd Biesheuvel	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
4723a2f58f3SArd Biesheuvel	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
4733a2f58f3SArd Biesheuvel	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
4743a2f58f3SArd Biesheuvel	bnez	$a2, .Loop_hchacha_xor_rounds
4753a2f58f3SArd Biesheuvel
4763a2f58f3SArd Biesheuvel	/* Restore used register */
4773a2f58f3SArd Biesheuvel	lw	X11, 0($sp)
4783a2f58f3SArd Biesheuvel
4793a2f58f3SArd Biesheuvel	sw	X0,  0(OUT)
4803a2f58f3SArd Biesheuvel	sw	X1,  4(OUT)
4813a2f58f3SArd Biesheuvel	sw	X2,  8(OUT)
4823a2f58f3SArd Biesheuvel	sw	X3,  12(OUT)
4833a2f58f3SArd Biesheuvel	sw	X12, 16(OUT)
4843a2f58f3SArd Biesheuvel	sw	X13, 20(OUT)
4853a2f58f3SArd Biesheuvel	sw	X14, 24(OUT)
4863a2f58f3SArd Biesheuvel	sw	X15, 28(OUT)
4873a2f58f3SArd Biesheuvel
4883a2f58f3SArd Biesheuvel	addiu	$sp, STACK_SIZE
4893a2f58f3SArd Biesheuvel	jr	$ra
4903a2f58f3SArd Biesheuvel.end hchacha_block_arch
49149aa7c00SJason A. Donenfeld.set at
492