xref: /linux/arch/x86/crypto/twofish-x86_64-asm_64-3way.S (revision 4f9786035f9e519db41375818e1d0b5f20da2f10)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Twofish Cipher 3-way parallel algorithm (x86_64)
4 *
5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6 */
7
8#include <linux/linkage.h>
9#include <linux/cfi_types.h>
10
11.file "twofish-x86_64-asm-3way.S"
12.text
13
14/* structure of crypto context */
15#define s0	0
16#define s1	1024
17#define s2	2048
18#define s3	3072
19#define w	4096
20#define k	4128
21
22/**********************************************************************
23  3-way twofish
24 **********************************************************************/
25#define CTX %rdi
26#define RIO %rdx
27
28#define RAB0 %rax
29#define RAB1 %rbx
30#define RAB2 %rcx
31
32#define RAB0d %eax
33#define RAB1d %ebx
34#define RAB2d %ecx
35
36#define RAB0bh %ah
37#define RAB1bh %bh
38#define RAB2bh %ch
39
40#define RAB0bl %al
41#define RAB1bl %bl
42#define RAB2bl %cl
43
44#define CD0 0x0(%rsp)
45#define CD1 0x8(%rsp)
46#define CD2 0x10(%rsp)
47
48# used only before/after all rounds
49#define RCD0 %r8
50#define RCD1 %r9
51#define RCD2 %r10
52
53# used only during rounds
54#define RX0 %r8
55#define RX1 %r9
56#define RX2 %r10
57
58#define RX0d %r8d
59#define RX1d %r9d
60#define RX2d %r10d
61
62#define RY0 %r11
63#define RY1 %r12
64#define RY2 %r13
65
66#define RY0d %r11d
67#define RY1d %r12d
68#define RY2d %r13d
69
70#define RT0 %rdx
71#define RT1 %rsi
72
73#define RT0d %edx
74#define RT1d %esi
75
76#define RT1bl %sil
77
78#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
79	movzbl ab ## bl,		tmp2 ## d; \
80	movzbl ab ## bh,		tmp1 ## d; \
81	rorq $(rot),			ab; \
82	op1##l T0(CTX, tmp2, 4),	dst ## d; \
83	op2##l T1(CTX, tmp1, 4),	dst ## d;
84
85#define swap_ab_with_cd(ab, cd, tmp)	\
86	movq cd, tmp;			\
87	movq ab, cd;			\
88	movq tmp, ab;
89
90/*
91 * Combined G1 & G2 function. Reordered with help of rotates to have moves
92 * at beginning.
93 */
94#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
95	/* G1,1 && G2,1 */ \
96	do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \
97	do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \
98	\
99	do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \
100	do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \
101	\
102	do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \
103	do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \
104	\
105	/* G1,2 && G2,2 */ \
106	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
107	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
108	swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \
109	\
110	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
111	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
112	swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \
113	\
114	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
115	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
116	swap_ab_with_cd(ab ## 2, cd ## 2, RT0);
117
118#define enc_round_end(ab, x, y, n) \
119	addl y ## d,			x ## d; \
120	addl x ## d,			y ## d; \
121	addl k+4*(2*(n))(CTX),		x ## d; \
122	xorl ab ## d,			x ## d; \
123	addl k+4*(2*(n)+1)(CTX),	y ## d; \
124	shrq $32,			ab; \
125	roll $1,			ab ## d; \
126	xorl y ## d,			ab ## d; \
127	shlq $32,			ab; \
128	rorl $1,			x ## d; \
129	orq x,				ab;
130
131#define dec_round_end(ba, x, y, n) \
132	addl y ## d,			x ## d; \
133	addl x ## d,			y ## d; \
134	addl k+4*(2*(n))(CTX),		x ## d; \
135	addl k+4*(2*(n)+1)(CTX),	y ## d; \
136	xorl ba ## d,			y ## d; \
137	shrq $32,			ba; \
138	roll $1,			ba ## d; \
139	xorl x ## d,			ba ## d; \
140	shlq $32,			ba; \
141	rorl $1,			y ## d; \
142	orq y,				ba;
143
144#define encrypt_round3(ab, cd, n) \
145	g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \
146	\
147	enc_round_end(ab ## 0, RX0, RY0, n); \
148	enc_round_end(ab ## 1, RX1, RY1, n); \
149	enc_round_end(ab ## 2, RX2, RY2, n);
150
151#define decrypt_round3(ba, dc, n) \
152	g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \
153	\
154	dec_round_end(ba ## 0, RX0, RY0, n); \
155	dec_round_end(ba ## 1, RX1, RY1, n); \
156	dec_round_end(ba ## 2, RX2, RY2, n);
157
158#define encrypt_cycle3(ab, cd, n) \
159	encrypt_round3(ab, cd, n*2); \
160	encrypt_round3(ab, cd, (n*2)+1);
161
162#define decrypt_cycle3(ba, dc, n) \
163	decrypt_round3(ba, dc, (n*2)+1); \
164	decrypt_round3(ba, dc, (n*2));
165
166#define push_cd()	\
167	pushq RCD2;	\
168	pushq RCD1;	\
169	pushq RCD0;
170
171#define pop_cd()	\
172	popq RCD0;	\
173	popq RCD1;	\
174	popq RCD2;
175
176#define inpack3(in, n, xy, m) \
177	movq 4*(n)(in),			xy ## 0; \
178	xorq w+4*m(CTX),		xy ## 0; \
179	\
180	movq 4*(4+(n))(in),		xy ## 1; \
181	xorq w+4*m(CTX),		xy ## 1; \
182	\
183	movq 4*(8+(n))(in),		xy ## 2; \
184	xorq w+4*m(CTX),		xy ## 2;
185
186#define outunpack3(op, out, n, xy, m) \
187	xorq w+4*m(CTX),		xy ## 0; \
188	op ## q xy ## 0,		4*(n)(out); \
189	\
190	xorq w+4*m(CTX),		xy ## 1; \
191	op ## q xy ## 1,		4*(4+(n))(out); \
192	\
193	xorq w+4*m(CTX),		xy ## 2; \
194	op ## q xy ## 2,		4*(8+(n))(out);
195
196#define inpack_enc3() \
197	inpack3(RIO, 0, RAB, 0); \
198	inpack3(RIO, 2, RCD, 2);
199
200#define outunpack_enc3(op) \
201	outunpack3(op, RIO, 2, RAB, 6); \
202	outunpack3(op, RIO, 0, RCD, 4);
203
204#define inpack_dec3() \
205	inpack3(RIO, 0, RAB, 4); \
206	rorq $32,			RAB0; \
207	rorq $32,			RAB1; \
208	rorq $32,			RAB2; \
209	inpack3(RIO, 2, RCD, 6); \
210	rorq $32,			RCD0; \
211	rorq $32,			RCD1; \
212	rorq $32,			RCD2;
213
214#define outunpack_dec3() \
215	rorq $32,			RCD0; \
216	rorq $32,			RCD1; \
217	rorq $32,			RCD2; \
218	outunpack3(mov, RIO, 0, RCD, 0); \
219	rorq $32,			RAB0; \
220	rorq $32,			RAB1; \
221	rorq $32,			RAB2; \
222	outunpack3(mov, RIO, 2, RAB, 2);
223
224SYM_TYPED_FUNC_START(__twofish_enc_blk_3way)
225	/* input:
226	 *	%rdi: ctx, CTX
227	 *	%rsi: dst
228	 *	%rdx: src, RIO
229	 *	%rcx: bool, if true: xor output
230	 */
231	pushq %r13;
232	pushq %r12;
233	pushq %rbx;
234
235	pushq %rcx; /* bool xor */
236	pushq %rsi; /* dst */
237
238	inpack_enc3();
239
240	push_cd();
241	encrypt_cycle3(RAB, CD, 0);
242	encrypt_cycle3(RAB, CD, 1);
243	encrypt_cycle3(RAB, CD, 2);
244	encrypt_cycle3(RAB, CD, 3);
245	encrypt_cycle3(RAB, CD, 4);
246	encrypt_cycle3(RAB, CD, 5);
247	encrypt_cycle3(RAB, CD, 6);
248	encrypt_cycle3(RAB, CD, 7);
249	pop_cd();
250
251	popq RIO; /* dst */
252	popq RT1; /* bool xor */
253
254	testb RT1bl, RT1bl;
255	jnz .L__enc_xor3;
256
257	outunpack_enc3(mov);
258
259	popq %rbx;
260	popq %r12;
261	popq %r13;
262	RET;
263
264.L__enc_xor3:
265	outunpack_enc3(xor);
266
267	popq %rbx;
268	popq %r12;
269	popq %r13;
270	RET;
271SYM_FUNC_END(__twofish_enc_blk_3way)
272
273SYM_TYPED_FUNC_START(twofish_dec_blk_3way)
274	/* input:
275	 *	%rdi: ctx, CTX
276	 *	%rsi: dst
277	 *	%rdx: src, RIO
278	 */
279	pushq %r13;
280	pushq %r12;
281	pushq %rbx;
282
283	pushq %rsi; /* dst */
284
285	inpack_dec3();
286
287	push_cd();
288	decrypt_cycle3(RAB, CD, 7);
289	decrypt_cycle3(RAB, CD, 6);
290	decrypt_cycle3(RAB, CD, 5);
291	decrypt_cycle3(RAB, CD, 4);
292	decrypt_cycle3(RAB, CD, 3);
293	decrypt_cycle3(RAB, CD, 2);
294	decrypt_cycle3(RAB, CD, 1);
295	decrypt_cycle3(RAB, CD, 0);
296	pop_cd();
297
298	popq RIO; /* dst */
299
300	outunpack_dec3();
301
302	popq %rbx;
303	popq %r12;
304	popq %r13;
305	RET;
306SYM_FUNC_END(twofish_dec_blk_3way)
307