xref: /linux/arch/x86/crypto/sm3-avx-asm_64.S (revision 7ae9fb1b7ecbb5d85d07857943f677fd1a559b18)
1930ab34dSTianjia Zhang/* SPDX-License-Identifier: GPL-2.0-or-later */
2930ab34dSTianjia Zhang/*
3930ab34dSTianjia Zhang * SM3 AVX accelerated transform.
4930ab34dSTianjia Zhang * specified in: https://datatracker.ietf.org/doc/html/draft-sca-cfrg-sm3-02
5930ab34dSTianjia Zhang *
6930ab34dSTianjia Zhang * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna@iki.fi>
7930ab34dSTianjia Zhang * Copyright (C) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
8930ab34dSTianjia Zhang */
9930ab34dSTianjia Zhang
10930ab34dSTianjia Zhang/* Based on SM3 AES/BMI2 accelerated work by libgcrypt at:
11930ab34dSTianjia Zhang *  https://gnupg.org/software/libgcrypt/index.html
12930ab34dSTianjia Zhang */
13930ab34dSTianjia Zhang
14930ab34dSTianjia Zhang#include <linux/linkage.h>
15*8ba490d9SEric Biggers#include <linux/cfi_types.h>
16930ab34dSTianjia Zhang#include <asm/frame.h>
17930ab34dSTianjia Zhang
18930ab34dSTianjia Zhang/* Context structure */
19930ab34dSTianjia Zhang
20930ab34dSTianjia Zhang#define state_h0 0
21930ab34dSTianjia Zhang#define state_h1 4
22930ab34dSTianjia Zhang#define state_h2 8
23930ab34dSTianjia Zhang#define state_h3 12
24930ab34dSTianjia Zhang#define state_h4 16
25930ab34dSTianjia Zhang#define state_h5 20
26930ab34dSTianjia Zhang#define state_h6 24
27930ab34dSTianjia Zhang#define state_h7 28
28930ab34dSTianjia Zhang
29930ab34dSTianjia Zhang/* Constants */
30930ab34dSTianjia Zhang
31930ab34dSTianjia Zhang/* Round constant macros */
32930ab34dSTianjia Zhang
33930ab34dSTianjia Zhang#define K0   2043430169  /* 0x79cc4519 */
34930ab34dSTianjia Zhang#define K1   -208106958  /* 0xf3988a32 */
35930ab34dSTianjia Zhang#define K2   -416213915  /* 0xe7311465 */
36930ab34dSTianjia Zhang#define K3   -832427829  /* 0xce6228cb */
37930ab34dSTianjia Zhang#define K4  -1664855657  /* 0x9cc45197 */
38930ab34dSTianjia Zhang#define K5    965255983  /* 0x3988a32f */
39930ab34dSTianjia Zhang#define K6   1930511966  /* 0x7311465e */
40930ab34dSTianjia Zhang#define K7   -433943364  /* 0xe6228cbc */
41930ab34dSTianjia Zhang#define K8   -867886727  /* 0xcc451979 */
42930ab34dSTianjia Zhang#define K9  -1735773453  /* 0x988a32f3 */
43930ab34dSTianjia Zhang#define K10   823420391  /* 0x311465e7 */
44930ab34dSTianjia Zhang#define K11  1646840782  /* 0x6228cbce */
45930ab34dSTianjia Zhang#define K12 -1001285732  /* 0xc451979c */
46930ab34dSTianjia Zhang#define K13 -2002571463  /* 0x88a32f39 */
47930ab34dSTianjia Zhang#define K14   289824371  /* 0x11465e73 */
48930ab34dSTianjia Zhang#define K15   579648742  /* 0x228cbce6 */
49930ab34dSTianjia Zhang#define K16 -1651869049  /* 0x9d8a7a87 */
50930ab34dSTianjia Zhang#define K17   991229199  /* 0x3b14f50f */
51930ab34dSTianjia Zhang#define K18  1982458398  /* 0x7629ea1e */
52930ab34dSTianjia Zhang#define K19  -330050500  /* 0xec53d43c */
53930ab34dSTianjia Zhang#define K20  -660100999  /* 0xd8a7a879 */
54930ab34dSTianjia Zhang#define K21 -1320201997  /* 0xb14f50f3 */
55930ab34dSTianjia Zhang#define K22  1654563303  /* 0x629ea1e7 */
56930ab34dSTianjia Zhang#define K23  -985840690  /* 0xc53d43ce */
57930ab34dSTianjia Zhang#define K24 -1971681379  /* 0x8a7a879d */
58930ab34dSTianjia Zhang#define K25   351604539  /* 0x14f50f3b */
59930ab34dSTianjia Zhang#define K26   703209078  /* 0x29ea1e76 */
60930ab34dSTianjia Zhang#define K27  1406418156  /* 0x53d43cec */
61930ab34dSTianjia Zhang#define K28 -1482130984  /* 0xa7a879d8 */
62930ab34dSTianjia Zhang#define K29  1330705329  /* 0x4f50f3b1 */
63930ab34dSTianjia Zhang#define K30 -1633556638  /* 0x9ea1e762 */
64930ab34dSTianjia Zhang#define K31  1027854021  /* 0x3d43cec5 */
65930ab34dSTianjia Zhang#define K32  2055708042  /* 0x7a879d8a */
66930ab34dSTianjia Zhang#define K33  -183551212  /* 0xf50f3b14 */
67930ab34dSTianjia Zhang#define K34  -367102423  /* 0xea1e7629 */
68930ab34dSTianjia Zhang#define K35  -734204845  /* 0xd43cec53 */
69930ab34dSTianjia Zhang#define K36 -1468409689  /* 0xa879d8a7 */
70930ab34dSTianjia Zhang#define K37  1358147919  /* 0x50f3b14f */
71930ab34dSTianjia Zhang#define K38 -1578671458  /* 0xa1e7629e */
72930ab34dSTianjia Zhang#define K39  1137624381  /* 0x43cec53d */
73930ab34dSTianjia Zhang#define K40 -2019718534  /* 0x879d8a7a */
74930ab34dSTianjia Zhang#define K41   255530229  /* 0x0f3b14f5 */
75930ab34dSTianjia Zhang#define K42   511060458  /* 0x1e7629ea */
76930ab34dSTianjia Zhang#define K43  1022120916  /* 0x3cec53d4 */
77930ab34dSTianjia Zhang#define K44  2044241832  /* 0x79d8a7a8 */
78930ab34dSTianjia Zhang#define K45  -206483632  /* 0xf3b14f50 */
79930ab34dSTianjia Zhang#define K46  -412967263  /* 0xe7629ea1 */
80930ab34dSTianjia Zhang#define K47  -825934525  /* 0xcec53d43 */
81930ab34dSTianjia Zhang#define K48 -1651869049  /* 0x9d8a7a87 */
82930ab34dSTianjia Zhang#define K49   991229199  /* 0x3b14f50f */
83930ab34dSTianjia Zhang#define K50  1982458398  /* 0x7629ea1e */
84930ab34dSTianjia Zhang#define K51  -330050500  /* 0xec53d43c */
85930ab34dSTianjia Zhang#define K52  -660100999  /* 0xd8a7a879 */
86930ab34dSTianjia Zhang#define K53 -1320201997  /* 0xb14f50f3 */
87930ab34dSTianjia Zhang#define K54  1654563303  /* 0x629ea1e7 */
88930ab34dSTianjia Zhang#define K55  -985840690  /* 0xc53d43ce */
89930ab34dSTianjia Zhang#define K56 -1971681379  /* 0x8a7a879d */
90930ab34dSTianjia Zhang#define K57   351604539  /* 0x14f50f3b */
91930ab34dSTianjia Zhang#define K58   703209078  /* 0x29ea1e76 */
92930ab34dSTianjia Zhang#define K59  1406418156  /* 0x53d43cec */
93930ab34dSTianjia Zhang#define K60 -1482130984  /* 0xa7a879d8 */
94930ab34dSTianjia Zhang#define K61  1330705329  /* 0x4f50f3b1 */
95930ab34dSTianjia Zhang#define K62 -1633556638  /* 0x9ea1e762 */
96930ab34dSTianjia Zhang#define K63  1027854021  /* 0x3d43cec5 */
97930ab34dSTianjia Zhang
98930ab34dSTianjia Zhang/* Register macros */
99930ab34dSTianjia Zhang
100930ab34dSTianjia Zhang#define RSTATE %rdi
101930ab34dSTianjia Zhang#define RDATA  %rsi
102930ab34dSTianjia Zhang#define RNBLKS %rdx
103930ab34dSTianjia Zhang
104930ab34dSTianjia Zhang#define t0 %eax
105930ab34dSTianjia Zhang#define t1 %ebx
106930ab34dSTianjia Zhang#define t2 %ecx
107930ab34dSTianjia Zhang
108930ab34dSTianjia Zhang#define a %r8d
109930ab34dSTianjia Zhang#define b %r9d
110930ab34dSTianjia Zhang#define c %r10d
111930ab34dSTianjia Zhang#define d %r11d
112930ab34dSTianjia Zhang#define e %r12d
113930ab34dSTianjia Zhang#define f %r13d
114930ab34dSTianjia Zhang#define g %r14d
115930ab34dSTianjia Zhang#define h %r15d
116930ab34dSTianjia Zhang
117930ab34dSTianjia Zhang#define W0 %xmm0
118930ab34dSTianjia Zhang#define W1 %xmm1
119930ab34dSTianjia Zhang#define W2 %xmm2
120930ab34dSTianjia Zhang#define W3 %xmm3
121930ab34dSTianjia Zhang#define W4 %xmm4
122930ab34dSTianjia Zhang#define W5 %xmm5
123930ab34dSTianjia Zhang
124930ab34dSTianjia Zhang#define XTMP0 %xmm6
125930ab34dSTianjia Zhang#define XTMP1 %xmm7
126930ab34dSTianjia Zhang#define XTMP2 %xmm8
127930ab34dSTianjia Zhang#define XTMP3 %xmm9
128930ab34dSTianjia Zhang#define XTMP4 %xmm10
129930ab34dSTianjia Zhang#define XTMP5 %xmm11
130930ab34dSTianjia Zhang#define XTMP6 %xmm12
131930ab34dSTianjia Zhang
132930ab34dSTianjia Zhang#define BSWAP_REG %xmm15
133930ab34dSTianjia Zhang
134930ab34dSTianjia Zhang/* Stack structure */
135930ab34dSTianjia Zhang
136930ab34dSTianjia Zhang#define STACK_W_SIZE        (32 * 2 * 3)
137930ab34dSTianjia Zhang#define STACK_REG_SAVE_SIZE (64)
138930ab34dSTianjia Zhang
139930ab34dSTianjia Zhang#define STACK_W             (0)
140930ab34dSTianjia Zhang#define STACK_REG_SAVE      (STACK_W + STACK_W_SIZE)
141930ab34dSTianjia Zhang#define STACK_SIZE          (STACK_REG_SAVE + STACK_REG_SAVE_SIZE)
142930ab34dSTianjia Zhang
143930ab34dSTianjia Zhang/* Instruction helpers. */
144930ab34dSTianjia Zhang
145930ab34dSTianjia Zhang#define roll2(v, reg)		\
146930ab34dSTianjia Zhang	roll $(v), reg;
147930ab34dSTianjia Zhang
148930ab34dSTianjia Zhang#define roll3mov(v, src, dst)	\
149930ab34dSTianjia Zhang	movl src, dst;		\
150930ab34dSTianjia Zhang	roll $(v), dst;
151930ab34dSTianjia Zhang
152930ab34dSTianjia Zhang#define roll3(v, src, dst)	\
153930ab34dSTianjia Zhang	rorxl $(32-(v)), src, dst;
154930ab34dSTianjia Zhang
155930ab34dSTianjia Zhang#define addl2(a, out)		\
156930ab34dSTianjia Zhang	leal (a, out), out;
157930ab34dSTianjia Zhang
158930ab34dSTianjia Zhang/* Round function macros. */
159930ab34dSTianjia Zhang
160930ab34dSTianjia Zhang#define GG1(x, y, z, o, t)	\
161930ab34dSTianjia Zhang	movl x, o;		\
162930ab34dSTianjia Zhang	xorl y, o;		\
163930ab34dSTianjia Zhang	xorl z, o;
164930ab34dSTianjia Zhang
165930ab34dSTianjia Zhang#define FF1(x, y, z, o, t) GG1(x, y, z, o, t)
166930ab34dSTianjia Zhang
167930ab34dSTianjia Zhang#define GG2(x, y, z, o, t)	\
168930ab34dSTianjia Zhang	andnl z, x, o;		\
169930ab34dSTianjia Zhang	movl y, t;		\
170930ab34dSTianjia Zhang	andl x, t;		\
171930ab34dSTianjia Zhang	addl2(t, o);
172930ab34dSTianjia Zhang
173930ab34dSTianjia Zhang#define FF2(x, y, z, o, t)	\
174930ab34dSTianjia Zhang	movl y, o;		\
175930ab34dSTianjia Zhang	xorl x, o;		\
176930ab34dSTianjia Zhang	movl y, t;		\
177930ab34dSTianjia Zhang	andl x, t;		\
178930ab34dSTianjia Zhang	andl z, o;		\
179930ab34dSTianjia Zhang	xorl t, o;
180930ab34dSTianjia Zhang
181930ab34dSTianjia Zhang#define R(i, a, b, c, d, e, f, g, h, round, widx, wtype)		\
182930ab34dSTianjia Zhang	/* rol(a, 12) => t0 */						\
183930ab34dSTianjia Zhang	roll3mov(12, a, t0); /* rorxl here would reduce perf by 6% on zen3 */ \
184930ab34dSTianjia Zhang	/* rol (t0 + e + t), 7) => t1 */				\
185930ab34dSTianjia Zhang	leal K##round(t0, e, 1), t1;					\
186930ab34dSTianjia Zhang	roll2(7, t1);							\
187930ab34dSTianjia Zhang	/* h + w1 => h */						\
188930ab34dSTianjia Zhang	addl wtype##_W1_ADDR(round, widx), h;				\
189930ab34dSTianjia Zhang	/* h + t1 => h */						\
190930ab34dSTianjia Zhang	addl2(t1, h);							\
191930ab34dSTianjia Zhang	/* t1 ^ t0 => t0 */						\
192930ab34dSTianjia Zhang	xorl t1, t0;							\
193930ab34dSTianjia Zhang	/* w1w2 + d => d */						\
194930ab34dSTianjia Zhang	addl wtype##_W1W2_ADDR(round, widx), d;				\
195930ab34dSTianjia Zhang	/* FF##i(a,b,c) => t1 */					\
196930ab34dSTianjia Zhang	FF##i(a, b, c, t1, t2);						\
197930ab34dSTianjia Zhang	/* d + t1 => d */						\
198930ab34dSTianjia Zhang	addl2(t1, d);							\
199930ab34dSTianjia Zhang	/* GG#i(e,f,g) => t2 */						\
200930ab34dSTianjia Zhang	GG##i(e, f, g, t2, t1);						\
201930ab34dSTianjia Zhang	/* h + t2 => h */						\
202930ab34dSTianjia Zhang	addl2(t2, h);							\
203930ab34dSTianjia Zhang	/* rol (f, 19) => f */						\
204930ab34dSTianjia Zhang	roll2(19, f);							\
205930ab34dSTianjia Zhang	/* d + t0 => d */						\
206930ab34dSTianjia Zhang	addl2(t0, d);							\
207930ab34dSTianjia Zhang	/* rol (b, 9) => b */						\
208930ab34dSTianjia Zhang	roll2(9, b);							\
209930ab34dSTianjia Zhang	/* P0(h) => h */						\
210930ab34dSTianjia Zhang	roll3(9, h, t2);						\
211930ab34dSTianjia Zhang	roll3(17, h, t1);						\
212930ab34dSTianjia Zhang	xorl t2, h;							\
213930ab34dSTianjia Zhang	xorl t1, h;
214930ab34dSTianjia Zhang
215930ab34dSTianjia Zhang#define R1(a, b, c, d, e, f, g, h, round, widx, wtype) \
216930ab34dSTianjia Zhang	R(1, a, b, c, d, e, f, g, h, round, widx, wtype)
217930ab34dSTianjia Zhang
218930ab34dSTianjia Zhang#define R2(a, b, c, d, e, f, g, h, round, widx, wtype) \
219930ab34dSTianjia Zhang	R(2, a, b, c, d, e, f, g, h, round, widx, wtype)
220930ab34dSTianjia Zhang
221930ab34dSTianjia Zhang/* Input expansion macros. */
222930ab34dSTianjia Zhang
223930ab34dSTianjia Zhang/* Byte-swapped input address. */
224930ab34dSTianjia Zhang#define IW_W_ADDR(round, widx, offs) \
225930ab34dSTianjia Zhang	(STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))(%rsp)
226930ab34dSTianjia Zhang
227930ab34dSTianjia Zhang/* Expanded input address. */
228930ab34dSTianjia Zhang#define XW_W_ADDR(round, widx, offs) \
229930ab34dSTianjia Zhang	(STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))(%rsp)
230930ab34dSTianjia Zhang
231930ab34dSTianjia Zhang/* Rounds 1-12, byte-swapped input block addresses. */
232930ab34dSTianjia Zhang#define IW_W1_ADDR(round, widx)   IW_W_ADDR(round, widx, 0)
233930ab34dSTianjia Zhang#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 32)
234930ab34dSTianjia Zhang
235930ab34dSTianjia Zhang/* Rounds 1-12, expanded input block addresses. */
236930ab34dSTianjia Zhang#define XW_W1_ADDR(round, widx)   XW_W_ADDR(round, widx, 0)
237930ab34dSTianjia Zhang#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 32)
238930ab34dSTianjia Zhang
239930ab34dSTianjia Zhang/* Input block loading. */
240930ab34dSTianjia Zhang#define LOAD_W_XMM_1()							\
241930ab34dSTianjia Zhang	vmovdqu 0*16(RDATA), XTMP0; /* XTMP0: w3, w2, w1, w0 */		\
242930ab34dSTianjia Zhang	vmovdqu 1*16(RDATA), XTMP1; /* XTMP1: w7, w6, w5, w4 */		\
243930ab34dSTianjia Zhang	vmovdqu 2*16(RDATA), XTMP2; /* XTMP2: w11, w10, w9, w8 */	\
244930ab34dSTianjia Zhang	vmovdqu 3*16(RDATA), XTMP3; /* XTMP3: w15, w14, w13, w12 */	\
245930ab34dSTianjia Zhang	vpshufb BSWAP_REG, XTMP0, XTMP0;				\
246930ab34dSTianjia Zhang	vpshufb BSWAP_REG, XTMP1, XTMP1;				\
247930ab34dSTianjia Zhang	vpshufb BSWAP_REG, XTMP2, XTMP2;				\
248930ab34dSTianjia Zhang	vpshufb BSWAP_REG, XTMP3, XTMP3;				\
249930ab34dSTianjia Zhang	vpxor XTMP0, XTMP1, XTMP4;					\
250930ab34dSTianjia Zhang	vpxor XTMP1, XTMP2, XTMP5;					\
251930ab34dSTianjia Zhang	vpxor XTMP2, XTMP3, XTMP6;					\
252930ab34dSTianjia Zhang	leaq 64(RDATA), RDATA;						\
253930ab34dSTianjia Zhang	vmovdqa XTMP0, IW_W1_ADDR(0, 0);				\
254930ab34dSTianjia Zhang	vmovdqa XTMP4, IW_W1W2_ADDR(0, 0);				\
255930ab34dSTianjia Zhang	vmovdqa XTMP1, IW_W1_ADDR(4, 0);				\
256930ab34dSTianjia Zhang	vmovdqa XTMP5, IW_W1W2_ADDR(4, 0);
257930ab34dSTianjia Zhang
258930ab34dSTianjia Zhang#define LOAD_W_XMM_2()				\
259930ab34dSTianjia Zhang	vmovdqa XTMP2, IW_W1_ADDR(8, 0);	\
260930ab34dSTianjia Zhang	vmovdqa XTMP6, IW_W1W2_ADDR(8, 0);
261930ab34dSTianjia Zhang
262930ab34dSTianjia Zhang#define LOAD_W_XMM_3()							\
263930ab34dSTianjia Zhang	vpshufd $0b00000000, XTMP0, W0; /* W0: xx, w0, xx, xx */	\
264930ab34dSTianjia Zhang	vpshufd $0b11111001, XTMP0, W1; /* W1: xx, w3, w2, w1 */	\
265930ab34dSTianjia Zhang	vmovdqa XTMP1, W2;              /* W2: xx, w6, w5, w4 */	\
266930ab34dSTianjia Zhang	vpalignr $12, XTMP1, XTMP2, W3; /* W3: xx, w9, w8, w7 */	\
267930ab34dSTianjia Zhang	vpalignr $8, XTMP2, XTMP3, W4;  /* W4: xx, w12, w11, w10 */	\
268930ab34dSTianjia Zhang	vpshufd $0b11111001, XTMP3, W5; /* W5: xx, w15, w14, w13 */
269930ab34dSTianjia Zhang
270930ab34dSTianjia Zhang/* Message scheduling. Note: 3 words per XMM register. */
271930ab34dSTianjia Zhang#define SCHED_W_0(round, w0, w1, w2, w3, w4, w5)			\
272930ab34dSTianjia Zhang	/* Load (w[i - 16]) => XTMP0 */					\
273930ab34dSTianjia Zhang	vpshufd $0b10111111, w0, XTMP0;					\
274930ab34dSTianjia Zhang	vpalignr $12, XTMP0, w1, XTMP0; /* XTMP0: xx, w2, w1, w0 */	\
275930ab34dSTianjia Zhang	/* Load (w[i - 13]) => XTMP1 */					\
276930ab34dSTianjia Zhang	vpshufd $0b10111111, w1, XTMP1;					\
277930ab34dSTianjia Zhang	vpalignr $12, XTMP1, w2, XTMP1;					\
278930ab34dSTianjia Zhang	/* w[i - 9] == w3 */						\
279930ab34dSTianjia Zhang	/* XMM3 ^ XTMP0 => XTMP0 */					\
280930ab34dSTianjia Zhang	vpxor w3, XTMP0, XTMP0;
281930ab34dSTianjia Zhang
282930ab34dSTianjia Zhang#define SCHED_W_1(round, w0, w1, w2, w3, w4, w5)	\
283930ab34dSTianjia Zhang	/* w[i - 3] == w5 */				\
284930ab34dSTianjia Zhang	/* rol(XMM5, 15) ^ XTMP0 => XTMP0 */		\
285930ab34dSTianjia Zhang	vpslld $15, w5, XTMP2;				\
286930ab34dSTianjia Zhang	vpsrld $(32-15), w5, XTMP3;			\
287930ab34dSTianjia Zhang	vpxor XTMP2, XTMP3, XTMP3;			\
288930ab34dSTianjia Zhang	vpxor XTMP3, XTMP0, XTMP0;			\
289930ab34dSTianjia Zhang	/* rol(XTMP1, 7) => XTMP1 */			\
290930ab34dSTianjia Zhang	vpslld $7, XTMP1, XTMP5;			\
291930ab34dSTianjia Zhang	vpsrld $(32-7), XTMP1, XTMP1;			\
292930ab34dSTianjia Zhang	vpxor XTMP5, XTMP1, XTMP1;			\
293930ab34dSTianjia Zhang	/* XMM4 ^ XTMP1 => XTMP1 */			\
294930ab34dSTianjia Zhang	vpxor w4, XTMP1, XTMP1;				\
295930ab34dSTianjia Zhang	/* w[i - 6] == XMM4 */				\
296930ab34dSTianjia Zhang	/* P1(XTMP0) ^ XTMP1 => XMM0 */			\
297930ab34dSTianjia Zhang	vpslld $15, XTMP0, XTMP5;			\
298930ab34dSTianjia Zhang	vpsrld $(32-15), XTMP0, XTMP6;			\
299930ab34dSTianjia Zhang	vpslld $23, XTMP0, XTMP2;			\
300930ab34dSTianjia Zhang	vpsrld $(32-23), XTMP0, XTMP3;			\
301930ab34dSTianjia Zhang	vpxor XTMP0, XTMP1, XTMP1;			\
302930ab34dSTianjia Zhang	vpxor XTMP6, XTMP5, XTMP5;			\
303930ab34dSTianjia Zhang	vpxor XTMP3, XTMP2, XTMP2;			\
304930ab34dSTianjia Zhang	vpxor XTMP2, XTMP5, XTMP5;			\
305930ab34dSTianjia Zhang	vpxor XTMP5, XTMP1, w0;
306930ab34dSTianjia Zhang
307930ab34dSTianjia Zhang#define SCHED_W_2(round, w0, w1, w2, w3, w4, w5)	\
308930ab34dSTianjia Zhang	/* W1 in XMM12 */				\
309930ab34dSTianjia Zhang	vpshufd $0b10111111, w4, XTMP4;			\
310930ab34dSTianjia Zhang	vpalignr $12, XTMP4, w5, XTMP4;			\
311930ab34dSTianjia Zhang	vmovdqa XTMP4, XW_W1_ADDR((round), 0);		\
312930ab34dSTianjia Zhang	/* W1 ^ W2 => XTMP1 */				\
313930ab34dSTianjia Zhang	vpxor w0, XTMP4, XTMP1;				\
314930ab34dSTianjia Zhang	vmovdqa XTMP1, XW_W1W2_ADDR((round), 0);
315930ab34dSTianjia Zhang
316930ab34dSTianjia Zhang
317930ab34dSTianjia Zhang.section	.rodata.cst16, "aM", @progbits, 16
318930ab34dSTianjia Zhang.align 16
319930ab34dSTianjia Zhang
320930ab34dSTianjia Zhang.Lbe32mask:
321930ab34dSTianjia Zhang	.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
322930ab34dSTianjia Zhang
323930ab34dSTianjia Zhang.text
324930ab34dSTianjia Zhang
325930ab34dSTianjia Zhang/*
326930ab34dSTianjia Zhang * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA.
327930ab34dSTianjia Zhang *
328930ab34dSTianjia Zhang * void sm3_transform_avx(struct sm3_state *state,
329930ab34dSTianjia Zhang *                        const u8 *data, int nblocks);
330930ab34dSTianjia Zhang */
331*8ba490d9SEric BiggersSYM_TYPED_FUNC_START(sm3_transform_avx)
332930ab34dSTianjia Zhang	/* input:
333930ab34dSTianjia Zhang	 *	%rdi: ctx, CTX
334930ab34dSTianjia Zhang	 *	%rsi: data (64*nblks bytes)
335930ab34dSTianjia Zhang	 *	%rdx: nblocks
336930ab34dSTianjia Zhang	 */
337930ab34dSTianjia Zhang	vzeroupper;
338930ab34dSTianjia Zhang
339930ab34dSTianjia Zhang	pushq %rbp;
340930ab34dSTianjia Zhang	movq %rsp, %rbp;
341930ab34dSTianjia Zhang
342930ab34dSTianjia Zhang	movq %rdx, RNBLKS;
343930ab34dSTianjia Zhang
344930ab34dSTianjia Zhang	subq $STACK_SIZE, %rsp;
345930ab34dSTianjia Zhang	andq $(~63), %rsp;
346930ab34dSTianjia Zhang
347930ab34dSTianjia Zhang	movq %rbx, (STACK_REG_SAVE + 0 * 8)(%rsp);
348930ab34dSTianjia Zhang	movq %r15, (STACK_REG_SAVE + 1 * 8)(%rsp);
349930ab34dSTianjia Zhang	movq %r14, (STACK_REG_SAVE + 2 * 8)(%rsp);
350930ab34dSTianjia Zhang	movq %r13, (STACK_REG_SAVE + 3 * 8)(%rsp);
351930ab34dSTianjia Zhang	movq %r12, (STACK_REG_SAVE + 4 * 8)(%rsp);
352930ab34dSTianjia Zhang
353930ab34dSTianjia Zhang	vmovdqa .Lbe32mask (%rip), BSWAP_REG;
354930ab34dSTianjia Zhang
355930ab34dSTianjia Zhang	/* Get the values of the chaining variables. */
356930ab34dSTianjia Zhang	movl state_h0(RSTATE), a;
357930ab34dSTianjia Zhang	movl state_h1(RSTATE), b;
358930ab34dSTianjia Zhang	movl state_h2(RSTATE), c;
359930ab34dSTianjia Zhang	movl state_h3(RSTATE), d;
360930ab34dSTianjia Zhang	movl state_h4(RSTATE), e;
361930ab34dSTianjia Zhang	movl state_h5(RSTATE), f;
362930ab34dSTianjia Zhang	movl state_h6(RSTATE), g;
363930ab34dSTianjia Zhang	movl state_h7(RSTATE), h;
364930ab34dSTianjia Zhang
365930ab34dSTianjia Zhang.align 16
366930ab34dSTianjia Zhang.Loop:
367930ab34dSTianjia Zhang	/* Load data part1. */
368930ab34dSTianjia Zhang	LOAD_W_XMM_1();
369930ab34dSTianjia Zhang
370930ab34dSTianjia Zhang	leaq -1(RNBLKS), RNBLKS;
371930ab34dSTianjia Zhang
372930ab34dSTianjia Zhang	/* Transform 0-3 + Load data part2. */
373930ab34dSTianjia Zhang	R1(a, b, c, d, e, f, g, h, 0, 0, IW); LOAD_W_XMM_2();
374930ab34dSTianjia Zhang	R1(d, a, b, c, h, e, f, g, 1, 1, IW);
375930ab34dSTianjia Zhang	R1(c, d, a, b, g, h, e, f, 2, 2, IW);
376930ab34dSTianjia Zhang	R1(b, c, d, a, f, g, h, e, 3, 3, IW); LOAD_W_XMM_3();
377930ab34dSTianjia Zhang
378930ab34dSTianjia Zhang	/* Transform 4-7 + Precalc 12-14. */
379930ab34dSTianjia Zhang	R1(a, b, c, d, e, f, g, h, 4, 0, IW);
380930ab34dSTianjia Zhang	R1(d, a, b, c, h, e, f, g, 5, 1, IW);
381930ab34dSTianjia Zhang	R1(c, d, a, b, g, h, e, f, 6, 2, IW); SCHED_W_0(12, W0, W1, W2, W3, W4, W5);
382930ab34dSTianjia Zhang	R1(b, c, d, a, f, g, h, e, 7, 3, IW); SCHED_W_1(12, W0, W1, W2, W3, W4, W5);
383930ab34dSTianjia Zhang
384930ab34dSTianjia Zhang	/* Transform 8-11 + Precalc 12-17. */
385930ab34dSTianjia Zhang	R1(a, b, c, d, e, f, g, h, 8, 0, IW); SCHED_W_2(12, W0, W1, W2, W3, W4, W5);
386930ab34dSTianjia Zhang	R1(d, a, b, c, h, e, f, g, 9, 1, IW); SCHED_W_0(15, W1, W2, W3, W4, W5, W0);
387930ab34dSTianjia Zhang	R1(c, d, a, b, g, h, e, f, 10, 2, IW); SCHED_W_1(15, W1, W2, W3, W4, W5, W0);
388930ab34dSTianjia Zhang	R1(b, c, d, a, f, g, h, e, 11, 3, IW); SCHED_W_2(15, W1, W2, W3, W4, W5, W0);
389930ab34dSTianjia Zhang
390930ab34dSTianjia Zhang	/* Transform 12-14 + Precalc 18-20 */
391930ab34dSTianjia Zhang	R1(a, b, c, d, e, f, g, h, 12, 0, XW); SCHED_W_0(18, W2, W3, W4, W5, W0, W1);
392930ab34dSTianjia Zhang	R1(d, a, b, c, h, e, f, g, 13, 1, XW); SCHED_W_1(18, W2, W3, W4, W5, W0, W1);
393930ab34dSTianjia Zhang	R1(c, d, a, b, g, h, e, f, 14, 2, XW); SCHED_W_2(18, W2, W3, W4, W5, W0, W1);
394930ab34dSTianjia Zhang
395930ab34dSTianjia Zhang	/* Transform 15-17 + Precalc 21-23 */
396930ab34dSTianjia Zhang	R1(b, c, d, a, f, g, h, e, 15, 0, XW); SCHED_W_0(21, W3, W4, W5, W0, W1, W2);
397930ab34dSTianjia Zhang	R2(a, b, c, d, e, f, g, h, 16, 1, XW); SCHED_W_1(21, W3, W4, W5, W0, W1, W2);
398930ab34dSTianjia Zhang	R2(d, a, b, c, h, e, f, g, 17, 2, XW); SCHED_W_2(21, W3, W4, W5, W0, W1, W2);
399930ab34dSTianjia Zhang
400930ab34dSTianjia Zhang	/* Transform 18-20 + Precalc 24-26 */
401930ab34dSTianjia Zhang	R2(c, d, a, b, g, h, e, f, 18, 0, XW); SCHED_W_0(24, W4, W5, W0, W1, W2, W3);
402930ab34dSTianjia Zhang	R2(b, c, d, a, f, g, h, e, 19, 1, XW); SCHED_W_1(24, W4, W5, W0, W1, W2, W3);
403930ab34dSTianjia Zhang	R2(a, b, c, d, e, f, g, h, 20, 2, XW); SCHED_W_2(24, W4, W5, W0, W1, W2, W3);
404930ab34dSTianjia Zhang
405930ab34dSTianjia Zhang	/* Transform 21-23 + Precalc 27-29 */
406930ab34dSTianjia Zhang	R2(d, a, b, c, h, e, f, g, 21, 0, XW); SCHED_W_0(27, W5, W0, W1, W2, W3, W4);
407930ab34dSTianjia Zhang	R2(c, d, a, b, g, h, e, f, 22, 1, XW); SCHED_W_1(27, W5, W0, W1, W2, W3, W4);
408930ab34dSTianjia Zhang	R2(b, c, d, a, f, g, h, e, 23, 2, XW); SCHED_W_2(27, W5, W0, W1, W2, W3, W4);
409930ab34dSTianjia Zhang
410930ab34dSTianjia Zhang	/* Transform 24-26 + Precalc 30-32 */
411930ab34dSTianjia Zhang	R2(a, b, c, d, e, f, g, h, 24, 0, XW); SCHED_W_0(30, W0, W1, W2, W3, W4, W5);
412930ab34dSTianjia Zhang	R2(d, a, b, c, h, e, f, g, 25, 1, XW); SCHED_W_1(30, W0, W1, W2, W3, W4, W5);
413930ab34dSTianjia Zhang	R2(c, d, a, b, g, h, e, f, 26, 2, XW); SCHED_W_2(30, W0, W1, W2, W3, W4, W5);
414930ab34dSTianjia Zhang
415930ab34dSTianjia Zhang	/* Transform 27-29 + Precalc 33-35 */
416930ab34dSTianjia Zhang	R2(b, c, d, a, f, g, h, e, 27, 0, XW); SCHED_W_0(33, W1, W2, W3, W4, W5, W0);
417930ab34dSTianjia Zhang	R2(a, b, c, d, e, f, g, h, 28, 1, XW); SCHED_W_1(33, W1, W2, W3, W4, W5, W0);
418930ab34dSTianjia Zhang	R2(d, a, b, c, h, e, f, g, 29, 2, XW); SCHED_W_2(33, W1, W2, W3, W4, W5, W0);
419930ab34dSTianjia Zhang
420930ab34dSTianjia Zhang	/* Transform 30-32 + Precalc 36-38 */
421930ab34dSTianjia Zhang	R2(c, d, a, b, g, h, e, f, 30, 0, XW); SCHED_W_0(36, W2, W3, W4, W5, W0, W1);
422930ab34dSTianjia Zhang	R2(b, c, d, a, f, g, h, e, 31, 1, XW); SCHED_W_1(36, W2, W3, W4, W5, W0, W1);
423930ab34dSTianjia Zhang	R2(a, b, c, d, e, f, g, h, 32, 2, XW); SCHED_W_2(36, W2, W3, W4, W5, W0, W1);
424930ab34dSTianjia Zhang
425930ab34dSTianjia Zhang	/* Transform 33-35 + Precalc 39-41 */
426930ab34dSTianjia Zhang	R2(d, a, b, c, h, e, f, g, 33, 0, XW); SCHED_W_0(39, W3, W4, W5, W0, W1, W2);
427930ab34dSTianjia Zhang	R2(c, d, a, b, g, h, e, f, 34, 1, XW); SCHED_W_1(39, W3, W4, W5, W0, W1, W2);
428930ab34dSTianjia Zhang	R2(b, c, d, a, f, g, h, e, 35, 2, XW); SCHED_W_2(39, W3, W4, W5, W0, W1, W2);
429930ab34dSTianjia Zhang
430930ab34dSTianjia Zhang	/* Transform 36-38 + Precalc 42-44 */
431930ab34dSTianjia Zhang	R2(a, b, c, d, e, f, g, h, 36, 0, XW); SCHED_W_0(42, W4, W5, W0, W1, W2, W3);
432930ab34dSTianjia Zhang	R2(d, a, b, c, h, e, f, g, 37, 1, XW); SCHED_W_1(42, W4, W5, W0, W1, W2, W3);
433930ab34dSTianjia Zhang	R2(c, d, a, b, g, h, e, f, 38, 2, XW); SCHED_W_2(42, W4, W5, W0, W1, W2, W3);
434930ab34dSTianjia Zhang
435930ab34dSTianjia Zhang	/* Transform 39-41 + Precalc 45-47 */
436930ab34dSTianjia Zhang	R2(b, c, d, a, f, g, h, e, 39, 0, XW); SCHED_W_0(45, W5, W0, W1, W2, W3, W4);
437930ab34dSTianjia Zhang	R2(a, b, c, d, e, f, g, h, 40, 1, XW); SCHED_W_1(45, W5, W0, W1, W2, W3, W4);
438930ab34dSTianjia Zhang	R2(d, a, b, c, h, e, f, g, 41, 2, XW); SCHED_W_2(45, W5, W0, W1, W2, W3, W4);
439930ab34dSTianjia Zhang
440930ab34dSTianjia Zhang	/* Transform 42-44 + Precalc 48-50 */
441930ab34dSTianjia Zhang	R2(c, d, a, b, g, h, e, f, 42, 0, XW); SCHED_W_0(48, W0, W1, W2, W3, W4, W5);
442930ab34dSTianjia Zhang	R2(b, c, d, a, f, g, h, e, 43, 1, XW); SCHED_W_1(48, W0, W1, W2, W3, W4, W5);
443930ab34dSTianjia Zhang	R2(a, b, c, d, e, f, g, h, 44, 2, XW); SCHED_W_2(48, W0, W1, W2, W3, W4, W5);
444930ab34dSTianjia Zhang
445930ab34dSTianjia Zhang	/* Transform 45-47 + Precalc 51-53 */
446930ab34dSTianjia Zhang	R2(d, a, b, c, h, e, f, g, 45, 0, XW); SCHED_W_0(51, W1, W2, W3, W4, W5, W0);
447930ab34dSTianjia Zhang	R2(c, d, a, b, g, h, e, f, 46, 1, XW); SCHED_W_1(51, W1, W2, W3, W4, W5, W0);
448930ab34dSTianjia Zhang	R2(b, c, d, a, f, g, h, e, 47, 2, XW); SCHED_W_2(51, W1, W2, W3, W4, W5, W0);
449930ab34dSTianjia Zhang
450930ab34dSTianjia Zhang	/* Transform 48-50 + Precalc 54-56 */
451930ab34dSTianjia Zhang	R2(a, b, c, d, e, f, g, h, 48, 0, XW); SCHED_W_0(54, W2, W3, W4, W5, W0, W1);
452930ab34dSTianjia Zhang	R2(d, a, b, c, h, e, f, g, 49, 1, XW); SCHED_W_1(54, W2, W3, W4, W5, W0, W1);
453930ab34dSTianjia Zhang	R2(c, d, a, b, g, h, e, f, 50, 2, XW); SCHED_W_2(54, W2, W3, W4, W5, W0, W1);
454930ab34dSTianjia Zhang
455930ab34dSTianjia Zhang	/* Transform 51-53 + Precalc 57-59 */
456930ab34dSTianjia Zhang	R2(b, c, d, a, f, g, h, e, 51, 0, XW); SCHED_W_0(57, W3, W4, W5, W0, W1, W2);
457930ab34dSTianjia Zhang	R2(a, b, c, d, e, f, g, h, 52, 1, XW); SCHED_W_1(57, W3, W4, W5, W0, W1, W2);
458930ab34dSTianjia Zhang	R2(d, a, b, c, h, e, f, g, 53, 2, XW); SCHED_W_2(57, W3, W4, W5, W0, W1, W2);
459930ab34dSTianjia Zhang
460930ab34dSTianjia Zhang	/* Transform 54-56 + Precalc 60-62 */
461930ab34dSTianjia Zhang	R2(c, d, a, b, g, h, e, f, 54, 0, XW); SCHED_W_0(60, W4, W5, W0, W1, W2, W3);
462930ab34dSTianjia Zhang	R2(b, c, d, a, f, g, h, e, 55, 1, XW); SCHED_W_1(60, W4, W5, W0, W1, W2, W3);
463930ab34dSTianjia Zhang	R2(a, b, c, d, e, f, g, h, 56, 2, XW); SCHED_W_2(60, W4, W5, W0, W1, W2, W3);
464930ab34dSTianjia Zhang
465930ab34dSTianjia Zhang	/* Transform 57-59 + Precalc 63 */
466930ab34dSTianjia Zhang	R2(d, a, b, c, h, e, f, g, 57, 0, XW); SCHED_W_0(63, W5, W0, W1, W2, W3, W4);
467930ab34dSTianjia Zhang	R2(c, d, a, b, g, h, e, f, 58, 1, XW);
468930ab34dSTianjia Zhang	R2(b, c, d, a, f, g, h, e, 59, 2, XW); SCHED_W_1(63, W5, W0, W1, W2, W3, W4);
469930ab34dSTianjia Zhang
470930ab34dSTianjia Zhang	/* Transform 60-62 + Precalc 63 */
471930ab34dSTianjia Zhang	R2(a, b, c, d, e, f, g, h, 60, 0, XW);
472930ab34dSTianjia Zhang	R2(d, a, b, c, h, e, f, g, 61, 1, XW); SCHED_W_2(63, W5, W0, W1, W2, W3, W4);
473930ab34dSTianjia Zhang	R2(c, d, a, b, g, h, e, f, 62, 2, XW);
474930ab34dSTianjia Zhang
475930ab34dSTianjia Zhang	/* Transform 63 */
476930ab34dSTianjia Zhang	R2(b, c, d, a, f, g, h, e, 63, 0, XW);
477930ab34dSTianjia Zhang
478930ab34dSTianjia Zhang	/* Update the chaining variables. */
479930ab34dSTianjia Zhang	xorl state_h0(RSTATE), a;
480930ab34dSTianjia Zhang	xorl state_h1(RSTATE), b;
481930ab34dSTianjia Zhang	xorl state_h2(RSTATE), c;
482930ab34dSTianjia Zhang	xorl state_h3(RSTATE), d;
483930ab34dSTianjia Zhang	movl a, state_h0(RSTATE);
484930ab34dSTianjia Zhang	movl b, state_h1(RSTATE);
485930ab34dSTianjia Zhang	movl c, state_h2(RSTATE);
486930ab34dSTianjia Zhang	movl d, state_h3(RSTATE);
487930ab34dSTianjia Zhang	xorl state_h4(RSTATE), e;
488930ab34dSTianjia Zhang	xorl state_h5(RSTATE), f;
489930ab34dSTianjia Zhang	xorl state_h6(RSTATE), g;
490930ab34dSTianjia Zhang	xorl state_h7(RSTATE), h;
491930ab34dSTianjia Zhang	movl e, state_h4(RSTATE);
492930ab34dSTianjia Zhang	movl f, state_h5(RSTATE);
493930ab34dSTianjia Zhang	movl g, state_h6(RSTATE);
494930ab34dSTianjia Zhang	movl h, state_h7(RSTATE);
495930ab34dSTianjia Zhang
496930ab34dSTianjia Zhang	cmpq $0, RNBLKS;
497930ab34dSTianjia Zhang	jne .Loop;
498930ab34dSTianjia Zhang
499930ab34dSTianjia Zhang	vzeroall;
500930ab34dSTianjia Zhang
501930ab34dSTianjia Zhang	movq (STACK_REG_SAVE + 0 * 8)(%rsp), %rbx;
502930ab34dSTianjia Zhang	movq (STACK_REG_SAVE + 1 * 8)(%rsp), %r15;
503930ab34dSTianjia Zhang	movq (STACK_REG_SAVE + 2 * 8)(%rsp), %r14;
504930ab34dSTianjia Zhang	movq (STACK_REG_SAVE + 3 * 8)(%rsp), %r13;
505930ab34dSTianjia Zhang	movq (STACK_REG_SAVE + 4 * 8)(%rsp), %r12;
506930ab34dSTianjia Zhang
507930ab34dSTianjia Zhang	vmovdqa %xmm0, IW_W1_ADDR(0, 0);
508930ab34dSTianjia Zhang	vmovdqa %xmm0, IW_W1W2_ADDR(0, 0);
509930ab34dSTianjia Zhang	vmovdqa %xmm0, IW_W1_ADDR(4, 0);
510930ab34dSTianjia Zhang	vmovdqa %xmm0, IW_W1W2_ADDR(4, 0);
511930ab34dSTianjia Zhang	vmovdqa %xmm0, IW_W1_ADDR(8, 0);
512930ab34dSTianjia Zhang	vmovdqa %xmm0, IW_W1W2_ADDR(8, 0);
513930ab34dSTianjia Zhang
514930ab34dSTianjia Zhang	movq %rbp, %rsp;
515930ab34dSTianjia Zhang	popq %rbp;
516aa8e73eeSPeter Zijlstra	RET;
517930ab34dSTianjia ZhangSYM_FUNC_END(sm3_transform_avx)
518