1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * Implement AES algorithm in Intel AES-NI instructions.
4 *
5 * The white paper of AES-NI instructions can be downloaded from:
6 *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 *
8 * Copyright (C) 2008, Intel Corp.
9 *    Author: Huang Ying <ying.huang@intel.com>
10 *            Vinodh Gopal <vinodh.gopal@intel.com>
11 *            Kahraman Akdemir
12 *
13 * Copyright (c) 2010, Intel Corporation.
14 *
15 * Ported x86_64 version to x86:
16 *    Author: Mathias Krause <minipli@googlemail.com>
17 */
18
19#include <linux/linkage.h>
20#include <linux/objtool.h>
21#include <asm/frame.h>
22
23#define STATE1	%xmm0
24#define STATE2	%xmm4
25#define STATE3	%xmm5
26#define STATE4	%xmm6
27#define STATE	STATE1
28#define IN1	%xmm1
29#define IN2	%xmm7
30#define IN3	%xmm8
31#define IN4	%xmm9
32#define IN	IN1
33#define KEY	%xmm2
34#define IV	%xmm3
35
36#define BSWAP_MASK %xmm10
37#define CTR	%xmm11
38#define INC	%xmm12
39
40#define GF128MUL_MASK %xmm7
41
42#ifdef __x86_64__
43#define AREG	%rax
44#define KEYP	%rdi
45#define OUTP	%rsi
46#define UKEYP	OUTP
47#define INP	%rdx
48#define LEN	%rcx
49#define IVP	%r8
50#define KLEN	%r9d
51#define T1	%r10
52#define TKEYP	T1
53#define T2	%r11
54#define TCTR_LOW T2
55#else
56#define AREG	%eax
57#define KEYP	%edi
58#define OUTP	AREG
59#define UKEYP	OUTP
60#define INP	%edx
61#define LEN	%esi
62#define IVP	%ebp
63#define KLEN	%ebx
64#define T1	%ecx
65#define TKEYP	T1
66#endif
67
68SYM_FUNC_START_LOCAL(_key_expansion_256a)
69	pshufd $0b11111111, %xmm1, %xmm1
70	shufps $0b00010000, %xmm0, %xmm4
71	pxor %xmm4, %xmm0
72	shufps $0b10001100, %xmm0, %xmm4
73	pxor %xmm4, %xmm0
74	pxor %xmm1, %xmm0
75	movaps %xmm0, (TKEYP)
76	add $0x10, TKEYP
77	RET
78SYM_FUNC_END(_key_expansion_256a)
79SYM_FUNC_ALIAS_LOCAL(_key_expansion_128, _key_expansion_256a)
80
81SYM_FUNC_START_LOCAL(_key_expansion_192a)
82	pshufd $0b01010101, %xmm1, %xmm1
83	shufps $0b00010000, %xmm0, %xmm4
84	pxor %xmm4, %xmm0
85	shufps $0b10001100, %xmm0, %xmm4
86	pxor %xmm4, %xmm0
87	pxor %xmm1, %xmm0
88
89	movaps %xmm2, %xmm5
90	movaps %xmm2, %xmm6
91	pslldq $4, %xmm5
92	pshufd $0b11111111, %xmm0, %xmm3
93	pxor %xmm3, %xmm2
94	pxor %xmm5, %xmm2
95
96	movaps %xmm0, %xmm1
97	shufps $0b01000100, %xmm0, %xmm6
98	movaps %xmm6, (TKEYP)
99	shufps $0b01001110, %xmm2, %xmm1
100	movaps %xmm1, 0x10(TKEYP)
101	add $0x20, TKEYP
102	RET
103SYM_FUNC_END(_key_expansion_192a)
104
105SYM_FUNC_START_LOCAL(_key_expansion_192b)
106	pshufd $0b01010101, %xmm1, %xmm1
107	shufps $0b00010000, %xmm0, %xmm4
108	pxor %xmm4, %xmm0
109	shufps $0b10001100, %xmm0, %xmm4
110	pxor %xmm4, %xmm0
111	pxor %xmm1, %xmm0
112
113	movaps %xmm2, %xmm5
114	pslldq $4, %xmm5
115	pshufd $0b11111111, %xmm0, %xmm3
116	pxor %xmm3, %xmm2
117	pxor %xmm5, %xmm2
118
119	movaps %xmm0, (TKEYP)
120	add $0x10, TKEYP
121	RET
122SYM_FUNC_END(_key_expansion_192b)
123
124SYM_FUNC_START_LOCAL(_key_expansion_256b)
125	pshufd $0b10101010, %xmm1, %xmm1
126	shufps $0b00010000, %xmm2, %xmm4
127	pxor %xmm4, %xmm2
128	shufps $0b10001100, %xmm2, %xmm4
129	pxor %xmm4, %xmm2
130	pxor %xmm1, %xmm2
131	movaps %xmm2, (TKEYP)
132	add $0x10, TKEYP
133	RET
134SYM_FUNC_END(_key_expansion_256b)
135
136/*
137 * void aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
138 *                    unsigned int key_len)
139 */
140SYM_FUNC_START(aesni_set_key)
141	FRAME_BEGIN
142#ifndef __x86_64__
143	pushl KEYP
144	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
145	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
146	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
147#endif
148	movups (UKEYP), %xmm0		# user key (first 16 bytes)
149	movaps %xmm0, (KEYP)
150	lea 0x10(KEYP), TKEYP		# key addr
151	movl %edx, 480(KEYP)
152	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
153	cmp $24, %dl
154	jb .Lenc_key128
155	je .Lenc_key192
156	movups 0x10(UKEYP), %xmm2	# other user key
157	movaps %xmm2, (TKEYP)
158	add $0x10, TKEYP
159	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
160	call _key_expansion_256a
161	aeskeygenassist $0x1, %xmm0, %xmm1
162	call _key_expansion_256b
163	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
164	call _key_expansion_256a
165	aeskeygenassist $0x2, %xmm0, %xmm1
166	call _key_expansion_256b
167	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
168	call _key_expansion_256a
169	aeskeygenassist $0x4, %xmm0, %xmm1
170	call _key_expansion_256b
171	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
172	call _key_expansion_256a
173	aeskeygenassist $0x8, %xmm0, %xmm1
174	call _key_expansion_256b
175	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
176	call _key_expansion_256a
177	aeskeygenassist $0x10, %xmm0, %xmm1
178	call _key_expansion_256b
179	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
180	call _key_expansion_256a
181	aeskeygenassist $0x20, %xmm0, %xmm1
182	call _key_expansion_256b
183	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
184	call _key_expansion_256a
185	jmp .Ldec_key
186.Lenc_key192:
187	movq 0x10(UKEYP), %xmm2		# other user key
188	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
189	call _key_expansion_192a
190	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
191	call _key_expansion_192b
192	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
193	call _key_expansion_192a
194	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
195	call _key_expansion_192b
196	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
197	call _key_expansion_192a
198	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
199	call _key_expansion_192b
200	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
201	call _key_expansion_192a
202	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
203	call _key_expansion_192b
204	jmp .Ldec_key
205.Lenc_key128:
206	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
207	call _key_expansion_128
208	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
209	call _key_expansion_128
210	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
211	call _key_expansion_128
212	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
213	call _key_expansion_128
214	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
215	call _key_expansion_128
216	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
217	call _key_expansion_128
218	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
219	call _key_expansion_128
220	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
221	call _key_expansion_128
222	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
223	call _key_expansion_128
224	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
225	call _key_expansion_128
226.Ldec_key:
227	sub $0x10, TKEYP
228	movaps (KEYP), %xmm0
229	movaps (TKEYP), %xmm1
230	movaps %xmm0, 240(TKEYP)
231	movaps %xmm1, 240(KEYP)
232	add $0x10, KEYP
233	lea 240-16(TKEYP), UKEYP
234.align 4
235.Ldec_key_loop:
236	movaps (KEYP), %xmm0
237	aesimc %xmm0, %xmm1
238	movaps %xmm1, (UKEYP)
239	add $0x10, KEYP
240	sub $0x10, UKEYP
241	cmp TKEYP, KEYP
242	jb .Ldec_key_loop
243#ifndef __x86_64__
244	popl KEYP
245#endif
246	FRAME_END
247	RET
248SYM_FUNC_END(aesni_set_key)
249
250/*
251 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
252 */
253SYM_FUNC_START(aesni_enc)
254	FRAME_BEGIN
255#ifndef __x86_64__
256	pushl KEYP
257	pushl KLEN
258	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
259	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
260	movl (FRAME_OFFSET+20)(%esp), INP	# src
261#endif
262	movl 480(KEYP), KLEN		# key length
263	movups (INP), STATE		# input
264	call _aesni_enc1
265	movups STATE, (OUTP)		# output
266#ifndef __x86_64__
267	popl KLEN
268	popl KEYP
269#endif
270	FRAME_END
271	RET
272SYM_FUNC_END(aesni_enc)
273
274/*
275 * _aesni_enc1:		internal ABI
276 * input:
277 *	KEYP:		key struct pointer
278 *	KLEN:		round count
279 *	STATE:		initial state (input)
280 * output:
281 *	STATE:		finial state (output)
282 * changed:
283 *	KEY
284 *	TKEYP (T1)
285 */
286SYM_FUNC_START_LOCAL(_aesni_enc1)
287	movaps (KEYP), KEY		# key
288	mov KEYP, TKEYP
289	pxor KEY, STATE		# round 0
290	add $0x30, TKEYP
291	cmp $24, KLEN
292	jb .Lenc128
293	lea 0x20(TKEYP), TKEYP
294	je .Lenc192
295	add $0x20, TKEYP
296	movaps -0x60(TKEYP), KEY
297	aesenc KEY, STATE
298	movaps -0x50(TKEYP), KEY
299	aesenc KEY, STATE
300.align 4
301.Lenc192:
302	movaps -0x40(TKEYP), KEY
303	aesenc KEY, STATE
304	movaps -0x30(TKEYP), KEY
305	aesenc KEY, STATE
306.align 4
307.Lenc128:
308	movaps -0x20(TKEYP), KEY
309	aesenc KEY, STATE
310	movaps -0x10(TKEYP), KEY
311	aesenc KEY, STATE
312	movaps (TKEYP), KEY
313	aesenc KEY, STATE
314	movaps 0x10(TKEYP), KEY
315	aesenc KEY, STATE
316	movaps 0x20(TKEYP), KEY
317	aesenc KEY, STATE
318	movaps 0x30(TKEYP), KEY
319	aesenc KEY, STATE
320	movaps 0x40(TKEYP), KEY
321	aesenc KEY, STATE
322	movaps 0x50(TKEYP), KEY
323	aesenc KEY, STATE
324	movaps 0x60(TKEYP), KEY
325	aesenc KEY, STATE
326	movaps 0x70(TKEYP), KEY
327	aesenclast KEY, STATE
328	RET
329SYM_FUNC_END(_aesni_enc1)
330
331/*
332 * _aesni_enc4:	internal ABI
333 * input:
334 *	KEYP:		key struct pointer
335 *	KLEN:		round count
336 *	STATE1:		initial state (input)
337 *	STATE2
338 *	STATE3
339 *	STATE4
340 * output:
341 *	STATE1:		finial state (output)
342 *	STATE2
343 *	STATE3
344 *	STATE4
345 * changed:
346 *	KEY
347 *	TKEYP (T1)
348 */
349SYM_FUNC_START_LOCAL(_aesni_enc4)
350	movaps (KEYP), KEY		# key
351	mov KEYP, TKEYP
352	pxor KEY, STATE1		# round 0
353	pxor KEY, STATE2
354	pxor KEY, STATE3
355	pxor KEY, STATE4
356	add $0x30, TKEYP
357	cmp $24, KLEN
358	jb .L4enc128
359	lea 0x20(TKEYP), TKEYP
360	je .L4enc192
361	add $0x20, TKEYP
362	movaps -0x60(TKEYP), KEY
363	aesenc KEY, STATE1
364	aesenc KEY, STATE2
365	aesenc KEY, STATE3
366	aesenc KEY, STATE4
367	movaps -0x50(TKEYP), KEY
368	aesenc KEY, STATE1
369	aesenc KEY, STATE2
370	aesenc KEY, STATE3
371	aesenc KEY, STATE4
372#.align 4
373.L4enc192:
374	movaps -0x40(TKEYP), KEY
375	aesenc KEY, STATE1
376	aesenc KEY, STATE2
377	aesenc KEY, STATE3
378	aesenc KEY, STATE4
379	movaps -0x30(TKEYP), KEY
380	aesenc KEY, STATE1
381	aesenc KEY, STATE2
382	aesenc KEY, STATE3
383	aesenc KEY, STATE4
384#.align 4
385.L4enc128:
386	movaps -0x20(TKEYP), KEY
387	aesenc KEY, STATE1
388	aesenc KEY, STATE2
389	aesenc KEY, STATE3
390	aesenc KEY, STATE4
391	movaps -0x10(TKEYP), KEY
392	aesenc KEY, STATE1
393	aesenc KEY, STATE2
394	aesenc KEY, STATE3
395	aesenc KEY, STATE4
396	movaps (TKEYP), KEY
397	aesenc KEY, STATE1
398	aesenc KEY, STATE2
399	aesenc KEY, STATE3
400	aesenc KEY, STATE4
401	movaps 0x10(TKEYP), KEY
402	aesenc KEY, STATE1
403	aesenc KEY, STATE2
404	aesenc KEY, STATE3
405	aesenc KEY, STATE4
406	movaps 0x20(TKEYP), KEY
407	aesenc KEY, STATE1
408	aesenc KEY, STATE2
409	aesenc KEY, STATE3
410	aesenc KEY, STATE4
411	movaps 0x30(TKEYP), KEY
412	aesenc KEY, STATE1
413	aesenc KEY, STATE2
414	aesenc KEY, STATE3
415	aesenc KEY, STATE4
416	movaps 0x40(TKEYP), KEY
417	aesenc KEY, STATE1
418	aesenc KEY, STATE2
419	aesenc KEY, STATE3
420	aesenc KEY, STATE4
421	movaps 0x50(TKEYP), KEY
422	aesenc KEY, STATE1
423	aesenc KEY, STATE2
424	aesenc KEY, STATE3
425	aesenc KEY, STATE4
426	movaps 0x60(TKEYP), KEY
427	aesenc KEY, STATE1
428	aesenc KEY, STATE2
429	aesenc KEY, STATE3
430	aesenc KEY, STATE4
431	movaps 0x70(TKEYP), KEY
432	aesenclast KEY, STATE1		# last round
433	aesenclast KEY, STATE2
434	aesenclast KEY, STATE3
435	aesenclast KEY, STATE4
436	RET
437SYM_FUNC_END(_aesni_enc4)
438
439/*
440 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
441 */
442SYM_FUNC_START(aesni_dec)
443	FRAME_BEGIN
444#ifndef __x86_64__
445	pushl KEYP
446	pushl KLEN
447	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
448	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
449	movl (FRAME_OFFSET+20)(%esp), INP	# src
450#endif
451	mov 480(KEYP), KLEN		# key length
452	add $240, KEYP
453	movups (INP), STATE		# input
454	call _aesni_dec1
455	movups STATE, (OUTP)		#output
456#ifndef __x86_64__
457	popl KLEN
458	popl KEYP
459#endif
460	FRAME_END
461	RET
462SYM_FUNC_END(aesni_dec)
463
464/*
465 * _aesni_dec1:		internal ABI
466 * input:
467 *	KEYP:		key struct pointer
468 *	KLEN:		key length
469 *	STATE:		initial state (input)
470 * output:
471 *	STATE:		finial state (output)
472 * changed:
473 *	KEY
474 *	TKEYP (T1)
475 */
476SYM_FUNC_START_LOCAL(_aesni_dec1)
477	movaps (KEYP), KEY		# key
478	mov KEYP, TKEYP
479	pxor KEY, STATE		# round 0
480	add $0x30, TKEYP
481	cmp $24, KLEN
482	jb .Ldec128
483	lea 0x20(TKEYP), TKEYP
484	je .Ldec192
485	add $0x20, TKEYP
486	movaps -0x60(TKEYP), KEY
487	aesdec KEY, STATE
488	movaps -0x50(TKEYP), KEY
489	aesdec KEY, STATE
490.align 4
491.Ldec192:
492	movaps -0x40(TKEYP), KEY
493	aesdec KEY, STATE
494	movaps -0x30(TKEYP), KEY
495	aesdec KEY, STATE
496.align 4
497.Ldec128:
498	movaps -0x20(TKEYP), KEY
499	aesdec KEY, STATE
500	movaps -0x10(TKEYP), KEY
501	aesdec KEY, STATE
502	movaps (TKEYP), KEY
503	aesdec KEY, STATE
504	movaps 0x10(TKEYP), KEY
505	aesdec KEY, STATE
506	movaps 0x20(TKEYP), KEY
507	aesdec KEY, STATE
508	movaps 0x30(TKEYP), KEY
509	aesdec KEY, STATE
510	movaps 0x40(TKEYP), KEY
511	aesdec KEY, STATE
512	movaps 0x50(TKEYP), KEY
513	aesdec KEY, STATE
514	movaps 0x60(TKEYP), KEY
515	aesdec KEY, STATE
516	movaps 0x70(TKEYP), KEY
517	aesdeclast KEY, STATE
518	RET
519SYM_FUNC_END(_aesni_dec1)
520
521/*
522 * _aesni_dec4:	internal ABI
523 * input:
524 *	KEYP:		key struct pointer
525 *	KLEN:		key length
526 *	STATE1:		initial state (input)
527 *	STATE2
528 *	STATE3
529 *	STATE4
530 * output:
531 *	STATE1:		finial state (output)
532 *	STATE2
533 *	STATE3
534 *	STATE4
535 * changed:
536 *	KEY
537 *	TKEYP (T1)
538 */
539SYM_FUNC_START_LOCAL(_aesni_dec4)
540	movaps (KEYP), KEY		# key
541	mov KEYP, TKEYP
542	pxor KEY, STATE1		# round 0
543	pxor KEY, STATE2
544	pxor KEY, STATE3
545	pxor KEY, STATE4
546	add $0x30, TKEYP
547	cmp $24, KLEN
548	jb .L4dec128
549	lea 0x20(TKEYP), TKEYP
550	je .L4dec192
551	add $0x20, TKEYP
552	movaps -0x60(TKEYP), KEY
553	aesdec KEY, STATE1
554	aesdec KEY, STATE2
555	aesdec KEY, STATE3
556	aesdec KEY, STATE4
557	movaps -0x50(TKEYP), KEY
558	aesdec KEY, STATE1
559	aesdec KEY, STATE2
560	aesdec KEY, STATE3
561	aesdec KEY, STATE4
562.align 4
563.L4dec192:
564	movaps -0x40(TKEYP), KEY
565	aesdec KEY, STATE1
566	aesdec KEY, STATE2
567	aesdec KEY, STATE3
568	aesdec KEY, STATE4
569	movaps -0x30(TKEYP), KEY
570	aesdec KEY, STATE1
571	aesdec KEY, STATE2
572	aesdec KEY, STATE3
573	aesdec KEY, STATE4
574.align 4
575.L4dec128:
576	movaps -0x20(TKEYP), KEY
577	aesdec KEY, STATE1
578	aesdec KEY, STATE2
579	aesdec KEY, STATE3
580	aesdec KEY, STATE4
581	movaps -0x10(TKEYP), KEY
582	aesdec KEY, STATE1
583	aesdec KEY, STATE2
584	aesdec KEY, STATE3
585	aesdec KEY, STATE4
586	movaps (TKEYP), KEY
587	aesdec KEY, STATE1
588	aesdec KEY, STATE2
589	aesdec KEY, STATE3
590	aesdec KEY, STATE4
591	movaps 0x10(TKEYP), KEY
592	aesdec KEY, STATE1
593	aesdec KEY, STATE2
594	aesdec KEY, STATE3
595	aesdec KEY, STATE4
596	movaps 0x20(TKEYP), KEY
597	aesdec KEY, STATE1
598	aesdec KEY, STATE2
599	aesdec KEY, STATE3
600	aesdec KEY, STATE4
601	movaps 0x30(TKEYP), KEY
602	aesdec KEY, STATE1
603	aesdec KEY, STATE2
604	aesdec KEY, STATE3
605	aesdec KEY, STATE4
606	movaps 0x40(TKEYP), KEY
607	aesdec KEY, STATE1
608	aesdec KEY, STATE2
609	aesdec KEY, STATE3
610	aesdec KEY, STATE4
611	movaps 0x50(TKEYP), KEY
612	aesdec KEY, STATE1
613	aesdec KEY, STATE2
614	aesdec KEY, STATE3
615	aesdec KEY, STATE4
616	movaps 0x60(TKEYP), KEY
617	aesdec KEY, STATE1
618	aesdec KEY, STATE2
619	aesdec KEY, STATE3
620	aesdec KEY, STATE4
621	movaps 0x70(TKEYP), KEY
622	aesdeclast KEY, STATE1		# last round
623	aesdeclast KEY, STATE2
624	aesdeclast KEY, STATE3
625	aesdeclast KEY, STATE4
626	RET
627SYM_FUNC_END(_aesni_dec4)
628
629/*
630 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
631 *		      size_t len)
632 */
633SYM_FUNC_START(aesni_ecb_enc)
634	FRAME_BEGIN
635#ifndef __x86_64__
636	pushl LEN
637	pushl KEYP
638	pushl KLEN
639	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
640	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
641	movl (FRAME_OFFSET+24)(%esp), INP	# src
642	movl (FRAME_OFFSET+28)(%esp), LEN	# len
643#endif
644	test LEN, LEN		# check length
645	jz .Lecb_enc_ret
646	mov 480(KEYP), KLEN
647	cmp $16, LEN
648	jb .Lecb_enc_ret
649	cmp $64, LEN
650	jb .Lecb_enc_loop1
651.align 4
652.Lecb_enc_loop4:
653	movups (INP), STATE1
654	movups 0x10(INP), STATE2
655	movups 0x20(INP), STATE3
656	movups 0x30(INP), STATE4
657	call _aesni_enc4
658	movups STATE1, (OUTP)
659	movups STATE2, 0x10(OUTP)
660	movups STATE3, 0x20(OUTP)
661	movups STATE4, 0x30(OUTP)
662	sub $64, LEN
663	add $64, INP
664	add $64, OUTP
665	cmp $64, LEN
666	jge .Lecb_enc_loop4
667	cmp $16, LEN
668	jb .Lecb_enc_ret
669.align 4
670.Lecb_enc_loop1:
671	movups (INP), STATE1
672	call _aesni_enc1
673	movups STATE1, (OUTP)
674	sub $16, LEN
675	add $16, INP
676	add $16, OUTP
677	cmp $16, LEN
678	jge .Lecb_enc_loop1
679.Lecb_enc_ret:
680#ifndef __x86_64__
681	popl KLEN
682	popl KEYP
683	popl LEN
684#endif
685	FRAME_END
686	RET
687SYM_FUNC_END(aesni_ecb_enc)
688
689/*
690 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
691 *		      size_t len);
692 */
693SYM_FUNC_START(aesni_ecb_dec)
694	FRAME_BEGIN
695#ifndef __x86_64__
696	pushl LEN
697	pushl KEYP
698	pushl KLEN
699	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
700	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
701	movl (FRAME_OFFSET+24)(%esp), INP	# src
702	movl (FRAME_OFFSET+28)(%esp), LEN	# len
703#endif
704	test LEN, LEN
705	jz .Lecb_dec_ret
706	mov 480(KEYP), KLEN
707	add $240, KEYP
708	cmp $16, LEN
709	jb .Lecb_dec_ret
710	cmp $64, LEN
711	jb .Lecb_dec_loop1
712.align 4
713.Lecb_dec_loop4:
714	movups (INP), STATE1
715	movups 0x10(INP), STATE2
716	movups 0x20(INP), STATE3
717	movups 0x30(INP), STATE4
718	call _aesni_dec4
719	movups STATE1, (OUTP)
720	movups STATE2, 0x10(OUTP)
721	movups STATE3, 0x20(OUTP)
722	movups STATE4, 0x30(OUTP)
723	sub $64, LEN
724	add $64, INP
725	add $64, OUTP
726	cmp $64, LEN
727	jge .Lecb_dec_loop4
728	cmp $16, LEN
729	jb .Lecb_dec_ret
730.align 4
731.Lecb_dec_loop1:
732	movups (INP), STATE1
733	call _aesni_dec1
734	movups STATE1, (OUTP)
735	sub $16, LEN
736	add $16, INP
737	add $16, OUTP
738	cmp $16, LEN
739	jge .Lecb_dec_loop1
740.Lecb_dec_ret:
741#ifndef __x86_64__
742	popl KLEN
743	popl KEYP
744	popl LEN
745#endif
746	FRAME_END
747	RET
748SYM_FUNC_END(aesni_ecb_dec)
749
750/*
751 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
752 *		      size_t len, u8 *iv)
753 */
754SYM_FUNC_START(aesni_cbc_enc)
755	FRAME_BEGIN
756#ifndef __x86_64__
757	pushl IVP
758	pushl LEN
759	pushl KEYP
760	pushl KLEN
761	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
762	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
763	movl (FRAME_OFFSET+28)(%esp), INP	# src
764	movl (FRAME_OFFSET+32)(%esp), LEN	# len
765	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
766#endif
767	cmp $16, LEN
768	jb .Lcbc_enc_ret
769	mov 480(KEYP), KLEN
770	movups (IVP), STATE	# load iv as initial state
771.align 4
772.Lcbc_enc_loop:
773	movups (INP), IN	# load input
774	pxor IN, STATE
775	call _aesni_enc1
776	movups STATE, (OUTP)	# store output
777	sub $16, LEN
778	add $16, INP
779	add $16, OUTP
780	cmp $16, LEN
781	jge .Lcbc_enc_loop
782	movups STATE, (IVP)
783.Lcbc_enc_ret:
784#ifndef __x86_64__
785	popl KLEN
786	popl KEYP
787	popl LEN
788	popl IVP
789#endif
790	FRAME_END
791	RET
792SYM_FUNC_END(aesni_cbc_enc)
793
794/*
795 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
796 *		      size_t len, u8 *iv)
797 */
798SYM_FUNC_START(aesni_cbc_dec)
799	FRAME_BEGIN
800#ifndef __x86_64__
801	pushl IVP
802	pushl LEN
803	pushl KEYP
804	pushl KLEN
805	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
806	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
807	movl (FRAME_OFFSET+28)(%esp), INP	# src
808	movl (FRAME_OFFSET+32)(%esp), LEN	# len
809	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
810#endif
811	cmp $16, LEN
812	jb .Lcbc_dec_just_ret
813	mov 480(KEYP), KLEN
814	add $240, KEYP
815	movups (IVP), IV
816	cmp $64, LEN
817	jb .Lcbc_dec_loop1
818.align 4
819.Lcbc_dec_loop4:
820	movups (INP), IN1
821	movaps IN1, STATE1
822	movups 0x10(INP), IN2
823	movaps IN2, STATE2
824#ifdef __x86_64__
825	movups 0x20(INP), IN3
826	movaps IN3, STATE3
827	movups 0x30(INP), IN4
828	movaps IN4, STATE4
829#else
830	movups 0x20(INP), IN1
831	movaps IN1, STATE3
832	movups 0x30(INP), IN2
833	movaps IN2, STATE4
834#endif
835	call _aesni_dec4
836	pxor IV, STATE1
837#ifdef __x86_64__
838	pxor IN1, STATE2
839	pxor IN2, STATE3
840	pxor IN3, STATE4
841	movaps IN4, IV
842#else
843	pxor IN1, STATE4
844	movaps IN2, IV
845	movups (INP), IN1
846	pxor IN1, STATE2
847	movups 0x10(INP), IN2
848	pxor IN2, STATE3
849#endif
850	movups STATE1, (OUTP)
851	movups STATE2, 0x10(OUTP)
852	movups STATE3, 0x20(OUTP)
853	movups STATE4, 0x30(OUTP)
854	sub $64, LEN
855	add $64, INP
856	add $64, OUTP
857	cmp $64, LEN
858	jge .Lcbc_dec_loop4
859	cmp $16, LEN
860	jb .Lcbc_dec_ret
861.align 4
862.Lcbc_dec_loop1:
863	movups (INP), IN
864	movaps IN, STATE
865	call _aesni_dec1
866	pxor IV, STATE
867	movups STATE, (OUTP)
868	movaps IN, IV
869	sub $16, LEN
870	add $16, INP
871	add $16, OUTP
872	cmp $16, LEN
873	jge .Lcbc_dec_loop1
874.Lcbc_dec_ret:
875	movups IV, (IVP)
876.Lcbc_dec_just_ret:
877#ifndef __x86_64__
878	popl KLEN
879	popl KEYP
880	popl LEN
881	popl IVP
882#endif
883	FRAME_END
884	RET
885SYM_FUNC_END(aesni_cbc_dec)
886
887/*
888 * void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
889 *			  size_t len, u8 *iv)
890 */
891SYM_FUNC_START(aesni_cts_cbc_enc)
892	FRAME_BEGIN
893#ifndef __x86_64__
894	pushl IVP
895	pushl LEN
896	pushl KEYP
897	pushl KLEN
898	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
899	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
900	movl (FRAME_OFFSET+28)(%esp), INP	# src
901	movl (FRAME_OFFSET+32)(%esp), LEN	# len
902	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
903	lea .Lcts_permute_table, T1
904#else
905	lea .Lcts_permute_table(%rip), T1
906#endif
907	mov 480(KEYP), KLEN
908	movups (IVP), STATE
909	sub $16, LEN
910	mov T1, IVP
911	add $32, IVP
912	add LEN, T1
913	sub LEN, IVP
914	movups (T1), %xmm4
915	movups (IVP), %xmm5
916
917	movups (INP), IN1
918	add LEN, INP
919	movups (INP), IN2
920
921	pxor IN1, STATE
922	call _aesni_enc1
923
924	pshufb %xmm5, IN2
925	pxor STATE, IN2
926	pshufb %xmm4, STATE
927	add OUTP, LEN
928	movups STATE, (LEN)
929
930	movaps IN2, STATE
931	call _aesni_enc1
932	movups STATE, (OUTP)
933
934#ifndef __x86_64__
935	popl KLEN
936	popl KEYP
937	popl LEN
938	popl IVP
939#endif
940	FRAME_END
941	RET
942SYM_FUNC_END(aesni_cts_cbc_enc)
943
944/*
945 * void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
946 *			  size_t len, u8 *iv)
947 */
948SYM_FUNC_START(aesni_cts_cbc_dec)
949	FRAME_BEGIN
950#ifndef __x86_64__
951	pushl IVP
952	pushl LEN
953	pushl KEYP
954	pushl KLEN
955	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
956	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
957	movl (FRAME_OFFSET+28)(%esp), INP	# src
958	movl (FRAME_OFFSET+32)(%esp), LEN	# len
959	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
960	lea .Lcts_permute_table, T1
961#else
962	lea .Lcts_permute_table(%rip), T1
963#endif
964	mov 480(KEYP), KLEN
965	add $240, KEYP
966	movups (IVP), IV
967	sub $16, LEN
968	mov T1, IVP
969	add $32, IVP
970	add LEN, T1
971	sub LEN, IVP
972	movups (T1), %xmm4
973
974	movups (INP), STATE
975	add LEN, INP
976	movups (INP), IN1
977
978	call _aesni_dec1
979	movaps STATE, IN2
980	pshufb %xmm4, STATE
981	pxor IN1, STATE
982
983	add OUTP, LEN
984	movups STATE, (LEN)
985
986	movups (IVP), %xmm0
987	pshufb %xmm0, IN1
988	pblendvb IN2, IN1
989	movaps IN1, STATE
990	call _aesni_dec1
991
992	pxor IV, STATE
993	movups STATE, (OUTP)
994
995#ifndef __x86_64__
996	popl KLEN
997	popl KEYP
998	popl LEN
999	popl IVP
1000#endif
1001	FRAME_END
1002	RET
1003SYM_FUNC_END(aesni_cts_cbc_dec)
1004
1005.pushsection .rodata
1006.align 16
1007.Lcts_permute_table:
1008	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1009	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1010	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
1011	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
1012	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1013	.byte		0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
1014#ifdef __x86_64__
1015.Lbswap_mask:
1016	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1017#endif
1018.popsection
1019
1020#ifdef __x86_64__
1021/*
1022 * _aesni_inc_init:	internal ABI
1023 *	setup registers used by _aesni_inc
1024 * input:
1025 *	IV
1026 * output:
1027 *	CTR:	== IV, in little endian
1028 *	TCTR_LOW: == lower qword of CTR
1029 *	INC:	== 1, in little endian
1030 *	BSWAP_MASK == endian swapping mask
1031 */
1032SYM_FUNC_START_LOCAL(_aesni_inc_init)
1033	movaps .Lbswap_mask(%rip), BSWAP_MASK
1034	movaps IV, CTR
1035	pshufb BSWAP_MASK, CTR
1036	mov $1, TCTR_LOW
1037	movq TCTR_LOW, INC
1038	movq CTR, TCTR_LOW
1039	RET
1040SYM_FUNC_END(_aesni_inc_init)
1041
1042/*
1043 * _aesni_inc:		internal ABI
1044 *	Increase IV by 1, IV is in big endian
1045 * input:
1046 *	IV
1047 *	CTR:	== IV, in little endian
1048 *	TCTR_LOW: == lower qword of CTR
1049 *	INC:	== 1, in little endian
1050 *	BSWAP_MASK == endian swapping mask
1051 * output:
1052 *	IV:	Increase by 1
1053 * changed:
1054 *	CTR:	== output IV, in little endian
1055 *	TCTR_LOW: == lower qword of CTR
1056 */
1057SYM_FUNC_START_LOCAL(_aesni_inc)
1058	paddq INC, CTR
1059	add $1, TCTR_LOW
1060	jnc .Linc_low
1061	pslldq $8, INC
1062	paddq INC, CTR
1063	psrldq $8, INC
1064.Linc_low:
1065	movaps CTR, IV
1066	pshufb BSWAP_MASK, IV
1067	RET
1068SYM_FUNC_END(_aesni_inc)
1069
1070/*
1071 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
1072 *		      size_t len, u8 *iv)
1073 */
1074SYM_FUNC_START(aesni_ctr_enc)
1075	ANNOTATE_NOENDBR
1076	FRAME_BEGIN
1077	cmp $16, LEN
1078	jb .Lctr_enc_just_ret
1079	mov 480(KEYP), KLEN
1080	movups (IVP), IV
1081	call _aesni_inc_init
1082	cmp $64, LEN
1083	jb .Lctr_enc_loop1
1084.align 4
1085.Lctr_enc_loop4:
1086	movaps IV, STATE1
1087	call _aesni_inc
1088	movups (INP), IN1
1089	movaps IV, STATE2
1090	call _aesni_inc
1091	movups 0x10(INP), IN2
1092	movaps IV, STATE3
1093	call _aesni_inc
1094	movups 0x20(INP), IN3
1095	movaps IV, STATE4
1096	call _aesni_inc
1097	movups 0x30(INP), IN4
1098	call _aesni_enc4
1099	pxor IN1, STATE1
1100	movups STATE1, (OUTP)
1101	pxor IN2, STATE2
1102	movups STATE2, 0x10(OUTP)
1103	pxor IN3, STATE3
1104	movups STATE3, 0x20(OUTP)
1105	pxor IN4, STATE4
1106	movups STATE4, 0x30(OUTP)
1107	sub $64, LEN
1108	add $64, INP
1109	add $64, OUTP
1110	cmp $64, LEN
1111	jge .Lctr_enc_loop4
1112	cmp $16, LEN
1113	jb .Lctr_enc_ret
1114.align 4
1115.Lctr_enc_loop1:
1116	movaps IV, STATE
1117	call _aesni_inc
1118	movups (INP), IN
1119	call _aesni_enc1
1120	pxor IN, STATE
1121	movups STATE, (OUTP)
1122	sub $16, LEN
1123	add $16, INP
1124	add $16, OUTP
1125	cmp $16, LEN
1126	jge .Lctr_enc_loop1
1127.Lctr_enc_ret:
1128	movups IV, (IVP)
1129.Lctr_enc_just_ret:
1130	FRAME_END
1131	RET
1132SYM_FUNC_END(aesni_ctr_enc)
1133
1134#endif
1135
1136.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
1137.align 16
1138.Lgf128mul_x_ble_mask:
1139	.octa 0x00000000000000010000000000000087
1140.previous
1141
1142/*
1143 * _aesni_gf128mul_x_ble: Multiply in GF(2^128) for XTS IVs
1144 * input:
1145 *	IV:	current IV
1146 *	GF128MUL_MASK == mask with 0x87 and 0x01
1147 * output:
1148 *	IV:	next IV
1149 * changed:
1150 *	KEY:	== temporary value
1151 */
1152.macro _aesni_gf128mul_x_ble
1153	pshufd $0x13, IV, KEY
1154	paddq IV, IV
1155	psrad $31, KEY
1156	pand GF128MUL_MASK, KEY
1157	pxor KEY, IV
1158.endm
1159
1160.macro	_aesni_xts_crypt	enc
1161	FRAME_BEGIN
1162#ifndef __x86_64__
1163	pushl IVP
1164	pushl LEN
1165	pushl KEYP
1166	pushl KLEN
1167	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
1168	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
1169	movl (FRAME_OFFSET+28)(%esp), INP	# src
1170	movl (FRAME_OFFSET+32)(%esp), LEN	# len
1171	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
1172	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
1173#else
1174	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
1175#endif
1176	movups (IVP), IV
1177
1178	mov 480(KEYP), KLEN
1179.if !\enc
1180	add $240, KEYP
1181
1182	test $15, LEN
1183	jz .Lxts_loop4\@
1184	sub $16, LEN
1185.endif
1186
1187.Lxts_loop4\@:
1188	sub $64, LEN
1189	jl .Lxts_1x\@
1190
1191	movdqa IV, STATE1
1192	movdqu 0x00(INP), IN
1193	pxor IN, STATE1
1194	movdqu IV, 0x00(OUTP)
1195
1196	_aesni_gf128mul_x_ble
1197	movdqa IV, STATE2
1198	movdqu 0x10(INP), IN
1199	pxor IN, STATE2
1200	movdqu IV, 0x10(OUTP)
1201
1202	_aesni_gf128mul_x_ble
1203	movdqa IV, STATE3
1204	movdqu 0x20(INP), IN
1205	pxor IN, STATE3
1206	movdqu IV, 0x20(OUTP)
1207
1208	_aesni_gf128mul_x_ble
1209	movdqa IV, STATE4
1210	movdqu 0x30(INP), IN
1211	pxor IN, STATE4
1212	movdqu IV, 0x30(OUTP)
1213
1214.if \enc
1215	call _aesni_enc4
1216.else
1217	call _aesni_dec4
1218.endif
1219
1220	movdqu 0x00(OUTP), IN
1221	pxor IN, STATE1
1222	movdqu STATE1, 0x00(OUTP)
1223
1224	movdqu 0x10(OUTP), IN
1225	pxor IN, STATE2
1226	movdqu STATE2, 0x10(OUTP)
1227
1228	movdqu 0x20(OUTP), IN
1229	pxor IN, STATE3
1230	movdqu STATE3, 0x20(OUTP)
1231
1232	movdqu 0x30(OUTP), IN
1233	pxor IN, STATE4
1234	movdqu STATE4, 0x30(OUTP)
1235
1236	_aesni_gf128mul_x_ble
1237
1238	add $64, INP
1239	add $64, OUTP
1240	test LEN, LEN
1241	jnz .Lxts_loop4\@
1242
1243.Lxts_ret_iv\@:
1244	movups IV, (IVP)
1245
1246.Lxts_ret\@:
1247#ifndef __x86_64__
1248	popl KLEN
1249	popl KEYP
1250	popl LEN
1251	popl IVP
1252#endif
1253	FRAME_END
1254	RET
1255
1256.Lxts_1x\@:
1257	add $64, LEN
1258	jz .Lxts_ret_iv\@
1259.if \enc
1260	sub $16, LEN
1261	jl .Lxts_cts4\@
1262.endif
1263
1264.Lxts_loop1\@:
1265	movdqu (INP), STATE
1266.if \enc
1267	pxor IV, STATE
1268	call _aesni_enc1
1269.else
1270	add $16, INP
1271	sub $16, LEN
1272	jl .Lxts_cts1\@
1273	pxor IV, STATE
1274	call _aesni_dec1
1275.endif
1276	pxor IV, STATE
1277	_aesni_gf128mul_x_ble
1278
1279	test LEN, LEN
1280	jz .Lxts_out\@
1281
1282.if \enc
1283	add $16, INP
1284	sub $16, LEN
1285	jl .Lxts_cts1\@
1286.endif
1287
1288	movdqu STATE, (OUTP)
1289	add $16, OUTP
1290	jmp .Lxts_loop1\@
1291
1292.Lxts_out\@:
1293	movdqu STATE, (OUTP)
1294	jmp .Lxts_ret_iv\@
1295
1296.if \enc
1297.Lxts_cts4\@:
1298	movdqa STATE4, STATE
1299	sub $16, OUTP
1300.Lxts_cts1\@:
1301.else
1302.Lxts_cts1\@:
1303	movdqa IV, STATE4
1304	_aesni_gf128mul_x_ble
1305
1306	pxor IV, STATE
1307	call _aesni_dec1
1308	pxor IV, STATE
1309.endif
1310#ifndef __x86_64__
1311	lea .Lcts_permute_table, T1
1312#else
1313	lea .Lcts_permute_table(%rip), T1
1314#endif
1315	add LEN, INP		/* rewind input pointer */
1316	add $16, LEN		/* # bytes in final block */
1317	movups (INP), IN1
1318
1319	mov T1, IVP
1320	add $32, IVP
1321	add LEN, T1
1322	sub LEN, IVP
1323	add OUTP, LEN
1324
1325	movups (T1), %xmm4
1326	movaps STATE, IN2
1327	pshufb %xmm4, STATE
1328	movups STATE, (LEN)
1329
1330	movups (IVP), %xmm0
1331	pshufb %xmm0, IN1
1332	pblendvb IN2, IN1
1333	movaps IN1, STATE
1334
1335.if \enc
1336	pxor IV, STATE
1337	call _aesni_enc1
1338	pxor IV, STATE
1339.else
1340	pxor STATE4, STATE
1341	call _aesni_dec1
1342	pxor STATE4, STATE
1343.endif
1344
1345	movups STATE, (OUTP)
1346	jmp .Lxts_ret\@
1347.endm
1348
1349/*
1350 * void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *dst,
1351 *		      const u8 *src, unsigned int len, le128 *iv)
1352 */
1353SYM_FUNC_START(aesni_xts_enc)
1354	_aesni_xts_crypt	1
1355SYM_FUNC_END(aesni_xts_enc)
1356
1357/*
1358 * void aesni_xts_dec(const struct crypto_aes_ctx *ctx, u8 *dst,
1359 *		      const u8 *src, unsigned int len, le128 *iv)
1360 */
1361SYM_FUNC_START(aesni_xts_dec)
1362	_aesni_xts_crypt	0
1363SYM_FUNC_END(aesni_xts_dec)
1364