xref: /src/sys/contrib/openzfs/module/icp/asm-x86_64/modes/ghash-x86_64.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1// SPDX-License-Identifier: Apache-2.0
2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# March, June 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that
21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
22# function features so called "528B" variant utilizing additional
23# 256+16 bytes of per-key storage [+512 bytes shared table].
24# Performance results are for this streamed GHASH subroutine and are
25# expressed in cycles per processed byte, less is better:
26#
27#		gcc 3.4.x(*)	assembler
28#
29# P4		28.6		14.0		+100%
30# Opteron	19.3		7.7		+150%
31# Core2		17.8		8.1(**)		+120%
32# Atom		31.6		16.8		+88%
33# VIA Nano	21.8		10.1		+115%
34#
35# (*)	comparison is not completely fair, because C results are
36#	for vanilla "256B" implementation, while assembler results
37#	are for "528B";-)
38# (**)	it's mystery [to me] why Core2 result is not same as for
39#	Opteron;
40
41# May 2010
42#
43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
44# See ghash-x86.pl for background information and details about coding
45# techniques.
46#
47# Special thanks to David Woodhouse for providing access to a
48# Westmere-based system on behalf of Intel Open Source Technology Centre.
49
50# December 2012
51#
52# Overhaul: aggregate Karatsuba post-processing, improve ILP in
53# reduction_alg9, increase reduction aggregate factor to 4x. As for
54# the latter. ghash-x86.pl discusses that it makes lesser sense to
55# increase aggregate factor. Then why increase here? Critical path
56# consists of 3 independent pclmulqdq instructions, Karatsuba post-
57# processing and reduction. "On top" of this we lay down aggregated
58# multiplication operations, triplets of independent pclmulqdq's. As
59# issue rate for pclmulqdq is limited, it makes lesser sense to
60# aggregate more multiplications than it takes to perform remaining
61# non-multiplication operations. 2x is near-optimal coefficient for
62# contemporary Intel CPUs (therefore modest improvement coefficient),
63# but not for Bulldozer. Latter is because logical SIMD operations
64# are twice as slow in comparison to Intel, so that critical path is
65# longer. A CPU with higher pclmulqdq issue rate would also benefit
66# from higher aggregate factor...
67#
68# Westmere	1.78(+13%)
69# Sandy Bridge	1.80(+8%)
70# Ivy Bridge	1.80(+7%)
71# Haswell	0.55(+93%) (if system doesn't support AVX)
72# Broadwell	0.45(+110%)(if system doesn't support AVX)
73# Skylake	0.44(+110%)(if system doesn't support AVX)
74# Bulldozer	1.49(+27%)
75# Silvermont	2.88(+13%)
76# Knights L	2.12(-)    (if system doesn't support AVX)
77# Goldmont	1.08(+24%)
78
79# March 2013
80#
81# ... 8x aggregate factor AVX code path is using reduction algorithm
82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
84# sub-optimally in comparison to above mentioned version. But thanks
85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
86# it performs in 0.41 cycles per byte on Haswell processor, in
87# 0.29 on Broadwell, and in 0.36 on Skylake.
88#
89# Knights Landing achieves 1.09 cpb.
90#
91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
92
93# Generated once from
94# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl
95# and modified for ICP. Modification are kept at a bare minimum to ease later
96# upstream merges.
97
98#if defined(__x86_64__) && defined(HAVE_AVX) && \
99    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
100
101#define _ASM
102#include <sys/asm_linkage.h>
103
104.text
105
106/* Windows userland links with OpenSSL */
107#if !defined (_WIN32) || defined (_KERNEL)
108ENTRY_ALIGN(gcm_gmult_clmul, 16)
109
110.cfi_startproc
111	ENDBR
112
113.L_gmult_clmul:
114	movdqu	(%rdi),%xmm0
115	movdqa	.Lbswap_mask(%rip),%xmm5
116	movdqu	(%rsi),%xmm2
117	movdqu	32(%rsi),%xmm4
118.byte	102,15,56,0,197
119	movdqa	%xmm0,%xmm1
120	pshufd	$78,%xmm0,%xmm3
121	pxor	%xmm0,%xmm3
122.byte	102,15,58,68,194,0
123.byte	102,15,58,68,202,17
124.byte	102,15,58,68,220,0
125	pxor	%xmm0,%xmm3
126	pxor	%xmm1,%xmm3
127
128	movdqa	%xmm3,%xmm4
129	psrldq	$8,%xmm3
130	pslldq	$8,%xmm4
131	pxor	%xmm3,%xmm1
132	pxor	%xmm4,%xmm0
133
134	movdqa	%xmm0,%xmm4
135	movdqa	%xmm0,%xmm3
136	psllq	$5,%xmm0
137	pxor	%xmm0,%xmm3
138	psllq	$1,%xmm0
139	pxor	%xmm3,%xmm0
140	psllq	$57,%xmm0
141	movdqa	%xmm0,%xmm3
142	pslldq	$8,%xmm0
143	psrldq	$8,%xmm3
144	pxor	%xmm4,%xmm0
145	pxor	%xmm3,%xmm1
146
147
148	movdqa	%xmm0,%xmm4
149	psrlq	$1,%xmm0
150	pxor	%xmm4,%xmm1
151	pxor	%xmm0,%xmm4
152	psrlq	$5,%xmm0
153	pxor	%xmm4,%xmm0
154	psrlq	$1,%xmm0
155	pxor	%xmm1,%xmm0
156.byte	102,15,56,0,197
157	movdqu	%xmm0,(%rdi)
158	RET
159.cfi_endproc
160SET_SIZE(gcm_gmult_clmul)
161#endif /* !_WIN32 || _KERNEL */
162
163ENTRY_ALIGN(gcm_init_htab_avx, 32)
164.cfi_startproc
165	ENDBR
166	vzeroupper
167
168	vmovdqu	(%rsi),%xmm2
169	// KCF/ICP stores H in network byte order with the hi qword first
170	// so we need to swap all bytes, not the 2 qwords.
171	vmovdqu	.Lbswap_mask(%rip),%xmm4
172	vpshufb	%xmm4,%xmm2,%xmm2
173
174
175	vpshufd	$255,%xmm2,%xmm4
176	vpsrlq	$63,%xmm2,%xmm3
177	vpsllq	$1,%xmm2,%xmm2
178	vpxor	%xmm5,%xmm5,%xmm5
179	vpcmpgtd	%xmm4,%xmm5,%xmm5
180	vpslldq	$8,%xmm3,%xmm3
181	vpor	%xmm3,%xmm2,%xmm2
182
183
184	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
185	vpxor	%xmm5,%xmm2,%xmm2
186
187	vpunpckhqdq	%xmm2,%xmm2,%xmm6
188	vmovdqa	%xmm2,%xmm0
189	vpxor	%xmm2,%xmm6,%xmm6
190	movq	$4,%r10
191	jmp	.Linit_start_avx
192.balign	32
193.Linit_loop_avx:
194	vpalignr	$8,%xmm3,%xmm4,%xmm5
195	vmovdqu	%xmm5,-16(%rdi)
196	vpunpckhqdq	%xmm0,%xmm0,%xmm3
197	vpxor	%xmm0,%xmm3,%xmm3
198	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
199	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
200	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
201	vpxor	%xmm0,%xmm1,%xmm4
202	vpxor	%xmm4,%xmm3,%xmm3
203
204	vpslldq	$8,%xmm3,%xmm4
205	vpsrldq	$8,%xmm3,%xmm3
206	vpxor	%xmm4,%xmm0,%xmm0
207	vpxor	%xmm3,%xmm1,%xmm1
208	vpsllq	$57,%xmm0,%xmm3
209	vpsllq	$62,%xmm0,%xmm4
210	vpxor	%xmm3,%xmm4,%xmm4
211	vpsllq	$63,%xmm0,%xmm3
212	vpxor	%xmm3,%xmm4,%xmm4
213	vpslldq	$8,%xmm4,%xmm3
214	vpsrldq	$8,%xmm4,%xmm4
215	vpxor	%xmm3,%xmm0,%xmm0
216	vpxor	%xmm4,%xmm1,%xmm1
217
218	vpsrlq	$1,%xmm0,%xmm4
219	vpxor	%xmm0,%xmm1,%xmm1
220	vpxor	%xmm4,%xmm0,%xmm0
221	vpsrlq	$5,%xmm4,%xmm4
222	vpxor	%xmm4,%xmm0,%xmm0
223	vpsrlq	$1,%xmm0,%xmm0
224	vpxor	%xmm1,%xmm0,%xmm0
225.Linit_start_avx:
226	vmovdqa	%xmm0,%xmm5
227	vpunpckhqdq	%xmm0,%xmm0,%xmm3
228	vpxor	%xmm0,%xmm3,%xmm3
229	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
230	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
231	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
232	vpxor	%xmm0,%xmm1,%xmm4
233	vpxor	%xmm4,%xmm3,%xmm3
234
235	vpslldq	$8,%xmm3,%xmm4
236	vpsrldq	$8,%xmm3,%xmm3
237	vpxor	%xmm4,%xmm0,%xmm0
238	vpxor	%xmm3,%xmm1,%xmm1
239	vpsllq	$57,%xmm0,%xmm3
240	vpsllq	$62,%xmm0,%xmm4
241	vpxor	%xmm3,%xmm4,%xmm4
242	vpsllq	$63,%xmm0,%xmm3
243	vpxor	%xmm3,%xmm4,%xmm4
244	vpslldq	$8,%xmm4,%xmm3
245	vpsrldq	$8,%xmm4,%xmm4
246	vpxor	%xmm3,%xmm0,%xmm0
247	vpxor	%xmm4,%xmm1,%xmm1
248
249	vpsrlq	$1,%xmm0,%xmm4
250	vpxor	%xmm0,%xmm1,%xmm1
251	vpxor	%xmm4,%xmm0,%xmm0
252	vpsrlq	$5,%xmm4,%xmm4
253	vpxor	%xmm4,%xmm0,%xmm0
254	vpsrlq	$1,%xmm0,%xmm0
255	vpxor	%xmm1,%xmm0,%xmm0
256	vpshufd	$78,%xmm5,%xmm3
257	vpshufd	$78,%xmm0,%xmm4
258	vpxor	%xmm5,%xmm3,%xmm3
259	vmovdqu	%xmm5,0(%rdi)
260	vpxor	%xmm0,%xmm4,%xmm4
261	vmovdqu	%xmm0,16(%rdi)
262	leaq	48(%rdi),%rdi
263	subq	$1,%r10
264	jnz	.Linit_loop_avx
265
266	vpalignr	$8,%xmm4,%xmm3,%xmm5
267	vmovdqu	%xmm5,-16(%rdi)
268
269	vzeroupper
270	RET
271.cfi_endproc
272SET_SIZE(gcm_init_htab_avx)
273
274#if !defined (_WIN32) || defined (_KERNEL)
275ENTRY_ALIGN(gcm_gmult_avx, 32)
276.cfi_startproc
277	ENDBR
278	jmp	.L_gmult_clmul
279.cfi_endproc
280SET_SIZE(gcm_gmult_avx)
281
282ENTRY_ALIGN(gcm_ghash_avx, 32)
283.cfi_startproc
284	ENDBR
285	vzeroupper
286
287	vmovdqu	(%rdi),%xmm10
288	leaq	.L0x1c2_polynomial(%rip),%r10
289	leaq	64(%rsi),%rsi
290	vmovdqu	.Lbswap_mask(%rip),%xmm13
291	vpshufb	%xmm13,%xmm10,%xmm10
292	cmpq	$0x80,%rcx
293	jb	.Lshort_avx
294	subq	$0x80,%rcx
295
296	vmovdqu	112(%rdx),%xmm14
297	vmovdqu	0-64(%rsi),%xmm6
298	vpshufb	%xmm13,%xmm14,%xmm14
299	vmovdqu	32-64(%rsi),%xmm7
300
301	vpunpckhqdq	%xmm14,%xmm14,%xmm9
302	vmovdqu	96(%rdx),%xmm15
303	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
304	vpxor	%xmm14,%xmm9,%xmm9
305	vpshufb	%xmm13,%xmm15,%xmm15
306	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
307	vmovdqu	16-64(%rsi),%xmm6
308	vpunpckhqdq	%xmm15,%xmm15,%xmm8
309	vmovdqu	80(%rdx),%xmm14
310	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
311	vpxor	%xmm15,%xmm8,%xmm8
312
313	vpshufb	%xmm13,%xmm14,%xmm14
314	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
315	vpunpckhqdq	%xmm14,%xmm14,%xmm9
316	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
317	vmovdqu	48-64(%rsi),%xmm6
318	vpxor	%xmm14,%xmm9,%xmm9
319	vmovdqu	64(%rdx),%xmm15
320	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
321	vmovdqu	80-64(%rsi),%xmm7
322
323	vpshufb	%xmm13,%xmm15,%xmm15
324	vpxor	%xmm0,%xmm3,%xmm3
325	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
326	vpxor	%xmm1,%xmm4,%xmm4
327	vpunpckhqdq	%xmm15,%xmm15,%xmm8
328	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
329	vmovdqu	64-64(%rsi),%xmm6
330	vpxor	%xmm2,%xmm5,%xmm5
331	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
332	vpxor	%xmm15,%xmm8,%xmm8
333
334	vmovdqu	48(%rdx),%xmm14
335	vpxor	%xmm3,%xmm0,%xmm0
336	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
337	vpxor	%xmm4,%xmm1,%xmm1
338	vpshufb	%xmm13,%xmm14,%xmm14
339	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
340	vmovdqu	96-64(%rsi),%xmm6
341	vpxor	%xmm5,%xmm2,%xmm2
342	vpunpckhqdq	%xmm14,%xmm14,%xmm9
343	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
344	vmovdqu	128-64(%rsi),%xmm7
345	vpxor	%xmm14,%xmm9,%xmm9
346
347	vmovdqu	32(%rdx),%xmm15
348	vpxor	%xmm0,%xmm3,%xmm3
349	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
350	vpxor	%xmm1,%xmm4,%xmm4
351	vpshufb	%xmm13,%xmm15,%xmm15
352	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
353	vmovdqu	112-64(%rsi),%xmm6
354	vpxor	%xmm2,%xmm5,%xmm5
355	vpunpckhqdq	%xmm15,%xmm15,%xmm8
356	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
357	vpxor	%xmm15,%xmm8,%xmm8
358
359	vmovdqu	16(%rdx),%xmm14
360	vpxor	%xmm3,%xmm0,%xmm0
361	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
362	vpxor	%xmm4,%xmm1,%xmm1
363	vpshufb	%xmm13,%xmm14,%xmm14
364	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
365	vmovdqu	144-64(%rsi),%xmm6
366	vpxor	%xmm5,%xmm2,%xmm2
367	vpunpckhqdq	%xmm14,%xmm14,%xmm9
368	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
369	vmovdqu	176-64(%rsi),%xmm7
370	vpxor	%xmm14,%xmm9,%xmm9
371
372	vmovdqu	(%rdx),%xmm15
373	vpxor	%xmm0,%xmm3,%xmm3
374	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
375	vpxor	%xmm1,%xmm4,%xmm4
376	vpshufb	%xmm13,%xmm15,%xmm15
377	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
378	vmovdqu	160-64(%rsi),%xmm6
379	vpxor	%xmm2,%xmm5,%xmm5
380	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
381
382	leaq	128(%rdx),%rdx
383	cmpq	$0x80,%rcx
384	jb	.Ltail_avx
385
386	vpxor	%xmm10,%xmm15,%xmm15
387	subq	$0x80,%rcx
388	jmp	.Loop8x_avx
389
390.balign	32
391.Loop8x_avx:
392	vpunpckhqdq	%xmm15,%xmm15,%xmm8
393	vmovdqu	112(%rdx),%xmm14
394	vpxor	%xmm0,%xmm3,%xmm3
395	vpxor	%xmm15,%xmm8,%xmm8
396	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
397	vpshufb	%xmm13,%xmm14,%xmm14
398	vpxor	%xmm1,%xmm4,%xmm4
399	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
400	vmovdqu	0-64(%rsi),%xmm6
401	vpunpckhqdq	%xmm14,%xmm14,%xmm9
402	vpxor	%xmm2,%xmm5,%xmm5
403	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
404	vmovdqu	32-64(%rsi),%xmm7
405	vpxor	%xmm14,%xmm9,%xmm9
406
407	vmovdqu	96(%rdx),%xmm15
408	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
409	vpxor	%xmm3,%xmm10,%xmm10
410	vpshufb	%xmm13,%xmm15,%xmm15
411	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
412	vxorps	%xmm4,%xmm11,%xmm11
413	vmovdqu	16-64(%rsi),%xmm6
414	vpunpckhqdq	%xmm15,%xmm15,%xmm8
415	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
416	vpxor	%xmm5,%xmm12,%xmm12
417	vxorps	%xmm15,%xmm8,%xmm8
418
419	vmovdqu	80(%rdx),%xmm14
420	vpxor	%xmm10,%xmm12,%xmm12
421	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
422	vpxor	%xmm11,%xmm12,%xmm12
423	vpslldq	$8,%xmm12,%xmm9
424	vpxor	%xmm0,%xmm3,%xmm3
425	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
426	vpsrldq	$8,%xmm12,%xmm12
427	vpxor	%xmm9,%xmm10,%xmm10
428	vmovdqu	48-64(%rsi),%xmm6
429	vpshufb	%xmm13,%xmm14,%xmm14
430	vxorps	%xmm12,%xmm11,%xmm11
431	vpxor	%xmm1,%xmm4,%xmm4
432	vpunpckhqdq	%xmm14,%xmm14,%xmm9
433	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
434	vmovdqu	80-64(%rsi),%xmm7
435	vpxor	%xmm14,%xmm9,%xmm9
436	vpxor	%xmm2,%xmm5,%xmm5
437
438	vmovdqu	64(%rdx),%xmm15
439	vpalignr	$8,%xmm10,%xmm10,%xmm12
440	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
441	vpshufb	%xmm13,%xmm15,%xmm15
442	vpxor	%xmm3,%xmm0,%xmm0
443	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
444	vmovdqu	64-64(%rsi),%xmm6
445	vpunpckhqdq	%xmm15,%xmm15,%xmm8
446	vpxor	%xmm4,%xmm1,%xmm1
447	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
448	vxorps	%xmm15,%xmm8,%xmm8
449	vpxor	%xmm5,%xmm2,%xmm2
450
451	vmovdqu	48(%rdx),%xmm14
452	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
453	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
454	vpshufb	%xmm13,%xmm14,%xmm14
455	vpxor	%xmm0,%xmm3,%xmm3
456	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
457	vmovdqu	96-64(%rsi),%xmm6
458	vpunpckhqdq	%xmm14,%xmm14,%xmm9
459	vpxor	%xmm1,%xmm4,%xmm4
460	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
461	vmovdqu	128-64(%rsi),%xmm7
462	vpxor	%xmm14,%xmm9,%xmm9
463	vpxor	%xmm2,%xmm5,%xmm5
464
465	vmovdqu	32(%rdx),%xmm15
466	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
467	vpshufb	%xmm13,%xmm15,%xmm15
468	vpxor	%xmm3,%xmm0,%xmm0
469	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
470	vmovdqu	112-64(%rsi),%xmm6
471	vpunpckhqdq	%xmm15,%xmm15,%xmm8
472	vpxor	%xmm4,%xmm1,%xmm1
473	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
474	vpxor	%xmm15,%xmm8,%xmm8
475	vpxor	%xmm5,%xmm2,%xmm2
476	vxorps	%xmm12,%xmm10,%xmm10
477
478	vmovdqu	16(%rdx),%xmm14
479	vpalignr	$8,%xmm10,%xmm10,%xmm12
480	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
481	vpshufb	%xmm13,%xmm14,%xmm14
482	vpxor	%xmm0,%xmm3,%xmm3
483	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
484	vmovdqu	144-64(%rsi),%xmm6
485	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
486	vxorps	%xmm11,%xmm12,%xmm12
487	vpunpckhqdq	%xmm14,%xmm14,%xmm9
488	vpxor	%xmm1,%xmm4,%xmm4
489	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
490	vmovdqu	176-64(%rsi),%xmm7
491	vpxor	%xmm14,%xmm9,%xmm9
492	vpxor	%xmm2,%xmm5,%xmm5
493
494	vmovdqu	(%rdx),%xmm15
495	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
496	vpshufb	%xmm13,%xmm15,%xmm15
497	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
498	vmovdqu	160-64(%rsi),%xmm6
499	vpxor	%xmm12,%xmm15,%xmm15
500	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
501	vpxor	%xmm10,%xmm15,%xmm15
502
503	leaq	128(%rdx),%rdx
504	subq	$0x80,%rcx
505	jnc	.Loop8x_avx
506
507	addq	$0x80,%rcx
508	jmp	.Ltail_no_xor_avx
509
510.balign	32
511.Lshort_avx:
512	vmovdqu	-16(%rdx,%rcx,1),%xmm14
513	leaq	(%rdx,%rcx,1),%rdx
514	vmovdqu	0-64(%rsi),%xmm6
515	vmovdqu	32-64(%rsi),%xmm7
516	vpshufb	%xmm13,%xmm14,%xmm15
517
518	vmovdqa	%xmm0,%xmm3
519	vmovdqa	%xmm1,%xmm4
520	vmovdqa	%xmm2,%xmm5
521	subq	$0x10,%rcx
522	jz	.Ltail_avx
523
524	vpunpckhqdq	%xmm15,%xmm15,%xmm8
525	vpxor	%xmm0,%xmm3,%xmm3
526	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
527	vpxor	%xmm15,%xmm8,%xmm8
528	vmovdqu	-32(%rdx),%xmm14
529	vpxor	%xmm1,%xmm4,%xmm4
530	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
531	vmovdqu	16-64(%rsi),%xmm6
532	vpshufb	%xmm13,%xmm14,%xmm15
533	vpxor	%xmm2,%xmm5,%xmm5
534	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
535	vpsrldq	$8,%xmm7,%xmm7
536	subq	$0x10,%rcx
537	jz	.Ltail_avx
538
539	vpunpckhqdq	%xmm15,%xmm15,%xmm8
540	vpxor	%xmm0,%xmm3,%xmm3
541	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
542	vpxor	%xmm15,%xmm8,%xmm8
543	vmovdqu	-48(%rdx),%xmm14
544	vpxor	%xmm1,%xmm4,%xmm4
545	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
546	vmovdqu	48-64(%rsi),%xmm6
547	vpshufb	%xmm13,%xmm14,%xmm15
548	vpxor	%xmm2,%xmm5,%xmm5
549	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
550	vmovdqu	80-64(%rsi),%xmm7
551	subq	$0x10,%rcx
552	jz	.Ltail_avx
553
554	vpunpckhqdq	%xmm15,%xmm15,%xmm8
555	vpxor	%xmm0,%xmm3,%xmm3
556	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
557	vpxor	%xmm15,%xmm8,%xmm8
558	vmovdqu	-64(%rdx),%xmm14
559	vpxor	%xmm1,%xmm4,%xmm4
560	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
561	vmovdqu	64-64(%rsi),%xmm6
562	vpshufb	%xmm13,%xmm14,%xmm15
563	vpxor	%xmm2,%xmm5,%xmm5
564	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
565	vpsrldq	$8,%xmm7,%xmm7
566	subq	$0x10,%rcx
567	jz	.Ltail_avx
568
569	vpunpckhqdq	%xmm15,%xmm15,%xmm8
570	vpxor	%xmm0,%xmm3,%xmm3
571	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
572	vpxor	%xmm15,%xmm8,%xmm8
573	vmovdqu	-80(%rdx),%xmm14
574	vpxor	%xmm1,%xmm4,%xmm4
575	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
576	vmovdqu	96-64(%rsi),%xmm6
577	vpshufb	%xmm13,%xmm14,%xmm15
578	vpxor	%xmm2,%xmm5,%xmm5
579	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
580	vmovdqu	128-64(%rsi),%xmm7
581	subq	$0x10,%rcx
582	jz	.Ltail_avx
583
584	vpunpckhqdq	%xmm15,%xmm15,%xmm8
585	vpxor	%xmm0,%xmm3,%xmm3
586	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
587	vpxor	%xmm15,%xmm8,%xmm8
588	vmovdqu	-96(%rdx),%xmm14
589	vpxor	%xmm1,%xmm4,%xmm4
590	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
591	vmovdqu	112-64(%rsi),%xmm6
592	vpshufb	%xmm13,%xmm14,%xmm15
593	vpxor	%xmm2,%xmm5,%xmm5
594	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
595	vpsrldq	$8,%xmm7,%xmm7
596	subq	$0x10,%rcx
597	jz	.Ltail_avx
598
599	vpunpckhqdq	%xmm15,%xmm15,%xmm8
600	vpxor	%xmm0,%xmm3,%xmm3
601	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
602	vpxor	%xmm15,%xmm8,%xmm8
603	vmovdqu	-112(%rdx),%xmm14
604	vpxor	%xmm1,%xmm4,%xmm4
605	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
606	vmovdqu	144-64(%rsi),%xmm6
607	vpshufb	%xmm13,%xmm14,%xmm15
608	vpxor	%xmm2,%xmm5,%xmm5
609	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
610	vmovq	184-64(%rsi),%xmm7
611	subq	$0x10,%rcx
612	jmp	.Ltail_avx
613
614.balign	32
615.Ltail_avx:
616	vpxor	%xmm10,%xmm15,%xmm15
617.Ltail_no_xor_avx:
618	vpunpckhqdq	%xmm15,%xmm15,%xmm8
619	vpxor	%xmm0,%xmm3,%xmm3
620	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
621	vpxor	%xmm15,%xmm8,%xmm8
622	vpxor	%xmm1,%xmm4,%xmm4
623	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
624	vpxor	%xmm2,%xmm5,%xmm5
625	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
626
627	vmovdqu	(%r10),%xmm12
628
629	vpxor	%xmm0,%xmm3,%xmm10
630	vpxor	%xmm1,%xmm4,%xmm11
631	vpxor	%xmm2,%xmm5,%xmm5
632
633	vpxor	%xmm10,%xmm5,%xmm5
634	vpxor	%xmm11,%xmm5,%xmm5
635	vpslldq	$8,%xmm5,%xmm9
636	vpsrldq	$8,%xmm5,%xmm5
637	vpxor	%xmm9,%xmm10,%xmm10
638	vpxor	%xmm5,%xmm11,%xmm11
639
640	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
641	vpalignr	$8,%xmm10,%xmm10,%xmm10
642	vpxor	%xmm9,%xmm10,%xmm10
643
644	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
645	vpalignr	$8,%xmm10,%xmm10,%xmm10
646	vpxor	%xmm11,%xmm10,%xmm10
647	vpxor	%xmm9,%xmm10,%xmm10
648
649	cmpq	$0,%rcx
650	jne	.Lshort_avx
651
652	vpshufb	%xmm13,%xmm10,%xmm10
653	vmovdqu	%xmm10,(%rdi)
654	vzeroupper
655	RET
656.cfi_endproc
657SET_SIZE(gcm_ghash_avx)
658
659#endif /* !_WIN32 || _KERNEL */
660
661SECTION_STATIC
662.balign	64
663.Lbswap_mask:
664.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
665.L0x1c2_polynomial:
666.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
667.L7_mask:
668.long	7,0,7,0
669.L7_mask_poly:
670.long	7,0,450,0
671.balign	64
672SET_OBJ(.Lrem_4bit)
673.Lrem_4bit:
674.long	0,0,0,471859200,0,943718400,0,610271232
675.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
676.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
677.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
678SET_OBJ(.Lrem_8bit)
679.Lrem_8bit:
680.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
681.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
682.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
683.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
684.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
685.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
686.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
687.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
688.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
689.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
690.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
691.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
692.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
693.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
694.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
695.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
696.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
697.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
698.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
699.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
700.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
701.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
702.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
703.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
704.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
705.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
706.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
707.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
708.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
709.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
710.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
711.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
712
713.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
714.balign	64
715
716/* Mark the stack non-executable. */
717#if defined(__linux__) && defined(__ELF__)
718.section .note.GNU-stack,"",%progbits
719#endif
720
721#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
722