x86/lib/crc-pclmul-template.S

1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 // Template to generate [V]PCLMULQDQ-based CRC functions for x86
13 .set OFFSETOF_BSWAP_MASK,			-5*16	// msb-first CRCs only
14 .set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS,	-4*16	// must precede next
15 .set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS,	-3*16	// must precede next
16 .set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS,	-2*16	// must precede next
17 .set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS,	-1*16	// must precede next
23 // corresponding non-VEX instruction plus any needed moves.  The supported
26 //     - Two-arg [src, dst], where the non-VEX format is the same.
27 //     - Three-arg [src1, src2, dst] where the non-VEX format is
32 // If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
37   .ifnb \arg3 // Three-arg [src1, src2, dst]
58   .else // Two-arg [src, dst]
76 // Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
89 // is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
91 .if \vl < 64
103   .if \vl < 64
110   .if \vl < 64
118 // The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
119 // msb-first order or the physically high qword for lsb-first order
122 // The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
123 // qword for msb-first order or the physically low qword for lsb-first order
126 // Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
127 // \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
134 // unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
135 // byte-reflection is needed; otherwise it must be a vector register.  \consts
152 // byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
164 // into all 128-bit lanes of the vector register CONSTS.
166 	_vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
193 // 32 for a CRC-32.  Currently the supported values are 8, 16, 32, and 64.  If
196 // \lsb_crc is 1 if the CRC processes the least significant bit of each byte
198 // if the CRC processes the most significant bit of each byte first, i.e. maps
201 // \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
212 // If \vl == 64 && \avx_level == 512, the generated code requires:
229   .elseif VL == 64
237 	// Note: when crc_t is shorter than u32, zero-extension to 32 bits is
238 	// guaranteed by the ABI.  Zero-extension to 64 bits is *not* guaranteed
252 	// 32-bit support, assuming -mregparm=3 and not including support for
253 	// CRC-64 (which would use both eax and edx to pass the crc parameter).
262 	// Define aliases for some local variables.  V0-V5 are used without
264 	// within the first 8 vector registers keeps the code 32-bit SSE
265 	// compatible and reduces the size of 64-bit SSE code slightly.
282 	// Create a 128-bit vector that contains the initial CRC in the end
283 	// representing the high-order polynomial coefficients, and the rest 0.
284 	// If the CRC is msb-first, also load the byte-reflection table.
291 	_cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
296 	// appropriate end of the first 128-bit lane of data.  If LEN < VL, then
302   .if VL == 64
319 	cmp		$4*VL-1, LEN
325 	cmp		$2*VL-1, LEN32
341 	sub		$-4*VL, BUF	// Shorter than 'add 4*VL' when VL=32
342 	add		$-4*VL, LEN	// Shorter than 'sub 4*VL' when VL=32
344 	// Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
345 	// 4 vectors of data and write the result back to V0-V3.
346 	cmp		$4*VL-1, LEN	// Shorter than 'cmp 4*VL' when VL=32
354 	sub		$-4*VL, BUF
355 	add		$-4*VL, LEN
356 	cmp		$4*VL-1, LEN
369 	sub		$-2*VL, BUF
378 .if VL == 64
379 	// Reduce 512-bit %zmm0 to 256-bit %ymm0.  Then fold 256 more bits of
387 	// Reduce 256-bit %ymm0 to 128-bit %xmm0.  Then fold 128 more bits of
398 	// A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
402 	// C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
407 	_load_data	16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
414 	// lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
415 	//	i.e. right-shift by LEN bytes.
416 	// msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
417 	//	i.e. left-shift by LEN bytes.
421 	// C1 = floor(A / x^(128 - 8*LEN))
422 	// lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
423 	//	i.e. left-shift by 16-LEN bytes.
424 	// msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
425 	//	i.e. right-shift by 16-LEN bytes.
430 	// bytes (reflected if msb-first).  The blend mask is the shuffle table
438 	vpblendvb	%xmm3, -16(BUF,LEN), %xmm1, %xmm1
443 	// Fold C1 into C2 and store the 128-bit result in %xmm0.
447 	// Compute the CRC as %xmm0 * x^n mod G.  Here %xmm0 means the 128-bit
448 	// polynomial stored in %xmm0 (using either lsb-first or msb-first bit
451 	// First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
453 	//	t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
454 	//	      x^n * (%xmm0 mod x^64)
456 	// Store t0 * x^(64-n) in %xmm0.  I.e., actually do:
458 	//	%xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
459 	//		 x^64 * (%xmm0 mod x^64)
461 	// The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
463 	// select it.  The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
464 	// msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
466 	// each pclmulqdq when using lsb-first order), is identical to the
471 	_cond_vex psrldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
473 	_cond_vex pslldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
477 	// The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
485 	// Then the desired value floor(t0 / G) is floor(t1 / x^64).  The 63 in
489 	// The '* x' makes it so the result is floor(t1 / x^64) rather than
490 	// floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
491 	// can be extracted much more easily in the next step.  In the lsb-first
492 	// case the '* x' happens implicitly.  In the msb-first case it must be
493 	// done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
494 	// constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
495 	// the multiplication by the x^64 term is handled using a pxor.  The
496 	// pxor causes the low 64 terms of t1 to be wrong, but they are unused.
500 	_cond_vex pxor,		%xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
502 	// The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
507 	//	crc := t0 - (G * floor(t0 / G))
509 	// But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
511 	//	crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
513 	// Furthermore, since the resulting CRC is n-bit, if mod x^n is
516 	// multiplier in 64 bits in most cases.  This gives the following:
518 	//	%xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
519 	//	crc := (%xmm0 / x^(64-n)) mod x^n
521 	// In the lsb-first case, each pclmulqdq implicitly introduces
523 	// passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
524 	// For lsb-first CRCs where n=64, the extra factor of x cannot be as
525 	// easily avoided.  In that case, instead pass '(G - x^n - x^0) / x' to
531 .if LSB_CRC && \n == 64
540 	_cond_vex "pextrq $1,",	%xmm0, %rax  // (%xmm0 / x^0) mod x^64
550   .else // \n == 64 && !LSB_CRC
551 	_cond_vex movq,		%xmm0, %rax  // (%xmm0 / x^0) mod x^64
575 	_crc_pclmul	n=bits, lsb_crc=lsb, vl=64, avx_level=512;	\