x86/lib/crc-pclmul-template.S

76 // Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
89 // is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
91 .if \vl < 64
103   .if \vl < 64
110   .if \vl < 64
118 // The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
122 // The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
126 // Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
127 // \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
164 // into all 128-bit lanes of the vector register CONSTS.
193 // 32 for a CRC-32.  Currently the supported values are 8, 16, 32, and 64.  If
196 // \lsb_crc is 1 if the CRC processes the least significant bit of each byte
198 // if the CRC processes the most significant bit of each byte first, i.e. maps
201 // \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
212 // If \vl == 64 && \avx_level == 512, the generated code requires:
229   .elseif VL == 64
238 	// guaranteed by the ABI.  Zero-extension to 64 bits is *not* guaranteed
252 	// 32-bit support, assuming -mregparm=3 and not including support for
253 	// CRC-64 (which would use both eax and edx to pass the crc parameter).
264 	// within the first 8 vector registers keeps the code 32-bit SSE
265 	// compatible and reduces the size of 64-bit SSE code slightly.
282 	// Create a 128-bit vector that contains the initial CRC in the end
296 	// appropriate end of the first 128-bit lane of data.  If LEN < VL, then
302   .if VL == 64
378 .if VL == 64
379 	// Reduce 512-bit %zmm0 to 256-bit %ymm0.  Then fold 256 more bits of
387 	// Reduce 256-bit %ymm0 to 128-bit %xmm0.  Then fold 128 more bits of
398 	// A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
443 	// Fold C1 into C2 and store the 128-bit result in %xmm0.
447 	// Compute the CRC as %xmm0 * x^n mod G.  Here %xmm0 means the 128-bit
448 	// polynomial stored in %xmm0 (using either lsb-first or msb-first bit
451 	// First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
453 	//	t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
454 	//	      x^n * (%xmm0 mod x^64)
456 	// Store t0 * x^(64-n) in %xmm0.  I.e., actually do:
458 	//	%xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
459 	//		 x^64 * (%xmm0 mod x^64)
461 	// The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
463 	// select it.  The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
464 	// msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
471 	_cond_vex psrldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
473 	_cond_vex pslldq,	$8, %xmm0, %xmm0  // x^64 * (%xmm0 mod x^64)
477 	// The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
485 	// Then the desired value floor(t0 / G) is floor(t1 / x^64).  The 63 in
489 	// The '* x' makes it so the result is floor(t1 / x^64) rather than
493 	// done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
494 	// constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
495 	// the multiplication by the x^64 term is handled using a pxor.  The
496 	// pxor causes the low 64 terms of t1 to be wrong, but they are unused.
500 	_cond_vex pxor,		%xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
502 	// The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
509 	// But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
511 	//	crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
513 	// Furthermore, since the resulting CRC is n-bit, if mod x^n is
516 	// multiplier in 64 bits in most cases.  This gives the following:
518 	//	%xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
519 	//	crc := (%xmm0 / x^(64-n)) mod x^n
524 	// For lsb-first CRCs where n=64, the extra factor of x cannot be as
531 .if LSB_CRC && \n == 64
540 	_cond_vex "pextrq $1,",	%xmm0, %rax  // (%xmm0 / x^0) mod x^64
550   .else // \n == 64 && !LSB_CRC
551 	_cond_vex movq,		%xmm0, %rax  // (%xmm0 / x^0) mod x^64
575 	_crc_pclmul	n=bits, lsb_crc=lsb, vl=64, avx_level=512;	\