x86/crypto/crct10dif-pcl-asm_64.S

2 # Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
50 #  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
107 	# bit order match the polynomial coefficient order.
137 	# While >= 128 data bytes remain (not counting xmm0-7), fold the 128
138 	# bytes xmm0-7 into them, storing the result back into xmm0-7.
148 	# Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
168 	add	$128-16, len
201 	movdqu	-16(buf, len), %xmm1
204 	# xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
210 	# xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
214 	# xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
215 	# then '16-len' bytes from xmm2 (high-order bytes).
226 	# Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
228 	# Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
232 	# x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
233 	# whose low 48 bits are 0.
235 	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
239 	# Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
240 	# value congruent to x^64 * M(x) and whose low 48 bits are 0.
244 	pclmulqdq	$0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
247 	# Load G(x) and floor(x^48 / G(x)).
252 	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
255 	psrlq	$48, %xmm0
303 	.quad		0x1368000000000000	# x^48 * (x^48 mod G(x))
304 	.quad		0x2d56000000000000	# x^48 * (x^80 mod G(x))
307 	.quad		0x00000001f65a57f8	# floor(x^48 / G(x))
326 # For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
328 # 0x80} XOR the index vector to shift right by '16 - len' bytes.