x86/crypto/crct10dif-pcl-asm_64.S

2 # Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
15 # COPYING in the main directory of this source tree, or the
18 # Redistribution and use in source and binary forms, with or without
27 #   documentation and/or other materials provided with the
31 #   contributors may be used to endorse or promote products derived from
36 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
40 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
41 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
42 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
43 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
44 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
50 #  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
57 #define		init_crc	%edi
58 #define		buf		%rsi
59 #define		len		%rdx  macro
61 #define		FOLD_CONSTS	%xmm10
62 #define		BSWAP_MASK	%xmm11
93 # u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len);
95 # Assumes len >= 16.
102 	cmp	$256, len
134 	sub	$256, len
136 	# While >= 128 data bytes remain (not counting xmm0-7), fold the 128
137 	# bytes xmm0-7 into them, storing the result back into xmm0-7.
144 	sub	$128, len
147 	# Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
167 	add	$128-16, len
181 	sub	$16, len
187 	add	$16, len
191 	# Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
193 	# this without needing a fold constant for each possible 'len', redivide
194 	# the bytes into a first chunk of 'len' bytes and a second chunk of 16
200 	movdqu	-16(buf, len), %xmm1
203 	# xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
205 	sub	len, %rax
209 	# xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
213 	# xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
214 	# then '16-len' bytes from xmm2 (high-order bytes).
225 	# Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
231 	# x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
238 	# Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
276 	cmp	$16, len
277 	je	.Lreduce_final_16_bytes		# len == 16
278 	sub	$32, len
279 	jge	.Lfold_16_bytes_loop		# 32 <= len <= 255
280 	add	$16, len
281 	jmp	.Lhandle_partial_segment	# 17 <= len <= 31
325 # For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
326 # is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
327 # 0x80} XOR the index vector to shift right by '16 - len' bytes.