Lines Matching +full:32 +full:- +full:bits
2 # Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
50 # /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
64 # Fold reg1, reg2 into the next 32 data bytes, storing the result back into
125 # XOR the first 16 data *bits* with the initial CRC value.
136 # While >= 128 data bytes remain (not counting xmm0-7), fold the 128
137 # bytes xmm0-7 into them, storing the result back into xmm0-7.
140 fold_32_bytes 32, %xmm2, %xmm3
147 # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
155 # Fold across 32 bytes.
167 add $128-16, len
200 movdqu -16(buf, len), %xmm1
203 # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
209 # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
213 # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
214 # then '16-len' bytes from xmm2 (high-order bytes).
225 # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
230 # Fold the high 64 bits into the low 64 bits, while also multiplying by
231 # x^64. This produces a 128-bit value congruent to x^64 * M(x) and
232 # whose low 48 bits are 0.
234 pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
236 pxor %xmm0, %xmm7 # + low bits * x^64
238 # Fold the high 32 bits into the low 96 bits. This produces a 96-bit
239 # value congruent to x^64 * M(x) and whose low 48 bits are 0.
241 pand .Lmask2(%rip), %xmm0 # zero high 32 bits
242 psrldq $12, %xmm7 # extract high 32 bits
243 pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
244 pxor %xmm0, %xmm7 # + low bits
251 pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
252 psrlq $32, %xmm7 # /= x^32
255 pxor %xmm7, %xmm0 # + low 16 nonzero bits
256 # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
270 # XOR the first 16 data *bits* with the initial CRC value.
278 sub $32, len
279 jge .Lfold_16_bytes_loop # 32 <= len <= 255
323 .section .rodata.cst32.byteshift_table, "aM", @progbits, 32
325 # For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
327 # 0x80} XOR the index vector to shift right by '16 - len' bytes.