Lines Matching +full:len +full:- +full:or +full:- +full:define

2 # Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
15 # COPYING in the main directory of this source tree, or the
18 # Redistribution and use in source and binary forms, with or without
27 # documentation and/or other materials provided with the
31 # contributors may be used to endorse or promote products derived from
36 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
40 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
41 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
42 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
43 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
44 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
50 # /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
57 #define init_crc %edi
58 #define buf %rsi
59 #define len %rdx macro
61 #define FOLD_CONSTS %xmm10
62 #define BSWAP_MASK %xmm11
93 # u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len);
95 # Assumes len >= 16.
102 cmp $256, len
134 sub $256, len
136 # While >= 128 data bytes remain (not counting xmm0-7), fold the 128
137 # bytes xmm0-7 into them, storing the result back into xmm0-7.
144 sub $128, len
147 # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
167 add $128-16, len
181 sub $16, len
187 add $16, len
191 # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
193 # this without needing a fold constant for each possible 'len', redivide
194 # the bytes into a first chunk of 'len' bytes and a second chunk of 16
200 movdqu -16(buf, len), %xmm1
203 # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
205 sub len, %rax
209 # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
213 # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
214 # then '16-len' bytes from xmm2 (high-order bytes).
225 # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
231 # x^64. This produces a 128-bit value congruent to x^64 * M(x) and
238 # Fold the high 32 bits into the low 96 bits. This produces a 96-bit
276 cmp $16, len
277 je .Lreduce_final_16_bytes # len == 16
278 sub $32, len
279 jge .Lfold_16_bytes_loop # 32 <= len <= 255
280 add $16, len
281 jmp .Lhandle_partial_segment # 17 <= len <= 31
325 # For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
326 # is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
327 # 0x80} XOR the index vector to shift right by '16 - len' bytes.