Lines Matching +full:64 +full:- +full:bit
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 // Template to generate [V]PCLMULQDQ-based CRC functions for x86
13 .set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only
14 .set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next
15 .set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next
16 .set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next
17 .set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next
23 // corresponding non-VEX instruction plus any needed moves. The supported
26 // - Two-arg [src, dst], where the non-VEX format is the same.
27 // - Three-arg [src1, src2, dst] where the non-VEX format is
32 // If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to
37 .ifnb \arg3 // Three-arg [src1, src2, dst]
58 .else // Two-arg [src, dst]
76 // Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
89 // is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
91 .if \vl < 64
103 .if \vl < 64
110 .if \vl < 64
118 // The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
119 // msb-first order or the physically high qword for lsb-first order
122 // The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
123 // qword for msb-first order or the physically low qword for lsb-first order
126 // Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
127 // \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
134 // unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no
135 // byte-reflection is needed; otherwise it must be a vector register. \consts
152 // byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are
164 // into all 128-bit lanes of the vector register CONSTS.
166 _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \
193 // 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If
196 // \lsb_crc is 1 if the CRC processes the least significant bit of each byte
198 // if the CRC processes the most significant bit of each byte first, i.e. maps
201 // \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
212 // If \vl == 64 && \avx_level == 512, the generated code requires:
229 .elseif VL == 64
237 // Note: when crc_t is shorter than u32, zero-extension to 32 bits is
238 // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed
252 // 32-bit support, assuming -mregparm=3 and not including support for
253 // CRC-64 (which would use both eax and edx to pass the crc parameter).
262 // Define aliases for some local variables. V0-V5 are used without
264 // within the first 8 vector registers keeps the code 32-bit SSE
265 // compatible and reduces the size of 64-bit SSE code slightly.
282 // Create a 128-bit vector that contains the initial CRC in the end
283 // representing the high-order polynomial coefficients, and the rest 0.
284 // If the CRC is msb-first, also load the byte-reflection table.
291 _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0
296 // appropriate end of the first 128-bit lane of data. If LEN < VL, then
302 .if VL == 64
319 cmp $4*VL-1, LEN
325 cmp $2*VL-1, LEN32
341 sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32
342 add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32
344 // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next
345 // 4 vectors of data and write the result back to V0-V3.
346 cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32
354 sub $-4*VL, BUF
355 add $-4*VL, LEN
356 cmp $4*VL-1, LEN
369 sub $-2*VL, BUF
378 .if VL == 64
379 // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of
387 // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of
398 // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
402 // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128.
407 _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2
414 // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1]
415 // i.e. right-shift by LEN bytes.
416 // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN]
417 // i.e. left-shift by LEN bytes.
421 // C1 = floor(A / x^(128 - 8*LEN))
422 // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1]
423 // i.e. left-shift by 16-LEN bytes.
424 // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1]
425 // i.e. right-shift by 16-LEN bytes.
430 // bytes (reflected if msb-first). The blend mask is the shuffle table
438 vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1
443 // Fold C1 into C2 and store the 128-bit result in %xmm0.
447 // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit
448 // polynomial stored in %xmm0 (using either lsb-first or msb-first bit
451 // First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
453 // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
454 // x^n * (%xmm0 mod x^64)
456 // Store t0 * x^(64-n) in %xmm0. I.e., actually do:
458 // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
459 // x^64 * (%xmm0 mod x^64)
461 // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
463 // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
464 // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
466 // each pclmulqdq when using lsb-first order), is identical to the
471 _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
473 _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
477 // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
485 // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in
489 // The '* x' makes it so the result is floor(t1 / x^64) rather than
490 // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it
491 // can be extracted much more easily in the next step. In the lsb-first
492 // case the '* x' happens implicitly. In the msb-first case it must be
493 // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
494 // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
495 // the multiplication by the x^64 term is handled using a pxor. The
496 // pxor causes the low 64 terms of t1 to be wrong, but they are unused.
500 _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
502 // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
507 // crc := t0 - (G * floor(t0 / G))
509 // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
511 // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
513 // Furthermore, since the resulting CRC is n-bit, if mod x^n is
516 // multiplier in 64 bits in most cases. This gives the following:
518 // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
519 // crc := (%xmm0 / x^(64-n)) mod x^n
521 // In the lsb-first case, each pclmulqdq implicitly introduces
523 // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63.
524 // For lsb-first CRCs where n=64, the extra factor of x cannot be as
525 // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to
531 .if LSB_CRC && \n == 64
540 _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64
550 .else // \n == 64 && !LSB_CRC
551 _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64
575 _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \