Lines Matching +full:64 +full:bit

76 // Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector
89 // is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane.
91 .if \vl < 64
103 .if \vl < 64
110 .if \vl < 64
118 // The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for
122 // The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high
126 // Multiply the given \src1_terms of each 128-bit lane of \src1 by the given
127 // \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst.
164 // into all 128-bit lanes of the vector register CONSTS.
193 // 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If
196 // \lsb_crc is 1 if the CRC processes the least significant bit of each byte
198 // if the CRC processes the most significant bit of each byte first, i.e. maps
201 // \vl is the maximum length of vector register to use in bytes: 16, 32, or 64.
212 // If \vl == 64 && \avx_level == 512, the generated code requires:
229 .elseif VL == 64
238 // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed
252 // 32-bit support, assuming -mregparm=3 and not including support for
253 // CRC-64 (which would use both eax and edx to pass the crc parameter).
264 // within the first 8 vector registers keeps the code 32-bit SSE
265 // compatible and reduces the size of 64-bit SSE code slightly.
282 // Create a 128-bit vector that contains the initial CRC in the end
296 // appropriate end of the first 128-bit lane of data. If LEN < VL, then
302 .if VL == 64
378 .if VL == 64
379 // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of
387 // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of
398 // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0
443 // Fold C1 into C2 and store the 128-bit result in %xmm0.
447 // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit
448 // polynomial stored in %xmm0 (using either lsb-first or msb-first bit
451 // First, multiply %xmm0 by x^n and reduce the result to 64+n bits:
453 // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) +
454 // x^n * (%xmm0 mod x^64)
456 // Store t0 * x^(64-n) in %xmm0. I.e., actually do:
458 // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) +
459 // x^64 * (%xmm0 mod x^64)
461 // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned
463 // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the
464 // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case
471 _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
473 _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64)
477 // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n).
485 // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in
489 // The '* x' makes it so the result is floor(t1 / x^64) rather than
493 // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the
494 // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and
495 // the multiplication by the x^64 term is handled using a pxor. The
496 // pxor causes the low 64 terms of t1 to be wrong, but they are unused.
500 _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n)
502 // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G).
509 // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do:
511 // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n)
513 // Furthermore, since the resulting CRC is n-bit, if mod x^n is
516 // multiplier in 64 bits in most cases. This gives the following:
518 // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G))
519 // crc := (%xmm0 / x^(64-n)) mod x^n
524 // For lsb-first CRCs where n=64, the extra factor of x cannot be as
531 .if LSB_CRC && \n == 64
540 _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64
550 .else // \n == 64 && !LSB_CRC
551 _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64
575 _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \