1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3 * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are
4 * instantiated by crc-pclmul-template.S
5 *
6 * Copyright 2025 Google LLC
7 *
8 * Author: Eric Biggers <ebiggers@google.com>
9 */
10 #ifndef _CRC_PCLMUL_TEMPLATE_H
11 #define _CRC_PCLMUL_TEMPLATE_H
12
13 #include <asm/cpufeatures.h>
14 #include <asm/simd.h>
15 #include <crypto/internal/simd.h>
16 #include <linux/static_call.h>
17 #include "crc-pclmul-consts.h"
18
19 #define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \
20 crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \
21 const void *consts_ptr); \
22 crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \
23 const void *consts_ptr); \
24 crc_t prefix##_vpclmul_avx512(crc_t crc, const u8 *p, size_t len, \
25 const void *consts_ptr); \
26 DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse)
27
have_vpclmul(void)28 static inline bool have_vpclmul(void)
29 {
30 return boot_cpu_has(X86_FEATURE_VPCLMULQDQ) &&
31 boot_cpu_has(X86_FEATURE_AVX2) &&
32 cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL);
33 }
34
have_avx512(void)35 static inline bool have_avx512(void)
36 {
37 return boot_cpu_has(X86_FEATURE_AVX512BW) &&
38 boot_cpu_has(X86_FEATURE_AVX512VL) &&
39 !boot_cpu_has(X86_FEATURE_PREFER_YMM) &&
40 cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL);
41 }
42
43 /*
44 * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16
45 * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD.
46 *
47 * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions.
48 * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(),
49 * varying by CPU and factors such as which parts of the "FPU" state userspace
50 * has touched, which could result in a larger cutoff being better. Indeed, a
51 * larger cutoff is usually better for a *single* message. However, the
52 * overhead of the FPU section gets amortized if multiple FPU sections get
53 * executed before returning to userspace, since the XSAVE and XRSTOR occur only
54 * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on
55 * the dcache than the table-based code is, a 16-byte cutoff seems to work well.
56 */
57 #define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \
58 do { \
59 if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \
60 crypto_simd_usable()) { \
61 const void *consts_ptr; \
62 \
63 consts_ptr = (consts).fold_across_128_bits_consts; \
64 kernel_fpu_begin(); \
65 crc = static_call(prefix##_pclmul)((crc), (p), (len), \
66 consts_ptr); \
67 kernel_fpu_end(); \
68 return crc; \
69 } \
70 } while (0)
71
72 #endif /* _CRC_PCLMUL_TEMPLATE_H */
73