1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /* Copyright 2025 Google LLC */
3
4 /*
5 * This file is a "template" that generates a CRC function optimized using the
6 * RISC-V Zbc (scalar carryless multiplication) extension. The includer of this
7 * file must define the following parameters to specify the type of CRC:
8 *
9 * crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC
10 * LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural
11 * mapping between bits and polynomial coefficients
12 * 1 for a lsb (least-significant-bit) first CRC, i.e. reflected
13 * mapping between bits and polynomial coefficients
14 */
15
16 #include <asm/byteorder.h>
17 #include <linux/minmax.h>
18
19 #define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */
20
clmul(unsigned long a,unsigned long b)21 static inline unsigned long clmul(unsigned long a, unsigned long b)
22 {
23 unsigned long res;
24
25 asm(".option push\n"
26 ".option arch,+zbc\n"
27 "clmul %0, %1, %2\n"
28 ".option pop\n"
29 : "=r" (res) : "r" (a), "r" (b));
30 return res;
31 }
32
clmulh(unsigned long a,unsigned long b)33 static inline unsigned long clmulh(unsigned long a, unsigned long b)
34 {
35 unsigned long res;
36
37 asm(".option push\n"
38 ".option arch,+zbc\n"
39 "clmulh %0, %1, %2\n"
40 ".option pop\n"
41 : "=r" (res) : "r" (a), "r" (b));
42 return res;
43 }
44
clmulr(unsigned long a,unsigned long b)45 static inline unsigned long clmulr(unsigned long a, unsigned long b)
46 {
47 unsigned long res;
48
49 asm(".option push\n"
50 ".option arch,+zbc\n"
51 "clmulr %0, %1, %2\n"
52 ".option pop\n"
53 : "=r" (res) : "r" (a), "r" (b));
54 return res;
55 }
56
57 /*
58 * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a
59 * polynomial whose bit order matches the CRC's bit order.
60 */
61 #ifdef CONFIG_64BIT
62 # if LSB_CRC
63 # define crc_load_long(x) le64_to_cpup(x)
64 # else
65 # define crc_load_long(x) be64_to_cpup(x)
66 # endif
67 #else
68 # if LSB_CRC
69 # define crc_load_long(x) le32_to_cpup(x)
70 # else
71 # define crc_load_long(x) be32_to_cpup(x)
72 # endif
73 #endif
74
75 /* XOR @crc into the end of @msgpoly that represents the high-order terms. */
76 static inline unsigned long
crc_clmul_prep(crc_t crc,unsigned long msgpoly)77 crc_clmul_prep(crc_t crc, unsigned long msgpoly)
78 {
79 #if LSB_CRC
80 return msgpoly ^ crc;
81 #else
82 return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS));
83 #endif
84 }
85
86 /*
87 * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it
88 * modulo the generator polynomial G. This gives the CRC of @msgpoly.
89 */
90 static inline crc_t
crc_clmul_long(unsigned long msgpoly,const struct crc_clmul_consts * consts)91 crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts)
92 {
93 unsigned long tmp;
94
95 /*
96 * First step of Barrett reduction with integrated multiplication by
97 * x^n: calculate floor((msgpoly * x^n) / G). This is the value by
98 * which G needs to be multiplied to cancel out the x^n and higher terms
99 * of msgpoly * x^n. Do it using the following formula:
100 *
101 * msb-first:
102 * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1))
103 * lsb-first:
104 * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG)
105 *
106 * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G),
107 * which fits a long exactly. Using any lower power of x there would
108 * not carry enough precision through the calculation, while using any
109 * higher power of x would require extra instructions to handle a wider
110 * multiplication. In the msb-first case, using this power of x results
111 * in needing a floored division by x^(BITS_PER_LONG-1), which matches
112 * what clmulr produces. In the lsb-first case, a factor of x gets
113 * implicitly introduced by each carryless multiplication (shown as
114 * '* x' above), and the floored division instead needs to be by
115 * x^BITS_PER_LONG which matches what clmul produces.
116 */
117 #if LSB_CRC
118 tmp = clmul(msgpoly, consts->barrett_reduction_const_1);
119 #else
120 tmp = clmulr(msgpoly, consts->barrett_reduction_const_1);
121 #endif
122
123 /*
124 * Second step of Barrett reduction:
125 *
126 * crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G))
127 *
128 * This reduces (msgpoly * x^n) modulo G by adding the appropriate
129 * multiple of G to it. The result uses only the x^0..x^(n-1) terms.
130 * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those
131 * terms in the first place, it is more efficient to do the equivalent:
132 *
133 * crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n
134 *
135 * In the lsb-first case further modify it to the following which avoids
136 * a shift, as the crc ends up in the physically low n bits from clmulr:
137 *
138 * product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x
139 * crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n
140 *
141 * barrett_reduction_const_2 contains the constant multiplier (G - x^n)
142 * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The
143 * cast of the result to crc_t is essential, as it applies the mod x^n!
144 */
145 #if LSB_CRC
146 return clmulr(tmp, consts->barrett_reduction_const_2);
147 #else
148 return clmul(tmp, consts->barrett_reduction_const_2);
149 #endif
150 }
151
152 /* Update @crc with the data from @msgpoly. */
153 static inline crc_t
crc_clmul_update_long(crc_t crc,unsigned long msgpoly,const struct crc_clmul_consts * consts)154 crc_clmul_update_long(crc_t crc, unsigned long msgpoly,
155 const struct crc_clmul_consts *consts)
156 {
157 return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts);
158 }
159
160 /* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */
161 static inline crc_t
crc_clmul_update_partial(crc_t crc,const u8 * p,size_t len,const struct crc_clmul_consts * consts)162 crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len,
163 const struct crc_clmul_consts *consts)
164 {
165 unsigned long msgpoly;
166 size_t i;
167
168 #if LSB_CRC
169 msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8);
170 for (i = 1; i < len; i++)
171 msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8));
172 #else
173 msgpoly = p[0];
174 for (i = 1; i < len; i++)
175 msgpoly = (msgpoly << 8) ^ p[i];
176 #endif
177
178 if (len >= sizeof(crc_t)) {
179 #if LSB_CRC
180 msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
181 #else
182 msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS);
183 #endif
184 return crc_clmul_long(msgpoly, consts);
185 }
186 #if LSB_CRC
187 msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
188 return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len));
189 #else
190 msgpoly ^= crc >> (CRC_BITS - 8*len);
191 return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len));
192 #endif
193 }
194
195 static inline crc_t
crc_clmul(crc_t crc,const void * p,size_t len,const struct crc_clmul_consts * consts)196 crc_clmul(crc_t crc, const void *p, size_t len,
197 const struct crc_clmul_consts *consts)
198 {
199 size_t align;
200
201 /* This implementation assumes that the CRC fits in an unsigned long. */
202 BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long));
203
204 /* If the buffer is not long-aligned, align it. */
205 align = (unsigned long)p % sizeof(unsigned long);
206 if (align && len) {
207 align = min(sizeof(unsigned long) - align, len);
208 crc = crc_clmul_update_partial(crc, p, align, consts);
209 p += align;
210 len -= align;
211 }
212
213 if (len >= 4 * sizeof(unsigned long)) {
214 unsigned long m0, m1;
215
216 m0 = crc_clmul_prep(crc, crc_load_long(p));
217 m1 = crc_load_long(p + sizeof(unsigned long));
218 p += 2 * sizeof(unsigned long);
219 len -= 2 * sizeof(unsigned long);
220 /*
221 * Main loop. Each iteration starts with a message polynomial
222 * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two
223 * more longs of data to form x^(3*BITS_PER_LONG)*m0 +
224 * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then
225 * "folds" that back into a congruent (modulo G) value that uses
226 * just m0 and m1 again. This is done by multiplying m0 by the
227 * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by
228 * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then
229 * adding the results to m2 and m3 as appropriate. Each such
230 * multiplication produces a result twice the length of a long,
231 * which in RISC-V is two instructions clmul and clmulh.
232 *
233 * This could be changed to fold across more than 2 longs at a
234 * time if there is a CPU that can take advantage of it.
235 */
236 do {
237 unsigned long p0, p1, p2, p3;
238
239 p0 = clmulh(m0, consts->fold_across_2_longs_const_hi);
240 p1 = clmul(m0, consts->fold_across_2_longs_const_hi);
241 p2 = clmulh(m1, consts->fold_across_2_longs_const_lo);
242 p3 = clmul(m1, consts->fold_across_2_longs_const_lo);
243 m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p);
244 m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^
245 crc_load_long(p + sizeof(unsigned long));
246
247 p += 2 * sizeof(unsigned long);
248 len -= 2 * sizeof(unsigned long);
249 } while (len >= 2 * sizeof(unsigned long));
250
251 crc = crc_clmul_long(m0, consts);
252 crc = crc_clmul_update_long(crc, m1, consts);
253 }
254
255 while (len >= sizeof(unsigned long)) {
256 crc = crc_clmul_update_long(crc, crc_load_long(p), consts);
257 p += sizeof(unsigned long);
258 len -= sizeof(unsigned long);
259 }
260
261 if (len)
262 crc = crc_clmul_update_partial(crc, p, len, consts);
263
264 return crc;
265 }
266