1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * GF(2^128) polynomial hashing: GHASH and POLYVAL
4 *
5 * Copyright 2025 Google LLC
6 */
7
8 #include <crypto/gf128hash.h>
9 #include <linux/export.h>
10 #include <linux/module.h>
11 #include <linux/string.h>
12 #include <linux/unaligned.h>
13
14 /*
15 * GHASH and POLYVAL are almost-XOR-universal hash functions. They interpret
16 * the message as the coefficients of a polynomial in the finite field GF(2^128)
17 * and evaluate that polynomial at a secret point.
18 *
19 * Neither GHASH nor POLYVAL is a cryptographic hash function. They should be
20 * used only by algorithms that are specifically designed to use them.
21 *
22 * GHASH is the older variant, defined as part of GCM in NIST SP 800-38D
23 * (https://nvlpubs.nist.gov/nistpubs/legacy/sp/nistspecialpublication800-38d.pdf).
24 * GHASH is hard to implement directly, due to its backwards mapping between
25 * bits and polynomial coefficients. GHASH implementations typically pre and
26 * post-process the inputs and outputs (mainly by byte-swapping) to convert the
27 * GHASH computation into an equivalent computation over a different,
28 * easier-to-use representation of GF(2^128).
29 *
30 * POLYVAL is a newer GF(2^128) polynomial hash, originally defined as part of
31 * AES-GCM-SIV (https://datatracker.ietf.org/doc/html/rfc8452) and also used by
32 * HCTR2 (https://eprint.iacr.org/2021/1441.pdf). It uses that easier-to-use
33 * field representation directly, eliminating the data conversion steps.
34 *
35 * This file provides library APIs for GHASH and POLYVAL. These APIs can
36 * delegate to either a generic implementation or an architecture-optimized
37 * implementation. Due to the mathematical relationship between GHASH and
38 * POLYVAL, in some cases code for one is reused with the other.
39 *
40 * For the generic implementation, we don't use the traditional table approach
41 * to GF(2^128) multiplication. That approach is not constant-time and requires
42 * a lot of memory. Instead, we use a different approach which emulates
43 * carryless multiplication using standard multiplications by spreading the data
44 * bits apart using "holes". This allows the carries to spill harmlessly. This
45 * approach is borrowed from BoringSSL, which in turn credits BearSSL's
46 * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
47 * "holes" trick and a presentation by Shay Gueron
48 * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
49 * 256-bit => 128-bit reduction algorithm.
50 */
51
52 #ifdef CONFIG_ARCH_SUPPORTS_INT128
53
54 /* Do a 64 x 64 => 128 bit carryless multiplication. */
clmul64(u64 a,u64 b,u64 * out_lo,u64 * out_hi)55 static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
56 {
57 /*
58 * With 64-bit multiplicands and one term every 4 bits, there would be
59 * up to 64 / 4 = 16 one bits per column when each multiplication is
60 * written out as a series of additions in the schoolbook manner.
61 * Unfortunately, that doesn't work since the value 16 is 1 too large to
62 * fit in 4 bits. Carries would sometimes overflow into the next term.
63 *
64 * Using one term every 5 bits would work. However, that would cost
65 * 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
66 *
67 * Instead, mask off 4 bits from one multiplicand, giving a max of 15
68 * one bits per column. Then handle those 4 bits separately.
69 */
70 u64 a0 = a & 0x1111111111111110;
71 u64 a1 = a & 0x2222222222222220;
72 u64 a2 = a & 0x4444444444444440;
73 u64 a3 = a & 0x8888888888888880;
74
75 u64 b0 = b & 0x1111111111111111;
76 u64 b1 = b & 0x2222222222222222;
77 u64 b2 = b & 0x4444444444444444;
78 u64 b3 = b & 0x8888888888888888;
79
80 /* Multiply the high 60 bits of @a by @b. */
81 u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
82 (a2 * (u128)b2) ^ (a3 * (u128)b1);
83 u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
84 (a2 * (u128)b3) ^ (a3 * (u128)b2);
85 u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
86 (a2 * (u128)b0) ^ (a3 * (u128)b3);
87 u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
88 (a2 * (u128)b1) ^ (a3 * (u128)b0);
89
90 /* Multiply the low 4 bits of @a by @b. */
91 u64 e0 = -(a & 1) & b;
92 u64 e1 = -((a >> 1) & 1) & b;
93 u64 e2 = -((a >> 2) & 1) & b;
94 u64 e3 = -((a >> 3) & 1) & b;
95 u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
96 u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
97
98 /* Add all the intermediate products together. */
99 *out_lo = (((u64)c0) & 0x1111111111111111) ^
100 (((u64)c1) & 0x2222222222222222) ^
101 (((u64)c2) & 0x4444444444444444) ^
102 (((u64)c3) & 0x8888888888888888) ^ extra_lo;
103 *out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
104 (((u64)(c1 >> 64)) & 0x2222222222222222) ^
105 (((u64)(c2 >> 64)) & 0x4444444444444444) ^
106 (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
107 }
108
109 #else /* CONFIG_ARCH_SUPPORTS_INT128 */
110
111 /* Do a 32 x 32 => 64 bit carryless multiplication. */
clmul32(u32 a,u32 b)112 static u64 clmul32(u32 a, u32 b)
113 {
114 /*
115 * With 32-bit multiplicands and one term every 4 bits, there are up to
116 * 32 / 4 = 8 one bits per column when each multiplication is written
117 * out as a series of additions in the schoolbook manner. The value 8
118 * fits in 4 bits, so the carries don't overflow into the next term.
119 */
120 u32 a0 = a & 0x11111111;
121 u32 a1 = a & 0x22222222;
122 u32 a2 = a & 0x44444444;
123 u32 a3 = a & 0x88888888;
124
125 u32 b0 = b & 0x11111111;
126 u32 b1 = b & 0x22222222;
127 u32 b2 = b & 0x44444444;
128 u32 b3 = b & 0x88888888;
129
130 u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
131 (a2 * (u64)b2) ^ (a3 * (u64)b1);
132 u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
133 (a2 * (u64)b3) ^ (a3 * (u64)b2);
134 u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
135 (a2 * (u64)b0) ^ (a3 * (u64)b3);
136 u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
137 (a2 * (u64)b1) ^ (a3 * (u64)b0);
138
139 /* Add all the intermediate products together. */
140 return (c0 & 0x1111111111111111) ^
141 (c1 & 0x2222222222222222) ^
142 (c2 & 0x4444444444444444) ^
143 (c3 & 0x8888888888888888);
144 }
145
146 /* Do a 64 x 64 => 128 bit carryless multiplication. */
clmul64(u64 a,u64 b,u64 * out_lo,u64 * out_hi)147 static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
148 {
149 u32 a_lo = (u32)a;
150 u32 a_hi = a >> 32;
151 u32 b_lo = (u32)b;
152 u32 b_hi = b >> 32;
153
154 /* Karatsuba multiplication */
155 u64 lo = clmul32(a_lo, b_lo);
156 u64 hi = clmul32(a_hi, b_hi);
157 u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
158
159 *out_lo = lo ^ (mi << 32);
160 *out_hi = hi ^ (mi >> 32);
161 }
162 #endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
163
164 /* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
165 static void __maybe_unused
polyval_mul_generic(struct polyval_elem * a,const struct polyval_elem * b)166 polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
167 {
168 u64 c0, c1, c2, c3, mi0, mi1;
169
170 /*
171 * Carryless-multiply @a by @b using Karatsuba multiplication. Store
172 * the 256-bit product in @c0 (low) through @c3 (high).
173 */
174 clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
175 clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
176 clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
177 &mi0, &mi1);
178 mi0 ^= c0 ^ c2;
179 mi1 ^= c1 ^ c3;
180 c1 ^= mi0;
181 c2 ^= mi1;
182
183 /*
184 * Cancel out the low 128 bits of the product by adding multiples of
185 * G(x) = x^128 + x^127 + x^126 + x^121 + 1. Do this in two steps, each
186 * of which cancels out 64 bits. Note that we break G(x) into three
187 * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
188 */
189
190 /*
191 * First, add G(x) times c0 as follows:
192 *
193 * (c0, c1, c2) = (0,
194 * c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
195 * c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
196 */
197 c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
198 c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
199
200 /*
201 * Second, add G(x) times the new c1:
202 *
203 * (c1, c2, c3) = (0,
204 * c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
205 * c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
206 */
207 c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
208 c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
209
210 /* Return (c2, c3). This implicitly multiplies by x^-128. */
211 a->lo = cpu_to_le64(c2);
212 a->hi = cpu_to_le64(c3);
213 }
214
ghash_blocks_generic(struct polyval_elem * acc,const struct polyval_elem * key,const u8 * data,size_t nblocks)215 static void __maybe_unused ghash_blocks_generic(struct polyval_elem *acc,
216 const struct polyval_elem *key,
217 const u8 *data, size_t nblocks)
218 {
219 do {
220 acc->lo ^=
221 cpu_to_le64(get_unaligned_be64((__be64 *)(data + 8)));
222 acc->hi ^= cpu_to_le64(get_unaligned_be64((__be64 *)data));
223 polyval_mul_generic(acc, key);
224 data += GHASH_BLOCK_SIZE;
225 } while (--nblocks);
226 }
227
228 static void __maybe_unused
polyval_blocks_generic(struct polyval_elem * acc,const struct polyval_elem * key,const u8 * data,size_t nblocks)229 polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
230 const u8 *data, size_t nblocks)
231 {
232 do {
233 acc->lo ^= get_unaligned((__le64 *)data);
234 acc->hi ^= get_unaligned((__le64 *)(data + 8));
235 polyval_mul_generic(acc, key);
236 data += POLYVAL_BLOCK_SIZE;
237 } while (--nblocks);
238 }
239
240 /* Convert the key from GHASH format to POLYVAL format. */
ghash_key_to_polyval(const u8 in[GHASH_BLOCK_SIZE],struct polyval_elem * out)241 static void __maybe_unused ghash_key_to_polyval(const u8 in[GHASH_BLOCK_SIZE],
242 struct polyval_elem *out)
243 {
244 u64 hi = get_unaligned_be64(&in[0]);
245 u64 lo = get_unaligned_be64(&in[8]);
246 u64 mask = (s64)hi >> 63;
247
248 hi = (hi << 1) ^ (lo >> 63) ^ (mask & ((u64)0xc2 << 56));
249 lo = (lo << 1) ^ (mask & 1);
250 out->lo = cpu_to_le64(lo);
251 out->hi = cpu_to_le64(hi);
252 }
253
254 /* Convert the accumulator from POLYVAL format to GHASH format. */
polyval_acc_to_ghash(const struct polyval_elem * in,u8 out[GHASH_BLOCK_SIZE])255 static void polyval_acc_to_ghash(const struct polyval_elem *in,
256 u8 out[GHASH_BLOCK_SIZE])
257 {
258 put_unaligned_be64(le64_to_cpu(in->hi), &out[0]);
259 put_unaligned_be64(le64_to_cpu(in->lo), &out[8]);
260 }
261
262 /* Convert the accumulator from GHASH format to POLYVAL format. */
ghash_acc_to_polyval(const u8 in[GHASH_BLOCK_SIZE],struct polyval_elem * out)263 static void __maybe_unused ghash_acc_to_polyval(const u8 in[GHASH_BLOCK_SIZE],
264 struct polyval_elem *out)
265 {
266 out->lo = cpu_to_le64(get_unaligned_be64(&in[8]));
267 out->hi = cpu_to_le64(get_unaligned_be64(&in[0]));
268 }
269
270 #ifdef CONFIG_CRYPTO_LIB_GF128HASH_ARCH
271 #include "gf128hash.h" /* $(SRCARCH)/gf128hash.h */
272 #endif
273
ghash_preparekey(struct ghash_key * key,const u8 raw_key[GHASH_BLOCK_SIZE])274 void ghash_preparekey(struct ghash_key *key, const u8 raw_key[GHASH_BLOCK_SIZE])
275 {
276 #ifdef ghash_preparekey_arch
277 ghash_preparekey_arch(key, raw_key);
278 #else
279 ghash_key_to_polyval(raw_key, &key->h);
280 #endif
281 }
282 EXPORT_SYMBOL_GPL(ghash_preparekey);
283
ghash_mul(struct ghash_ctx * ctx)284 static void ghash_mul(struct ghash_ctx *ctx)
285 {
286 #ifdef ghash_mul_arch
287 ghash_mul_arch(&ctx->acc, ctx->key);
288 #elif defined(ghash_blocks_arch)
289 static const u8 zeroes[GHASH_BLOCK_SIZE];
290
291 ghash_blocks_arch(&ctx->acc, ctx->key, zeroes, 1);
292 #else
293 polyval_mul_generic(&ctx->acc, &ctx->key->h);
294 #endif
295 }
296
297 /* nblocks is always >= 1. */
ghash_blocks(struct ghash_ctx * ctx,const u8 * data,size_t nblocks)298 static void ghash_blocks(struct ghash_ctx *ctx, const u8 *data, size_t nblocks)
299 {
300 #ifdef ghash_blocks_arch
301 ghash_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
302 #else
303 ghash_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
304 #endif
305 }
306
ghash_update(struct ghash_ctx * ctx,const u8 * data,size_t len)307 void ghash_update(struct ghash_ctx *ctx, const u8 *data, size_t len)
308 {
309 if (unlikely(ctx->partial)) {
310 size_t n = min(len, GHASH_BLOCK_SIZE - ctx->partial);
311
312 len -= n;
313 while (n--)
314 ctx->acc.bytes[GHASH_BLOCK_SIZE - 1 - ctx->partial++] ^=
315 *data++;
316 if (ctx->partial < GHASH_BLOCK_SIZE)
317 return;
318 ghash_mul(ctx);
319 }
320 if (len >= GHASH_BLOCK_SIZE) {
321 size_t nblocks = len / GHASH_BLOCK_SIZE;
322
323 ghash_blocks(ctx, data, nblocks);
324 data += len & ~(GHASH_BLOCK_SIZE - 1);
325 len &= GHASH_BLOCK_SIZE - 1;
326 }
327 for (size_t i = 0; i < len; i++)
328 ctx->acc.bytes[GHASH_BLOCK_SIZE - 1 - i] ^= data[i];
329 ctx->partial = len;
330 }
331 EXPORT_SYMBOL_GPL(ghash_update);
332
ghash_final(struct ghash_ctx * ctx,u8 out[GHASH_BLOCK_SIZE])333 void ghash_final(struct ghash_ctx *ctx, u8 out[GHASH_BLOCK_SIZE])
334 {
335 if (unlikely(ctx->partial))
336 ghash_mul(ctx);
337 polyval_acc_to_ghash(&ctx->acc, out);
338 memzero_explicit(ctx, sizeof(*ctx));
339 }
340 EXPORT_SYMBOL_GPL(ghash_final);
341
polyval_preparekey(struct polyval_key * key,const u8 raw_key[POLYVAL_BLOCK_SIZE])342 void polyval_preparekey(struct polyval_key *key,
343 const u8 raw_key[POLYVAL_BLOCK_SIZE])
344 {
345 #ifdef polyval_preparekey_arch
346 polyval_preparekey_arch(key, raw_key);
347 #else
348 memcpy(key->h.bytes, raw_key, POLYVAL_BLOCK_SIZE);
349 #endif
350 }
351 EXPORT_SYMBOL_GPL(polyval_preparekey);
352
353 /*
354 * polyval_mul_generic() and polyval_blocks_generic() take the key as a
355 * polyval_elem rather than a polyval_key, so that arch-optimized
356 * implementations with a different key format can use it as a fallback (if they
357 * have H^1 stored somewhere in their struct). Thus, the following dispatch
358 * code is needed to pass the appropriate key argument.
359 */
360
polyval_mul(struct polyval_ctx * ctx)361 static void polyval_mul(struct polyval_ctx *ctx)
362 {
363 #ifdef polyval_mul_arch
364 polyval_mul_arch(&ctx->acc, ctx->key);
365 #elif defined(polyval_blocks_arch)
366 static const u8 zeroes[POLYVAL_BLOCK_SIZE];
367
368 polyval_blocks_arch(&ctx->acc, ctx->key, zeroes, 1);
369 #else
370 polyval_mul_generic(&ctx->acc, &ctx->key->h);
371 #endif
372 }
373
374 /* nblocks is always >= 1. */
polyval_blocks(struct polyval_ctx * ctx,const u8 * data,size_t nblocks)375 static void polyval_blocks(struct polyval_ctx *ctx,
376 const u8 *data, size_t nblocks)
377 {
378 #ifdef polyval_blocks_arch
379 polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
380 #else
381 polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
382 #endif
383 }
384
polyval_update(struct polyval_ctx * ctx,const u8 * data,size_t len)385 void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
386 {
387 if (unlikely(ctx->partial)) {
388 size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
389
390 len -= n;
391 while (n--)
392 ctx->acc.bytes[ctx->partial++] ^= *data++;
393 if (ctx->partial < POLYVAL_BLOCK_SIZE)
394 return;
395 polyval_mul(ctx);
396 }
397 if (len >= POLYVAL_BLOCK_SIZE) {
398 size_t nblocks = len / POLYVAL_BLOCK_SIZE;
399
400 polyval_blocks(ctx, data, nblocks);
401 data += len & ~(POLYVAL_BLOCK_SIZE - 1);
402 len &= POLYVAL_BLOCK_SIZE - 1;
403 }
404 for (size_t i = 0; i < len; i++)
405 ctx->acc.bytes[i] ^= data[i];
406 ctx->partial = len;
407 }
408 EXPORT_SYMBOL_GPL(polyval_update);
409
polyval_final(struct polyval_ctx * ctx,u8 out[POLYVAL_BLOCK_SIZE])410 void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
411 {
412 if (unlikely(ctx->partial))
413 polyval_mul(ctx);
414 memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
415 memzero_explicit(ctx, sizeof(*ctx));
416 }
417 EXPORT_SYMBOL_GPL(polyval_final);
418
419 #ifdef gf128hash_mod_init_arch
gf128hash_mod_init(void)420 static int __init gf128hash_mod_init(void)
421 {
422 gf128hash_mod_init_arch();
423 return 0;
424 }
425 subsys_initcall(gf128hash_mod_init);
426
gf128hash_mod_exit(void)427 static void __exit gf128hash_mod_exit(void)
428 {
429 }
430 module_exit(gf128hash_mod_exit);
431 #endif
432
433 MODULE_DESCRIPTION("GF(2^128) polynomial hashing: GHASH and POLYVAL");
434 MODULE_LICENSE("GPL");
435