xref: /linux/lib/crypto/gf128hash.c (revision c417e7045b70345f59643fb2db67b0e7fbd7fbd0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * GF(2^128) polynomial hashing: GHASH and POLYVAL
4  *
5  * Copyright 2025 Google LLC
6  */
7 
8 #include <crypto/gf128hash.h>
9 #include <linux/export.h>
10 #include <linux/module.h>
11 #include <linux/string.h>
12 #include <linux/unaligned.h>
13 
14 /*
15  * GHASH and POLYVAL are almost-XOR-universal hash functions.  They interpret
16  * the message as the coefficients of a polynomial in the finite field GF(2^128)
17  * and evaluate that polynomial at a secret point.
18  *
19  * Neither GHASH nor POLYVAL is a cryptographic hash function.  They should be
20  * used only by algorithms that are specifically designed to use them.
21  *
22  * GHASH is the older variant, defined as part of GCM in NIST SP 800-38D
23  * (https://nvlpubs.nist.gov/nistpubs/legacy/sp/nistspecialpublication800-38d.pdf).
24  * GHASH is hard to implement directly, due to its backwards mapping between
25  * bits and polynomial coefficients.  GHASH implementations typically pre and
26  * post-process the inputs and outputs (mainly by byte-swapping) to convert the
27  * GHASH computation into an equivalent computation over a different,
28  * easier-to-use representation of GF(2^128).
29  *
30  * POLYVAL is a newer GF(2^128) polynomial hash, originally defined as part of
31  * AES-GCM-SIV (https://datatracker.ietf.org/doc/html/rfc8452) and also used by
32  * HCTR2 (https://eprint.iacr.org/2021/1441.pdf).  It uses that easier-to-use
33  * field representation directly, eliminating the data conversion steps.
34  *
35  * This file provides library APIs for GHASH and POLYVAL.  These APIs can
36  * delegate to either a generic implementation or an architecture-optimized
37  * implementation.  Due to the mathematical relationship between GHASH and
38  * POLYVAL, in some cases code for one is reused with the other.
39  *
40  * For the generic implementation, we don't use the traditional table approach
41  * to GF(2^128) multiplication.  That approach is not constant-time and requires
42  * a lot of memory.  Instead, we use a different approach which emulates
43  * carryless multiplication using standard multiplications by spreading the data
44  * bits apart using "holes".  This allows the carries to spill harmlessly.  This
45  * approach is borrowed from BoringSSL, which in turn credits BearSSL's
46  * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
47  * "holes" trick and a presentation by Shay Gueron
48  * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
49  * 256-bit => 128-bit reduction algorithm.
50  */
51 
52 #ifdef CONFIG_ARCH_SUPPORTS_INT128
53 
54 /* Do a 64 x 64 => 128 bit carryless multiplication. */
clmul64(u64 a,u64 b,u64 * out_lo,u64 * out_hi)55 static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
56 {
57 	/*
58 	 * With 64-bit multiplicands and one term every 4 bits, there would be
59 	 * up to 64 / 4 = 16 one bits per column when each multiplication is
60 	 * written out as a series of additions in the schoolbook manner.
61 	 * Unfortunately, that doesn't work since the value 16 is 1 too large to
62 	 * fit in 4 bits.  Carries would sometimes overflow into the next term.
63 	 *
64 	 * Using one term every 5 bits would work.  However, that would cost
65 	 * 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
66 	 *
67 	 * Instead, mask off 4 bits from one multiplicand, giving a max of 15
68 	 * one bits per column.  Then handle those 4 bits separately.
69 	 */
70 	u64 a0 = a & 0x1111111111111110;
71 	u64 a1 = a & 0x2222222222222220;
72 	u64 a2 = a & 0x4444444444444440;
73 	u64 a3 = a & 0x8888888888888880;
74 
75 	u64 b0 = b & 0x1111111111111111;
76 	u64 b1 = b & 0x2222222222222222;
77 	u64 b2 = b & 0x4444444444444444;
78 	u64 b3 = b & 0x8888888888888888;
79 
80 	/* Multiply the high 60 bits of @a by @b. */
81 	u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
82 		  (a2 * (u128)b2) ^ (a3 * (u128)b1);
83 	u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
84 		  (a2 * (u128)b3) ^ (a3 * (u128)b2);
85 	u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
86 		  (a2 * (u128)b0) ^ (a3 * (u128)b3);
87 	u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
88 		  (a2 * (u128)b1) ^ (a3 * (u128)b0);
89 
90 	/* Multiply the low 4 bits of @a by @b. */
91 	u64 e0 = -(a & 1) & b;
92 	u64 e1 = -((a >> 1) & 1) & b;
93 	u64 e2 = -((a >> 2) & 1) & b;
94 	u64 e3 = -((a >> 3) & 1) & b;
95 	u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
96 	u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
97 
98 	/* Add all the intermediate products together. */
99 	*out_lo = (((u64)c0) & 0x1111111111111111) ^
100 		  (((u64)c1) & 0x2222222222222222) ^
101 		  (((u64)c2) & 0x4444444444444444) ^
102 		  (((u64)c3) & 0x8888888888888888) ^ extra_lo;
103 	*out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
104 		  (((u64)(c1 >> 64)) & 0x2222222222222222) ^
105 		  (((u64)(c2 >> 64)) & 0x4444444444444444) ^
106 		  (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
107 }
108 
109 #else /* CONFIG_ARCH_SUPPORTS_INT128 */
110 
111 /* Do a 32 x 32 => 64 bit carryless multiplication. */
clmul32(u32 a,u32 b)112 static u64 clmul32(u32 a, u32 b)
113 {
114 	/*
115 	 * With 32-bit multiplicands and one term every 4 bits, there are up to
116 	 * 32 / 4 = 8 one bits per column when each multiplication is written
117 	 * out as a series of additions in the schoolbook manner.  The value 8
118 	 * fits in 4 bits, so the carries don't overflow into the next term.
119 	 */
120 	u32 a0 = a & 0x11111111;
121 	u32 a1 = a & 0x22222222;
122 	u32 a2 = a & 0x44444444;
123 	u32 a3 = a & 0x88888888;
124 
125 	u32 b0 = b & 0x11111111;
126 	u32 b1 = b & 0x22222222;
127 	u32 b2 = b & 0x44444444;
128 	u32 b3 = b & 0x88888888;
129 
130 	u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
131 		 (a2 * (u64)b2) ^ (a3 * (u64)b1);
132 	u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
133 		 (a2 * (u64)b3) ^ (a3 * (u64)b2);
134 	u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
135 		 (a2 * (u64)b0) ^ (a3 * (u64)b3);
136 	u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
137 		 (a2 * (u64)b1) ^ (a3 * (u64)b0);
138 
139 	/* Add all the intermediate products together. */
140 	return (c0 & 0x1111111111111111) ^
141 	       (c1 & 0x2222222222222222) ^
142 	       (c2 & 0x4444444444444444) ^
143 	       (c3 & 0x8888888888888888);
144 }
145 
146 /* Do a 64 x 64 => 128 bit carryless multiplication. */
clmul64(u64 a,u64 b,u64 * out_lo,u64 * out_hi)147 static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
148 {
149 	u32 a_lo = (u32)a;
150 	u32 a_hi = a >> 32;
151 	u32 b_lo = (u32)b;
152 	u32 b_hi = b >> 32;
153 
154 	/* Karatsuba multiplication */
155 	u64 lo = clmul32(a_lo, b_lo);
156 	u64 hi = clmul32(a_hi, b_hi);
157 	u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
158 
159 	*out_lo = lo ^ (mi << 32);
160 	*out_hi = hi ^ (mi >> 32);
161 }
162 #endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
163 
164 /* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
165 static void __maybe_unused
polyval_mul_generic(struct polyval_elem * a,const struct polyval_elem * b)166 polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
167 {
168 	u64 c0, c1, c2, c3, mi0, mi1;
169 
170 	/*
171 	 * Carryless-multiply @a by @b using Karatsuba multiplication.  Store
172 	 * the 256-bit product in @c0 (low) through @c3 (high).
173 	 */
174 	clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
175 	clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
176 	clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
177 		&mi0, &mi1);
178 	mi0 ^= c0 ^ c2;
179 	mi1 ^= c1 ^ c3;
180 	c1 ^= mi0;
181 	c2 ^= mi1;
182 
183 	/*
184 	 * Cancel out the low 128 bits of the product by adding multiples of
185 	 * G(x) = x^128 + x^127 + x^126 + x^121 + 1.  Do this in two steps, each
186 	 * of which cancels out 64 bits.  Note that we break G(x) into three
187 	 * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
188 	 */
189 
190 	/*
191 	 * First, add G(x) times c0 as follows:
192 	 *
193 	 * (c0, c1, c2) = (0,
194 	 *                 c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
195 	 *		   c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
196 	 */
197 	c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
198 	c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
199 
200 	/*
201 	 * Second, add G(x) times the new c1:
202 	 *
203 	 * (c1, c2, c3) = (0,
204 	 *                 c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
205 	 *		   c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
206 	 */
207 	c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
208 	c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
209 
210 	/* Return (c2, c3).  This implicitly multiplies by x^-128. */
211 	a->lo = cpu_to_le64(c2);
212 	a->hi = cpu_to_le64(c3);
213 }
214 
ghash_blocks_generic(struct polyval_elem * acc,const struct polyval_elem * key,const u8 * data,size_t nblocks)215 static void __maybe_unused ghash_blocks_generic(struct polyval_elem *acc,
216 						const struct polyval_elem *key,
217 						const u8 *data, size_t nblocks)
218 {
219 	do {
220 		acc->lo ^=
221 			cpu_to_le64(get_unaligned_be64((__be64 *)(data + 8)));
222 		acc->hi ^= cpu_to_le64(get_unaligned_be64((__be64 *)data));
223 		polyval_mul_generic(acc, key);
224 		data += GHASH_BLOCK_SIZE;
225 	} while (--nblocks);
226 }
227 
228 static void __maybe_unused
polyval_blocks_generic(struct polyval_elem * acc,const struct polyval_elem * key,const u8 * data,size_t nblocks)229 polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
230 		       const u8 *data, size_t nblocks)
231 {
232 	do {
233 		acc->lo ^= get_unaligned((__le64 *)data);
234 		acc->hi ^= get_unaligned((__le64 *)(data + 8));
235 		polyval_mul_generic(acc, key);
236 		data += POLYVAL_BLOCK_SIZE;
237 	} while (--nblocks);
238 }
239 
240 /* Convert the key from GHASH format to POLYVAL format. */
ghash_key_to_polyval(const u8 in[GHASH_BLOCK_SIZE],struct polyval_elem * out)241 static void __maybe_unused ghash_key_to_polyval(const u8 in[GHASH_BLOCK_SIZE],
242 						struct polyval_elem *out)
243 {
244 	u64 hi = get_unaligned_be64(&in[0]);
245 	u64 lo = get_unaligned_be64(&in[8]);
246 	u64 mask = (s64)hi >> 63;
247 
248 	hi = (hi << 1) ^ (lo >> 63) ^ (mask & ((u64)0xc2 << 56));
249 	lo = (lo << 1) ^ (mask & 1);
250 	out->lo = cpu_to_le64(lo);
251 	out->hi = cpu_to_le64(hi);
252 }
253 
254 /* Convert the accumulator from POLYVAL format to GHASH format. */
polyval_acc_to_ghash(const struct polyval_elem * in,u8 out[GHASH_BLOCK_SIZE])255 static void polyval_acc_to_ghash(const struct polyval_elem *in,
256 				 u8 out[GHASH_BLOCK_SIZE])
257 {
258 	put_unaligned_be64(le64_to_cpu(in->hi), &out[0]);
259 	put_unaligned_be64(le64_to_cpu(in->lo), &out[8]);
260 }
261 
262 /* Convert the accumulator from GHASH format to POLYVAL format. */
ghash_acc_to_polyval(const u8 in[GHASH_BLOCK_SIZE],struct polyval_elem * out)263 static void __maybe_unused ghash_acc_to_polyval(const u8 in[GHASH_BLOCK_SIZE],
264 						struct polyval_elem *out)
265 {
266 	out->lo = cpu_to_le64(get_unaligned_be64(&in[8]));
267 	out->hi = cpu_to_le64(get_unaligned_be64(&in[0]));
268 }
269 
270 #ifdef CONFIG_CRYPTO_LIB_GF128HASH_ARCH
271 #include "gf128hash.h" /* $(SRCARCH)/gf128hash.h */
272 #endif
273 
ghash_preparekey(struct ghash_key * key,const u8 raw_key[GHASH_BLOCK_SIZE])274 void ghash_preparekey(struct ghash_key *key, const u8 raw_key[GHASH_BLOCK_SIZE])
275 {
276 #ifdef ghash_preparekey_arch
277 	ghash_preparekey_arch(key, raw_key);
278 #else
279 	ghash_key_to_polyval(raw_key, &key->h);
280 #endif
281 }
282 EXPORT_SYMBOL_GPL(ghash_preparekey);
283 
ghash_mul(struct ghash_ctx * ctx)284 static void ghash_mul(struct ghash_ctx *ctx)
285 {
286 #ifdef ghash_mul_arch
287 	ghash_mul_arch(&ctx->acc, ctx->key);
288 #elif defined(ghash_blocks_arch)
289 	static const u8 zeroes[GHASH_BLOCK_SIZE];
290 
291 	ghash_blocks_arch(&ctx->acc, ctx->key, zeroes, 1);
292 #else
293 	polyval_mul_generic(&ctx->acc, &ctx->key->h);
294 #endif
295 }
296 
297 /* nblocks is always >= 1. */
ghash_blocks(struct ghash_ctx * ctx,const u8 * data,size_t nblocks)298 static void ghash_blocks(struct ghash_ctx *ctx, const u8 *data, size_t nblocks)
299 {
300 #ifdef ghash_blocks_arch
301 	ghash_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
302 #else
303 	ghash_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
304 #endif
305 }
306 
ghash_update(struct ghash_ctx * ctx,const u8 * data,size_t len)307 void ghash_update(struct ghash_ctx *ctx, const u8 *data, size_t len)
308 {
309 	if (unlikely(ctx->partial)) {
310 		size_t n = min(len, GHASH_BLOCK_SIZE - ctx->partial);
311 
312 		len -= n;
313 		while (n--)
314 			ctx->acc.bytes[GHASH_BLOCK_SIZE - 1 - ctx->partial++] ^=
315 				*data++;
316 		if (ctx->partial < GHASH_BLOCK_SIZE)
317 			return;
318 		ghash_mul(ctx);
319 	}
320 	if (len >= GHASH_BLOCK_SIZE) {
321 		size_t nblocks = len / GHASH_BLOCK_SIZE;
322 
323 		ghash_blocks(ctx, data, nblocks);
324 		data += len & ~(GHASH_BLOCK_SIZE - 1);
325 		len &= GHASH_BLOCK_SIZE - 1;
326 	}
327 	for (size_t i = 0; i < len; i++)
328 		ctx->acc.bytes[GHASH_BLOCK_SIZE - 1 - i] ^= data[i];
329 	ctx->partial = len;
330 }
331 EXPORT_SYMBOL_GPL(ghash_update);
332 
ghash_final(struct ghash_ctx * ctx,u8 out[GHASH_BLOCK_SIZE])333 void ghash_final(struct ghash_ctx *ctx, u8 out[GHASH_BLOCK_SIZE])
334 {
335 	if (unlikely(ctx->partial))
336 		ghash_mul(ctx);
337 	polyval_acc_to_ghash(&ctx->acc, out);
338 	memzero_explicit(ctx, sizeof(*ctx));
339 }
340 EXPORT_SYMBOL_GPL(ghash_final);
341 
polyval_preparekey(struct polyval_key * key,const u8 raw_key[POLYVAL_BLOCK_SIZE])342 void polyval_preparekey(struct polyval_key *key,
343 			const u8 raw_key[POLYVAL_BLOCK_SIZE])
344 {
345 #ifdef polyval_preparekey_arch
346 	polyval_preparekey_arch(key, raw_key);
347 #else
348 	memcpy(key->h.bytes, raw_key, POLYVAL_BLOCK_SIZE);
349 #endif
350 }
351 EXPORT_SYMBOL_GPL(polyval_preparekey);
352 
353 /*
354  * polyval_mul_generic() and polyval_blocks_generic() take the key as a
355  * polyval_elem rather than a polyval_key, so that arch-optimized
356  * implementations with a different key format can use it as a fallback (if they
357  * have H^1 stored somewhere in their struct).  Thus, the following dispatch
358  * code is needed to pass the appropriate key argument.
359  */
360 
polyval_mul(struct polyval_ctx * ctx)361 static void polyval_mul(struct polyval_ctx *ctx)
362 {
363 #ifdef polyval_mul_arch
364 	polyval_mul_arch(&ctx->acc, ctx->key);
365 #elif defined(polyval_blocks_arch)
366 	static const u8 zeroes[POLYVAL_BLOCK_SIZE];
367 
368 	polyval_blocks_arch(&ctx->acc, ctx->key, zeroes, 1);
369 #else
370 	polyval_mul_generic(&ctx->acc, &ctx->key->h);
371 #endif
372 }
373 
374 /* nblocks is always >= 1. */
polyval_blocks(struct polyval_ctx * ctx,const u8 * data,size_t nblocks)375 static void polyval_blocks(struct polyval_ctx *ctx,
376 			   const u8 *data, size_t nblocks)
377 {
378 #ifdef polyval_blocks_arch
379 	polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
380 #else
381 	polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
382 #endif
383 }
384 
polyval_update(struct polyval_ctx * ctx,const u8 * data,size_t len)385 void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
386 {
387 	if (unlikely(ctx->partial)) {
388 		size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
389 
390 		len -= n;
391 		while (n--)
392 			ctx->acc.bytes[ctx->partial++] ^= *data++;
393 		if (ctx->partial < POLYVAL_BLOCK_SIZE)
394 			return;
395 		polyval_mul(ctx);
396 	}
397 	if (len >= POLYVAL_BLOCK_SIZE) {
398 		size_t nblocks = len / POLYVAL_BLOCK_SIZE;
399 
400 		polyval_blocks(ctx, data, nblocks);
401 		data += len & ~(POLYVAL_BLOCK_SIZE - 1);
402 		len &= POLYVAL_BLOCK_SIZE - 1;
403 	}
404 	for (size_t i = 0; i < len; i++)
405 		ctx->acc.bytes[i] ^= data[i];
406 	ctx->partial = len;
407 }
408 EXPORT_SYMBOL_GPL(polyval_update);
409 
polyval_final(struct polyval_ctx * ctx,u8 out[POLYVAL_BLOCK_SIZE])410 void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
411 {
412 	if (unlikely(ctx->partial))
413 		polyval_mul(ctx);
414 	memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
415 	memzero_explicit(ctx, sizeof(*ctx));
416 }
417 EXPORT_SYMBOL_GPL(polyval_final);
418 
419 #ifdef gf128hash_mod_init_arch
gf128hash_mod_init(void)420 static int __init gf128hash_mod_init(void)
421 {
422 	gf128hash_mod_init_arch();
423 	return 0;
424 }
425 subsys_initcall(gf128hash_mod_init);
426 
gf128hash_mod_exit(void)427 static void __exit gf128hash_mod_exit(void)
428 {
429 }
430 module_exit(gf128hash_mod_exit);
431 #endif
432 
433 MODULE_DESCRIPTION("GF(2^128) polynomial hashing: GHASH and POLYVAL");
434 MODULE_LICENSE("GPL");
435