1*3d176751SEric Biggers // SPDX-License-Identifier: GPL-2.0-or-later
2*3d176751SEric Biggers /*
3*3d176751SEric Biggers * POLYVAL library functions
4*3d176751SEric Biggers *
5*3d176751SEric Biggers * Copyright 2025 Google LLC
6*3d176751SEric Biggers */
7*3d176751SEric Biggers
8*3d176751SEric Biggers #include <crypto/polyval.h>
9*3d176751SEric Biggers #include <linux/export.h>
10*3d176751SEric Biggers #include <linux/module.h>
11*3d176751SEric Biggers #include <linux/string.h>
12*3d176751SEric Biggers #include <linux/unaligned.h>
13*3d176751SEric Biggers
14*3d176751SEric Biggers /*
15*3d176751SEric Biggers * POLYVAL is an almost-XOR-universal hash function. Similar to GHASH, POLYVAL
16*3d176751SEric Biggers * interprets the message as the coefficients of a polynomial in GF(2^128) and
17*3d176751SEric Biggers * evaluates that polynomial at a secret point. POLYVAL has a simple
18*3d176751SEric Biggers * mathematical relationship with GHASH, but it uses a better field convention
19*3d176751SEric Biggers * which makes it easier and faster to implement.
20*3d176751SEric Biggers *
21*3d176751SEric Biggers * POLYVAL is not a cryptographic hash function, and it should be used only by
22*3d176751SEric Biggers * algorithms that are specifically designed to use it.
23*3d176751SEric Biggers *
24*3d176751SEric Biggers * POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated
25*3d176751SEric Biggers * Encryption" (https://datatracker.ietf.org/doc/html/rfc8452)
26*3d176751SEric Biggers *
27*3d176751SEric Biggers * POLYVAL is also used by HCTR2. See "Length-preserving encryption with HCTR2"
28*3d176751SEric Biggers * (https://eprint.iacr.org/2021/1441.pdf).
29*3d176751SEric Biggers *
30*3d176751SEric Biggers * This file provides a library API for POLYVAL. This API can delegate to
31*3d176751SEric Biggers * either a generic implementation or an architecture-optimized implementation.
32*3d176751SEric Biggers *
33*3d176751SEric Biggers * For the generic implementation, we don't use the traditional table approach
34*3d176751SEric Biggers * to GF(2^128) multiplication. That approach is not constant-time and requires
35*3d176751SEric Biggers * a lot of memory. Instead, we use a different approach which emulates
36*3d176751SEric Biggers * carryless multiplication using standard multiplications by spreading the data
37*3d176751SEric Biggers * bits apart using "holes". This allows the carries to spill harmlessly. This
38*3d176751SEric Biggers * approach is borrowed from BoringSSL, which in turn credits BearSSL's
39*3d176751SEric Biggers * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
40*3d176751SEric Biggers * "holes" trick and a presentation by Shay Gueron
41*3d176751SEric Biggers * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
42*3d176751SEric Biggers * 256-bit => 128-bit reduction algorithm.
43*3d176751SEric Biggers */
44*3d176751SEric Biggers
45*3d176751SEric Biggers #ifdef CONFIG_ARCH_SUPPORTS_INT128
46*3d176751SEric Biggers
47*3d176751SEric Biggers /* Do a 64 x 64 => 128 bit carryless multiplication. */
clmul64(u64 a,u64 b,u64 * out_lo,u64 * out_hi)48*3d176751SEric Biggers static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
49*3d176751SEric Biggers {
50*3d176751SEric Biggers /*
51*3d176751SEric Biggers * With 64-bit multiplicands and one term every 4 bits, there would be
52*3d176751SEric Biggers * up to 64 / 4 = 16 one bits per column when each multiplication is
53*3d176751SEric Biggers * written out as a series of additions in the schoolbook manner.
54*3d176751SEric Biggers * Unfortunately, that doesn't work since the value 16 is 1 too large to
55*3d176751SEric Biggers * fit in 4 bits. Carries would sometimes overflow into the next term.
56*3d176751SEric Biggers *
57*3d176751SEric Biggers * Using one term every 5 bits would work. However, that would cost
58*3d176751SEric Biggers * 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
59*3d176751SEric Biggers *
60*3d176751SEric Biggers * Instead, mask off 4 bits from one multiplicand, giving a max of 15
61*3d176751SEric Biggers * one bits per column. Then handle those 4 bits separately.
62*3d176751SEric Biggers */
63*3d176751SEric Biggers u64 a0 = a & 0x1111111111111110;
64*3d176751SEric Biggers u64 a1 = a & 0x2222222222222220;
65*3d176751SEric Biggers u64 a2 = a & 0x4444444444444440;
66*3d176751SEric Biggers u64 a3 = a & 0x8888888888888880;
67*3d176751SEric Biggers
68*3d176751SEric Biggers u64 b0 = b & 0x1111111111111111;
69*3d176751SEric Biggers u64 b1 = b & 0x2222222222222222;
70*3d176751SEric Biggers u64 b2 = b & 0x4444444444444444;
71*3d176751SEric Biggers u64 b3 = b & 0x8888888888888888;
72*3d176751SEric Biggers
73*3d176751SEric Biggers /* Multiply the high 60 bits of @a by @b. */
74*3d176751SEric Biggers u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
75*3d176751SEric Biggers (a2 * (u128)b2) ^ (a3 * (u128)b1);
76*3d176751SEric Biggers u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
77*3d176751SEric Biggers (a2 * (u128)b3) ^ (a3 * (u128)b2);
78*3d176751SEric Biggers u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
79*3d176751SEric Biggers (a2 * (u128)b0) ^ (a3 * (u128)b3);
80*3d176751SEric Biggers u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
81*3d176751SEric Biggers (a2 * (u128)b1) ^ (a3 * (u128)b0);
82*3d176751SEric Biggers
83*3d176751SEric Biggers /* Multiply the low 4 bits of @a by @b. */
84*3d176751SEric Biggers u64 e0 = -(a & 1) & b;
85*3d176751SEric Biggers u64 e1 = -((a >> 1) & 1) & b;
86*3d176751SEric Biggers u64 e2 = -((a >> 2) & 1) & b;
87*3d176751SEric Biggers u64 e3 = -((a >> 3) & 1) & b;
88*3d176751SEric Biggers u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
89*3d176751SEric Biggers u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
90*3d176751SEric Biggers
91*3d176751SEric Biggers /* Add all the intermediate products together. */
92*3d176751SEric Biggers *out_lo = (((u64)c0) & 0x1111111111111111) ^
93*3d176751SEric Biggers (((u64)c1) & 0x2222222222222222) ^
94*3d176751SEric Biggers (((u64)c2) & 0x4444444444444444) ^
95*3d176751SEric Biggers (((u64)c3) & 0x8888888888888888) ^ extra_lo;
96*3d176751SEric Biggers *out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
97*3d176751SEric Biggers (((u64)(c1 >> 64)) & 0x2222222222222222) ^
98*3d176751SEric Biggers (((u64)(c2 >> 64)) & 0x4444444444444444) ^
99*3d176751SEric Biggers (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
100*3d176751SEric Biggers }
101*3d176751SEric Biggers
102*3d176751SEric Biggers #else /* CONFIG_ARCH_SUPPORTS_INT128 */
103*3d176751SEric Biggers
104*3d176751SEric Biggers /* Do a 32 x 32 => 64 bit carryless multiplication. */
clmul32(u32 a,u32 b)105*3d176751SEric Biggers static u64 clmul32(u32 a, u32 b)
106*3d176751SEric Biggers {
107*3d176751SEric Biggers /*
108*3d176751SEric Biggers * With 32-bit multiplicands and one term every 4 bits, there are up to
109*3d176751SEric Biggers * 32 / 4 = 8 one bits per column when each multiplication is written
110*3d176751SEric Biggers * out as a series of additions in the schoolbook manner. The value 8
111*3d176751SEric Biggers * fits in 4 bits, so the carries don't overflow into the next term.
112*3d176751SEric Biggers */
113*3d176751SEric Biggers u32 a0 = a & 0x11111111;
114*3d176751SEric Biggers u32 a1 = a & 0x22222222;
115*3d176751SEric Biggers u32 a2 = a & 0x44444444;
116*3d176751SEric Biggers u32 a3 = a & 0x88888888;
117*3d176751SEric Biggers
118*3d176751SEric Biggers u32 b0 = b & 0x11111111;
119*3d176751SEric Biggers u32 b1 = b & 0x22222222;
120*3d176751SEric Biggers u32 b2 = b & 0x44444444;
121*3d176751SEric Biggers u32 b3 = b & 0x88888888;
122*3d176751SEric Biggers
123*3d176751SEric Biggers u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
124*3d176751SEric Biggers (a2 * (u64)b2) ^ (a3 * (u64)b1);
125*3d176751SEric Biggers u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
126*3d176751SEric Biggers (a2 * (u64)b3) ^ (a3 * (u64)b2);
127*3d176751SEric Biggers u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
128*3d176751SEric Biggers (a2 * (u64)b0) ^ (a3 * (u64)b3);
129*3d176751SEric Biggers u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
130*3d176751SEric Biggers (a2 * (u64)b1) ^ (a3 * (u64)b0);
131*3d176751SEric Biggers
132*3d176751SEric Biggers /* Add all the intermediate products together. */
133*3d176751SEric Biggers return (c0 & 0x1111111111111111) ^
134*3d176751SEric Biggers (c1 & 0x2222222222222222) ^
135*3d176751SEric Biggers (c2 & 0x4444444444444444) ^
136*3d176751SEric Biggers (c3 & 0x8888888888888888);
137*3d176751SEric Biggers }
138*3d176751SEric Biggers
139*3d176751SEric Biggers /* Do a 64 x 64 => 128 bit carryless multiplication. */
clmul64(u64 a,u64 b,u64 * out_lo,u64 * out_hi)140*3d176751SEric Biggers static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
141*3d176751SEric Biggers {
142*3d176751SEric Biggers u32 a_lo = (u32)a;
143*3d176751SEric Biggers u32 a_hi = a >> 32;
144*3d176751SEric Biggers u32 b_lo = (u32)b;
145*3d176751SEric Biggers u32 b_hi = b >> 32;
146*3d176751SEric Biggers
147*3d176751SEric Biggers /* Karatsuba multiplication */
148*3d176751SEric Biggers u64 lo = clmul32(a_lo, b_lo);
149*3d176751SEric Biggers u64 hi = clmul32(a_hi, b_hi);
150*3d176751SEric Biggers u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
151*3d176751SEric Biggers
152*3d176751SEric Biggers *out_lo = lo ^ (mi << 32);
153*3d176751SEric Biggers *out_hi = hi ^ (mi >> 32);
154*3d176751SEric Biggers }
155*3d176751SEric Biggers #endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
156*3d176751SEric Biggers
157*3d176751SEric Biggers /* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
158*3d176751SEric Biggers static void __maybe_unused
polyval_mul_generic(struct polyval_elem * a,const struct polyval_elem * b)159*3d176751SEric Biggers polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
160*3d176751SEric Biggers {
161*3d176751SEric Biggers u64 c0, c1, c2, c3, mi0, mi1;
162*3d176751SEric Biggers
163*3d176751SEric Biggers /*
164*3d176751SEric Biggers * Carryless-multiply @a by @b using Karatsuba multiplication. Store
165*3d176751SEric Biggers * the 256-bit product in @c0 (low) through @c3 (high).
166*3d176751SEric Biggers */
167*3d176751SEric Biggers clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
168*3d176751SEric Biggers clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
169*3d176751SEric Biggers clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
170*3d176751SEric Biggers &mi0, &mi1);
171*3d176751SEric Biggers mi0 ^= c0 ^ c2;
172*3d176751SEric Biggers mi1 ^= c1 ^ c3;
173*3d176751SEric Biggers c1 ^= mi0;
174*3d176751SEric Biggers c2 ^= mi1;
175*3d176751SEric Biggers
176*3d176751SEric Biggers /*
177*3d176751SEric Biggers * Cancel out the low 128 bits of the product by adding multiples of
178*3d176751SEric Biggers * G(x) = x^128 + x^127 + x^126 + x^121 + 1. Do this in two steps, each
179*3d176751SEric Biggers * of which cancels out 64 bits. Note that we break G(x) into three
180*3d176751SEric Biggers * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
181*3d176751SEric Biggers */
182*3d176751SEric Biggers
183*3d176751SEric Biggers /*
184*3d176751SEric Biggers * First, add G(x) times c0 as follows:
185*3d176751SEric Biggers *
186*3d176751SEric Biggers * (c0, c1, c2) = (0,
187*3d176751SEric Biggers * c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
188*3d176751SEric Biggers * c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
189*3d176751SEric Biggers */
190*3d176751SEric Biggers c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
191*3d176751SEric Biggers c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
192*3d176751SEric Biggers
193*3d176751SEric Biggers /*
194*3d176751SEric Biggers * Second, add G(x) times the new c1:
195*3d176751SEric Biggers *
196*3d176751SEric Biggers * (c1, c2, c3) = (0,
197*3d176751SEric Biggers * c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
198*3d176751SEric Biggers * c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
199*3d176751SEric Biggers */
200*3d176751SEric Biggers c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
201*3d176751SEric Biggers c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
202*3d176751SEric Biggers
203*3d176751SEric Biggers /* Return (c2, c3). This implicitly multiplies by x^-128. */
204*3d176751SEric Biggers a->lo = cpu_to_le64(c2);
205*3d176751SEric Biggers a->hi = cpu_to_le64(c3);
206*3d176751SEric Biggers }
207*3d176751SEric Biggers
208*3d176751SEric Biggers static void __maybe_unused
polyval_blocks_generic(struct polyval_elem * acc,const struct polyval_elem * key,const u8 * data,size_t nblocks)209*3d176751SEric Biggers polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
210*3d176751SEric Biggers const u8 *data, size_t nblocks)
211*3d176751SEric Biggers {
212*3d176751SEric Biggers do {
213*3d176751SEric Biggers acc->lo ^= get_unaligned((__le64 *)data);
214*3d176751SEric Biggers acc->hi ^= get_unaligned((__le64 *)(data + 8));
215*3d176751SEric Biggers polyval_mul_generic(acc, key);
216*3d176751SEric Biggers data += POLYVAL_BLOCK_SIZE;
217*3d176751SEric Biggers } while (--nblocks);
218*3d176751SEric Biggers }
219*3d176751SEric Biggers
220*3d176751SEric Biggers /* Include the arch-optimized implementation of POLYVAL, if one is available. */
221*3d176751SEric Biggers #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
222*3d176751SEric Biggers #include "polyval.h" /* $(SRCARCH)/polyval.h */
polyval_preparekey(struct polyval_key * key,const u8 raw_key[POLYVAL_BLOCK_SIZE])223*3d176751SEric Biggers void polyval_preparekey(struct polyval_key *key,
224*3d176751SEric Biggers const u8 raw_key[POLYVAL_BLOCK_SIZE])
225*3d176751SEric Biggers {
226*3d176751SEric Biggers polyval_preparekey_arch(key, raw_key);
227*3d176751SEric Biggers }
228*3d176751SEric Biggers EXPORT_SYMBOL_GPL(polyval_preparekey);
229*3d176751SEric Biggers #endif /* Else, polyval_preparekey() is an inline function. */
230*3d176751SEric Biggers
231*3d176751SEric Biggers /*
232*3d176751SEric Biggers * polyval_mul_generic() and polyval_blocks_generic() take the key as a
233*3d176751SEric Biggers * polyval_elem rather than a polyval_key, so that arch-optimized
234*3d176751SEric Biggers * implementations with a different key format can use it as a fallback (if they
235*3d176751SEric Biggers * have H^1 stored somewhere in their struct). Thus, the following dispatch
236*3d176751SEric Biggers * code is needed to pass the appropriate key argument.
237*3d176751SEric Biggers */
238*3d176751SEric Biggers
polyval_mul(struct polyval_ctx * ctx)239*3d176751SEric Biggers static void polyval_mul(struct polyval_ctx *ctx)
240*3d176751SEric Biggers {
241*3d176751SEric Biggers #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
242*3d176751SEric Biggers polyval_mul_arch(&ctx->acc, ctx->key);
243*3d176751SEric Biggers #else
244*3d176751SEric Biggers polyval_mul_generic(&ctx->acc, &ctx->key->h);
245*3d176751SEric Biggers #endif
246*3d176751SEric Biggers }
247*3d176751SEric Biggers
polyval_blocks(struct polyval_ctx * ctx,const u8 * data,size_t nblocks)248*3d176751SEric Biggers static void polyval_blocks(struct polyval_ctx *ctx,
249*3d176751SEric Biggers const u8 *data, size_t nblocks)
250*3d176751SEric Biggers {
251*3d176751SEric Biggers #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
252*3d176751SEric Biggers polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
253*3d176751SEric Biggers #else
254*3d176751SEric Biggers polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
255*3d176751SEric Biggers #endif
256*3d176751SEric Biggers }
257*3d176751SEric Biggers
polyval_update(struct polyval_ctx * ctx,const u8 * data,size_t len)258*3d176751SEric Biggers void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
259*3d176751SEric Biggers {
260*3d176751SEric Biggers if (unlikely(ctx->partial)) {
261*3d176751SEric Biggers size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
262*3d176751SEric Biggers
263*3d176751SEric Biggers len -= n;
264*3d176751SEric Biggers while (n--)
265*3d176751SEric Biggers ctx->acc.bytes[ctx->partial++] ^= *data++;
266*3d176751SEric Biggers if (ctx->partial < POLYVAL_BLOCK_SIZE)
267*3d176751SEric Biggers return;
268*3d176751SEric Biggers polyval_mul(ctx);
269*3d176751SEric Biggers }
270*3d176751SEric Biggers if (len >= POLYVAL_BLOCK_SIZE) {
271*3d176751SEric Biggers size_t nblocks = len / POLYVAL_BLOCK_SIZE;
272*3d176751SEric Biggers
273*3d176751SEric Biggers polyval_blocks(ctx, data, nblocks);
274*3d176751SEric Biggers data += len & ~(POLYVAL_BLOCK_SIZE - 1);
275*3d176751SEric Biggers len &= POLYVAL_BLOCK_SIZE - 1;
276*3d176751SEric Biggers }
277*3d176751SEric Biggers for (size_t i = 0; i < len; i++)
278*3d176751SEric Biggers ctx->acc.bytes[i] ^= data[i];
279*3d176751SEric Biggers ctx->partial = len;
280*3d176751SEric Biggers }
281*3d176751SEric Biggers EXPORT_SYMBOL_GPL(polyval_update);
282*3d176751SEric Biggers
polyval_final(struct polyval_ctx * ctx,u8 out[POLYVAL_BLOCK_SIZE])283*3d176751SEric Biggers void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
284*3d176751SEric Biggers {
285*3d176751SEric Biggers if (unlikely(ctx->partial))
286*3d176751SEric Biggers polyval_mul(ctx);
287*3d176751SEric Biggers memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
288*3d176751SEric Biggers memzero_explicit(ctx, sizeof(*ctx));
289*3d176751SEric Biggers }
290*3d176751SEric Biggers EXPORT_SYMBOL_GPL(polyval_final);
291*3d176751SEric Biggers
292*3d176751SEric Biggers #ifdef polyval_mod_init_arch
polyval_mod_init(void)293*3d176751SEric Biggers static int __init polyval_mod_init(void)
294*3d176751SEric Biggers {
295*3d176751SEric Biggers polyval_mod_init_arch();
296*3d176751SEric Biggers return 0;
297*3d176751SEric Biggers }
298*3d176751SEric Biggers subsys_initcall(polyval_mod_init);
299*3d176751SEric Biggers
polyval_mod_exit(void)300*3d176751SEric Biggers static void __exit polyval_mod_exit(void)
301*3d176751SEric Biggers {
302*3d176751SEric Biggers }
303*3d176751SEric Biggers module_exit(polyval_mod_exit);
304*3d176751SEric Biggers #endif
305*3d176751SEric Biggers
306*3d176751SEric Biggers MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function");
307*3d176751SEric Biggers MODULE_LICENSE("GPL");
308