xref: /src/sys/contrib/openzfs/module/icp/algs/modes/gcm.c (revision 80aae8a3f8aa70712930664572be9e6885dc0be7)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/cmn_err.h>
28 #include <modes/modes.h>
29 #include <sys/crypto/common.h>
30 #include <sys/crypto/icp.h>
31 #include <sys/crypto/impl.h>
32 #include <sys/byteorder.h>
33 #include <sys/simd.h>
34 #include <modes/gcm_impl.h>
35 #ifdef CAN_USE_GCM_ASM
36 #include <aes/aes_impl.h>
37 #include <modes/gcm_asm_rename_funcs.h>
38 #endif
39 
40 #define	GHASH(c, d, t, o) \
41 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
42 	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
43 	(uint64_t *)(void *)(t));
44 
45 /* Select GCM implementation */
46 #define	IMPL_FASTEST	(UINT32_MAX)
47 #define	IMPL_CYCLE	(UINT32_MAX-1)
48 #ifdef CAN_USE_GCM_ASM
49 #define	IMPL_AVX	(UINT32_MAX-2)
50 #if CAN_USE_GCM_ASM >= 2
51 #define	IMPL_AVX2	(UINT32_MAX-3)
52 #endif
53 #endif
54 #define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
55 static uint32_t icp_gcm_impl = IMPL_FASTEST;
56 static uint32_t user_sel_impl = IMPL_FASTEST;
57 
58 #ifdef CAN_USE_GCM_ASM
59 /* Does the architecture we run on support the MOVBE instruction? */
60 boolean_t gcm_avx_can_use_movbe = B_FALSE;
61 /*
62  * Whether to use the optimized openssl gcm and ghash implementations.
63  */
64 static gcm_impl gcm_impl_used = GCM_IMPL_GENERIC;
65 #define	GCM_IMPL_USED	(*(volatile gcm_impl *)&gcm_impl_used)
66 
67 extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *);
68 
69 static inline boolean_t gcm_avx_will_work(void);
70 static inline boolean_t gcm_avx2_will_work(void);
71 static inline void gcm_use_impl(gcm_impl impl);
72 static inline gcm_impl gcm_toggle_impl(void);
73 
74 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
75     crypto_data_t *, size_t);
76 
77 static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
78 static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
79 static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
80     size_t, size_t);
81 #endif /* ifdef CAN_USE_GCM_ASM */
82 
83 /*
84  * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
85  * is done in another function.
86  */
87 int
gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))88 gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
89     crypto_data_t *out, size_t block_size,
90     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
91     void (*copy_block)(uint8_t *, uint8_t *),
92     void (*xor_block)(uint8_t *, uint8_t *))
93 {
94 #ifdef CAN_USE_GCM_ASM
95 	if (ctx->impl != GCM_IMPL_GENERIC)
96 		return (gcm_mode_encrypt_contiguous_blocks_avx(
97 		    ctx, data, length, out, block_size));
98 #endif
99 
100 	const gcm_impl_ops_t *gops;
101 	size_t remainder = length;
102 	size_t need = 0;
103 	uint8_t *datap = (uint8_t *)data;
104 	uint8_t *blockp;
105 	uint8_t *lastp;
106 	void *iov_or_mp;
107 	offset_t offset;
108 	uint8_t *out_data_1;
109 	uint8_t *out_data_2;
110 	size_t out_data_1_len;
111 	uint64_t counter;
112 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
113 
114 	if (length + ctx->gcm_remainder_len < block_size) {
115 		/* accumulate bytes here and return */
116 		memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
117 		    datap,
118 		    length);
119 		ctx->gcm_remainder_len += length;
120 		if (ctx->gcm_copy_to == NULL) {
121 			ctx->gcm_copy_to = datap;
122 		}
123 		return (CRYPTO_SUCCESS);
124 	}
125 
126 	crypto_init_ptrs(out, &iov_or_mp, &offset);
127 
128 	gops = gcm_impl_get_ops();
129 	do {
130 		/* Unprocessed data from last call. */
131 		if (ctx->gcm_remainder_len > 0) {
132 			need = block_size - ctx->gcm_remainder_len;
133 
134 			if (need > remainder)
135 				return (CRYPTO_DATA_LEN_RANGE);
136 
137 			memcpy(&((uint8_t *)ctx->gcm_remainder)
138 			    [ctx->gcm_remainder_len], datap, need);
139 
140 			blockp = (uint8_t *)ctx->gcm_remainder;
141 		} else {
142 			blockp = datap;
143 		}
144 
145 		/*
146 		 * Increment counter. Counter bits are confined
147 		 * to the bottom 32 bits of the counter block.
148 		 */
149 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
150 		counter = htonll(counter + 1);
151 		counter &= counter_mask;
152 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
153 
154 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
155 		    (uint8_t *)ctx->gcm_tmp);
156 		xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
157 
158 		lastp = (uint8_t *)ctx->gcm_tmp;
159 
160 		ctx->gcm_processed_data_len += block_size;
161 
162 		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
163 		    &out_data_1_len, &out_data_2, block_size);
164 
165 		/* copy block to where it belongs */
166 		if (out_data_1_len == block_size) {
167 			copy_block(lastp, out_data_1);
168 		} else {
169 			memcpy(out_data_1, lastp, out_data_1_len);
170 			if (out_data_2 != NULL) {
171 				memcpy(out_data_2,
172 				    lastp + out_data_1_len,
173 				    block_size - out_data_1_len);
174 			}
175 		}
176 		/* update offset */
177 		out->cd_offset += block_size;
178 
179 		/* add ciphertext to the hash */
180 		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
181 
182 		/* Update pointer to next block of data to be processed. */
183 		if (ctx->gcm_remainder_len != 0) {
184 			datap += need;
185 			ctx->gcm_remainder_len = 0;
186 		} else {
187 			datap += block_size;
188 		}
189 
190 		remainder = (size_t)&data[length] - (size_t)datap;
191 
192 		/* Incomplete last block. */
193 		if (remainder > 0 && remainder < block_size) {
194 			memcpy(ctx->gcm_remainder, datap, remainder);
195 			ctx->gcm_remainder_len = remainder;
196 			ctx->gcm_copy_to = datap;
197 			goto out;
198 		}
199 		ctx->gcm_copy_to = NULL;
200 
201 	} while (remainder > 0);
202 out:
203 	return (CRYPTO_SUCCESS);
204 }
205 
206 int
gcm_encrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))207 gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
208     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
209     void (*copy_block)(uint8_t *, uint8_t *),
210     void (*xor_block)(uint8_t *, uint8_t *))
211 {
212 	(void) copy_block;
213 #ifdef CAN_USE_GCM_ASM
214 	if (ctx->impl != GCM_IMPL_GENERIC)
215 		return (gcm_encrypt_final_avx(ctx, out, block_size));
216 #endif
217 
218 	const gcm_impl_ops_t *gops;
219 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
220 	uint8_t *ghash, *macp = NULL;
221 	int i, rv;
222 
223 	if (out->cd_length <
224 	    (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
225 		return (CRYPTO_DATA_LEN_RANGE);
226 	}
227 
228 	gops = gcm_impl_get_ops();
229 	ghash = (uint8_t *)ctx->gcm_ghash;
230 
231 	if (ctx->gcm_remainder_len > 0) {
232 		uint64_t counter;
233 		uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
234 
235 		/*
236 		 * Here is where we deal with data that is not a
237 		 * multiple of the block size.
238 		 */
239 
240 		/*
241 		 * Increment counter.
242 		 */
243 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
244 		counter = htonll(counter + 1);
245 		counter &= counter_mask;
246 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
247 
248 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
249 		    (uint8_t *)ctx->gcm_tmp);
250 
251 		macp = (uint8_t *)ctx->gcm_remainder;
252 		memset(macp + ctx->gcm_remainder_len, 0,
253 		    block_size - ctx->gcm_remainder_len);
254 
255 		/* XOR with counter block */
256 		for (i = 0; i < ctx->gcm_remainder_len; i++) {
257 			macp[i] ^= tmpp[i];
258 		}
259 
260 		/* add ciphertext to the hash */
261 		GHASH(ctx, macp, ghash, gops);
262 
263 		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
264 	}
265 
266 	ctx->gcm_len_a_len_c[1] =
267 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
268 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
269 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
270 	    (uint8_t *)ctx->gcm_J0);
271 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
272 
273 	if (ctx->gcm_remainder_len > 0) {
274 		rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
275 		if (rv != CRYPTO_SUCCESS)
276 			return (rv);
277 	}
278 	out->cd_offset += ctx->gcm_remainder_len;
279 	ctx->gcm_remainder_len = 0;
280 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
281 	if (rv != CRYPTO_SUCCESS)
282 		return (rv);
283 	out->cd_offset += ctx->gcm_tag_len;
284 
285 	return (CRYPTO_SUCCESS);
286 }
287 
288 /*
289  * This will only deal with decrypting the last block of the input that
290  * might not be a multiple of block length.
291  */
292 static void
gcm_decrypt_incomplete_block(gcm_ctx_t * ctx,size_t block_size,size_t index,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))293 gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
294     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
295     void (*xor_block)(uint8_t *, uint8_t *))
296 {
297 	uint8_t *datap, *outp, *counterp;
298 	uint64_t counter;
299 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
300 	int i;
301 
302 	/*
303 	 * Increment counter.
304 	 * Counter bits are confined to the bottom 32 bits
305 	 */
306 	counter = ntohll(ctx->gcm_cb[1] & counter_mask);
307 	counter = htonll(counter + 1);
308 	counter &= counter_mask;
309 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
310 
311 	datap = (uint8_t *)ctx->gcm_remainder;
312 	outp = &((ctx->gcm_pt_buf)[index]);
313 	counterp = (uint8_t *)ctx->gcm_tmp;
314 
315 	/* authentication tag */
316 	memset((uint8_t *)ctx->gcm_tmp, 0, block_size);
317 	memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len);
318 
319 	/* add ciphertext to the hash */
320 	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
321 
322 	/* decrypt remaining ciphertext */
323 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
324 
325 	/* XOR with counter block */
326 	for (i = 0; i < ctx->gcm_remainder_len; i++) {
327 		outp[i] = datap[i] ^ counterp[i];
328 	}
329 }
330 
331 int
gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))332 gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
333     crypto_data_t *out, size_t block_size,
334     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
335     void (*copy_block)(uint8_t *, uint8_t *),
336     void (*xor_block)(uint8_t *, uint8_t *))
337 {
338 	(void) out, (void) block_size, (void) encrypt_block, (void) copy_block,
339 	    (void) xor_block;
340 	size_t new_len;
341 	uint8_t *new;
342 
343 	/*
344 	 * Copy contiguous ciphertext input blocks to plaintext buffer.
345 	 * Ciphertext will be decrypted in the final.
346 	 */
347 	if (length > 0) {
348 		new_len = ctx->gcm_pt_buf_len + length;
349 		new = vmem_alloc(new_len, KM_SLEEP);
350 		if (new == NULL) {
351 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
352 			ctx->gcm_pt_buf = NULL;
353 			return (CRYPTO_HOST_MEMORY);
354 		}
355 
356 		if (ctx->gcm_pt_buf != NULL) {
357 			memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
358 			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
359 		} else {
360 			ASSERT0(ctx->gcm_pt_buf_len);
361 		}
362 
363 		ctx->gcm_pt_buf = new;
364 		ctx->gcm_pt_buf_len = new_len;
365 		memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data,
366 		    length);
367 		ctx->gcm_processed_data_len += length;
368 	}
369 
370 	ctx->gcm_remainder_len = 0;
371 	return (CRYPTO_SUCCESS);
372 }
373 
374 int
gcm_decrypt_final(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))375 gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
376     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
377     void (*xor_block)(uint8_t *, uint8_t *))
378 {
379 #ifdef CAN_USE_GCM_ASM
380 	if (ctx->impl != GCM_IMPL_GENERIC)
381 		return (gcm_decrypt_final_avx(ctx, out, block_size));
382 #endif
383 
384 	const gcm_impl_ops_t *gops;
385 	size_t pt_len;
386 	size_t remainder;
387 	uint8_t *ghash;
388 	uint8_t *blockp;
389 	uint8_t *cbp;
390 	uint64_t counter;
391 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
392 	int processed = 0, rv;
393 
394 	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
395 
396 	gops = gcm_impl_get_ops();
397 	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
398 	ghash = (uint8_t *)ctx->gcm_ghash;
399 	blockp = ctx->gcm_pt_buf;
400 	remainder = pt_len;
401 	while (remainder > 0) {
402 		/* Incomplete last block */
403 		if (remainder < block_size) {
404 			memcpy(ctx->gcm_remainder, blockp, remainder);
405 			ctx->gcm_remainder_len = remainder;
406 			/*
407 			 * not expecting anymore ciphertext, just
408 			 * compute plaintext for the remaining input
409 			 */
410 			gcm_decrypt_incomplete_block(ctx, block_size,
411 			    processed, encrypt_block, xor_block);
412 			ctx->gcm_remainder_len = 0;
413 			goto out;
414 		}
415 		/* add ciphertext to the hash */
416 		GHASH(ctx, blockp, ghash, gops);
417 
418 		/*
419 		 * Increment counter.
420 		 * Counter bits are confined to the bottom 32 bits
421 		 */
422 		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
423 		counter = htonll(counter + 1);
424 		counter &= counter_mask;
425 		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
426 
427 		cbp = (uint8_t *)ctx->gcm_tmp;
428 		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
429 
430 		/* XOR with ciphertext */
431 		xor_block(cbp, blockp);
432 
433 		processed += block_size;
434 		blockp += block_size;
435 		remainder -= block_size;
436 	}
437 out:
438 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
439 	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
440 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
441 	    (uint8_t *)ctx->gcm_J0);
442 	xor_block((uint8_t *)ctx->gcm_J0, ghash);
443 
444 	/* compare the input authentication tag with what we calculated */
445 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
446 		/* They don't match */
447 		return (CRYPTO_INVALID_MAC);
448 	} else {
449 		rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
450 		if (rv != CRYPTO_SUCCESS)
451 			return (rv);
452 		out->cd_offset += pt_len;
453 	}
454 	return (CRYPTO_SUCCESS);
455 }
456 
457 static int
gcm_validate_args(CK_AES_GCM_PARAMS * gcm_param)458 gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
459 {
460 	size_t tag_len;
461 
462 	/*
463 	 * Check the length of the authentication tag (in bits).
464 	 */
465 	tag_len = gcm_param->ulTagBits;
466 	switch (tag_len) {
467 	case 32:
468 	case 64:
469 	case 96:
470 	case 104:
471 	case 112:
472 	case 120:
473 	case 128:
474 		break;
475 	default:
476 		return (CRYPTO_MECHANISM_PARAM_INVALID);
477 	}
478 
479 	if (gcm_param->ulIvLen == 0)
480 		return (CRYPTO_MECHANISM_PARAM_INVALID);
481 
482 	return (CRYPTO_SUCCESS);
483 }
484 
485 static void
gcm_format_initial_blocks(const uint8_t * iv,ulong_t iv_len,gcm_ctx_t * ctx,size_t block_size,void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))486 gcm_format_initial_blocks(const uint8_t *iv, ulong_t iv_len,
487     gcm_ctx_t *ctx, size_t block_size,
488     void (*copy_block)(uint8_t *, uint8_t *),
489     void (*xor_block)(uint8_t *, uint8_t *))
490 {
491 	const gcm_impl_ops_t *gops;
492 	uint8_t *cb;
493 	ulong_t remainder = iv_len;
494 	ulong_t processed = 0;
495 	uint8_t *datap, *ghash;
496 	uint64_t len_a_len_c[2];
497 
498 	gops = gcm_impl_get_ops();
499 	ghash = (uint8_t *)ctx->gcm_ghash;
500 	cb = (uint8_t *)ctx->gcm_cb;
501 	if (iv_len == 12) {
502 		memcpy(cb, iv, 12);
503 		cb[12] = 0;
504 		cb[13] = 0;
505 		cb[14] = 0;
506 		cb[15] = 1;
507 		/* J0 will be used again in the final */
508 		copy_block(cb, (uint8_t *)ctx->gcm_J0);
509 	} else {
510 		/* GHASH the IV */
511 		do {
512 			if (remainder < block_size) {
513 				memset(cb, 0, block_size);
514 				memcpy(cb, &(iv[processed]), remainder);
515 				datap = (uint8_t *)cb;
516 				remainder = 0;
517 			} else {
518 				datap = (uint8_t *)(&(iv[processed]));
519 				processed += block_size;
520 				remainder -= block_size;
521 			}
522 			GHASH(ctx, datap, ghash, gops);
523 		} while (remainder > 0);
524 
525 		len_a_len_c[0] = 0;
526 		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
527 		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
528 
529 		/* J0 will be used again in the final */
530 		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
531 	}
532 }
533 
534 static int
gcm_init(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))535 gcm_init(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
536     const uint8_t *auth_data, size_t auth_data_len, size_t block_size,
537     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
538     void (*copy_block)(uint8_t *, uint8_t *),
539     void (*xor_block)(uint8_t *, uint8_t *))
540 {
541 	const gcm_impl_ops_t *gops;
542 	uint8_t *ghash, *datap, *authp;
543 	size_t remainder, processed;
544 
545 	/* encrypt zero block to get subkey H */
546 	memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H));
547 	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
548 	    (uint8_t *)ctx->gcm_H);
549 
550 	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
551 	    copy_block, xor_block);
552 
553 	gops = gcm_impl_get_ops();
554 	authp = (uint8_t *)ctx->gcm_tmp;
555 	ghash = (uint8_t *)ctx->gcm_ghash;
556 	memset(authp, 0, block_size);
557 	memset(ghash, 0, block_size);
558 
559 	processed = 0;
560 	remainder = auth_data_len;
561 	do {
562 		if (remainder < block_size) {
563 			/*
564 			 * There's not a block full of data, pad rest of
565 			 * buffer with zero
566 			 */
567 
568 			if (auth_data != NULL) {
569 				memset(authp, 0, block_size);
570 				memcpy(authp, &(auth_data[processed]),
571 				    remainder);
572 			} else {
573 				ASSERT0(remainder);
574 			}
575 
576 			datap = (uint8_t *)authp;
577 			remainder = 0;
578 		} else {
579 			datap = (uint8_t *)(&(auth_data[processed]));
580 			processed += block_size;
581 			remainder -= block_size;
582 		}
583 
584 		/* add auth data to the hash */
585 		GHASH(ctx, datap, ghash, gops);
586 
587 	} while (remainder > 0);
588 
589 	return (CRYPTO_SUCCESS);
590 }
591 
592 /*
593  * Init the GCM context struct. Handle the cycle and avx implementations here.
594  */
595 int
gcm_init_ctx(gcm_ctx_t * gcm_ctx,char * param,size_t block_size,int (* encrypt_block)(const void *,const uint8_t *,uint8_t *),void (* copy_block)(uint8_t *,uint8_t *),void (* xor_block)(uint8_t *,uint8_t *))596 gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
597     size_t block_size, int (*encrypt_block)(const void *, const uint8_t *,
598     uint8_t *), void (*copy_block)(uint8_t *, uint8_t *),
599     void (*xor_block)(uint8_t *, uint8_t *))
600 {
601 	CK_AES_GCM_PARAMS *gcm_param;
602 	int rv = CRYPTO_SUCCESS;
603 	size_t tag_len, iv_len;
604 
605 	if (param != NULL) {
606 		gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
607 
608 		/* GCM mode. */
609 		if ((rv = gcm_validate_args(gcm_param)) != 0) {
610 			return (rv);
611 		}
612 		gcm_ctx->gcm_flags |= GCM_MODE;
613 
614 		size_t tbits = gcm_param->ulTagBits;
615 		tag_len = CRYPTO_BITS2BYTES(tbits);
616 		iv_len = gcm_param->ulIvLen;
617 
618 		gcm_ctx->gcm_tag_len = tag_len;
619 		gcm_ctx->gcm_processed_data_len = 0;
620 
621 		/* these values are in bits */
622 		gcm_ctx->gcm_len_a_len_c[0]
623 		    = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
624 	} else {
625 		return (CRYPTO_MECHANISM_PARAM_INVALID);
626 	}
627 
628 	const uint8_t *iv = (const uint8_t *)gcm_param->pIv;
629 	const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
630 	size_t aad_len = gcm_param->ulAADLen;
631 
632 #ifdef CAN_USE_GCM_ASM
633 	boolean_t needs_bswap =
634 	    ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
635 
636 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
637 		gcm_ctx->impl = GCM_IMPL_USED;
638 	} else {
639 		/*
640 		 * Handle the "cycle" implementation by creating different
641 		 * contexts, one per implementation.
642 		 */
643 		gcm_ctx->impl = gcm_toggle_impl();
644 
645 		/* The AVX impl. doesn't handle byte swapped key schedules. */
646 		if (needs_bswap == B_TRUE) {
647 			gcm_ctx->impl = GCM_IMPL_GENERIC;
648 		}
649 		/*
650 		 * If this is an AVX context, use the MOVBE and the BSWAP
651 		 * variants alternately.
652 		 */
653 		if (gcm_ctx->impl == GCM_IMPL_AVX &&
654 		    zfs_movbe_available() == B_TRUE) {
655 			(void) atomic_toggle_boolean_nv(
656 			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
657 		}
658 	}
659 	/*
660 	 * We don't handle byte swapped key schedules in the avx code path,
661 	 * still they could be created by the aes generic implementation.
662 	 * Make sure not to use them since we'll corrupt data if we do.
663 	 */
664 	if (gcm_ctx->impl != GCM_IMPL_GENERIC && needs_bswap == B_TRUE) {
665 		gcm_ctx->impl = GCM_IMPL_GENERIC;
666 
667 		cmn_err_once(CE_WARN,
668 		    "ICP: Can't use the aes generic or cycle implementations "
669 		    "in combination with the gcm avx or avx2-vaes "
670 		    "implementation!");
671 		cmn_err_once(CE_WARN,
672 		    "ICP: Falling back to a compatible implementation, "
673 		    "aes-gcm performance will likely be degraded.");
674 		cmn_err_once(CE_WARN,
675 		    "ICP: Choose at least the x86_64 aes implementation to "
676 		    "restore performance.");
677 	}
678 
679 	/*
680 	 * AVX implementations use Htable with sizes depending on
681 	 * implementation.
682 	 */
683 	if (gcm_ctx->impl != GCM_IMPL_GENERIC) {
684 		rv = gcm_init_avx(gcm_ctx, iv, iv_len, aad, aad_len,
685 		    block_size);
686 	}
687 	else
688 #endif /* ifdef CAN_USE_GCM_ASM */
689 	if (gcm_init(gcm_ctx, iv, iv_len, aad, aad_len, block_size,
690 	    encrypt_block, copy_block, xor_block) != CRYPTO_SUCCESS) {
691 		rv = CRYPTO_MECHANISM_PARAM_INVALID;
692 	}
693 
694 	return (rv);
695 }
696 
697 void *
gcm_alloc_ctx(int kmflag)698 gcm_alloc_ctx(int kmflag)
699 {
700 	gcm_ctx_t *gcm_ctx;
701 
702 	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
703 		return (NULL);
704 
705 	gcm_ctx->gcm_flags = GCM_MODE;
706 	return (gcm_ctx);
707 }
708 
709 /* GCM implementation that contains the fastest methods */
710 static gcm_impl_ops_t gcm_fastest_impl = {
711 	.name = "fastest"
712 };
713 
714 /* All compiled in implementations */
715 static const gcm_impl_ops_t *gcm_all_impl[] = {
716 	&gcm_generic_impl,
717 #if defined(__x86_64) && HAVE_SIMD(PCLMULQDQ)
718 	&gcm_pclmulqdq_impl,
719 #endif
720 };
721 
722 /* Indicate that benchmark has been completed */
723 static boolean_t gcm_impl_initialized = B_FALSE;
724 
725 /* Hold all supported implementations */
726 static size_t gcm_supp_impl_cnt = 0;
727 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
728 
729 /*
730  * Returns the GCM operations for encrypt/decrypt/key setup.  When a
731  * SIMD implementation is not allowed in the current context, then
732  * fallback to the fastest generic implementation.
733  */
734 const gcm_impl_ops_t *
gcm_impl_get_ops(void)735 gcm_impl_get_ops(void)
736 {
737 	if (!kfpu_allowed())
738 		return (&gcm_generic_impl);
739 
740 	const gcm_impl_ops_t *ops = NULL;
741 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
742 
743 	switch (impl) {
744 	case IMPL_FASTEST:
745 		ASSERT(gcm_impl_initialized);
746 		ops = &gcm_fastest_impl;
747 		break;
748 	case IMPL_CYCLE:
749 		/* Cycle through supported implementations */
750 		ASSERT(gcm_impl_initialized);
751 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
752 		static size_t cycle_impl_idx = 0;
753 		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
754 		ops = gcm_supp_impl[idx];
755 		break;
756 #ifdef CAN_USE_GCM_ASM
757 	case IMPL_AVX:
758 #if CAN_USE_GCM_ASM >= 2
759 	case IMPL_AVX2:
760 #endif
761 		/*
762 		 * Make sure that we return a valid implementation while
763 		 * switching to the avx implementation since there still
764 		 * may be unfinished non-avx contexts around.
765 		 */
766 		ops = &gcm_generic_impl;
767 		break;
768 #endif
769 	default:
770 		ASSERT3U(impl, <, gcm_supp_impl_cnt);
771 		ASSERT3U(gcm_supp_impl_cnt, >, 0);
772 		if (impl < ARRAY_SIZE(gcm_all_impl))
773 			ops = gcm_supp_impl[impl];
774 		break;
775 	}
776 
777 	ASSERT3P(ops, !=, NULL);
778 
779 	return (ops);
780 }
781 
782 /*
783  * Initialize all supported implementations.
784  */
785 void
gcm_impl_init(void)786 gcm_impl_init(void)
787 {
788 	gcm_impl_ops_t *curr_impl;
789 	int i, c;
790 
791 	/* Move supported implementations into gcm_supp_impls */
792 	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
793 		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
794 
795 		if (curr_impl->is_supported())
796 			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
797 	}
798 	gcm_supp_impl_cnt = c;
799 
800 	/*
801 	 * Set the fastest implementation given the assumption that the
802 	 * hardware accelerated version is the fastest.
803 	 */
804 #if defined(__x86_64) && HAVE_SIMD(PCLMULQDQ)
805 	if (gcm_pclmulqdq_impl.is_supported()) {
806 		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
807 		    sizeof (gcm_fastest_impl));
808 	} else
809 #endif
810 	{
811 		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
812 		    sizeof (gcm_fastest_impl));
813 	}
814 
815 	strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
816 
817 #ifdef CAN_USE_GCM_ASM
818 	/*
819 	 * Use the avx implementation if it's available and the implementation
820 	 * hasn't changed from its default value of fastest on module load.
821 	 */
822 #if CAN_USE_GCM_ASM >= 2
823 	if (gcm_avx2_will_work()) {
824 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
825 			gcm_use_impl(GCM_IMPL_AVX2);
826 		}
827 	} else
828 #endif
829 	if (gcm_avx_will_work()) {
830 #if HAVE_SIMD(MOVBE)
831 		if (zfs_movbe_available() == B_TRUE) {
832 			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
833 		}
834 #endif
835 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
836 			gcm_use_impl(GCM_IMPL_AVX);
837 		}
838 	}
839 #endif
840 	/* Finish initialization */
841 	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
842 	gcm_impl_initialized = B_TRUE;
843 }
844 
845 static const struct {
846 	const char *name;
847 	uint32_t sel;
848 } gcm_impl_opts[] = {
849 		{ "cycle",	IMPL_CYCLE },
850 		{ "fastest",	IMPL_FASTEST },
851 #ifdef CAN_USE_GCM_ASM
852 		{ "avx",	IMPL_AVX },
853 		{ "avx2-vaes",	IMPL_AVX2 },
854 #endif
855 };
856 
857 /*
858  * Function sets desired gcm implementation.
859  *
860  * If we are called before init(), user preference will be saved in
861  * user_sel_impl, and applied in later init() call. This occurs when module
862  * parameter is specified on module load. Otherwise, directly update
863  * icp_gcm_impl.
864  *
865  * @val		Name of gcm implementation to use
866  * @param	Unused.
867  */
868 int
gcm_impl_set(const char * val)869 gcm_impl_set(const char *val)
870 {
871 	int err = -EINVAL;
872 	char req_name[GCM_IMPL_NAME_MAX];
873 	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
874 	size_t i;
875 
876 	/* sanitize input */
877 	i = strnlen(val, GCM_IMPL_NAME_MAX);
878 	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
879 		return (err);
880 
881 	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
882 	while (i > 0 && isspace(req_name[i-1]))
883 		i--;
884 	req_name[i] = '\0';
885 
886 	/* Check mandatory options */
887 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
888 #ifdef CAN_USE_GCM_ASM
889 #if CAN_USE_GCM_ASM >= 2
890 		/* Ignore avx implementation if it won't work. */
891 		if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
892 		    !gcm_avx2_will_work()) {
893 			continue;
894 		}
895 #endif
896 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
897 			continue;
898 		}
899 #endif
900 		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
901 			impl = gcm_impl_opts[i].sel;
902 			err = 0;
903 			break;
904 		}
905 	}
906 
907 	/* check all supported impl if init() was already called */
908 	if (err != 0 && gcm_impl_initialized) {
909 		/* check all supported implementations */
910 		for (i = 0; i < gcm_supp_impl_cnt; i++) {
911 			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
912 				impl = i;
913 				err = 0;
914 				break;
915 			}
916 		}
917 	}
918 #ifdef CAN_USE_GCM_ASM
919 	/*
920 	 * Use the avx implementation if available and the requested one is
921 	 * avx or fastest.
922 	 */
923 #if CAN_USE_GCM_ASM >= 2
924 	if (gcm_avx2_will_work() == B_TRUE &&
925 	    (impl == IMPL_AVX2 || impl == IMPL_FASTEST)) {
926 		gcm_use_impl(GCM_IMPL_AVX2);
927 	} else
928 #endif
929 	if (gcm_avx_will_work() == B_TRUE &&
930 	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
931 		gcm_use_impl(GCM_IMPL_AVX);
932 	} else {
933 		gcm_use_impl(GCM_IMPL_GENERIC);
934 	}
935 #endif
936 
937 	if (err == 0) {
938 		if (gcm_impl_initialized)
939 			atomic_swap_32(&icp_gcm_impl, impl);
940 		else
941 			atomic_swap_32(&user_sel_impl, impl);
942 	}
943 
944 	return (err);
945 }
946 
947 #if defined(_KERNEL) && defined(__linux__)
948 
949 static int
icp_gcm_impl_set(const char * val,zfs_kernel_param_t * kp)950 icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
951 {
952 	return (gcm_impl_set(val));
953 }
954 
955 static int
icp_gcm_impl_get(char * buffer,zfs_kernel_param_t * kp)956 icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
957 {
958 	int i, cnt = 0;
959 	char *fmt;
960 	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
961 
962 	/* list mandatory options */
963 	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
964 #ifdef CAN_USE_GCM_ASM
965 		/* Ignore avx implementation if it won't work. */
966 #if CAN_USE_GCM_ASM >= 2
967 		if (gcm_impl_opts[i].sel == IMPL_AVX2 &&
968 		    !gcm_avx2_will_work()) {
969 			continue;
970 		}
971 #endif
972 		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
973 			continue;
974 		}
975 #endif
976 		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
977 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
978 		    gcm_impl_opts[i].name);
979 	}
980 
981 	/* list all supported implementations */
982 	for (i = 0; i < gcm_supp_impl_cnt; i++) {
983 		fmt = (i == impl) ? "[%s] " : "%s ";
984 		cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt,
985 		    gcm_supp_impl[i]->name);
986 	}
987 
988 	return (cnt);
989 }
990 
991 module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
992     NULL, 0644);
993 MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
994 #endif /* defined(__KERNEL) */
995 
996 #ifdef CAN_USE_GCM_ASM
997 #define	GCM_BLOCK_LEN 16
998 /*
999  * The openssl asm routines are 6x aggregated and need that many bytes
1000  * at minimum.
1001  */
1002 #define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
1003 #define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
1004 /*
1005  * Ensure the chunk size is reasonable since we are allocating a
1006  * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
1007  */
1008 #define	GCM_AVX_MAX_CHUNK_SIZE \
1009 	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
1010 
1011 /* Clear the FPU registers since they hold sensitive internal state. */
1012 #define	clear_fpu_regs() clear_fpu_regs_avx()
1013 
1014 #define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
1015 
1016 /* Get the chunk size module parameter. */
1017 #define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
1018 
1019 /*
1020  * Module parameter: number of bytes to process at once while owning the FPU.
1021  * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
1022  * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
1023  */
1024 static uint32_t gcm_avx_chunk_size =
1025 	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1026 
1027 /*
1028  * GCM definitions: uint128_t is copied from include/crypto/modes.h
1029  * Avoiding u128 because it is already defined in kernel sources.
1030  */
1031 typedef struct {
1032     uint64_t hi, lo;
1033 } uint128_t;
1034 
1035 extern void ASMABI clear_fpu_regs_avx(void);
1036 extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst);
1037 extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr,
1038     const uint32_t pt[4], uint32_t ct[4]);
1039 
1040 extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
1041 #if CAN_USE_GCM_ASM >= 2
1042 extern void ASMABI gcm_init_vpclmulqdq_avx2(uint128_t Htable[16],
1043     const uint64_t H[2]);
1044 #endif
1045 extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
1046     const uint8_t *in, size_t len);
1047 #if CAN_USE_GCM_ASM >= 2
1048 extern void ASMABI gcm_ghash_vpclmulqdq_avx2(uint64_t ghash[2],
1049     const uint64_t *Htable, const uint8_t *in, size_t len);
1050 #endif
GHASH_AVX(gcm_ctx_t * ctx,const uint8_t * in,size_t len)1051 static inline void GHASH_AVX(gcm_ctx_t *ctx, const uint8_t *in, size_t len)
1052 {
1053 	switch (ctx->impl) {
1054 #if CAN_USE_GCM_ASM >= 2
1055 		case GCM_IMPL_AVX2:
1056 			gcm_ghash_vpclmulqdq_avx2(ctx->gcm_ghash,
1057 			    (const uint64_t *)ctx->gcm_Htable, in, len);
1058 			break;
1059 #endif
1060 
1061 		case GCM_IMPL_AVX:
1062 			gcm_ghash_avx(ctx->gcm_ghash,
1063 			    (const uint64_t *)ctx->gcm_Htable, in, len);
1064 			break;
1065 
1066 		default:
1067 			VERIFY(B_FALSE);
1068 	}
1069 }
1070 
1071 typedef size_t ASMABI aesni_gcm_encrypt_impl(const uint8_t *, uint8_t *,
1072     size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
1073 extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
1074     const void *, uint64_t *, uint64_t *);
1075 #if CAN_USE_GCM_ASM >= 2
1076 extern void ASMABI aes_gcm_enc_update_vaes_avx2(const uint8_t *in,
1077     uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
1078     const uint128_t Htable[16], uint8_t Xi[16]);
1079 #endif
1080 
1081 typedef size_t ASMABI aesni_gcm_decrypt_impl(const uint8_t *, uint8_t *,
1082     size_t, const void *, uint64_t *, const uint64_t *Htable, uint64_t *);
1083 extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
1084     const void *, uint64_t *, uint64_t *);
1085 #if CAN_USE_GCM_ASM >= 2
1086 extern void ASMABI aes_gcm_dec_update_vaes_avx2(const uint8_t *in,
1087     uint8_t *out, size_t len, const void *key, const uint8_t ivec[16],
1088     const uint128_t Htable[16], uint8_t Xi[16]);
1089 #endif
1090 
1091 static inline boolean_t
gcm_avx2_will_work(void)1092 gcm_avx2_will_work(void)
1093 {
1094 	return (kfpu_allowed() &&
1095 	    zfs_avx2_available() && zfs_vaes_available() &&
1096 	    zfs_vpclmulqdq_available());
1097 }
1098 
1099 static inline boolean_t
gcm_avx_will_work(void)1100 gcm_avx_will_work(void)
1101 {
1102 	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
1103 	return (kfpu_allowed() &&
1104 	    zfs_avx_available() && zfs_aes_available() &&
1105 	    zfs_pclmulqdq_available());
1106 }
1107 
1108 static inline void
gcm_use_impl(gcm_impl impl)1109 gcm_use_impl(gcm_impl impl)
1110 {
1111 	switch (impl) {
1112 #if CAN_USE_GCM_ASM >= 2
1113 		case GCM_IMPL_AVX2:
1114 			if (gcm_avx2_will_work() == B_TRUE) {
1115 				atomic_swap_32(&gcm_impl_used, impl);
1116 				return;
1117 			}
1118 
1119 			zfs_fallthrough;
1120 #endif
1121 
1122 		case GCM_IMPL_AVX:
1123 			if (gcm_avx_will_work() == B_TRUE) {
1124 				atomic_swap_32(&gcm_impl_used, impl);
1125 				return;
1126 			}
1127 
1128 			zfs_fallthrough;
1129 
1130 		default:
1131 			atomic_swap_32(&gcm_impl_used, GCM_IMPL_GENERIC);
1132 	}
1133 }
1134 
1135 static inline boolean_t
gcm_impl_will_work(gcm_impl impl)1136 gcm_impl_will_work(gcm_impl impl)
1137 {
1138 	switch (impl) {
1139 #if CAN_USE_GCM_ASM >= 2
1140 		case GCM_IMPL_AVX2:
1141 			return (gcm_avx2_will_work());
1142 #endif
1143 
1144 		case GCM_IMPL_AVX:
1145 			return (gcm_avx_will_work());
1146 
1147 		default:
1148 			return (B_TRUE);
1149 	}
1150 }
1151 
1152 static inline gcm_impl
gcm_toggle_impl(void)1153 gcm_toggle_impl(void)
1154 {
1155 	gcm_impl current_impl, new_impl;
1156 	do { /* handle races */
1157 		current_impl = atomic_load_32(&gcm_impl_used);
1158 		new_impl = current_impl;
1159 		while (B_TRUE) { /* handle incompatble implementations */
1160 			new_impl = (new_impl + 1) % GCM_IMPL_MAX;
1161 			if (gcm_impl_will_work(new_impl)) {
1162 				break;
1163 			}
1164 		}
1165 
1166 	} while (atomic_cas_32(&gcm_impl_used, current_impl, new_impl) !=
1167 	    current_impl);
1168 
1169 	return (new_impl);
1170 }
1171 
1172 
1173 /* Increment the GCM counter block by n. */
1174 static inline void
gcm_incr_counter_block_by(gcm_ctx_t * ctx,int n)1175 gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
1176 {
1177 	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
1178 	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
1179 
1180 	counter = htonll(counter + n);
1181 	counter &= counter_mask;
1182 	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
1183 }
1184 
aesni_gcm_encrypt_avx(const uint8_t * in,uint8_t * out,size_t len,const void * key,uint64_t * iv,const uint64_t * Htable,uint64_t * Xip)1185 static size_t aesni_gcm_encrypt_avx(const uint8_t *in, uint8_t *out,
1186     size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
1187     uint64_t *Xip)
1188 {
1189 	(void) Htable;
1190 	return (aesni_gcm_encrypt(in, out, len, key, iv, Xip));
1191 }
1192 
1193 #if CAN_USE_GCM_ASM >= 2
1194 // kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four
1195 // bits of a |size_t|.
1196 // This is from boringssl/crypto/fipsmodule/aes/gcm.cc.inc
1197 static const size_t kSizeTWithoutLower4Bits = (size_t)-16;
1198 
1199 /* The following CRYPTO methods are from boringssl/crypto/internal.h */
CRYPTO_bswap4(uint32_t x)1200 static inline uint32_t CRYPTO_bswap4(uint32_t x) {
1201 	return (__builtin_bswap32(x));
1202 }
1203 
CRYPTO_load_u32_be(const void * in)1204 static inline uint32_t CRYPTO_load_u32_be(const void *in) {
1205 	uint32_t v;
1206 	memcpy(&v, in, sizeof (v));
1207 	return (CRYPTO_bswap4(v));
1208 }
1209 
CRYPTO_store_u32_be(void * out,uint32_t v)1210 static inline void CRYPTO_store_u32_be(void *out, uint32_t v) {
1211 	v = CRYPTO_bswap4(v);
1212 	memcpy(out, &v, sizeof (v));
1213 }
1214 
aesni_gcm_encrypt_avx2(const uint8_t * in,uint8_t * out,size_t len,const void * key,uint64_t * iv,const uint64_t * Htable,uint64_t * Xip)1215 static size_t aesni_gcm_encrypt_avx2(const uint8_t *in, uint8_t *out,
1216     size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
1217     uint64_t *Xip)
1218 {
1219 	uint8_t *ivec = (uint8_t *)iv;
1220 	len &= kSizeTWithoutLower4Bits;
1221 	aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec,
1222 	    (const uint128_t *)Htable, (uint8_t *)Xip);
1223 	CRYPTO_store_u32_be(&ivec[12],
1224 	    CRYPTO_load_u32_be(&ivec[12]) + len / 16);
1225 	return (len);
1226 }
1227 #endif /* if CAN_USE_GCM_ASM >= 2 */
1228 
1229 /*
1230  * Encrypt multiple blocks of data in GCM mode.
1231  * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
1232  * if possible. While processing a chunk the FPU is "locked".
1233  */
1234 static int
gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t * ctx,char * data,size_t length,crypto_data_t * out,size_t block_size)1235 gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
1236     size_t length, crypto_data_t *out, size_t block_size)
1237 {
1238 	size_t bleft = length;
1239 	size_t need = 0;
1240 	size_t done = 0;
1241 	uint8_t *datap = (uint8_t *)data;
1242 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1243 	aesni_gcm_encrypt_impl *encrypt_blocks =
1244 #if CAN_USE_GCM_ASM >= 2
1245 	    ctx->impl == GCM_IMPL_AVX2 ?
1246 	    aesni_gcm_encrypt_avx2 :
1247 #endif
1248 	    aesni_gcm_encrypt_avx;
1249 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1250 	uint64_t *ghash = ctx->gcm_ghash;
1251 	uint64_t *htable = ctx->gcm_Htable;
1252 	uint64_t *cb = ctx->gcm_cb;
1253 	uint8_t *ct_buf = NULL;
1254 	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1255 	int rv = CRYPTO_SUCCESS;
1256 
1257 	ASSERT(block_size == GCM_BLOCK_LEN);
1258 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1259 	    B_FALSE);
1260 	/*
1261 	 * If the last call left an incomplete block, try to fill
1262 	 * it first.
1263 	 */
1264 	if (ctx->gcm_remainder_len > 0) {
1265 		need = block_size - ctx->gcm_remainder_len;
1266 		if (length < need) {
1267 			/* Accumulate bytes here and return. */
1268 			memcpy((uint8_t *)ctx->gcm_remainder +
1269 			    ctx->gcm_remainder_len, datap, length);
1270 
1271 			ctx->gcm_remainder_len += length;
1272 			if (ctx->gcm_copy_to == NULL) {
1273 				ctx->gcm_copy_to = datap;
1274 			}
1275 			return (CRYPTO_SUCCESS);
1276 		} else {
1277 			/* Complete incomplete block. */
1278 			memcpy((uint8_t *)ctx->gcm_remainder +
1279 			    ctx->gcm_remainder_len, datap, need);
1280 
1281 			ctx->gcm_copy_to = NULL;
1282 		}
1283 	}
1284 
1285 	/* Allocate a buffer to encrypt to if there is enough input. */
1286 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1287 		ct_buf = vmem_alloc(chunk_size, KM_SLEEP);
1288 		if (ct_buf == NULL) {
1289 			return (CRYPTO_HOST_MEMORY);
1290 		}
1291 	}
1292 
1293 	/* If we completed an incomplete block, encrypt and write it out. */
1294 	if (ctx->gcm_remainder_len > 0) {
1295 		kfpu_begin();
1296 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1297 		    (const uint32_t *)cb, (uint32_t *)tmp);
1298 
1299 		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
1300 		GHASH_AVX(ctx, tmp, block_size);
1301 		clear_fpu_regs();
1302 		kfpu_end();
1303 		rv = crypto_put_output_data(tmp, out, block_size);
1304 		out->cd_offset += block_size;
1305 		gcm_incr_counter_block(ctx);
1306 		ctx->gcm_processed_data_len += block_size;
1307 		bleft -= need;
1308 		datap += need;
1309 		ctx->gcm_remainder_len = 0;
1310 	}
1311 
1312 	/* Do the bulk encryption in chunk_size blocks. */
1313 	for (; bleft >= chunk_size; bleft -= chunk_size) {
1314 		kfpu_begin();
1315 		done = encrypt_blocks(
1316 		    datap, ct_buf, chunk_size, key, cb, htable, ghash);
1317 
1318 		clear_fpu_regs();
1319 		kfpu_end();
1320 		if (done != chunk_size) {
1321 			rv = CRYPTO_FAILED;
1322 			goto out_nofpu;
1323 		}
1324 		rv = crypto_put_output_data(ct_buf, out, chunk_size);
1325 		if (rv != CRYPTO_SUCCESS) {
1326 			goto out_nofpu;
1327 		}
1328 		out->cd_offset += chunk_size;
1329 		datap += chunk_size;
1330 		ctx->gcm_processed_data_len += chunk_size;
1331 	}
1332 	/* Check if we are already done. */
1333 	if (bleft == 0) {
1334 		goto out_nofpu;
1335 	}
1336 	/* Bulk encrypt the remaining data. */
1337 	kfpu_begin();
1338 	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
1339 		done = encrypt_blocks(datap, ct_buf, bleft, key, cb, htable,
1340 		    ghash);
1341 		if (done == 0) {
1342 			rv = CRYPTO_FAILED;
1343 			goto out;
1344 		}
1345 		rv = crypto_put_output_data(ct_buf, out, done);
1346 		if (rv != CRYPTO_SUCCESS) {
1347 			goto out;
1348 		}
1349 		out->cd_offset += done;
1350 		ctx->gcm_processed_data_len += done;
1351 		datap += done;
1352 		bleft -= done;
1353 
1354 	}
1355 	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
1356 	while (bleft > 0) {
1357 		if (bleft < block_size) {
1358 			memcpy(ctx->gcm_remainder, datap, bleft);
1359 			ctx->gcm_remainder_len = bleft;
1360 			ctx->gcm_copy_to = datap;
1361 			goto out;
1362 		}
1363 		/* Encrypt, hash and write out. */
1364 		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
1365 		    (const uint32_t *)cb, (uint32_t *)tmp);
1366 
1367 		gcm_xor_avx(datap, tmp);
1368 		GHASH_AVX(ctx, tmp, block_size);
1369 		rv = crypto_put_output_data(tmp, out, block_size);
1370 		if (rv != CRYPTO_SUCCESS) {
1371 			goto out;
1372 		}
1373 		out->cd_offset += block_size;
1374 		gcm_incr_counter_block(ctx);
1375 		ctx->gcm_processed_data_len += block_size;
1376 		datap += block_size;
1377 		bleft -= block_size;
1378 	}
1379 out:
1380 	clear_fpu_regs();
1381 	kfpu_end();
1382 out_nofpu:
1383 	if (ct_buf != NULL) {
1384 		vmem_free(ct_buf, chunk_size);
1385 	}
1386 	return (rv);
1387 }
1388 
1389 /*
1390  * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
1391  * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
1392  */
1393 static int
gcm_encrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1394 gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1395 {
1396 	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
1397 	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
1398 	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
1399 	size_t rem_len = ctx->gcm_remainder_len;
1400 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1401 	int aes_rounds = ((aes_key_t *)keysched)->nr;
1402 	int rv;
1403 
1404 	ASSERT(block_size == GCM_BLOCK_LEN);
1405 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1406 	    B_FALSE);
1407 
1408 	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
1409 		return (CRYPTO_DATA_LEN_RANGE);
1410 	}
1411 
1412 	kfpu_begin();
1413 	/* Pad last incomplete block with zeros, encrypt and hash. */
1414 	if (rem_len > 0) {
1415 		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
1416 		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1417 
1418 		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
1419 		memset(remainder + rem_len, 0, block_size - rem_len);
1420 		for (int i = 0; i < rem_len; i++) {
1421 			remainder[i] ^= tmp[i];
1422 		}
1423 		GHASH_AVX(ctx, remainder, block_size);
1424 		ctx->gcm_processed_data_len += rem_len;
1425 		/* No need to increment counter_block, it's the last block. */
1426 	}
1427 	/* Finish tag. */
1428 	ctx->gcm_len_a_len_c[1] =
1429 	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
1430 	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
1431 	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
1432 
1433 	gcm_xor_avx((uint8_t *)J0, ghash);
1434 	clear_fpu_regs();
1435 	kfpu_end();
1436 
1437 	/* Output remainder. */
1438 	if (rem_len > 0) {
1439 		rv = crypto_put_output_data(remainder, out, rem_len);
1440 		if (rv != CRYPTO_SUCCESS)
1441 			return (rv);
1442 	}
1443 	out->cd_offset += rem_len;
1444 	ctx->gcm_remainder_len = 0;
1445 	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
1446 	if (rv != CRYPTO_SUCCESS)
1447 		return (rv);
1448 
1449 	out->cd_offset += ctx->gcm_tag_len;
1450 	return (CRYPTO_SUCCESS);
1451 }
1452 
aesni_gcm_decrypt_avx(const uint8_t * in,uint8_t * out,size_t len,const void * key,uint64_t * iv,const uint64_t * Htable,uint64_t * Xip)1453 static size_t aesni_gcm_decrypt_avx(const uint8_t *in, uint8_t *out,
1454     size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
1455     uint64_t *Xip)
1456 {
1457 	(void) Htable;
1458 	return (aesni_gcm_decrypt(in, out, len, key, iv, Xip));
1459 }
1460 
1461 #if CAN_USE_GCM_ASM >= 2
aesni_gcm_decrypt_avx2(const uint8_t * in,uint8_t * out,size_t len,const void * key,uint64_t * iv,const uint64_t * Htable,uint64_t * Xip)1462 static size_t aesni_gcm_decrypt_avx2(const uint8_t *in, uint8_t *out,
1463     size_t len, const void *key, uint64_t *iv, const uint64_t *Htable,
1464     uint64_t *Xip)
1465 {
1466 	uint8_t *ivec = (uint8_t *)iv;
1467 	len &= kSizeTWithoutLower4Bits;
1468 	aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec,
1469 	    (const uint128_t *)Htable, (uint8_t *)Xip);
1470 	CRYPTO_store_u32_be(&ivec[12],
1471 	    CRYPTO_load_u32_be(&ivec[12]) + len / 16);
1472 	return (len);
1473 }
1474 #endif /* if CAN_USE_GCM_ASM >= 2 */
1475 
1476 /*
1477  * Finalize decryption: We just have accumulated crypto text, so now we
1478  * decrypt it here inplace.
1479  */
1480 static int
gcm_decrypt_final_avx(gcm_ctx_t * ctx,crypto_data_t * out,size_t block_size)1481 gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
1482 {
1483 	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
1484 	ASSERT3U(block_size, ==, 16);
1485 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1486 	    B_FALSE);
1487 
1488 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1489 	aesni_gcm_decrypt_impl *decrypt_blocks =
1490 #if CAN_USE_GCM_ASM >= 2
1491 	    ctx->impl == GCM_IMPL_AVX2 ?
1492 	    aesni_gcm_decrypt_avx2 :
1493 #endif
1494 	    aesni_gcm_decrypt_avx;
1495 	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
1496 	uint8_t *datap = ctx->gcm_pt_buf;
1497 	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
1498 	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
1499 	uint64_t *htable = ctx->gcm_Htable;
1500 	uint64_t *ghash = ctx->gcm_ghash;
1501 	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
1502 	int rv = CRYPTO_SUCCESS;
1503 	size_t bleft, done;
1504 
1505 	/*
1506 	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
1507 	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
1508 	 * GCM_AVX_MIN_DECRYPT_BYTES.
1509 	 */
1510 	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
1511 		kfpu_begin();
1512 		done = decrypt_blocks(datap, datap, chunk_size,
1513 		    (const void *)key, ctx->gcm_cb, htable, ghash);
1514 		clear_fpu_regs();
1515 		kfpu_end();
1516 		if (done != chunk_size) {
1517 			return (CRYPTO_FAILED);
1518 		}
1519 		datap += done;
1520 	}
1521 	/* Decrypt remainder, which is less than chunk size, in one go. */
1522 	kfpu_begin();
1523 	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
1524 		done = decrypt_blocks(datap, datap, bleft,
1525 		    (const void *)key, ctx->gcm_cb, htable, ghash);
1526 		if (done == 0) {
1527 			clear_fpu_regs();
1528 			kfpu_end();
1529 			return (CRYPTO_FAILED);
1530 		}
1531 		datap += done;
1532 		bleft -= done;
1533 	}
1534 	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
1535 
1536 	/*
1537 	 * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
1538 	 * decrypt them block by block.
1539 	 */
1540 	while (bleft > 0) {
1541 		/* Incomplete last block. */
1542 		if (bleft < block_size) {
1543 			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
1544 
1545 			memset(lastb, 0, block_size);
1546 			memcpy(lastb, datap, bleft);
1547 			/* The GCM processing. */
1548 			GHASH_AVX(ctx, lastb, block_size);
1549 			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1550 			for (size_t i = 0; i < bleft; i++) {
1551 				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
1552 			}
1553 			break;
1554 		}
1555 		/* The GCM processing. */
1556 		GHASH_AVX(ctx, datap, block_size);
1557 		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
1558 		gcm_xor_avx((uint8_t *)tmp, datap);
1559 		gcm_incr_counter_block(ctx);
1560 
1561 		datap += block_size;
1562 		bleft -= block_size;
1563 	}
1564 	if (rv != CRYPTO_SUCCESS) {
1565 		clear_fpu_regs();
1566 		kfpu_end();
1567 		return (rv);
1568 	}
1569 	/* Decryption done, finish the tag. */
1570 	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
1571 	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
1572 	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
1573 	    (uint32_t *)ctx->gcm_J0);
1574 
1575 	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
1576 
1577 	/* We are done with the FPU, restore its state. */
1578 	clear_fpu_regs();
1579 	kfpu_end();
1580 
1581 	/* Compare the input authentication tag with what we calculated. */
1582 	if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
1583 		/* They don't match. */
1584 		return (CRYPTO_INVALID_MAC);
1585 	}
1586 	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
1587 	if (rv != CRYPTO_SUCCESS) {
1588 		return (rv);
1589 	}
1590 	out->cd_offset += pt_len;
1591 	return (CRYPTO_SUCCESS);
1592 }
1593 
1594 /*
1595  * Initialize the GCM params H, Htabtle and the counter block. Save the
1596  * initial counter block.
1597  */
1598 static int
gcm_init_avx(gcm_ctx_t * ctx,const uint8_t * iv,size_t iv_len,const uint8_t * auth_data,size_t auth_data_len,size_t block_size)1599 gcm_init_avx(gcm_ctx_t *ctx, const uint8_t *iv, size_t iv_len,
1600     const uint8_t *auth_data, size_t auth_data_len, size_t block_size)
1601 {
1602 	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
1603 	uint64_t *H = ctx->gcm_H;
1604 	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
1605 	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
1606 	const uint8_t *datap = auth_data;
1607 	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
1608 	size_t bleft;
1609 
1610 	ASSERT(block_size == GCM_BLOCK_LEN);
1611 	ASSERT3S(((aes_key_t *)ctx->gcm_keysched)->ops->needs_byteswap, ==,
1612 	    B_FALSE);
1613 
1614 	size_t htab_len = 0;
1615 #if CAN_USE_GCM_ASM >= 2
1616 	if (ctx->impl == GCM_IMPL_AVX2) {
1617 		/*
1618 		 * BoringSSL's API specifies uint128_t[16] for htab; but only
1619 		 * uint128_t[12] are used.
1620 		 * See https://github.com/google/boringssl/blob/
1621 		 * 813840dd094f9e9c1b00a7368aa25e656554221f1/crypto/fipsmodule/
1622 		 * modes/asm/aes-gcm-avx2-x86_64.pl#L198-L200
1623 		 */
1624 		htab_len = (2 * 8 * sizeof (uint128_t));
1625 	} else
1626 #endif /* CAN_USE_GCM_ASM >= 2 */
1627 	{
1628 		htab_len = (2 * 6 * sizeof (uint128_t));
1629 	}
1630 
1631 	ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP);
1632 	if (ctx->gcm_Htable == NULL) {
1633 		return (CRYPTO_HOST_MEMORY);
1634 	}
1635 
1636 	/* Init H (encrypt zero block) and create the initial counter block. */
1637 	memset(H, 0, sizeof (ctx->gcm_H));
1638 	kfpu_begin();
1639 	aes_encrypt_intel(keysched, aes_rounds,
1640 	    (const uint32_t *)H, (uint32_t *)H);
1641 
1642 #if CAN_USE_GCM_ASM >= 2
1643 	if (ctx->impl == GCM_IMPL_AVX2) {
1644 		gcm_init_vpclmulqdq_avx2((uint128_t *)ctx->gcm_Htable, H);
1645 	} else
1646 #endif /* if CAN_USE_GCM_ASM >= 2 */
1647 	{
1648 		gcm_init_htab_avx(ctx->gcm_Htable, H);
1649 	}
1650 
1651 	if (iv_len == 12) {
1652 		memcpy(cb, iv, 12);
1653 		cb[12] = 0;
1654 		cb[13] = 0;
1655 		cb[14] = 0;
1656 		cb[15] = 1;
1657 		/* We need the ICB later. */
1658 		memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0));
1659 	} else {
1660 		/*
1661 		 * Most consumers use 12 byte IVs, so it's OK to use the
1662 		 * original routines for other IV sizes, just avoid nesting
1663 		 * kfpu_begin calls.
1664 		 */
1665 		clear_fpu_regs();
1666 		kfpu_end();
1667 		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
1668 		    aes_copy_block, aes_xor_block);
1669 		kfpu_begin();
1670 	}
1671 
1672 	memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash));
1673 
1674 	/* Openssl post increments the counter, adjust for that. */
1675 	gcm_incr_counter_block(ctx);
1676 
1677 	/* Ghash AAD in chunk_size blocks. */
1678 	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
1679 		GHASH_AVX(ctx, datap, chunk_size);
1680 		datap += chunk_size;
1681 		clear_fpu_regs();
1682 		kfpu_end();
1683 		kfpu_begin();
1684 	}
1685 	/* Ghash the remainder and handle possible incomplete GCM block. */
1686 	if (bleft > 0) {
1687 		size_t incomp = bleft % block_size;
1688 
1689 		bleft -= incomp;
1690 		if (bleft > 0) {
1691 			GHASH_AVX(ctx, datap, bleft);
1692 			datap += bleft;
1693 		}
1694 		if (incomp > 0) {
1695 			/* Zero pad and hash incomplete last block. */
1696 			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
1697 
1698 			memset(authp, 0, block_size);
1699 			memcpy(authp, datap, incomp);
1700 			GHASH_AVX(ctx, authp, block_size);
1701 		}
1702 	}
1703 	clear_fpu_regs();
1704 	kfpu_end();
1705 	return (CRYPTO_SUCCESS);
1706 }
1707 
1708 #if defined(_KERNEL)
1709 static int
icp_gcm_avx_set_chunk_size(const char * buf,zfs_kernel_param_t * kp)1710 icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
1711 {
1712 	unsigned long val;
1713 	char val_rounded[16];
1714 	int error = 0;
1715 
1716 	error = kstrtoul(buf, 0, &val);
1717 	if (error)
1718 		return (error);
1719 
1720 	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
1721 
1722 	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
1723 		return (-EINVAL);
1724 
1725 	snprintf(val_rounded, 16, "%u", (uint32_t)val);
1726 	error = param_set_uint(val_rounded, kp);
1727 	return (error);
1728 }
1729 
1730 module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
1731     param_get_uint, &gcm_avx_chunk_size, 0644);
1732 
1733 MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
1734 	"How many bytes to process while owning the FPU");
1735 
1736 #endif /* defined(__KERNEL) */
1737 #endif /* ifdef CAN_USE_GCM_ASM */
1738