1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * xsave/xrstor support.
4  *
5  * Author: Suresh Siddha <suresh.b.siddha@intel.com>
6  */
7 #include <linux/bitops.h>
8 #include <linux/compat.h>
9 #include <linux/cpu.h>
10 #include <linux/mman.h>
11 #include <linux/nospec.h>
12 #include <linux/pkeys.h>
13 #include <linux/seq_file.h>
14 #include <linux/proc_fs.h>
15 #include <linux/vmalloc.h>
16 #include <linux/coredump.h>
17 
18 #include <asm/fpu/api.h>
19 #include <asm/fpu/regset.h>
20 #include <asm/fpu/signal.h>
21 #include <asm/fpu/xcr.h>
22 
23 #include <asm/cpuid.h>
24 #include <asm/tlbflush.h>
25 #include <asm/prctl.h>
26 #include <asm/elf.h>
27 
28 #include <uapi/asm/elf.h>
29 
30 #include "context.h"
31 #include "internal.h"
32 #include "legacy.h"
33 #include "xstate.h"
34 
35 #define for_each_extended_xfeature(bit, mask)				\
36 	(bit) = FIRST_EXTENDED_XFEATURE;				\
37 	for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
38 
39 /*
40  * Although we spell it out in here, the Processor Trace
41  * xfeature is completely unused.  We use other mechanisms
42  * to save/restore PT state in Linux.
43  */
44 static const char *xfeature_names[] =
45 {
46 	"x87 floating point registers",
47 	"SSE registers",
48 	"AVX registers",
49 	"MPX bounds registers",
50 	"MPX CSR",
51 	"AVX-512 opmask",
52 	"AVX-512 Hi256",
53 	"AVX-512 ZMM_Hi256",
54 	"Processor Trace (unused)",
55 	"Protection Keys User registers",
56 	"PASID state",
57 	"Control-flow User registers",
58 	"Control-flow Kernel registers (unused)",
59 	"unknown xstate feature",
60 	"unknown xstate feature",
61 	"unknown xstate feature",
62 	"unknown xstate feature",
63 	"AMX Tile config",
64 	"AMX Tile data",
65 	"unknown xstate feature",
66 };
67 
68 static unsigned short xsave_cpuid_features[] __initdata = {
69 	[XFEATURE_FP]				= X86_FEATURE_FPU,
70 	[XFEATURE_SSE]				= X86_FEATURE_XMM,
71 	[XFEATURE_YMM]				= X86_FEATURE_AVX,
72 	[XFEATURE_BNDREGS]			= X86_FEATURE_MPX,
73 	[XFEATURE_BNDCSR]			= X86_FEATURE_MPX,
74 	[XFEATURE_OPMASK]			= X86_FEATURE_AVX512F,
75 	[XFEATURE_ZMM_Hi256]			= X86_FEATURE_AVX512F,
76 	[XFEATURE_Hi16_ZMM]			= X86_FEATURE_AVX512F,
77 	[XFEATURE_PT_UNIMPLEMENTED_SO_FAR]	= X86_FEATURE_INTEL_PT,
78 	[XFEATURE_PKRU]				= X86_FEATURE_OSPKE,
79 	[XFEATURE_PASID]			= X86_FEATURE_ENQCMD,
80 	[XFEATURE_CET_USER]			= X86_FEATURE_SHSTK,
81 	[XFEATURE_XTILE_CFG]			= X86_FEATURE_AMX_TILE,
82 	[XFEATURE_XTILE_DATA]			= X86_FEATURE_AMX_TILE,
83 };
84 
85 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
86 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
87 static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
88 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
89 static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init;
90 
91 #define XSTATE_FLAG_SUPERVISOR	BIT(0)
92 #define XSTATE_FLAG_ALIGNED64	BIT(1)
93 
94 /*
95  * Return whether the system supports a given xfeature.
96  *
97  * Also return the name of the (most advanced) feature that the caller requested:
98  */
cpu_has_xfeatures(u64 xfeatures_needed,const char ** feature_name)99 int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
100 {
101 	u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features;
102 
103 	if (unlikely(feature_name)) {
104 		long xfeature_idx, max_idx;
105 		u64 xfeatures_print;
106 		/*
107 		 * So we use FLS here to be able to print the most advanced
108 		 * feature that was requested but is missing. So if a driver
109 		 * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the
110 		 * missing AVX feature - this is the most informative message
111 		 * to users:
112 		 */
113 		if (xfeatures_missing)
114 			xfeatures_print = xfeatures_missing;
115 		else
116 			xfeatures_print = xfeatures_needed;
117 
118 		xfeature_idx = fls64(xfeatures_print)-1;
119 		max_idx = ARRAY_SIZE(xfeature_names)-1;
120 		xfeature_idx = min(xfeature_idx, max_idx);
121 
122 		*feature_name = xfeature_names[xfeature_idx];
123 	}
124 
125 	if (xfeatures_missing)
126 		return 0;
127 
128 	return 1;
129 }
130 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
131 
xfeature_is_aligned64(int xfeature_nr)132 static bool xfeature_is_aligned64(int xfeature_nr)
133 {
134 	return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64;
135 }
136 
xfeature_is_supervisor(int xfeature_nr)137 static bool xfeature_is_supervisor(int xfeature_nr)
138 {
139 	return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR;
140 }
141 
xfeature_get_offset(u64 xcomp_bv,int xfeature)142 static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
143 {
144 	unsigned int offs, i;
145 
146 	/*
147 	 * Non-compacted format and legacy features use the cached fixed
148 	 * offsets.
149 	 */
150 	if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) ||
151 	    xfeature <= XFEATURE_SSE)
152 		return xstate_offsets[xfeature];
153 
154 	/*
155 	 * Compacted format offsets depend on the actual content of the
156 	 * compacted xsave area which is determined by the xcomp_bv header
157 	 * field.
158 	 */
159 	offs = FXSAVE_SIZE + XSAVE_HDR_SIZE;
160 	for_each_extended_xfeature(i, xcomp_bv) {
161 		if (xfeature_is_aligned64(i))
162 			offs = ALIGN(offs, 64);
163 		if (i == xfeature)
164 			break;
165 		offs += xstate_sizes[i];
166 	}
167 	return offs;
168 }
169 
170 /*
171  * Enable the extended processor state save/restore feature.
172  * Called once per CPU onlining.
173  */
fpu__init_cpu_xstate(void)174 void fpu__init_cpu_xstate(void)
175 {
176 	if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features)
177 		return;
178 
179 	cr4_set_bits(X86_CR4_OSXSAVE);
180 
181 	/*
182 	 * Must happen after CR4 setup and before xsetbv() to allow KVM
183 	 * lazy passthrough.  Write independent of the dynamic state static
184 	 * key as that does not work on the boot CPU. This also ensures
185 	 * that any stale state is wiped out from XFD. Reset the per CPU
186 	 * xfd cache too.
187 	 */
188 	if (cpu_feature_enabled(X86_FEATURE_XFD))
189 		xfd_set_state(init_fpstate.xfd);
190 
191 	/*
192 	 * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
193 	 * managed by XSAVE{C, OPT, S} and XRSTOR{S}.  Only XSAVE user
194 	 * states can be set here.
195 	 */
196 	xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
197 
198 	/*
199 	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
200 	 */
201 	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
202 		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
203 				     xfeatures_mask_independent());
204 	}
205 }
206 
xfeature_enabled(enum xfeature xfeature)207 static bool xfeature_enabled(enum xfeature xfeature)
208 {
209 	return fpu_kernel_cfg.max_features & BIT_ULL(xfeature);
210 }
211 
212 /*
213  * Record the offsets and sizes of various xstates contained
214  * in the XSAVE state memory layout.
215  */
setup_xstate_cache(void)216 static void __init setup_xstate_cache(void)
217 {
218 	u32 eax, ebx, ecx, edx, i;
219 	/* start at the beginning of the "extended state" */
220 	unsigned int last_good_offset = offsetof(struct xregs_state,
221 						 extended_state_area);
222 	/*
223 	 * The FP xstates and SSE xstates are legacy states. They are always
224 	 * in the fixed offsets in the xsave area in either compacted form
225 	 * or standard form.
226 	 */
227 	xstate_offsets[XFEATURE_FP]	= 0;
228 	xstate_sizes[XFEATURE_FP]	= offsetof(struct fxregs_state,
229 						   xmm_space);
230 
231 	xstate_offsets[XFEATURE_SSE]	= xstate_sizes[XFEATURE_FP];
232 	xstate_sizes[XFEATURE_SSE]	= sizeof_field(struct fxregs_state,
233 						       xmm_space);
234 
235 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
236 		cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
237 
238 		xstate_sizes[i] = eax;
239 		xstate_flags[i] = ecx;
240 
241 		/*
242 		 * If an xfeature is supervisor state, the offset in EBX is
243 		 * invalid, leave it to -1.
244 		 */
245 		if (xfeature_is_supervisor(i))
246 			continue;
247 
248 		xstate_offsets[i] = ebx;
249 
250 		/*
251 		 * In our xstate size checks, we assume that the highest-numbered
252 		 * xstate feature has the highest offset in the buffer.  Ensure
253 		 * it does.
254 		 */
255 		WARN_ONCE(last_good_offset > xstate_offsets[i],
256 			  "x86/fpu: misordered xstate at %d\n", last_good_offset);
257 
258 		last_good_offset = xstate_offsets[i];
259 	}
260 }
261 
262 /*
263  * Print out all the supported xstate features:
264  */
print_xstate_features(void)265 static void __init print_xstate_features(void)
266 {
267 	int i;
268 
269 	for (i = 0; i < XFEATURE_MAX; i++) {
270 		u64 mask = BIT_ULL(i);
271 		const char *name;
272 
273 		if (cpu_has_xfeatures(mask, &name))
274 			pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name);
275 	}
276 }
277 
278 /*
279  * This check is important because it is easy to get XSTATE_*
280  * confused with XSTATE_BIT_*.
281  */
282 #define CHECK_XFEATURE(nr) do {		\
283 	WARN_ON(nr < FIRST_EXTENDED_XFEATURE);	\
284 	WARN_ON(nr >= XFEATURE_MAX);	\
285 } while (0)
286 
287 /*
288  * Print out xstate component offsets and sizes
289  */
print_xstate_offset_size(void)290 static void __init print_xstate_offset_size(void)
291 {
292 	int i;
293 
294 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
295 		pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n",
296 			i, xfeature_get_offset(fpu_kernel_cfg.max_features, i),
297 			i, xstate_sizes[i]);
298 	}
299 }
300 
301 /*
302  * This function is called only during boot time when x86 caps are not set
303  * up and alternative can not be used yet.
304  */
os_xrstor_booting(struct xregs_state * xstate)305 static __init void os_xrstor_booting(struct xregs_state *xstate)
306 {
307 	u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE;
308 	u32 lmask = mask;
309 	u32 hmask = mask >> 32;
310 	int err;
311 
312 	if (cpu_feature_enabled(X86_FEATURE_XSAVES))
313 		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
314 	else
315 		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
316 
317 	/*
318 	 * We should never fault when copying from a kernel buffer, and the FPU
319 	 * state we set at boot time should be valid.
320 	 */
321 	WARN_ON_FPU(err);
322 }
323 
324 /*
325  * All supported features have either init state all zeros or are
326  * handled in setup_init_fpu() individually. This is an explicit
327  * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
328  * newly added supported features at build time and make people
329  * actually look at the init state for the new feature.
330  */
331 #define XFEATURES_INIT_FPSTATE_HANDLED		\
332 	(XFEATURE_MASK_FP |			\
333 	 XFEATURE_MASK_SSE |			\
334 	 XFEATURE_MASK_YMM |			\
335 	 XFEATURE_MASK_OPMASK |			\
336 	 XFEATURE_MASK_ZMM_Hi256 |		\
337 	 XFEATURE_MASK_Hi16_ZMM	 |		\
338 	 XFEATURE_MASK_PKRU |			\
339 	 XFEATURE_MASK_BNDREGS |		\
340 	 XFEATURE_MASK_BNDCSR |			\
341 	 XFEATURE_MASK_PASID |			\
342 	 XFEATURE_MASK_CET_USER |		\
343 	 XFEATURE_MASK_XTILE)
344 
345 /*
346  * setup the xstate image representing the init state
347  */
setup_init_fpu_buf(void)348 static void __init setup_init_fpu_buf(void)
349 {
350 	BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
351 		      XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
352 		     XFEATURES_INIT_FPSTATE_HANDLED);
353 
354 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
355 		return;
356 
357 	print_xstate_features();
358 
359 	xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures);
360 
361 	/*
362 	 * Init all the features state with header.xfeatures being 0x0
363 	 */
364 	os_xrstor_booting(&init_fpstate.regs.xsave);
365 
366 	/*
367 	 * All components are now in init state. Read the state back so
368 	 * that init_fpstate contains all non-zero init state. This only
369 	 * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because
370 	 * those use the init optimization which skips writing data for
371 	 * components in init state.
372 	 *
373 	 * XSAVE could be used, but that would require to reshuffle the
374 	 * data when XSAVEC/S is available because XSAVEC/S uses xstate
375 	 * compaction. But doing so is a pointless exercise because most
376 	 * components have an all zeros init state except for the legacy
377 	 * ones (FP and SSE). Those can be saved with FXSAVE into the
378 	 * legacy area. Adding new features requires to ensure that init
379 	 * state is all zeroes or if not to add the necessary handling
380 	 * here.
381 	 */
382 	fxsave(&init_fpstate.regs.fxsave);
383 }
384 
xfeature_size(int xfeature_nr)385 int xfeature_size(int xfeature_nr)
386 {
387 	u32 eax, ebx, ecx, edx;
388 
389 	CHECK_XFEATURE(xfeature_nr);
390 	cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx);
391 	return eax;
392 }
393 
394 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
validate_user_xstate_header(const struct xstate_header * hdr,struct fpstate * fpstate)395 static int validate_user_xstate_header(const struct xstate_header *hdr,
396 				       struct fpstate *fpstate)
397 {
398 	/* No unknown or supervisor features may be set */
399 	if (hdr->xfeatures & ~fpstate->user_xfeatures)
400 		return -EINVAL;
401 
402 	/* Userspace must use the uncompacted format */
403 	if (hdr->xcomp_bv)
404 		return -EINVAL;
405 
406 	/*
407 	 * If 'reserved' is shrunken to add a new field, make sure to validate
408 	 * that new field here!
409 	 */
410 	BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
411 
412 	/* No reserved bits may be set */
413 	if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
414 		return -EINVAL;
415 
416 	return 0;
417 }
418 
__xstate_dump_leaves(void)419 static void __init __xstate_dump_leaves(void)
420 {
421 	int i;
422 	u32 eax, ebx, ecx, edx;
423 	static int should_dump = 1;
424 
425 	if (!should_dump)
426 		return;
427 	should_dump = 0;
428 	/*
429 	 * Dump out a few leaves past the ones that we support
430 	 * just in case there are some goodies up there
431 	 */
432 	for (i = 0; i < XFEATURE_MAX + 10; i++) {
433 		cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
434 		pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
435 			CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx);
436 	}
437 }
438 
439 #define XSTATE_WARN_ON(x, fmt, ...) do {					\
440 	if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) {	\
441 		__xstate_dump_leaves();						\
442 	}									\
443 } while (0)
444 
445 #define XCHECK_SZ(sz, nr, __struct) ({					\
446 	if (WARN_ONCE(sz != sizeof(__struct),				\
447 	    "[%s]: struct is %zu bytes, cpu state %d bytes\n",		\
448 	    xfeature_names[nr], sizeof(__struct), sz)) {		\
449 		__xstate_dump_leaves();					\
450 	}								\
451 	true;								\
452 })
453 
454 
455 /**
456  * check_xtile_data_against_struct - Check tile data state size.
457  *
458  * Calculate the state size by multiplying the single tile size which is
459  * recorded in a C struct, and the number of tiles that the CPU informs.
460  * Compare the provided size with the calculation.
461  *
462  * @size:	The tile data state size
463  *
464  * Returns:	0 on success, -EINVAL on mismatch.
465  */
check_xtile_data_against_struct(int size)466 static int __init check_xtile_data_against_struct(int size)
467 {
468 	u32 max_palid, palid, state_size;
469 	u32 eax, ebx, ecx, edx;
470 	u16 max_tile;
471 
472 	/*
473 	 * Check the maximum palette id:
474 	 *   eax: the highest numbered palette subleaf.
475 	 */
476 	cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx);
477 
478 	/*
479 	 * Cross-check each tile size and find the maximum number of
480 	 * supported tiles.
481 	 */
482 	for (palid = 1, max_tile = 0; palid <= max_palid; palid++) {
483 		u16 tile_size, max;
484 
485 		/*
486 		 * Check the tile size info:
487 		 *   eax[31:16]:  bytes per title
488 		 *   ebx[31:16]:  the max names (or max number of tiles)
489 		 */
490 		cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx);
491 		tile_size = eax >> 16;
492 		max = ebx >> 16;
493 
494 		if (tile_size != sizeof(struct xtile_data)) {
495 			pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n",
496 			       __stringify(XFEATURE_XTILE_DATA),
497 			       sizeof(struct xtile_data), tile_size);
498 			__xstate_dump_leaves();
499 			return -EINVAL;
500 		}
501 
502 		if (max > max_tile)
503 			max_tile = max;
504 	}
505 
506 	state_size = sizeof(struct xtile_data) * max_tile;
507 	if (size != state_size) {
508 		pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n",
509 		       __stringify(XFEATURE_XTILE_DATA), state_size, size);
510 		__xstate_dump_leaves();
511 		return -EINVAL;
512 	}
513 	return 0;
514 }
515 
516 /*
517  * We have a C struct for each 'xstate'.  We need to ensure
518  * that our software representation matches what the CPU
519  * tells us about the state's size.
520  */
check_xstate_against_struct(int nr)521 static bool __init check_xstate_against_struct(int nr)
522 {
523 	/*
524 	 * Ask the CPU for the size of the state.
525 	 */
526 	int sz = xfeature_size(nr);
527 
528 	/*
529 	 * Match each CPU state with the corresponding software
530 	 * structure.
531 	 */
532 	switch (nr) {
533 	case XFEATURE_YMM:	  return XCHECK_SZ(sz, nr, struct ymmh_struct);
534 	case XFEATURE_BNDREGS:	  return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
535 	case XFEATURE_BNDCSR:	  return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
536 	case XFEATURE_OPMASK:	  return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
537 	case XFEATURE_ZMM_Hi256:  return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
538 	case XFEATURE_Hi16_ZMM:	  return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
539 	case XFEATURE_PKRU:	  return XCHECK_SZ(sz, nr, struct pkru_state);
540 	case XFEATURE_PASID:	  return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
541 	case XFEATURE_XTILE_CFG:  return XCHECK_SZ(sz, nr, struct xtile_cfg);
542 	case XFEATURE_CET_USER:	  return XCHECK_SZ(sz, nr, struct cet_user_state);
543 	case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
544 	default:
545 		XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
546 		return false;
547 	}
548 
549 	return true;
550 }
551 
xstate_calculate_size(u64 xfeatures,bool compacted)552 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
553 {
554 	unsigned int topmost = fls64(xfeatures) -  1;
555 	unsigned int offset = xstate_offsets[topmost];
556 
557 	if (topmost <= XFEATURE_SSE)
558 		return sizeof(struct xregs_state);
559 
560 	if (compacted)
561 		offset = xfeature_get_offset(xfeatures, topmost);
562 	return offset + xstate_sizes[topmost];
563 }
564 
565 /*
566  * This essentially double-checks what the cpu told us about
567  * how large the XSAVE buffer needs to be.  We are recalculating
568  * it to be safe.
569  *
570  * Independent XSAVE features allocate their own buffers and are not
571  * covered by these checks. Only the size of the buffer for task->fpu
572  * is checked here.
573  */
paranoid_xstate_size_valid(unsigned int kernel_size)574 static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
575 {
576 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
577 	bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES);
578 	unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
579 	int i;
580 
581 	for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
582 		if (!check_xstate_against_struct(i))
583 			return false;
584 		/*
585 		 * Supervisor state components can be managed only by
586 		 * XSAVES.
587 		 */
588 		if (!xsaves && xfeature_is_supervisor(i)) {
589 			XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
590 			return false;
591 		}
592 	}
593 	size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
594 	XSTATE_WARN_ON(size != kernel_size,
595 		       "size %u != kernel_size %u\n", size, kernel_size);
596 	return size == kernel_size;
597 }
598 
599 /*
600  * Get total size of enabled xstates in XCR0 | IA32_XSS.
601  *
602  * Note the SDM's wording here.  "sub-function 0" only enumerates
603  * the size of the *user* states.  If we use it to size a buffer
604  * that we use 'XSAVES' on, we could potentially overflow the
605  * buffer because 'XSAVES' saves system states too.
606  *
607  * This also takes compaction into account. So this works for
608  * XSAVEC as well.
609  */
get_compacted_size(void)610 static unsigned int __init get_compacted_size(void)
611 {
612 	unsigned int eax, ebx, ecx, edx;
613 	/*
614 	 * - CPUID function 0DH, sub-function 1:
615 	 *    EBX enumerates the size (in bytes) required by
616 	 *    the XSAVES instruction for an XSAVE area
617 	 *    containing all the state components
618 	 *    corresponding to bits currently set in
619 	 *    XCR0 | IA32_XSS.
620 	 *
621 	 * When XSAVES is not available but XSAVEC is (virt), then there
622 	 * are no supervisor states, but XSAVEC still uses compacted
623 	 * format.
624 	 */
625 	cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
626 	return ebx;
627 }
628 
629 /*
630  * Get the total size of the enabled xstates without the independent supervisor
631  * features.
632  */
get_xsave_compacted_size(void)633 static unsigned int __init get_xsave_compacted_size(void)
634 {
635 	u64 mask = xfeatures_mask_independent();
636 	unsigned int size;
637 
638 	if (!mask)
639 		return get_compacted_size();
640 
641 	/* Disable independent features. */
642 	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
643 
644 	/*
645 	 * Ask the hardware what size is required of the buffer.
646 	 * This is the size required for the task->fpu buffer.
647 	 */
648 	size = get_compacted_size();
649 
650 	/* Re-enable independent features so XSAVES will work on them again. */
651 	wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
652 
653 	return size;
654 }
655 
get_xsave_size_user(void)656 static unsigned int __init get_xsave_size_user(void)
657 {
658 	unsigned int eax, ebx, ecx, edx;
659 	/*
660 	 * - CPUID function 0DH, sub-function 0:
661 	 *    EBX enumerates the size (in bytes) required by
662 	 *    the XSAVE instruction for an XSAVE area
663 	 *    containing all the *user* state components
664 	 *    corresponding to bits currently set in XCR0.
665 	 */
666 	cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
667 	return ebx;
668 }
669 
init_xstate_size(void)670 static int __init init_xstate_size(void)
671 {
672 	/* Recompute the context size for enabled features: */
673 	unsigned int user_size, kernel_size, kernel_default_size;
674 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
675 
676 	/* Uncompacted user space size */
677 	user_size = get_xsave_size_user();
678 
679 	/*
680 	 * XSAVES kernel size includes supervisor states and uses compacted
681 	 * format. XSAVEC uses compacted format, but does not save
682 	 * supervisor states.
683 	 *
684 	 * XSAVE[OPT] do not support supervisor states so kernel and user
685 	 * size is identical.
686 	 */
687 	if (compacted)
688 		kernel_size = get_xsave_compacted_size();
689 	else
690 		kernel_size = user_size;
691 
692 	kernel_default_size =
693 		xstate_calculate_size(fpu_kernel_cfg.default_features, compacted);
694 
695 	if (!paranoid_xstate_size_valid(kernel_size))
696 		return -EINVAL;
697 
698 	fpu_kernel_cfg.max_size = kernel_size;
699 	fpu_user_cfg.max_size = user_size;
700 
701 	fpu_kernel_cfg.default_size = kernel_default_size;
702 	fpu_user_cfg.default_size =
703 		xstate_calculate_size(fpu_user_cfg.default_features, false);
704 
705 	return 0;
706 }
707 
708 /*
709  * We enabled the XSAVE hardware, but something went wrong and
710  * we can not use it.  Disable it.
711  */
fpu__init_disable_system_xstate(unsigned int legacy_size)712 static void __init fpu__init_disable_system_xstate(unsigned int legacy_size)
713 {
714 	fpu_kernel_cfg.max_features = 0;
715 	cr4_clear_bits(X86_CR4_OSXSAVE);
716 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
717 
718 	/* Restore the legacy size.*/
719 	fpu_kernel_cfg.max_size = legacy_size;
720 	fpu_kernel_cfg.default_size = legacy_size;
721 	fpu_user_cfg.max_size = legacy_size;
722 	fpu_user_cfg.default_size = legacy_size;
723 
724 	/*
725 	 * Prevent enabling the static branch which enables writes to the
726 	 * XFD MSR.
727 	 */
728 	init_fpstate.xfd = 0;
729 
730 	fpstate_reset(&current->thread.fpu);
731 }
732 
733 /*
734  * Enable and initialize the xsave feature.
735  * Called once per system bootup.
736  */
fpu__init_system_xstate(unsigned int legacy_size)737 void __init fpu__init_system_xstate(unsigned int legacy_size)
738 {
739 	unsigned int eax, ebx, ecx, edx;
740 	u64 xfeatures;
741 	int err;
742 	int i;
743 
744 	if (!boot_cpu_has(X86_FEATURE_FPU)) {
745 		pr_info("x86/fpu: No FPU detected\n");
746 		return;
747 	}
748 
749 	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
750 		pr_info("x86/fpu: x87 FPU will use %s\n",
751 			boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
752 		return;
753 	}
754 
755 	/*
756 	 * Find user xstates supported by the processor.
757 	 */
758 	cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
759 	fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
760 
761 	/*
762 	 * Find supervisor xstates supported by the processor.
763 	 */
764 	cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
765 	fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
766 
767 	if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
768 		/*
769 		 * This indicates that something really unexpected happened
770 		 * with the enumeration.  Disable XSAVE and try to continue
771 		 * booting without it.  This is too early to BUG().
772 		 */
773 		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
774 		       fpu_kernel_cfg.max_features);
775 		goto out_disable;
776 	}
777 
778 	fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
779 					      XFEATURE_MASK_INDEPENDENT;
780 
781 	/*
782 	 * Clear XSAVE features that are disabled in the normal CPUID.
783 	 */
784 	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
785 		unsigned short cid = xsave_cpuid_features[i];
786 
787 		/* Careful: X86_FEATURE_FPU is 0! */
788 		if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid))
789 			fpu_kernel_cfg.max_features &= ~BIT_ULL(i);
790 	}
791 
792 	if (!cpu_feature_enabled(X86_FEATURE_XFD))
793 		fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC;
794 
795 	if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
796 		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
797 	else
798 		fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED |
799 					XFEATURE_MASK_SUPERVISOR_SUPPORTED;
800 
801 	fpu_user_cfg.max_features = fpu_kernel_cfg.max_features;
802 	fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED;
803 
804 	/* Clean out dynamic features from default */
805 	fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features;
806 	fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
807 
808 	fpu_user_cfg.default_features = fpu_user_cfg.max_features;
809 	fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC;
810 
811 	/* Store it for paranoia check at the end */
812 	xfeatures = fpu_kernel_cfg.max_features;
813 
814 	/*
815 	 * Initialize the default XFD state in initfp_state and enable the
816 	 * dynamic sizing mechanism if dynamic states are available.  The
817 	 * static key cannot be enabled here because this runs before
818 	 * jump_label_init(). This is delayed to an initcall.
819 	 */
820 	init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC;
821 
822 	/* Set up compaction feature bit */
823 	if (cpu_feature_enabled(X86_FEATURE_XSAVEC) ||
824 	    cpu_feature_enabled(X86_FEATURE_XSAVES))
825 		setup_force_cpu_cap(X86_FEATURE_XCOMPACTED);
826 
827 	/* Enable xstate instructions to be able to continue with initialization: */
828 	fpu__init_cpu_xstate();
829 
830 	/* Cache size, offset and flags for initialization */
831 	setup_xstate_cache();
832 
833 	err = init_xstate_size();
834 	if (err)
835 		goto out_disable;
836 
837 	/* Reset the state for the current task */
838 	fpstate_reset(&current->thread.fpu);
839 
840 	/*
841 	 * Update info used for ptrace frames; use standard-format size and no
842 	 * supervisor xstates:
843 	 */
844 	update_regset_xstate_info(fpu_user_cfg.max_size,
845 				  fpu_user_cfg.max_features);
846 
847 	/*
848 	 * init_fpstate excludes dynamic states as they are large but init
849 	 * state is zero.
850 	 */
851 	init_fpstate.size		= fpu_kernel_cfg.default_size;
852 	init_fpstate.xfeatures		= fpu_kernel_cfg.default_features;
853 
854 	if (init_fpstate.size > sizeof(init_fpstate.regs)) {
855 		pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n",
856 			sizeof(init_fpstate.regs), init_fpstate.size);
857 		goto out_disable;
858 	}
859 
860 	setup_init_fpu_buf();
861 
862 	/*
863 	 * Paranoia check whether something in the setup modified the
864 	 * xfeatures mask.
865 	 */
866 	if (xfeatures != fpu_kernel_cfg.max_features) {
867 		pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
868 		       xfeatures, fpu_kernel_cfg.max_features);
869 		goto out_disable;
870 	}
871 
872 	/*
873 	 * CPU capabilities initialization runs before FPU init. So
874 	 * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely
875 	 * functional, set the feature bit so depending code works.
876 	 */
877 	setup_force_cpu_cap(X86_FEATURE_OSXSAVE);
878 
879 	print_xstate_offset_size();
880 	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
881 		fpu_kernel_cfg.max_features,
882 		fpu_kernel_cfg.max_size,
883 		boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard");
884 	return;
885 
886 out_disable:
887 	/* something went wrong, try to boot without any XSAVE support */
888 	fpu__init_disable_system_xstate(legacy_size);
889 }
890 
891 /*
892  * Restore minimal FPU state after suspend:
893  */
fpu__resume_cpu(void)894 void fpu__resume_cpu(void)
895 {
896 	/*
897 	 * Restore XCR0 on xsave capable CPUs:
898 	 */
899 	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
900 		xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features);
901 
902 	/*
903 	 * Restore IA32_XSS. The same CPUID bit enumerates support
904 	 * of XSAVES and MSR_IA32_XSS.
905 	 */
906 	if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
907 		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
908 				     xfeatures_mask_independent());
909 	}
910 
911 	if (fpu_state_size_dynamic())
912 		wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd);
913 }
914 
915 /*
916  * Given an xstate feature nr, calculate where in the xsave
917  * buffer the state is.  Callers should ensure that the buffer
918  * is valid.
919  */
__raw_xsave_addr(struct xregs_state * xsave,int xfeature_nr)920 static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
921 {
922 	u64 xcomp_bv = xsave->header.xcomp_bv;
923 
924 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
925 		return NULL;
926 
927 	if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) {
928 		if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr))))
929 			return NULL;
930 	}
931 
932 	return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr);
933 }
934 
935 /*
936  * Given the xsave area and a state inside, this function returns the
937  * address of the state.
938  *
939  * This is the API that is called to get xstate address in either
940  * standard format or compacted format of xsave area.
941  *
942  * Note that if there is no data for the field in the xsave buffer
943  * this will return NULL.
944  *
945  * Inputs:
946  *	xstate: the thread's storage area for all FPU data
947  *	xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
948  *	XFEATURE_SSE, etc...)
949  * Output:
950  *	address of the state in the xsave area, or NULL if the
951  *	field is not present in the xsave buffer.
952  */
get_xsave_addr(struct xregs_state * xsave,int xfeature_nr)953 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
954 {
955 	/*
956 	 * Do we even *have* xsave state?
957 	 */
958 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
959 		return NULL;
960 
961 	/*
962 	 * We should not ever be requesting features that we
963 	 * have not enabled.
964 	 */
965 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
966 		return NULL;
967 
968 	/*
969 	 * This assumes the last 'xsave*' instruction to
970 	 * have requested that 'xfeature_nr' be saved.
971 	 * If it did not, we might be seeing and old value
972 	 * of the field in the buffer.
973 	 *
974 	 * This can happen because the last 'xsave' did not
975 	 * request that this feature be saved (unlikely)
976 	 * or because the "init optimization" caused it
977 	 * to not be saved.
978 	 */
979 	if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr)))
980 		return NULL;
981 
982 	return __raw_xsave_addr(xsave, xfeature_nr);
983 }
984 EXPORT_SYMBOL_GPL(get_xsave_addr);
985 
986 /*
987  * Given an xstate feature nr, calculate where in the xsave buffer the state is.
988  * The xsave buffer should be in standard format, not compacted (e.g. user mode
989  * signal frames).
990  */
get_xsave_addr_user(struct xregs_state __user * xsave,int xfeature_nr)991 void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
992 {
993 	if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
994 		return NULL;
995 
996 	return (void __user *)xsave + xstate_offsets[xfeature_nr];
997 }
998 
999 #ifdef CONFIG_ARCH_HAS_PKEYS
1000 
1001 /*
1002  * This will go out and modify PKRU register to set the access
1003  * rights for @pkey to @init_val.
1004  */
arch_set_user_pkey_access(struct task_struct * tsk,int pkey,unsigned long init_val)1005 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
1006 			      unsigned long init_val)
1007 {
1008 	u32 old_pkru, new_pkru_bits = 0;
1009 	int pkey_shift;
1010 
1011 	/*
1012 	 * This check implies XSAVE support.  OSPKE only gets
1013 	 * set if we enable XSAVE and we enable PKU in XCR0.
1014 	 */
1015 	if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
1016 		return -EINVAL;
1017 
1018 	/*
1019 	 * This code should only be called with valid 'pkey'
1020 	 * values originating from in-kernel users.  Complain
1021 	 * if a bad value is observed.
1022 	 */
1023 	if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
1024 		return -EINVAL;
1025 
1026 	/* Set the bits we need in PKRU:  */
1027 	if (init_val & PKEY_DISABLE_ACCESS)
1028 		new_pkru_bits |= PKRU_AD_BIT;
1029 	if (init_val & PKEY_DISABLE_WRITE)
1030 		new_pkru_bits |= PKRU_WD_BIT;
1031 
1032 	/* Shift the bits in to the correct place in PKRU for pkey: */
1033 	pkey_shift = pkey * PKRU_BITS_PER_PKEY;
1034 	new_pkru_bits <<= pkey_shift;
1035 
1036 	/* Get old PKRU and mask off any old bits in place: */
1037 	old_pkru = read_pkru();
1038 	old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
1039 
1040 	/* Write old part along with new part: */
1041 	write_pkru(old_pkru | new_pkru_bits);
1042 
1043 	return 0;
1044 }
1045 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
1046 
copy_feature(bool from_xstate,struct membuf * to,void * xstate,void * init_xstate,unsigned int size)1047 static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
1048 			 void *init_xstate, unsigned int size)
1049 {
1050 	membuf_write(to, from_xstate ? xstate : init_xstate, size);
1051 }
1052 
1053 /**
1054  * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1055  * @to:		membuf descriptor
1056  * @fpstate:	The fpstate buffer from which to copy
1057  * @xfeatures:	The mask of xfeatures to save (XSAVE mode only)
1058  * @pkru_val:	The PKRU value to store in the PKRU component
1059  * @copy_mode:	The requested copy mode
1060  *
1061  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1062  * format, i.e. from the kernel internal hardware dependent storage format
1063  * to the requested @mode. UABI XSTATE is always uncompacted!
1064  *
1065  * It supports partial copy but @to.pos always starts from zero.
1066  */
__copy_xstate_to_uabi_buf(struct membuf to,struct fpstate * fpstate,u64 xfeatures,u32 pkru_val,enum xstate_copy_mode copy_mode)1067 void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
1068 			       u64 xfeatures, u32 pkru_val,
1069 			       enum xstate_copy_mode copy_mode)
1070 {
1071 	const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
1072 	struct xregs_state *xinit = &init_fpstate.regs.xsave;
1073 	struct xregs_state *xsave = &fpstate->regs.xsave;
1074 	struct xstate_header header;
1075 	unsigned int zerofrom;
1076 	u64 mask;
1077 	int i;
1078 
1079 	memset(&header, 0, sizeof(header));
1080 	header.xfeatures = xsave->header.xfeatures;
1081 
1082 	/* Mask out the feature bits depending on copy mode */
1083 	switch (copy_mode) {
1084 	case XSTATE_COPY_FP:
1085 		header.xfeatures &= XFEATURE_MASK_FP;
1086 		break;
1087 
1088 	case XSTATE_COPY_FX:
1089 		header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
1090 		break;
1091 
1092 	case XSTATE_COPY_XSAVE:
1093 		header.xfeatures &= fpstate->user_xfeatures & xfeatures;
1094 		break;
1095 	}
1096 
1097 	/* Copy FP state up to MXCSR */
1098 	copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
1099 		     &xinit->i387, off_mxcsr);
1100 
1101 	/* Copy MXCSR when SSE or YMM are set in the feature mask */
1102 	copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
1103 		     &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
1104 		     MXCSR_AND_FLAGS_SIZE);
1105 
1106 	/* Copy the remaining FP state */
1107 	copy_feature(header.xfeatures & XFEATURE_MASK_FP,
1108 		     &to, &xsave->i387.st_space, &xinit->i387.st_space,
1109 		     sizeof(xsave->i387.st_space));
1110 
1111 	/* Copy the SSE state - shared with YMM, but independently managed */
1112 	copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
1113 		     &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
1114 		     sizeof(xsave->i387.xmm_space));
1115 
1116 	if (copy_mode != XSTATE_COPY_XSAVE)
1117 		goto out;
1118 
1119 	/* Zero the padding area */
1120 	membuf_zero(&to, sizeof(xsave->i387.padding));
1121 
1122 	/* Copy xsave->i387.sw_reserved */
1123 	membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
1124 
1125 	/* Copy the user space relevant state of @xsave->header */
1126 	membuf_write(&to, &header, sizeof(header));
1127 
1128 	zerofrom = offsetof(struct xregs_state, extended_state_area);
1129 
1130 	/*
1131 	 * This 'mask' indicates which states to copy from fpstate.
1132 	 * Those extended states that are not present in fpstate are
1133 	 * either disabled or initialized:
1134 	 *
1135 	 * In non-compacted format, disabled features still occupy
1136 	 * state space but there is no state to copy from in the
1137 	 * compacted init_fpstate. The gap tracking will zero these
1138 	 * states.
1139 	 *
1140 	 * The extended features have an all zeroes init state. Thus,
1141 	 * remove them from 'mask' to zero those features in the user
1142 	 * buffer instead of retrieving them from init_fpstate.
1143 	 */
1144 	mask = header.xfeatures;
1145 
1146 	for_each_extended_xfeature(i, mask) {
1147 		/*
1148 		 * If there was a feature or alignment gap, zero the space
1149 		 * in the destination buffer.
1150 		 */
1151 		if (zerofrom < xstate_offsets[i])
1152 			membuf_zero(&to, xstate_offsets[i] - zerofrom);
1153 
1154 		if (i == XFEATURE_PKRU) {
1155 			struct pkru_state pkru = {0};
1156 			/*
1157 			 * PKRU is not necessarily up to date in the
1158 			 * XSAVE buffer. Use the provided value.
1159 			 */
1160 			pkru.pkru = pkru_val;
1161 			membuf_write(&to, &pkru, sizeof(pkru));
1162 		} else {
1163 			membuf_write(&to,
1164 				     __raw_xsave_addr(xsave, i),
1165 				     xstate_sizes[i]);
1166 		}
1167 		/*
1168 		 * Keep track of the last copied state in the non-compacted
1169 		 * target buffer for gap zeroing.
1170 		 */
1171 		zerofrom = xstate_offsets[i] + xstate_sizes[i];
1172 	}
1173 
1174 out:
1175 	if (to.left)
1176 		membuf_zero(&to, to.left);
1177 }
1178 
1179 /**
1180  * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
1181  * @to:		membuf descriptor
1182  * @tsk:	The task from which to copy the saved xstate
1183  * @copy_mode:	The requested copy mode
1184  *
1185  * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
1186  * format, i.e. from the kernel internal hardware dependent storage format
1187  * to the requested @mode. UABI XSTATE is always uncompacted!
1188  *
1189  * It supports partial copy but @to.pos always starts from zero.
1190  */
copy_xstate_to_uabi_buf(struct membuf to,struct task_struct * tsk,enum xstate_copy_mode copy_mode)1191 void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
1192 			     enum xstate_copy_mode copy_mode)
1193 {
1194 	__copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate,
1195 				  tsk->thread.fpu.fpstate->user_xfeatures,
1196 				  tsk->thread.pkru, copy_mode);
1197 }
1198 
copy_from_buffer(void * dst,unsigned int offset,unsigned int size,const void * kbuf,const void __user * ubuf)1199 static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
1200 			    const void *kbuf, const void __user *ubuf)
1201 {
1202 	if (kbuf) {
1203 		memcpy(dst, kbuf + offset, size);
1204 	} else {
1205 		if (copy_from_user(dst, ubuf + offset, size))
1206 			return -EFAULT;
1207 	}
1208 	return 0;
1209 }
1210 
1211 
1212 /**
1213  * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
1214  * @fpstate:	The fpstate buffer to copy to
1215  * @kbuf:	The UABI format buffer, if it comes from the kernel
1216  * @ubuf:	The UABI format buffer, if it comes from userspace
1217  * @pkru:	The location to write the PKRU value to
1218  *
1219  * Converts from the UABI format into the kernel internal hardware
1220  * dependent format.
1221  *
1222  * This function ultimately has three different callers with distinct PKRU
1223  * behavior.
1224  * 1.	When called from sigreturn the PKRU register will be restored from
1225  *	@fpstate via an XRSTOR. Correctly copying the UABI format buffer to
1226  *	@fpstate is sufficient to cover this case, but the caller will also
1227  *	pass a pointer to the thread_struct's pkru field in @pkru and updating
1228  *	it is harmless.
1229  * 2.	When called from ptrace the PKRU register will be restored from the
1230  *	thread_struct's pkru field. A pointer to that is passed in @pkru.
1231  *	The kernel will restore it manually, so the XRSTOR behavior that resets
1232  *	the PKRU register to the hardware init value (0) if the corresponding
1233  *	xfeatures bit is not set is emulated here.
1234  * 3.	When called from KVM the PKRU register will be restored from the vcpu's
1235  *	pkru field. A pointer to that is passed in @pkru. KVM hasn't used
1236  *	XRSTOR and hasn't had the PKRU resetting behavior described above. To
1237  *	preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
1238  *	bit is not set.
1239  */
copy_uabi_to_xstate(struct fpstate * fpstate,const void * kbuf,const void __user * ubuf,u32 * pkru)1240 static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
1241 			       const void __user *ubuf, u32 *pkru)
1242 {
1243 	struct xregs_state *xsave = &fpstate->regs.xsave;
1244 	unsigned int offset, size;
1245 	struct xstate_header hdr;
1246 	u64 mask;
1247 	int i;
1248 
1249 	offset = offsetof(struct xregs_state, header);
1250 	if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
1251 		return -EFAULT;
1252 
1253 	if (validate_user_xstate_header(&hdr, fpstate))
1254 		return -EINVAL;
1255 
1256 	/* Validate MXCSR when any of the related features is in use */
1257 	mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
1258 	if (hdr.xfeatures & mask) {
1259 		u32 mxcsr[2];
1260 
1261 		offset = offsetof(struct fxregs_state, mxcsr);
1262 		if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
1263 			return -EFAULT;
1264 
1265 		/* Reserved bits in MXCSR must be zero. */
1266 		if (mxcsr[0] & ~mxcsr_feature_mask)
1267 			return -EINVAL;
1268 
1269 		/* SSE and YMM require MXCSR even when FP is not in use. */
1270 		if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
1271 			xsave->i387.mxcsr = mxcsr[0];
1272 			xsave->i387.mxcsr_mask = mxcsr[1];
1273 		}
1274 	}
1275 
1276 	for (i = 0; i < XFEATURE_MAX; i++) {
1277 		mask = BIT_ULL(i);
1278 
1279 		if (hdr.xfeatures & mask) {
1280 			void *dst = __raw_xsave_addr(xsave, i);
1281 
1282 			offset = xstate_offsets[i];
1283 			size = xstate_sizes[i];
1284 
1285 			if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
1286 				return -EFAULT;
1287 		}
1288 	}
1289 
1290 	if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
1291 		struct pkru_state *xpkru;
1292 
1293 		xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
1294 		*pkru = xpkru->pkru;
1295 	} else {
1296 		/*
1297 		 * KVM may pass NULL here to indicate that it does not need
1298 		 * PKRU updated.
1299 		 */
1300 		if (pkru)
1301 			*pkru = 0;
1302 	}
1303 
1304 	/*
1305 	 * The state that came in from userspace was user-state only.
1306 	 * Mask all the user states out of 'xfeatures':
1307 	 */
1308 	xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
1309 
1310 	/*
1311 	 * Add back in the features that came in from userspace:
1312 	 */
1313 	xsave->header.xfeatures |= hdr.xfeatures;
1314 
1315 	return 0;
1316 }
1317 
1318 /*
1319  * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
1320  * format and copy to the target thread. Used by ptrace and KVM.
1321  */
copy_uabi_from_kernel_to_xstate(struct fpstate * fpstate,const void * kbuf,u32 * pkru)1322 int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
1323 {
1324 	return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
1325 }
1326 
1327 /*
1328  * Convert from a sigreturn standard-format user-space buffer to kernel
1329  * XSAVE[S] format and copy to the target thread. This is called from the
1330  * sigreturn() and rt_sigreturn() system calls.
1331  */
copy_sigframe_from_user_to_xstate(struct task_struct * tsk,const void __user * ubuf)1332 int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
1333 				      const void __user *ubuf)
1334 {
1335 	return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
1336 }
1337 
validate_independent_components(u64 mask)1338 static bool validate_independent_components(u64 mask)
1339 {
1340 	u64 xchk;
1341 
1342 	if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
1343 		return false;
1344 
1345 	xchk = ~xfeatures_mask_independent();
1346 
1347 	if (WARN_ON_ONCE(!mask || mask & xchk))
1348 		return false;
1349 
1350 	return true;
1351 }
1352 
1353 /**
1354  * xsaves - Save selected components to a kernel xstate buffer
1355  * @xstate:	Pointer to the buffer
1356  * @mask:	Feature mask to select the components to save
1357  *
1358  * The @xstate buffer must be 64 byte aligned and correctly initialized as
1359  * XSAVES does not write the full xstate header. Before first use the
1360  * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
1361  * can #GP.
1362  *
1363  * The feature mask must be a subset of the independent features.
1364  */
xsaves(struct xregs_state * xstate,u64 mask)1365 void xsaves(struct xregs_state *xstate, u64 mask)
1366 {
1367 	int err;
1368 
1369 	if (!validate_independent_components(mask))
1370 		return;
1371 
1372 	XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
1373 	WARN_ON_ONCE(err);
1374 }
1375 
1376 /**
1377  * xrstors - Restore selected components from a kernel xstate buffer
1378  * @xstate:	Pointer to the buffer
1379  * @mask:	Feature mask to select the components to restore
1380  *
1381  * The @xstate buffer must be 64 byte aligned and correctly initialized
1382  * otherwise XRSTORS from that buffer can #GP.
1383  *
1384  * Proper usage is to restore the state which was saved with
1385  * xsaves() into @xstate.
1386  *
1387  * The feature mask must be a subset of the independent features.
1388  */
xrstors(struct xregs_state * xstate,u64 mask)1389 void xrstors(struct xregs_state *xstate, u64 mask)
1390 {
1391 	int err;
1392 
1393 	if (!validate_independent_components(mask))
1394 		return;
1395 
1396 	XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
1397 	WARN_ON_ONCE(err);
1398 }
1399 
1400 #if IS_ENABLED(CONFIG_KVM)
fpstate_clear_xstate_component(struct fpstate * fps,unsigned int xfeature)1401 void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature)
1402 {
1403 	void *addr = get_xsave_addr(&fps->regs.xsave, xfeature);
1404 
1405 	if (addr)
1406 		memset(addr, 0, xstate_sizes[xfeature]);
1407 }
1408 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
1409 #endif
1410 
1411 #ifdef CONFIG_X86_64
1412 
1413 #ifdef CONFIG_X86_DEBUG_FPU
1414 /*
1415  * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask
1416  * can safely operate on the @fpstate buffer.
1417  */
xstate_op_valid(struct fpstate * fpstate,u64 mask,bool rstor)1418 static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
1419 {
1420 	u64 xfd = __this_cpu_read(xfd_state);
1421 
1422 	if (fpstate->xfd == xfd)
1423 		return true;
1424 
1425 	 /*
1426 	  * The XFD MSR does not match fpstate->xfd. That's invalid when
1427 	  * the passed in fpstate is current's fpstate.
1428 	  */
1429 	if (fpstate->xfd == current->thread.fpu.fpstate->xfd)
1430 		return false;
1431 
1432 	/*
1433 	 * XRSTOR(S) from init_fpstate are always correct as it will just
1434 	 * bring all components into init state and not read from the
1435 	 * buffer. XSAVE(S) raises #PF after init.
1436 	 */
1437 	if (fpstate == &init_fpstate)
1438 		return rstor;
1439 
1440 	/*
1441 	 * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
1442 	 * XRSTORS(S): fpu_swap_kvm_fpstate()
1443 	 */
1444 
1445 	/*
1446 	 * No XSAVE/XRSTOR instructions (except XSAVE itself) touch
1447 	 * the buffer area for XFD-disabled state components.
1448 	 */
1449 	mask &= ~xfd;
1450 
1451 	/*
1452 	 * Remove features which are valid in fpstate. They
1453 	 * have space allocated in fpstate.
1454 	 */
1455 	mask &= ~fpstate->xfeatures;
1456 
1457 	/*
1458 	 * Any remaining state components in 'mask' might be written
1459 	 * by XSAVE/XRSTOR. Fail validation it found.
1460 	 */
1461 	return !mask;
1462 }
1463 
xfd_validate_state(struct fpstate * fpstate,u64 mask,bool rstor)1464 void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor)
1465 {
1466 	WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor));
1467 }
1468 #endif /* CONFIG_X86_DEBUG_FPU */
1469 
xfd_update_static_branch(void)1470 static int __init xfd_update_static_branch(void)
1471 {
1472 	/*
1473 	 * If init_fpstate.xfd has bits set then dynamic features are
1474 	 * available and the dynamic sizing must be enabled.
1475 	 */
1476 	if (init_fpstate.xfd)
1477 		static_branch_enable(&__fpu_state_size_dynamic);
1478 	return 0;
1479 }
arch_initcall(xfd_update_static_branch)1480 arch_initcall(xfd_update_static_branch)
1481 
1482 void fpstate_free(struct fpu *fpu)
1483 {
1484 	if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate)
1485 		vfree(fpu->fpstate);
1486 }
1487 
1488 /**
1489  * fpstate_realloc - Reallocate struct fpstate for the requested new features
1490  *
1491  * @xfeatures:	A bitmap of xstate features which extend the enabled features
1492  *		of that task
1493  * @ksize:	The required size for the kernel buffer
1494  * @usize:	The required size for user space buffers
1495  * @guest_fpu:	Pointer to a guest FPU container. NULL for host allocations
1496  *
1497  * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer
1498  * terminates quickly, vfree()-induced IPIs may be a concern, but tasks
1499  * with large states are likely to live longer.
1500  *
1501  * Returns: 0 on success, -ENOMEM on allocation error.
1502  */
fpstate_realloc(u64 xfeatures,unsigned int ksize,unsigned int usize,struct fpu_guest * guest_fpu)1503 static int fpstate_realloc(u64 xfeatures, unsigned int ksize,
1504 			   unsigned int usize, struct fpu_guest *guest_fpu)
1505 {
1506 	struct fpu *fpu = &current->thread.fpu;
1507 	struct fpstate *curfps, *newfps = NULL;
1508 	unsigned int fpsize;
1509 	bool in_use;
1510 
1511 	fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64);
1512 
1513 	newfps = vzalloc(fpsize);
1514 	if (!newfps)
1515 		return -ENOMEM;
1516 	newfps->size = ksize;
1517 	newfps->user_size = usize;
1518 	newfps->is_valloc = true;
1519 
1520 	/*
1521 	 * When a guest FPU is supplied, use @guest_fpu->fpstate
1522 	 * as reference independent whether it is in use or not.
1523 	 */
1524 	curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate;
1525 
1526 	/* Determine whether @curfps is the active fpstate */
1527 	in_use = fpu->fpstate == curfps;
1528 
1529 	if (guest_fpu) {
1530 		newfps->is_guest = true;
1531 		newfps->is_confidential = curfps->is_confidential;
1532 		newfps->in_use = curfps->in_use;
1533 		guest_fpu->xfeatures |= xfeatures;
1534 		guest_fpu->uabi_size = usize;
1535 	}
1536 
1537 	fpregs_lock();
1538 	/*
1539 	 * If @curfps is in use, ensure that the current state is in the
1540 	 * registers before swapping fpstate as that might invalidate it
1541 	 * due to layout changes.
1542 	 */
1543 	if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD))
1544 		fpregs_restore_userregs();
1545 
1546 	newfps->xfeatures = curfps->xfeatures | xfeatures;
1547 	newfps->user_xfeatures = curfps->user_xfeatures | xfeatures;
1548 	newfps->xfd = curfps->xfd & ~xfeatures;
1549 
1550 	/* Do the final updates within the locked region */
1551 	xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures);
1552 
1553 	if (guest_fpu) {
1554 		guest_fpu->fpstate = newfps;
1555 		/* If curfps is active, update the FPU fpstate pointer */
1556 		if (in_use)
1557 			fpu->fpstate = newfps;
1558 	} else {
1559 		fpu->fpstate = newfps;
1560 	}
1561 
1562 	if (in_use)
1563 		xfd_update_state(fpu->fpstate);
1564 	fpregs_unlock();
1565 
1566 	/* Only free valloc'ed state */
1567 	if (curfps && curfps->is_valloc)
1568 		vfree(curfps);
1569 
1570 	return 0;
1571 }
1572 
validate_sigaltstack(unsigned int usize)1573 static int validate_sigaltstack(unsigned int usize)
1574 {
1575 	struct task_struct *thread, *leader = current->group_leader;
1576 	unsigned long framesize = get_sigframe_size();
1577 
1578 	lockdep_assert_held(&current->sighand->siglock);
1579 
1580 	/* get_sigframe_size() is based on fpu_user_cfg.max_size */
1581 	framesize -= fpu_user_cfg.max_size;
1582 	framesize += usize;
1583 	for_each_thread(leader, thread) {
1584 		if (thread->sas_ss_size && thread->sas_ss_size < framesize)
1585 			return -ENOSPC;
1586 	}
1587 	return 0;
1588 }
1589 
__xstate_request_perm(u64 permitted,u64 requested,bool guest)1590 static int __xstate_request_perm(u64 permitted, u64 requested, bool guest)
1591 {
1592 	/*
1593 	 * This deliberately does not exclude !XSAVES as we still might
1594 	 * decide to optionally context switch XCR0 or talk the silicon
1595 	 * vendors into extending XFD for the pre AMX states, especially
1596 	 * AVX512.
1597 	 */
1598 	bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED);
1599 	struct fpu *fpu = &current->group_leader->thread.fpu;
1600 	struct fpu_state_perm *perm;
1601 	unsigned int ksize, usize;
1602 	u64 mask;
1603 	int ret = 0;
1604 
1605 	/* Check whether fully enabled */
1606 	if ((permitted & requested) == requested)
1607 		return 0;
1608 
1609 	/* Calculate the resulting kernel state size */
1610 	mask = permitted | requested;
1611 	/* Take supervisor states into account on the host */
1612 	if (!guest)
1613 		mask |= xfeatures_mask_supervisor();
1614 	ksize = xstate_calculate_size(mask, compacted);
1615 
1616 	/* Calculate the resulting user state size */
1617 	mask &= XFEATURE_MASK_USER_SUPPORTED;
1618 	usize = xstate_calculate_size(mask, false);
1619 
1620 	if (!guest) {
1621 		ret = validate_sigaltstack(usize);
1622 		if (ret)
1623 			return ret;
1624 	}
1625 
1626 	perm = guest ? &fpu->guest_perm : &fpu->perm;
1627 	/* Pairs with the READ_ONCE() in xstate_get_group_perm() */
1628 	WRITE_ONCE(perm->__state_perm, mask);
1629 	/* Protected by sighand lock */
1630 	perm->__state_size = ksize;
1631 	perm->__user_state_size = usize;
1632 	return ret;
1633 }
1634 
1635 /*
1636  * Permissions array to map facilities with more than one component
1637  */
1638 static const u64 xstate_prctl_req[XFEATURE_MAX] = {
1639 	[XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA,
1640 };
1641 
xstate_request_perm(unsigned long idx,bool guest)1642 static int xstate_request_perm(unsigned long idx, bool guest)
1643 {
1644 	u64 permitted, requested;
1645 	int ret;
1646 
1647 	if (idx >= XFEATURE_MAX)
1648 		return -EINVAL;
1649 
1650 	/*
1651 	 * Look up the facility mask which can require more than
1652 	 * one xstate component.
1653 	 */
1654 	idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req));
1655 	requested = xstate_prctl_req[idx];
1656 	if (!requested)
1657 		return -EOPNOTSUPP;
1658 
1659 	if ((fpu_user_cfg.max_features & requested) != requested)
1660 		return -EOPNOTSUPP;
1661 
1662 	/* Lockless quick check */
1663 	permitted = xstate_get_group_perm(guest);
1664 	if ((permitted & requested) == requested)
1665 		return 0;
1666 
1667 	/* Protect against concurrent modifications */
1668 	spin_lock_irq(&current->sighand->siglock);
1669 	permitted = xstate_get_group_perm(guest);
1670 
1671 	/* First vCPU allocation locks the permissions. */
1672 	if (guest && (permitted & FPU_GUEST_PERM_LOCKED))
1673 		ret = -EBUSY;
1674 	else
1675 		ret = __xstate_request_perm(permitted, requested, guest);
1676 	spin_unlock_irq(&current->sighand->siglock);
1677 	return ret;
1678 }
1679 
__xfd_enable_feature(u64 xfd_err,struct fpu_guest * guest_fpu)1680 int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
1681 {
1682 	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
1683 	struct fpu_state_perm *perm;
1684 	unsigned int ksize, usize;
1685 	struct fpu *fpu;
1686 
1687 	if (!xfd_event) {
1688 		if (!guest_fpu)
1689 			pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err);
1690 		return 0;
1691 	}
1692 
1693 	/* Protect against concurrent modifications */
1694 	spin_lock_irq(&current->sighand->siglock);
1695 
1696 	/* If not permitted let it die */
1697 	if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) {
1698 		spin_unlock_irq(&current->sighand->siglock);
1699 		return -EPERM;
1700 	}
1701 
1702 	fpu = &current->group_leader->thread.fpu;
1703 	perm = guest_fpu ? &fpu->guest_perm : &fpu->perm;
1704 	ksize = perm->__state_size;
1705 	usize = perm->__user_state_size;
1706 
1707 	/*
1708 	 * The feature is permitted. State size is sufficient.  Dropping
1709 	 * the lock is safe here even if more features are added from
1710 	 * another task, the retrieved buffer sizes are valid for the
1711 	 * currently requested feature(s).
1712 	 */
1713 	spin_unlock_irq(&current->sighand->siglock);
1714 
1715 	/*
1716 	 * Try to allocate a new fpstate. If that fails there is no way
1717 	 * out.
1718 	 */
1719 	if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu))
1720 		return -EFAULT;
1721 	return 0;
1722 }
1723 
xfd_enable_feature(u64 xfd_err)1724 int xfd_enable_feature(u64 xfd_err)
1725 {
1726 	return __xfd_enable_feature(xfd_err, NULL);
1727 }
1728 
1729 #else /* CONFIG_X86_64 */
xstate_request_perm(unsigned long idx,bool guest)1730 static inline int xstate_request_perm(unsigned long idx, bool guest)
1731 {
1732 	return -EPERM;
1733 }
1734 #endif  /* !CONFIG_X86_64 */
1735 
xstate_get_guest_group_perm(void)1736 u64 xstate_get_guest_group_perm(void)
1737 {
1738 	return xstate_get_group_perm(true);
1739 }
1740 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
1741 
1742 /**
1743  * fpu_xstate_prctl - xstate permission operations
1744  * @option:	A subfunction of arch_prctl()
1745  * @arg2:	option argument
1746  * Return:	0 if successful; otherwise, an error code
1747  *
1748  * Option arguments:
1749  *
1750  * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info
1751  * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info
1752  * ARCH_REQ_XCOMP_PERM: Facility number requested
1753  *
1754  * For facilities which require more than one XSTATE component, the request
1755  * must be the highest state component number related to that facility,
1756  * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and
1757  * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18).
1758  */
fpu_xstate_prctl(int option,unsigned long arg2)1759 long fpu_xstate_prctl(int option, unsigned long arg2)
1760 {
1761 	u64 __user *uptr = (u64 __user *)arg2;
1762 	u64 permitted, supported;
1763 	unsigned long idx = arg2;
1764 	bool guest = false;
1765 
1766 	switch (option) {
1767 	case ARCH_GET_XCOMP_SUPP:
1768 		supported = fpu_user_cfg.max_features |	fpu_user_cfg.legacy_features;
1769 		return put_user(supported, uptr);
1770 
1771 	case ARCH_GET_XCOMP_PERM:
1772 		/*
1773 		 * Lockless snapshot as it can also change right after the
1774 		 * dropping the lock.
1775 		 */
1776 		permitted = xstate_get_host_group_perm();
1777 		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1778 		return put_user(permitted, uptr);
1779 
1780 	case ARCH_GET_XCOMP_GUEST_PERM:
1781 		permitted = xstate_get_guest_group_perm();
1782 		permitted &= XFEATURE_MASK_USER_SUPPORTED;
1783 		return put_user(permitted, uptr);
1784 
1785 	case ARCH_REQ_XCOMP_GUEST_PERM:
1786 		guest = true;
1787 		fallthrough;
1788 
1789 	case ARCH_REQ_XCOMP_PERM:
1790 		if (!IS_ENABLED(CONFIG_X86_64))
1791 			return -EOPNOTSUPP;
1792 
1793 		return xstate_request_perm(idx, guest);
1794 
1795 	default:
1796 		return -EINVAL;
1797 	}
1798 }
1799 
1800 #ifdef CONFIG_PROC_PID_ARCH_STATUS
1801 /*
1802  * Report the amount of time elapsed in millisecond since last AVX512
1803  * use in the task.
1804  */
avx512_status(struct seq_file * m,struct task_struct * task)1805 static void avx512_status(struct seq_file *m, struct task_struct *task)
1806 {
1807 	unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
1808 	long delta;
1809 
1810 	if (!timestamp) {
1811 		/*
1812 		 * Report -1 if no AVX512 usage
1813 		 */
1814 		delta = -1;
1815 	} else {
1816 		delta = (long)(jiffies - timestamp);
1817 		/*
1818 		 * Cap to LONG_MAX if time difference > LONG_MAX
1819 		 */
1820 		if (delta < 0)
1821 			delta = LONG_MAX;
1822 		delta = jiffies_to_msecs(delta);
1823 	}
1824 
1825 	seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
1826 	seq_putc(m, '\n');
1827 }
1828 
1829 /*
1830  * Report architecture specific information
1831  */
proc_pid_arch_status(struct seq_file * m,struct pid_namespace * ns,struct pid * pid,struct task_struct * task)1832 int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
1833 			struct pid *pid, struct task_struct *task)
1834 {
1835 	/*
1836 	 * Report AVX512 state if the processor and build option supported.
1837 	 */
1838 	if (cpu_feature_enabled(X86_FEATURE_AVX512F))
1839 		avx512_status(m, task);
1840 
1841 	return 0;
1842 }
1843 #endif /* CONFIG_PROC_PID_ARCH_STATUS */
1844 
1845 #ifdef CONFIG_COREDUMP
1846 static const char owner_name[] = "LINUX";
1847 
1848 /*
1849  * Dump type, size, offset and flag values for every xfeature that is present.
1850  */
dump_xsave_layout_desc(struct coredump_params * cprm)1851 static int dump_xsave_layout_desc(struct coredump_params *cprm)
1852 {
1853 	int num_records = 0;
1854 	int i;
1855 
1856 	for_each_extended_xfeature(i, fpu_user_cfg.max_features) {
1857 		struct x86_xfeat_component xc = {
1858 			.type   = i,
1859 			.size   = xstate_sizes[i],
1860 			.offset = xstate_offsets[i],
1861 			/* reserved for future use */
1862 			.flags  = 0,
1863 		};
1864 
1865 		if (!dump_emit(cprm, &xc, sizeof(xc)))
1866 			return 0;
1867 
1868 		num_records++;
1869 	}
1870 	return num_records;
1871 }
1872 
get_xsave_desc_size(void)1873 static u32 get_xsave_desc_size(void)
1874 {
1875 	u32 cnt = 0;
1876 	u32 i;
1877 
1878 	for_each_extended_xfeature(i, fpu_user_cfg.max_features)
1879 		cnt++;
1880 
1881 	return cnt * (sizeof(struct x86_xfeat_component));
1882 }
1883 
elf_coredump_extra_notes_write(struct coredump_params * cprm)1884 int elf_coredump_extra_notes_write(struct coredump_params *cprm)
1885 {
1886 	int num_records = 0;
1887 	struct elf_note en;
1888 
1889 	if (!fpu_user_cfg.max_features)
1890 		return 0;
1891 
1892 	en.n_namesz = sizeof(owner_name);
1893 	en.n_descsz = get_xsave_desc_size();
1894 	en.n_type = NT_X86_XSAVE_LAYOUT;
1895 
1896 	if (!dump_emit(cprm, &en, sizeof(en)))
1897 		return 1;
1898 	if (!dump_emit(cprm, owner_name, en.n_namesz))
1899 		return 1;
1900 	if (!dump_align(cprm, 4))
1901 		return 1;
1902 
1903 	num_records = dump_xsave_layout_desc(cprm);
1904 	if (!num_records)
1905 		return 1;
1906 
1907 	/* Total size should be equal to the number of records */
1908 	if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
1909 		return 1;
1910 
1911 	return 0;
1912 }
1913 
elf_coredump_extra_notes_size(void)1914 int elf_coredump_extra_notes_size(void)
1915 {
1916 	int size;
1917 
1918 	if (!fpu_user_cfg.max_features)
1919 		return 0;
1920 
1921 	/* .note header */
1922 	size  = sizeof(struct elf_note);
1923 	/*  Name plus alignment to 4 bytes */
1924 	size += roundup(sizeof(owner_name), 4);
1925 	size += get_xsave_desc_size();
1926 
1927 	return size;
1928 }
1929 #endif /* CONFIG_COREDUMP */
1930