1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright(c) 2023 Intel Corporation.
4 *
5 * Intel Trusted Domain Extensions (TDX) support
6 */
7
8 #define pr_fmt(fmt) "virt/tdx: " fmt
9
10 #include <linux/types.h>
11 #include <linux/cache.h>
12 #include <linux/init.h>
13 #include <linux/errno.h>
14 #include <linux/printk.h>
15 #include <linux/cpu.h>
16 #include <linux/spinlock.h>
17 #include <linux/percpu-defs.h>
18 #include <linux/mutex.h>
19 #include <linux/list.h>
20 #include <linux/memblock.h>
21 #include <linux/memory.h>
22 #include <linux/minmax.h>
23 #include <linux/sizes.h>
24 #include <linux/pfn.h>
25 #include <linux/align.h>
26 #include <linux/sort.h>
27 #include <linux/log2.h>
28 #include <linux/acpi.h>
29 #include <linux/suspend.h>
30 #include <linux/acpi.h>
31 #include <asm/page.h>
32 #include <asm/special_insns.h>
33 #include <asm/msr-index.h>
34 #include <asm/msr.h>
35 #include <asm/cpufeature.h>
36 #include <asm/tdx.h>
37 #include <asm/intel-family.h>
38 #include <asm/processor.h>
39 #include <asm/mce.h>
40 #include "tdx.h"
41
42 static u32 tdx_global_keyid __ro_after_init;
43 static u32 tdx_guest_keyid_start __ro_after_init;
44 static u32 tdx_nr_guest_keyids __ro_after_init;
45
46 static DEFINE_PER_CPU(bool, tdx_lp_initialized);
47
48 static struct tdmr_info_list tdx_tdmr_list;
49
50 static enum tdx_module_status_t tdx_module_status;
51 static DEFINE_MUTEX(tdx_module_lock);
52
53 /* All TDX-usable memory regions. Protected by mem_hotplug_lock. */
54 static LIST_HEAD(tdx_memlist);
55
56 typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
57
seamcall_err(u64 fn,u64 err,struct tdx_module_args * args)58 static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
59 {
60 pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
61 }
62
seamcall_err_ret(u64 fn,u64 err,struct tdx_module_args * args)63 static inline void seamcall_err_ret(u64 fn, u64 err,
64 struct tdx_module_args *args)
65 {
66 seamcall_err(fn, err, args);
67 pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
68 args->rcx, args->rdx, args->r8);
69 pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
70 args->r9, args->r10, args->r11);
71 }
72
sc_retry_prerr(sc_func_t func,sc_err_func_t err_func,u64 fn,struct tdx_module_args * args)73 static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
74 u64 fn, struct tdx_module_args *args)
75 {
76 u64 sret = sc_retry(func, fn, args);
77
78 if (sret == TDX_SUCCESS)
79 return 0;
80
81 if (sret == TDX_SEAMCALL_VMFAILINVALID)
82 return -ENODEV;
83
84 if (sret == TDX_SEAMCALL_GP)
85 return -EOPNOTSUPP;
86
87 if (sret == TDX_SEAMCALL_UD)
88 return -EACCES;
89
90 err_func(fn, sret, args);
91 return -EIO;
92 }
93
94 #define seamcall_prerr(__fn, __args) \
95 sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
96
97 #define seamcall_prerr_ret(__fn, __args) \
98 sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
99
100 /*
101 * Do the module global initialization once and return its result.
102 * It can be done on any cpu. It's always called with interrupts
103 * disabled.
104 */
try_init_module_global(void)105 static int try_init_module_global(void)
106 {
107 struct tdx_module_args args = {};
108 static DEFINE_RAW_SPINLOCK(sysinit_lock);
109 static bool sysinit_done;
110 static int sysinit_ret;
111
112 lockdep_assert_irqs_disabled();
113
114 raw_spin_lock(&sysinit_lock);
115
116 if (sysinit_done)
117 goto out;
118
119 /* RCX is module attributes and all bits are reserved */
120 args.rcx = 0;
121 sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
122
123 /*
124 * The first SEAMCALL also detects the TDX module, thus
125 * it can fail due to the TDX module is not loaded.
126 * Dump message to let the user know.
127 */
128 if (sysinit_ret == -ENODEV)
129 pr_err("module not loaded\n");
130
131 sysinit_done = true;
132 out:
133 raw_spin_unlock(&sysinit_lock);
134 return sysinit_ret;
135 }
136
137 /**
138 * tdx_cpu_enable - Enable TDX on local cpu
139 *
140 * Do one-time TDX module per-cpu initialization SEAMCALL (and TDX module
141 * global initialization SEAMCALL if not done) on local cpu to make this
142 * cpu be ready to run any other SEAMCALLs.
143 *
144 * Always call this function via IPI function calls.
145 *
146 * Return 0 on success, otherwise errors.
147 */
tdx_cpu_enable(void)148 int tdx_cpu_enable(void)
149 {
150 struct tdx_module_args args = {};
151 int ret;
152
153 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
154 return -ENODEV;
155
156 lockdep_assert_irqs_disabled();
157
158 if (__this_cpu_read(tdx_lp_initialized))
159 return 0;
160
161 /*
162 * The TDX module global initialization is the very first step
163 * to enable TDX. Need to do it first (if hasn't been done)
164 * before the per-cpu initialization.
165 */
166 ret = try_init_module_global();
167 if (ret)
168 return ret;
169
170 ret = seamcall_prerr(TDH_SYS_LP_INIT, &args);
171 if (ret)
172 return ret;
173
174 __this_cpu_write(tdx_lp_initialized, true);
175
176 return 0;
177 }
178 EXPORT_SYMBOL_GPL(tdx_cpu_enable);
179
180 /*
181 * Add a memory region as a TDX memory block. The caller must make sure
182 * all memory regions are added in address ascending order and don't
183 * overlap.
184 */
add_tdx_memblock(struct list_head * tmb_list,unsigned long start_pfn,unsigned long end_pfn,int nid)185 static int add_tdx_memblock(struct list_head *tmb_list, unsigned long start_pfn,
186 unsigned long end_pfn, int nid)
187 {
188 struct tdx_memblock *tmb;
189
190 tmb = kmalloc(sizeof(*tmb), GFP_KERNEL);
191 if (!tmb)
192 return -ENOMEM;
193
194 INIT_LIST_HEAD(&tmb->list);
195 tmb->start_pfn = start_pfn;
196 tmb->end_pfn = end_pfn;
197 tmb->nid = nid;
198
199 /* @tmb_list is protected by mem_hotplug_lock */
200 list_add_tail(&tmb->list, tmb_list);
201 return 0;
202 }
203
free_tdx_memlist(struct list_head * tmb_list)204 static void free_tdx_memlist(struct list_head *tmb_list)
205 {
206 /* @tmb_list is protected by mem_hotplug_lock */
207 while (!list_empty(tmb_list)) {
208 struct tdx_memblock *tmb = list_first_entry(tmb_list,
209 struct tdx_memblock, list);
210
211 list_del(&tmb->list);
212 kfree(tmb);
213 }
214 }
215
216 /*
217 * Ensure that all memblock memory regions are convertible to TDX
218 * memory. Once this has been established, stash the memblock
219 * ranges off in a secondary structure because memblock is modified
220 * in memory hotplug while TDX memory regions are fixed.
221 */
build_tdx_memlist(struct list_head * tmb_list)222 static int build_tdx_memlist(struct list_head *tmb_list)
223 {
224 unsigned long start_pfn, end_pfn;
225 int i, nid, ret;
226
227 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
228 /*
229 * The first 1MB is not reported as TDX convertible memory.
230 * Although the first 1MB is always reserved and won't end up
231 * to the page allocator, it is still in memblock's memory
232 * regions. Skip them manually to exclude them as TDX memory.
233 */
234 start_pfn = max(start_pfn, PHYS_PFN(SZ_1M));
235 if (start_pfn >= end_pfn)
236 continue;
237
238 /*
239 * Add the memory regions as TDX memory. The regions in
240 * memblock has already guaranteed they are in address
241 * ascending order and don't overlap.
242 */
243 ret = add_tdx_memblock(tmb_list, start_pfn, end_pfn, nid);
244 if (ret)
245 goto err;
246 }
247
248 return 0;
249 err:
250 free_tdx_memlist(tmb_list);
251 return ret;
252 }
253
read_sys_metadata_field(u64 field_id,u64 * data)254 static int read_sys_metadata_field(u64 field_id, u64 *data)
255 {
256 struct tdx_module_args args = {};
257 int ret;
258
259 /*
260 * TDH.SYS.RD -- reads one global metadata field
261 * - RDX (in): the field to read
262 * - R8 (out): the field data
263 */
264 args.rdx = field_id;
265 ret = seamcall_prerr_ret(TDH_SYS_RD, &args);
266 if (ret)
267 return ret;
268
269 *data = args.r8;
270
271 return 0;
272 }
273
read_sys_metadata_field16(u64 field_id,int offset,struct tdx_tdmr_sysinfo * ts)274 static int read_sys_metadata_field16(u64 field_id,
275 int offset,
276 struct tdx_tdmr_sysinfo *ts)
277 {
278 u16 *ts_member = ((void *)ts) + offset;
279 u64 tmp;
280 int ret;
281
282 if (WARN_ON_ONCE(MD_FIELD_ID_ELE_SIZE_CODE(field_id) !=
283 MD_FIELD_ID_ELE_SIZE_16BIT))
284 return -EINVAL;
285
286 ret = read_sys_metadata_field(field_id, &tmp);
287 if (ret)
288 return ret;
289
290 *ts_member = tmp;
291
292 return 0;
293 }
294
295 struct field_mapping {
296 u64 field_id;
297 int offset;
298 };
299
300 #define TD_SYSINFO_MAP(_field_id, _offset) \
301 { .field_id = MD_FIELD_ID_##_field_id, \
302 .offset = offsetof(struct tdx_tdmr_sysinfo, _offset) }
303
304 /* Map TD_SYSINFO fields into 'struct tdx_tdmr_sysinfo': */
305 static const struct field_mapping fields[] = {
306 TD_SYSINFO_MAP(MAX_TDMRS, max_tdmrs),
307 TD_SYSINFO_MAP(MAX_RESERVED_PER_TDMR, max_reserved_per_tdmr),
308 TD_SYSINFO_MAP(PAMT_4K_ENTRY_SIZE, pamt_entry_size[TDX_PS_4K]),
309 TD_SYSINFO_MAP(PAMT_2M_ENTRY_SIZE, pamt_entry_size[TDX_PS_2M]),
310 TD_SYSINFO_MAP(PAMT_1G_ENTRY_SIZE, pamt_entry_size[TDX_PS_1G]),
311 };
312
get_tdx_tdmr_sysinfo(struct tdx_tdmr_sysinfo * tdmr_sysinfo)313 static int get_tdx_tdmr_sysinfo(struct tdx_tdmr_sysinfo *tdmr_sysinfo)
314 {
315 int ret;
316 int i;
317
318 /* Populate 'tdmr_sysinfo' fields using the mapping structure above: */
319 for (i = 0; i < ARRAY_SIZE(fields); i++) {
320 ret = read_sys_metadata_field16(fields[i].field_id,
321 fields[i].offset,
322 tdmr_sysinfo);
323 if (ret)
324 return ret;
325 }
326
327 return 0;
328 }
329
330 /* Calculate the actual TDMR size */
tdmr_size_single(u16 max_reserved_per_tdmr)331 static int tdmr_size_single(u16 max_reserved_per_tdmr)
332 {
333 int tdmr_sz;
334
335 /*
336 * The actual size of TDMR depends on the maximum
337 * number of reserved areas.
338 */
339 tdmr_sz = sizeof(struct tdmr_info);
340 tdmr_sz += sizeof(struct tdmr_reserved_area) * max_reserved_per_tdmr;
341
342 return ALIGN(tdmr_sz, TDMR_INFO_ALIGNMENT);
343 }
344
alloc_tdmr_list(struct tdmr_info_list * tdmr_list,struct tdx_tdmr_sysinfo * tdmr_sysinfo)345 static int alloc_tdmr_list(struct tdmr_info_list *tdmr_list,
346 struct tdx_tdmr_sysinfo *tdmr_sysinfo)
347 {
348 size_t tdmr_sz, tdmr_array_sz;
349 void *tdmr_array;
350
351 tdmr_sz = tdmr_size_single(tdmr_sysinfo->max_reserved_per_tdmr);
352 tdmr_array_sz = tdmr_sz * tdmr_sysinfo->max_tdmrs;
353
354 /*
355 * To keep things simple, allocate all TDMRs together.
356 * The buffer needs to be physically contiguous to make
357 * sure each TDMR is physically contiguous.
358 */
359 tdmr_array = alloc_pages_exact(tdmr_array_sz,
360 GFP_KERNEL | __GFP_ZERO);
361 if (!tdmr_array)
362 return -ENOMEM;
363
364 tdmr_list->tdmrs = tdmr_array;
365
366 /*
367 * Keep the size of TDMR to find the target TDMR
368 * at a given index in the TDMR list.
369 */
370 tdmr_list->tdmr_sz = tdmr_sz;
371 tdmr_list->max_tdmrs = tdmr_sysinfo->max_tdmrs;
372 tdmr_list->nr_consumed_tdmrs = 0;
373
374 return 0;
375 }
376
free_tdmr_list(struct tdmr_info_list * tdmr_list)377 static void free_tdmr_list(struct tdmr_info_list *tdmr_list)
378 {
379 free_pages_exact(tdmr_list->tdmrs,
380 tdmr_list->max_tdmrs * tdmr_list->tdmr_sz);
381 }
382
383 /* Get the TDMR from the list at the given index. */
tdmr_entry(struct tdmr_info_list * tdmr_list,int idx)384 static struct tdmr_info *tdmr_entry(struct tdmr_info_list *tdmr_list,
385 int idx)
386 {
387 int tdmr_info_offset = tdmr_list->tdmr_sz * idx;
388
389 return (void *)tdmr_list->tdmrs + tdmr_info_offset;
390 }
391
392 #define TDMR_ALIGNMENT SZ_1G
393 #define TDMR_ALIGN_DOWN(_addr) ALIGN_DOWN((_addr), TDMR_ALIGNMENT)
394 #define TDMR_ALIGN_UP(_addr) ALIGN((_addr), TDMR_ALIGNMENT)
395
tdmr_end(struct tdmr_info * tdmr)396 static inline u64 tdmr_end(struct tdmr_info *tdmr)
397 {
398 return tdmr->base + tdmr->size;
399 }
400
401 /*
402 * Take the memory referenced in @tmb_list and populate the
403 * preallocated @tdmr_list, following all the special alignment
404 * and size rules for TDMR.
405 */
fill_out_tdmrs(struct list_head * tmb_list,struct tdmr_info_list * tdmr_list)406 static int fill_out_tdmrs(struct list_head *tmb_list,
407 struct tdmr_info_list *tdmr_list)
408 {
409 struct tdx_memblock *tmb;
410 int tdmr_idx = 0;
411
412 /*
413 * Loop over TDX memory regions and fill out TDMRs to cover them.
414 * To keep it simple, always try to use one TDMR to cover one
415 * memory region.
416 *
417 * In practice TDX supports at least 64 TDMRs. A 2-socket system
418 * typically only consumes less than 10 of those. This code is
419 * dumb and simple and may use more TMDRs than is strictly
420 * required.
421 */
422 list_for_each_entry(tmb, tmb_list, list) {
423 struct tdmr_info *tdmr = tdmr_entry(tdmr_list, tdmr_idx);
424 u64 start, end;
425
426 start = TDMR_ALIGN_DOWN(PFN_PHYS(tmb->start_pfn));
427 end = TDMR_ALIGN_UP(PFN_PHYS(tmb->end_pfn));
428
429 /*
430 * A valid size indicates the current TDMR has already
431 * been filled out to cover the previous memory region(s).
432 */
433 if (tdmr->size) {
434 /*
435 * Loop to the next if the current memory region
436 * has already been fully covered.
437 */
438 if (end <= tdmr_end(tdmr))
439 continue;
440
441 /* Otherwise, skip the already covered part. */
442 if (start < tdmr_end(tdmr))
443 start = tdmr_end(tdmr);
444
445 /*
446 * Create a new TDMR to cover the current memory
447 * region, or the remaining part of it.
448 */
449 tdmr_idx++;
450 if (tdmr_idx >= tdmr_list->max_tdmrs) {
451 pr_warn("initialization failed: TDMRs exhausted.\n");
452 return -ENOSPC;
453 }
454
455 tdmr = tdmr_entry(tdmr_list, tdmr_idx);
456 }
457
458 tdmr->base = start;
459 tdmr->size = end - start;
460 }
461
462 /* @tdmr_idx is always the index of the last valid TDMR. */
463 tdmr_list->nr_consumed_tdmrs = tdmr_idx + 1;
464
465 /*
466 * Warn early that kernel is about to run out of TDMRs.
467 *
468 * This is an indication that TDMR allocation has to be
469 * reworked to be smarter to not run into an issue.
470 */
471 if (tdmr_list->max_tdmrs - tdmr_list->nr_consumed_tdmrs < TDMR_NR_WARN)
472 pr_warn("consumed TDMRs reaching limit: %d used out of %d\n",
473 tdmr_list->nr_consumed_tdmrs,
474 tdmr_list->max_tdmrs);
475
476 return 0;
477 }
478
479 /*
480 * Calculate PAMT size given a TDMR and a page size. The returned
481 * PAMT size is always aligned up to 4K page boundary.
482 */
tdmr_get_pamt_sz(struct tdmr_info * tdmr,int pgsz,u16 pamt_entry_size)483 static unsigned long tdmr_get_pamt_sz(struct tdmr_info *tdmr, int pgsz,
484 u16 pamt_entry_size)
485 {
486 unsigned long pamt_sz, nr_pamt_entries;
487
488 switch (pgsz) {
489 case TDX_PS_4K:
490 nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
491 break;
492 case TDX_PS_2M:
493 nr_pamt_entries = tdmr->size >> PMD_SHIFT;
494 break;
495 case TDX_PS_1G:
496 nr_pamt_entries = tdmr->size >> PUD_SHIFT;
497 break;
498 default:
499 WARN_ON_ONCE(1);
500 return 0;
501 }
502
503 pamt_sz = nr_pamt_entries * pamt_entry_size;
504 /* TDX requires PAMT size must be 4K aligned */
505 pamt_sz = ALIGN(pamt_sz, PAGE_SIZE);
506
507 return pamt_sz;
508 }
509
510 /*
511 * Locate a NUMA node which should hold the allocation of the @tdmr
512 * PAMT. This node will have some memory covered by the TDMR. The
513 * relative amount of memory covered is not considered.
514 */
tdmr_get_nid(struct tdmr_info * tdmr,struct list_head * tmb_list)515 static int tdmr_get_nid(struct tdmr_info *tdmr, struct list_head *tmb_list)
516 {
517 struct tdx_memblock *tmb;
518
519 /*
520 * A TDMR must cover at least part of one TMB. That TMB will end
521 * after the TDMR begins. But, that TMB may have started before
522 * the TDMR. Find the next 'tmb' that _ends_ after this TDMR
523 * begins. Ignore 'tmb' start addresses. They are irrelevant.
524 */
525 list_for_each_entry(tmb, tmb_list, list) {
526 if (tmb->end_pfn > PHYS_PFN(tdmr->base))
527 return tmb->nid;
528 }
529
530 /*
531 * Fall back to allocating the TDMR's metadata from node 0 when
532 * no TDX memory block can be found. This should never happen
533 * since TDMRs originate from TDX memory blocks.
534 */
535 pr_warn("TDMR [0x%llx, 0x%llx): unable to find local NUMA node for PAMT allocation, fallback to use node 0.\n",
536 tdmr->base, tdmr_end(tdmr));
537 return 0;
538 }
539
540 /*
541 * Allocate PAMTs from the local NUMA node of some memory in @tmb_list
542 * within @tdmr, and set up PAMTs for @tdmr.
543 */
tdmr_set_up_pamt(struct tdmr_info * tdmr,struct list_head * tmb_list,u16 pamt_entry_size[])544 static int tdmr_set_up_pamt(struct tdmr_info *tdmr,
545 struct list_head *tmb_list,
546 u16 pamt_entry_size[])
547 {
548 unsigned long pamt_base[TDX_PS_NR];
549 unsigned long pamt_size[TDX_PS_NR];
550 unsigned long tdmr_pamt_base;
551 unsigned long tdmr_pamt_size;
552 struct page *pamt;
553 int pgsz, nid;
554
555 nid = tdmr_get_nid(tdmr, tmb_list);
556
557 /*
558 * Calculate the PAMT size for each TDX supported page size
559 * and the total PAMT size.
560 */
561 tdmr_pamt_size = 0;
562 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
563 pamt_size[pgsz] = tdmr_get_pamt_sz(tdmr, pgsz,
564 pamt_entry_size[pgsz]);
565 tdmr_pamt_size += pamt_size[pgsz];
566 }
567
568 /*
569 * Allocate one chunk of physically contiguous memory for all
570 * PAMTs. This helps minimize the PAMT's use of reserved areas
571 * in overlapped TDMRs.
572 */
573 pamt = alloc_contig_pages(tdmr_pamt_size >> PAGE_SHIFT, GFP_KERNEL,
574 nid, &node_online_map);
575 if (!pamt)
576 return -ENOMEM;
577
578 /*
579 * Break the contiguous allocation back up into the
580 * individual PAMTs for each page size.
581 */
582 tdmr_pamt_base = page_to_pfn(pamt) << PAGE_SHIFT;
583 for (pgsz = TDX_PS_4K; pgsz < TDX_PS_NR; pgsz++) {
584 pamt_base[pgsz] = tdmr_pamt_base;
585 tdmr_pamt_base += pamt_size[pgsz];
586 }
587
588 tdmr->pamt_4k_base = pamt_base[TDX_PS_4K];
589 tdmr->pamt_4k_size = pamt_size[TDX_PS_4K];
590 tdmr->pamt_2m_base = pamt_base[TDX_PS_2M];
591 tdmr->pamt_2m_size = pamt_size[TDX_PS_2M];
592 tdmr->pamt_1g_base = pamt_base[TDX_PS_1G];
593 tdmr->pamt_1g_size = pamt_size[TDX_PS_1G];
594
595 return 0;
596 }
597
tdmr_get_pamt(struct tdmr_info * tdmr,unsigned long * pamt_base,unsigned long * pamt_size)598 static void tdmr_get_pamt(struct tdmr_info *tdmr, unsigned long *pamt_base,
599 unsigned long *pamt_size)
600 {
601 unsigned long pamt_bs, pamt_sz;
602
603 /*
604 * The PAMT was allocated in one contiguous unit. The 4K PAMT
605 * should always point to the beginning of that allocation.
606 */
607 pamt_bs = tdmr->pamt_4k_base;
608 pamt_sz = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
609
610 WARN_ON_ONCE((pamt_bs & ~PAGE_MASK) || (pamt_sz & ~PAGE_MASK));
611
612 *pamt_base = pamt_bs;
613 *pamt_size = pamt_sz;
614 }
615
tdmr_do_pamt_func(struct tdmr_info * tdmr,void (* pamt_func)(unsigned long base,unsigned long size))616 static void tdmr_do_pamt_func(struct tdmr_info *tdmr,
617 void (*pamt_func)(unsigned long base, unsigned long size))
618 {
619 unsigned long pamt_base, pamt_size;
620
621 tdmr_get_pamt(tdmr, &pamt_base, &pamt_size);
622
623 /* Do nothing if PAMT hasn't been allocated for this TDMR */
624 if (!pamt_size)
625 return;
626
627 if (WARN_ON_ONCE(!pamt_base))
628 return;
629
630 pamt_func(pamt_base, pamt_size);
631 }
632
free_pamt(unsigned long pamt_base,unsigned long pamt_size)633 static void free_pamt(unsigned long pamt_base, unsigned long pamt_size)
634 {
635 free_contig_range(pamt_base >> PAGE_SHIFT, pamt_size >> PAGE_SHIFT);
636 }
637
tdmr_free_pamt(struct tdmr_info * tdmr)638 static void tdmr_free_pamt(struct tdmr_info *tdmr)
639 {
640 tdmr_do_pamt_func(tdmr, free_pamt);
641 }
642
tdmrs_free_pamt_all(struct tdmr_info_list * tdmr_list)643 static void tdmrs_free_pamt_all(struct tdmr_info_list *tdmr_list)
644 {
645 int i;
646
647 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
648 tdmr_free_pamt(tdmr_entry(tdmr_list, i));
649 }
650
651 /* Allocate and set up PAMTs for all TDMRs */
tdmrs_set_up_pamt_all(struct tdmr_info_list * tdmr_list,struct list_head * tmb_list,u16 pamt_entry_size[])652 static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list,
653 struct list_head *tmb_list,
654 u16 pamt_entry_size[])
655 {
656 int i, ret = 0;
657
658 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
659 ret = tdmr_set_up_pamt(tdmr_entry(tdmr_list, i), tmb_list,
660 pamt_entry_size);
661 if (ret)
662 goto err;
663 }
664
665 return 0;
666 err:
667 tdmrs_free_pamt_all(tdmr_list);
668 return ret;
669 }
670
671 /*
672 * Convert TDX private pages back to normal by using MOVDIR64B to
673 * clear these pages. Note this function doesn't flush cache of
674 * these TDX private pages. The caller should make sure of that.
675 */
reset_tdx_pages(unsigned long base,unsigned long size)676 static void reset_tdx_pages(unsigned long base, unsigned long size)
677 {
678 const void *zero_page = (const void *)page_address(ZERO_PAGE(0));
679 unsigned long phys, end;
680
681 end = base + size;
682 for (phys = base; phys < end; phys += 64)
683 movdir64b(__va(phys), zero_page);
684
685 /*
686 * MOVDIR64B uses WC protocol. Use memory barrier to
687 * make sure any later user of these pages sees the
688 * updated data.
689 */
690 mb();
691 }
692
tdmr_reset_pamt(struct tdmr_info * tdmr)693 static void tdmr_reset_pamt(struct tdmr_info *tdmr)
694 {
695 tdmr_do_pamt_func(tdmr, reset_tdx_pages);
696 }
697
tdmrs_reset_pamt_all(struct tdmr_info_list * tdmr_list)698 static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list)
699 {
700 int i;
701
702 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
703 tdmr_reset_pamt(tdmr_entry(tdmr_list, i));
704 }
705
tdmrs_count_pamt_kb(struct tdmr_info_list * tdmr_list)706 static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list)
707 {
708 unsigned long pamt_size = 0;
709 int i;
710
711 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
712 unsigned long base, size;
713
714 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
715 pamt_size += size;
716 }
717
718 return pamt_size / 1024;
719 }
720
tdmr_add_rsvd_area(struct tdmr_info * tdmr,int * p_idx,u64 addr,u64 size,u16 max_reserved_per_tdmr)721 static int tdmr_add_rsvd_area(struct tdmr_info *tdmr, int *p_idx, u64 addr,
722 u64 size, u16 max_reserved_per_tdmr)
723 {
724 struct tdmr_reserved_area *rsvd_areas = tdmr->reserved_areas;
725 int idx = *p_idx;
726
727 /* Reserved area must be 4K aligned in offset and size */
728 if (WARN_ON(addr & ~PAGE_MASK || size & ~PAGE_MASK))
729 return -EINVAL;
730
731 if (idx >= max_reserved_per_tdmr) {
732 pr_warn("initialization failed: TDMR [0x%llx, 0x%llx): reserved areas exhausted.\n",
733 tdmr->base, tdmr_end(tdmr));
734 return -ENOSPC;
735 }
736
737 /*
738 * Consume one reserved area per call. Make no effort to
739 * optimize or reduce the number of reserved areas which are
740 * consumed by contiguous reserved areas, for instance.
741 */
742 rsvd_areas[idx].offset = addr - tdmr->base;
743 rsvd_areas[idx].size = size;
744
745 *p_idx = idx + 1;
746
747 return 0;
748 }
749
750 /*
751 * Go through @tmb_list to find holes between memory areas. If any of
752 * those holes fall within @tdmr, set up a TDMR reserved area to cover
753 * the hole.
754 */
tdmr_populate_rsvd_holes(struct list_head * tmb_list,struct tdmr_info * tdmr,int * rsvd_idx,u16 max_reserved_per_tdmr)755 static int tdmr_populate_rsvd_holes(struct list_head *tmb_list,
756 struct tdmr_info *tdmr,
757 int *rsvd_idx,
758 u16 max_reserved_per_tdmr)
759 {
760 struct tdx_memblock *tmb;
761 u64 prev_end;
762 int ret;
763
764 /*
765 * Start looking for reserved blocks at the
766 * beginning of the TDMR.
767 */
768 prev_end = tdmr->base;
769 list_for_each_entry(tmb, tmb_list, list) {
770 u64 start, end;
771
772 start = PFN_PHYS(tmb->start_pfn);
773 end = PFN_PHYS(tmb->end_pfn);
774
775 /* Break if this region is after the TDMR */
776 if (start >= tdmr_end(tdmr))
777 break;
778
779 /* Exclude regions before this TDMR */
780 if (end < tdmr->base)
781 continue;
782
783 /*
784 * Skip over memory areas that
785 * have already been dealt with.
786 */
787 if (start <= prev_end) {
788 prev_end = end;
789 continue;
790 }
791
792 /* Add the hole before this region */
793 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
794 start - prev_end,
795 max_reserved_per_tdmr);
796 if (ret)
797 return ret;
798
799 prev_end = end;
800 }
801
802 /* Add the hole after the last region if it exists. */
803 if (prev_end < tdmr_end(tdmr)) {
804 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, prev_end,
805 tdmr_end(tdmr) - prev_end,
806 max_reserved_per_tdmr);
807 if (ret)
808 return ret;
809 }
810
811 return 0;
812 }
813
814 /*
815 * Go through @tdmr_list to find all PAMTs. If any of those PAMTs
816 * overlaps with @tdmr, set up a TDMR reserved area to cover the
817 * overlapping part.
818 */
tdmr_populate_rsvd_pamts(struct tdmr_info_list * tdmr_list,struct tdmr_info * tdmr,int * rsvd_idx,u16 max_reserved_per_tdmr)819 static int tdmr_populate_rsvd_pamts(struct tdmr_info_list *tdmr_list,
820 struct tdmr_info *tdmr,
821 int *rsvd_idx,
822 u16 max_reserved_per_tdmr)
823 {
824 int i, ret;
825
826 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
827 struct tdmr_info *tmp = tdmr_entry(tdmr_list, i);
828 unsigned long pamt_base, pamt_size, pamt_end;
829
830 tdmr_get_pamt(tmp, &pamt_base, &pamt_size);
831 /* Each TDMR must already have PAMT allocated */
832 WARN_ON_ONCE(!pamt_size || !pamt_base);
833
834 pamt_end = pamt_base + pamt_size;
835 /* Skip PAMTs outside of the given TDMR */
836 if ((pamt_end <= tdmr->base) ||
837 (pamt_base >= tdmr_end(tdmr)))
838 continue;
839
840 /* Only mark the part within the TDMR as reserved */
841 if (pamt_base < tdmr->base)
842 pamt_base = tdmr->base;
843 if (pamt_end > tdmr_end(tdmr))
844 pamt_end = tdmr_end(tdmr);
845
846 ret = tdmr_add_rsvd_area(tdmr, rsvd_idx, pamt_base,
847 pamt_end - pamt_base,
848 max_reserved_per_tdmr);
849 if (ret)
850 return ret;
851 }
852
853 return 0;
854 }
855
856 /* Compare function called by sort() for TDMR reserved areas */
rsvd_area_cmp_func(const void * a,const void * b)857 static int rsvd_area_cmp_func(const void *a, const void *b)
858 {
859 struct tdmr_reserved_area *r1 = (struct tdmr_reserved_area *)a;
860 struct tdmr_reserved_area *r2 = (struct tdmr_reserved_area *)b;
861
862 if (r1->offset + r1->size <= r2->offset)
863 return -1;
864 if (r1->offset >= r2->offset + r2->size)
865 return 1;
866
867 /* Reserved areas cannot overlap. The caller must guarantee. */
868 WARN_ON_ONCE(1);
869 return -1;
870 }
871
872 /*
873 * Populate reserved areas for the given @tdmr, including memory holes
874 * (via @tmb_list) and PAMTs (via @tdmr_list).
875 */
tdmr_populate_rsvd_areas(struct tdmr_info * tdmr,struct list_head * tmb_list,struct tdmr_info_list * tdmr_list,u16 max_reserved_per_tdmr)876 static int tdmr_populate_rsvd_areas(struct tdmr_info *tdmr,
877 struct list_head *tmb_list,
878 struct tdmr_info_list *tdmr_list,
879 u16 max_reserved_per_tdmr)
880 {
881 int ret, rsvd_idx = 0;
882
883 ret = tdmr_populate_rsvd_holes(tmb_list, tdmr, &rsvd_idx,
884 max_reserved_per_tdmr);
885 if (ret)
886 return ret;
887
888 ret = tdmr_populate_rsvd_pamts(tdmr_list, tdmr, &rsvd_idx,
889 max_reserved_per_tdmr);
890 if (ret)
891 return ret;
892
893 /* TDX requires reserved areas listed in address ascending order */
894 sort(tdmr->reserved_areas, rsvd_idx, sizeof(struct tdmr_reserved_area),
895 rsvd_area_cmp_func, NULL);
896
897 return 0;
898 }
899
900 /*
901 * Populate reserved areas for all TDMRs in @tdmr_list, including memory
902 * holes (via @tmb_list) and PAMTs.
903 */
tdmrs_populate_rsvd_areas_all(struct tdmr_info_list * tdmr_list,struct list_head * tmb_list,u16 max_reserved_per_tdmr)904 static int tdmrs_populate_rsvd_areas_all(struct tdmr_info_list *tdmr_list,
905 struct list_head *tmb_list,
906 u16 max_reserved_per_tdmr)
907 {
908 int i;
909
910 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
911 int ret;
912
913 ret = tdmr_populate_rsvd_areas(tdmr_entry(tdmr_list, i),
914 tmb_list, tdmr_list, max_reserved_per_tdmr);
915 if (ret)
916 return ret;
917 }
918
919 return 0;
920 }
921
922 /*
923 * Construct a list of TDMRs on the preallocated space in @tdmr_list
924 * to cover all TDX memory regions in @tmb_list based on the TDX module
925 * TDMR global information in @tdmr_sysinfo.
926 */
construct_tdmrs(struct list_head * tmb_list,struct tdmr_info_list * tdmr_list,struct tdx_tdmr_sysinfo * tdmr_sysinfo)927 static int construct_tdmrs(struct list_head *tmb_list,
928 struct tdmr_info_list *tdmr_list,
929 struct tdx_tdmr_sysinfo *tdmr_sysinfo)
930 {
931 int ret;
932
933 ret = fill_out_tdmrs(tmb_list, tdmr_list);
934 if (ret)
935 return ret;
936
937 ret = tdmrs_set_up_pamt_all(tdmr_list, tmb_list,
938 tdmr_sysinfo->pamt_entry_size);
939 if (ret)
940 return ret;
941
942 ret = tdmrs_populate_rsvd_areas_all(tdmr_list, tmb_list,
943 tdmr_sysinfo->max_reserved_per_tdmr);
944 if (ret)
945 tdmrs_free_pamt_all(tdmr_list);
946
947 /*
948 * The tdmr_info_list is read-only from here on out.
949 * Ensure that these writes are seen by other CPUs.
950 * Pairs with a smp_rmb() in is_pamt_page().
951 */
952 smp_wmb();
953
954 return ret;
955 }
956
config_tdx_module(struct tdmr_info_list * tdmr_list,u64 global_keyid)957 static int config_tdx_module(struct tdmr_info_list *tdmr_list, u64 global_keyid)
958 {
959 struct tdx_module_args args = {};
960 u64 *tdmr_pa_array;
961 size_t array_sz;
962 int i, ret;
963
964 /*
965 * TDMRs are passed to the TDX module via an array of physical
966 * addresses of each TDMR. The array itself also has certain
967 * alignment requirement.
968 */
969 array_sz = tdmr_list->nr_consumed_tdmrs * sizeof(u64);
970 array_sz = roundup_pow_of_two(array_sz);
971 if (array_sz < TDMR_INFO_PA_ARRAY_ALIGNMENT)
972 array_sz = TDMR_INFO_PA_ARRAY_ALIGNMENT;
973
974 tdmr_pa_array = kzalloc(array_sz, GFP_KERNEL);
975 if (!tdmr_pa_array)
976 return -ENOMEM;
977
978 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++)
979 tdmr_pa_array[i] = __pa(tdmr_entry(tdmr_list, i));
980
981 args.rcx = __pa(tdmr_pa_array);
982 args.rdx = tdmr_list->nr_consumed_tdmrs;
983 args.r8 = global_keyid;
984 ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
985
986 /* Free the array as it is not required anymore. */
987 kfree(tdmr_pa_array);
988
989 return ret;
990 }
991
do_global_key_config(void * unused)992 static int do_global_key_config(void *unused)
993 {
994 struct tdx_module_args args = {};
995
996 return seamcall_prerr(TDH_SYS_KEY_CONFIG, &args);
997 }
998
999 /*
1000 * Attempt to configure the global KeyID on all physical packages.
1001 *
1002 * This requires running code on at least one CPU in each package.
1003 * TDMR initialization) will fail will fail if any package in the
1004 * system has no online CPUs.
1005 *
1006 * This code takes no affirmative steps to online CPUs. Callers (aka.
1007 * KVM) can ensure success by ensuring sufficient CPUs are online and
1008 * can run SEAMCALLs.
1009 */
config_global_keyid(void)1010 static int config_global_keyid(void)
1011 {
1012 cpumask_var_t packages;
1013 int cpu, ret = -EINVAL;
1014
1015 if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
1016 return -ENOMEM;
1017
1018 /*
1019 * Hardware doesn't guarantee cache coherency across different
1020 * KeyIDs. The kernel needs to flush PAMT's dirty cachelines
1021 * (associated with KeyID 0) before the TDX module can use the
1022 * global KeyID to access the PAMT. Given PAMTs are potentially
1023 * large (~1/256th of system RAM), just use WBINVD.
1024 */
1025 wbinvd_on_all_cpus();
1026
1027 for_each_online_cpu(cpu) {
1028 /*
1029 * The key configuration only needs to be done once per
1030 * package and will return an error if configured more
1031 * than once. Avoid doing it multiple times per package.
1032 */
1033 if (cpumask_test_and_set_cpu(topology_physical_package_id(cpu),
1034 packages))
1035 continue;
1036
1037 /*
1038 * TDH.SYS.KEY.CONFIG cannot run concurrently on
1039 * different cpus. Do it one by one.
1040 */
1041 ret = smp_call_on_cpu(cpu, do_global_key_config, NULL, true);
1042 if (ret)
1043 break;
1044 }
1045
1046 free_cpumask_var(packages);
1047 return ret;
1048 }
1049
init_tdmr(struct tdmr_info * tdmr)1050 static int init_tdmr(struct tdmr_info *tdmr)
1051 {
1052 u64 next;
1053
1054 /*
1055 * Initializing a TDMR can be time consuming. To avoid long
1056 * SEAMCALLs, the TDX module may only initialize a part of the
1057 * TDMR in each call.
1058 */
1059 do {
1060 struct tdx_module_args args = {
1061 .rcx = tdmr->base,
1062 };
1063 int ret;
1064
1065 ret = seamcall_prerr_ret(TDH_SYS_TDMR_INIT, &args);
1066 if (ret)
1067 return ret;
1068 /*
1069 * RDX contains 'next-to-initialize' address if
1070 * TDH.SYS.TDMR.INIT did not fully complete and
1071 * should be retried.
1072 */
1073 next = args.rdx;
1074 cond_resched();
1075 /* Keep making SEAMCALLs until the TDMR is done */
1076 } while (next < tdmr->base + tdmr->size);
1077
1078 return 0;
1079 }
1080
init_tdmrs(struct tdmr_info_list * tdmr_list)1081 static int init_tdmrs(struct tdmr_info_list *tdmr_list)
1082 {
1083 int i;
1084
1085 /*
1086 * This operation is costly. It can be parallelized,
1087 * but keep it simple for now.
1088 */
1089 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1090 int ret;
1091
1092 ret = init_tdmr(tdmr_entry(tdmr_list, i));
1093 if (ret)
1094 return ret;
1095 }
1096
1097 return 0;
1098 }
1099
init_tdx_module(void)1100 static int init_tdx_module(void)
1101 {
1102 struct tdx_tdmr_sysinfo tdmr_sysinfo;
1103 int ret;
1104
1105 /*
1106 * To keep things simple, assume that all TDX-protected memory
1107 * will come from the page allocator. Make sure all pages in the
1108 * page allocator are TDX-usable memory.
1109 *
1110 * Build the list of "TDX-usable" memory regions which cover all
1111 * pages in the page allocator to guarantee that. Do it while
1112 * holding mem_hotplug_lock read-lock as the memory hotplug code
1113 * path reads the @tdx_memlist to reject any new memory.
1114 */
1115 get_online_mems();
1116
1117 ret = build_tdx_memlist(&tdx_memlist);
1118 if (ret)
1119 goto out_put_tdxmem;
1120
1121 ret = get_tdx_tdmr_sysinfo(&tdmr_sysinfo);
1122 if (ret)
1123 goto err_free_tdxmem;
1124
1125 /* Allocate enough space for constructing TDMRs */
1126 ret = alloc_tdmr_list(&tdx_tdmr_list, &tdmr_sysinfo);
1127 if (ret)
1128 goto err_free_tdxmem;
1129
1130 /* Cover all TDX-usable memory regions in TDMRs */
1131 ret = construct_tdmrs(&tdx_memlist, &tdx_tdmr_list, &tdmr_sysinfo);
1132 if (ret)
1133 goto err_free_tdmrs;
1134
1135 /* Pass the TDMRs and the global KeyID to the TDX module */
1136 ret = config_tdx_module(&tdx_tdmr_list, tdx_global_keyid);
1137 if (ret)
1138 goto err_free_pamts;
1139
1140 /* Config the key of global KeyID on all packages */
1141 ret = config_global_keyid();
1142 if (ret)
1143 goto err_reset_pamts;
1144
1145 /* Initialize TDMRs to complete the TDX module initialization */
1146 ret = init_tdmrs(&tdx_tdmr_list);
1147 if (ret)
1148 goto err_reset_pamts;
1149
1150 pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
1151
1152 out_put_tdxmem:
1153 /*
1154 * @tdx_memlist is written here and read at memory hotplug time.
1155 * Lock out memory hotplug code while building it.
1156 */
1157 put_online_mems();
1158 return ret;
1159
1160 err_reset_pamts:
1161 /*
1162 * Part of PAMTs may already have been initialized by the
1163 * TDX module. Flush cache before returning PAMTs back
1164 * to the kernel.
1165 */
1166 wbinvd_on_all_cpus();
1167 /*
1168 * According to the TDX hardware spec, if the platform
1169 * doesn't have the "partial write machine check"
1170 * erratum, any kernel read/write will never cause #MC
1171 * in kernel space, thus it's OK to not convert PAMTs
1172 * back to normal. But do the conversion anyway here
1173 * as suggested by the TDX spec.
1174 */
1175 tdmrs_reset_pamt_all(&tdx_tdmr_list);
1176 err_free_pamts:
1177 tdmrs_free_pamt_all(&tdx_tdmr_list);
1178 err_free_tdmrs:
1179 free_tdmr_list(&tdx_tdmr_list);
1180 err_free_tdxmem:
1181 free_tdx_memlist(&tdx_memlist);
1182 goto out_put_tdxmem;
1183 }
1184
__tdx_enable(void)1185 static int __tdx_enable(void)
1186 {
1187 int ret;
1188
1189 ret = init_tdx_module();
1190 if (ret) {
1191 pr_err("module initialization failed (%d)\n", ret);
1192 tdx_module_status = TDX_MODULE_ERROR;
1193 return ret;
1194 }
1195
1196 pr_info("module initialized\n");
1197 tdx_module_status = TDX_MODULE_INITIALIZED;
1198
1199 return 0;
1200 }
1201
1202 /**
1203 * tdx_enable - Enable TDX module to make it ready to run TDX guests
1204 *
1205 * This function assumes the caller has: 1) held read lock of CPU hotplug
1206 * lock to prevent any new cpu from becoming online; 2) done both VMXON
1207 * and tdx_cpu_enable() on all online cpus.
1208 *
1209 * This function requires there's at least one online cpu for each CPU
1210 * package to succeed.
1211 *
1212 * This function can be called in parallel by multiple callers.
1213 *
1214 * Return 0 if TDX is enabled successfully, otherwise error.
1215 */
tdx_enable(void)1216 int tdx_enable(void)
1217 {
1218 int ret;
1219
1220 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1221 return -ENODEV;
1222
1223 lockdep_assert_cpus_held();
1224
1225 mutex_lock(&tdx_module_lock);
1226
1227 switch (tdx_module_status) {
1228 case TDX_MODULE_UNINITIALIZED:
1229 ret = __tdx_enable();
1230 break;
1231 case TDX_MODULE_INITIALIZED:
1232 /* Already initialized, great, tell the caller. */
1233 ret = 0;
1234 break;
1235 default:
1236 /* Failed to initialize in the previous attempts */
1237 ret = -EINVAL;
1238 break;
1239 }
1240
1241 mutex_unlock(&tdx_module_lock);
1242
1243 return ret;
1244 }
1245 EXPORT_SYMBOL_GPL(tdx_enable);
1246
is_pamt_page(unsigned long phys)1247 static bool is_pamt_page(unsigned long phys)
1248 {
1249 struct tdmr_info_list *tdmr_list = &tdx_tdmr_list;
1250 int i;
1251
1252 /* Ensure that all remote 'tdmr_list' writes are visible: */
1253 smp_rmb();
1254
1255 /*
1256 * The TDX module is no longer returning TDX_SYS_NOT_READY and
1257 * is initialized. The 'tdmr_list' was initialized long ago
1258 * and is now read-only.
1259 */
1260 for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) {
1261 unsigned long base, size;
1262
1263 tdmr_get_pamt(tdmr_entry(tdmr_list, i), &base, &size);
1264
1265 if (phys >= base && phys < (base + size))
1266 return true;
1267 }
1268
1269 return false;
1270 }
1271
1272 /*
1273 * Return whether the memory page at the given physical address is TDX
1274 * private memory or not.
1275 *
1276 * This can be imprecise for two known reasons:
1277 * 1. PAMTs are private memory and exist before the TDX module is
1278 * ready and TDH_PHYMEM_PAGE_RDMD works. This is a relatively
1279 * short window that occurs once per boot.
1280 * 2. TDH_PHYMEM_PAGE_RDMD reflects the TDX module's knowledge of the
1281 * page. However, the page can still cause #MC until it has been
1282 * fully converted to shared using 64-byte writes like MOVDIR64B.
1283 * Buggy hosts might still leave #MC-causing memory in place which
1284 * this function can not detect.
1285 */
paddr_is_tdx_private(unsigned long phys)1286 static bool paddr_is_tdx_private(unsigned long phys)
1287 {
1288 struct tdx_module_args args = {
1289 .rcx = phys & PAGE_MASK,
1290 };
1291 u64 sret;
1292
1293 if (!boot_cpu_has(X86_FEATURE_TDX_HOST_PLATFORM))
1294 return false;
1295
1296 /* Get page type from the TDX module */
1297 sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args);
1298
1299 /*
1300 * The SEAMCALL will not return success unless there is a
1301 * working, "ready" TDX module. Assume an absence of TDX
1302 * private pages until SEAMCALL is working.
1303 */
1304 if (sret)
1305 return false;
1306
1307 /*
1308 * SEAMCALL was successful -- read page type (via RCX):
1309 *
1310 * - PT_NDA: Page is not used by the TDX module
1311 * - PT_RSVD: Reserved for Non-TDX use
1312 * - Others: Page is used by the TDX module
1313 *
1314 * Note PAMT pages are marked as PT_RSVD but they are also TDX
1315 * private memory.
1316 */
1317 switch (args.rcx) {
1318 case PT_NDA:
1319 return false;
1320 case PT_RSVD:
1321 return is_pamt_page(phys);
1322 default:
1323 return true;
1324 }
1325 }
1326
1327 /*
1328 * Some TDX-capable CPUs have an erratum. A write to TDX private
1329 * memory poisons that memory, and a subsequent read of that memory
1330 * triggers #MC.
1331 *
1332 * Help distinguish erratum-triggered #MCs from a normal hardware one.
1333 * Just print additional message to show such #MC may be result of the
1334 * erratum.
1335 */
tdx_dump_mce_info(struct mce * m)1336 const char *tdx_dump_mce_info(struct mce *m)
1337 {
1338 if (!m || !mce_is_memory_error(m) || !mce_usable_address(m))
1339 return NULL;
1340
1341 if (!paddr_is_tdx_private(m->addr))
1342 return NULL;
1343
1344 return "TDX private memory error. Possible kernel bug.";
1345 }
1346
record_keyid_partitioning(u32 * tdx_keyid_start,u32 * nr_tdx_keyids)1347 static __init int record_keyid_partitioning(u32 *tdx_keyid_start,
1348 u32 *nr_tdx_keyids)
1349 {
1350 u32 _nr_mktme_keyids, _tdx_keyid_start, _nr_tdx_keyids;
1351 int ret;
1352
1353 /*
1354 * IA32_MKTME_KEYID_PARTIONING:
1355 * Bit [31:0]: Number of MKTME KeyIDs.
1356 * Bit [63:32]: Number of TDX private KeyIDs.
1357 */
1358 ret = rdmsr_safe(MSR_IA32_MKTME_KEYID_PARTITIONING, &_nr_mktme_keyids,
1359 &_nr_tdx_keyids);
1360 if (ret || !_nr_tdx_keyids)
1361 return -EINVAL;
1362
1363 /* TDX KeyIDs start after the last MKTME KeyID. */
1364 _tdx_keyid_start = _nr_mktme_keyids + 1;
1365
1366 *tdx_keyid_start = _tdx_keyid_start;
1367 *nr_tdx_keyids = _nr_tdx_keyids;
1368
1369 return 0;
1370 }
1371
is_tdx_memory(unsigned long start_pfn,unsigned long end_pfn)1372 static bool is_tdx_memory(unsigned long start_pfn, unsigned long end_pfn)
1373 {
1374 struct tdx_memblock *tmb;
1375
1376 /*
1377 * This check assumes that the start_pfn<->end_pfn range does not
1378 * cross multiple @tdx_memlist entries. A single memory online
1379 * event across multiple memblocks (from which @tdx_memlist
1380 * entries are derived at the time of module initialization) is
1381 * not possible. This is because memory offline/online is done
1382 * on granularity of 'struct memory_block', and the hotpluggable
1383 * memory region (one memblock) must be multiple of memory_block.
1384 */
1385 list_for_each_entry(tmb, &tdx_memlist, list) {
1386 if (start_pfn >= tmb->start_pfn && end_pfn <= tmb->end_pfn)
1387 return true;
1388 }
1389 return false;
1390 }
1391
tdx_memory_notifier(struct notifier_block * nb,unsigned long action,void * v)1392 static int tdx_memory_notifier(struct notifier_block *nb, unsigned long action,
1393 void *v)
1394 {
1395 struct memory_notify *mn = v;
1396
1397 if (action != MEM_GOING_ONLINE)
1398 return NOTIFY_OK;
1399
1400 /*
1401 * Empty list means TDX isn't enabled. Allow any memory
1402 * to go online.
1403 */
1404 if (list_empty(&tdx_memlist))
1405 return NOTIFY_OK;
1406
1407 /*
1408 * The TDX memory configuration is static and can not be
1409 * changed. Reject onlining any memory which is outside of
1410 * the static configuration whether it supports TDX or not.
1411 */
1412 if (is_tdx_memory(mn->start_pfn, mn->start_pfn + mn->nr_pages))
1413 return NOTIFY_OK;
1414
1415 return NOTIFY_BAD;
1416 }
1417
1418 static struct notifier_block tdx_memory_nb = {
1419 .notifier_call = tdx_memory_notifier,
1420 };
1421
check_tdx_erratum(void)1422 static void __init check_tdx_erratum(void)
1423 {
1424 /*
1425 * These CPUs have an erratum. A partial write from non-TD
1426 * software (e.g. via MOVNTI variants or UC/WC mapping) to TDX
1427 * private memory poisons that memory, and a subsequent read of
1428 * that memory triggers #MC.
1429 */
1430 switch (boot_cpu_data.x86_model) {
1431 case INTEL_FAM6_SAPPHIRERAPIDS_X:
1432 case INTEL_FAM6_EMERALDRAPIDS_X:
1433 setup_force_cpu_bug(X86_BUG_TDX_PW_MCE);
1434 }
1435 }
1436
tdx_init(void)1437 void __init tdx_init(void)
1438 {
1439 u32 tdx_keyid_start, nr_tdx_keyids;
1440 int err;
1441
1442 err = record_keyid_partitioning(&tdx_keyid_start, &nr_tdx_keyids);
1443 if (err)
1444 return;
1445
1446 pr_info("BIOS enabled: private KeyID range [%u, %u)\n",
1447 tdx_keyid_start, tdx_keyid_start + nr_tdx_keyids);
1448
1449 /*
1450 * The TDX module itself requires one 'global KeyID' to protect
1451 * its metadata. If there's only one TDX KeyID, there won't be
1452 * any left for TDX guests thus there's no point to enable TDX
1453 * at all.
1454 */
1455 if (nr_tdx_keyids < 2) {
1456 pr_err("initialization failed: too few private KeyIDs available.\n");
1457 return;
1458 }
1459
1460 /*
1461 * At this point, hibernation_available() indicates whether or
1462 * not hibernation support has been permanently disabled.
1463 */
1464 if (hibernation_available()) {
1465 pr_err("initialization failed: Hibernation support is enabled\n");
1466 return;
1467 }
1468
1469 err = register_memory_notifier(&tdx_memory_nb);
1470 if (err) {
1471 pr_err("initialization failed: register_memory_notifier() failed (%d)\n",
1472 err);
1473 return;
1474 }
1475
1476 #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND)
1477 pr_info("Disable ACPI S3. Turn off TDX in the BIOS to use ACPI S3.\n");
1478 acpi_suspend_lowlevel = NULL;
1479 #endif
1480
1481 /*
1482 * Just use the first TDX KeyID as the 'global KeyID' and
1483 * leave the rest for TDX guests.
1484 */
1485 tdx_global_keyid = tdx_keyid_start;
1486 tdx_guest_keyid_start = tdx_keyid_start + 1;
1487 tdx_nr_guest_keyids = nr_tdx_keyids - 1;
1488
1489 setup_force_cpu_cap(X86_FEATURE_TDX_HOST_PLATFORM);
1490
1491 check_tdx_erratum();
1492 }
1493