1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/cleanup.h>
3 #include <linux/cpu.h>
4 #include <asm/cpufeature.h>
5 #include <asm/fpu/xcr.h>
6 #include <linux/misc_cgroup.h>
7 #include <linux/mmu_context.h>
8 #include <asm/tdx.h>
9 #include "capabilities.h"
10 #include "mmu.h"
11 #include "x86_ops.h"
12 #include "lapic.h"
13 #include "tdx.h"
14 #include "vmx.h"
15 #include "mmu/spte.h"
16 #include "common.h"
17 #include "posted_intr.h"
18 #include "irq.h"
19 #include <trace/events/kvm.h>
20 #include "trace.h"
21
22 #pragma GCC poison to_vmx
23
24 #undef pr_fmt
25 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26
27 #define pr_tdx_error(__fn, __err) \
28 pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
29
30 #define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \
31 pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__)
32
33 #define pr_tdx_error_1(__fn, __err, __rcx) \
34 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
35
36 #define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \
37 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
38
39 #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \
40 __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
41
42 bool enable_tdx __ro_after_init;
43 module_param_named(tdx, enable_tdx, bool, 0444);
44
45 #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
46 #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
47
48 static enum cpuhp_state tdx_cpuhp_state;
49
50 static const struct tdx_sys_info *tdx_sysinfo;
51
tdh_vp_rd_failed(struct vcpu_tdx * tdx,char * uclass,u32 field,u64 err)52 void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
53 {
54 KVM_BUG_ON(1, tdx->vcpu.kvm);
55 pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
56 }
57
tdh_vp_wr_failed(struct vcpu_tdx * tdx,char * uclass,char * op,u32 field,u64 val,u64 err)58 void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
59 u64 val, u64 err)
60 {
61 KVM_BUG_ON(1, tdx->vcpu.kvm);
62 pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
63 }
64
65 #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
66
to_kvm_tdx(struct kvm * kvm)67 static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
68 {
69 return container_of(kvm, struct kvm_tdx, kvm);
70 }
71
to_tdx(struct kvm_vcpu * vcpu)72 static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
73 {
74 return container_of(vcpu, struct vcpu_tdx, vcpu);
75 }
76
tdx_get_supported_attrs(const struct tdx_sys_info_td_conf * td_conf)77 static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
78 {
79 u64 val = KVM_SUPPORTED_TD_ATTRS;
80
81 if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
82 return 0;
83
84 val &= td_conf->attributes_fixed0;
85
86 return val;
87 }
88
tdx_get_supported_xfam(const struct tdx_sys_info_td_conf * td_conf)89 static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
90 {
91 u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
92
93 if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
94 return 0;
95
96 val &= td_conf->xfam_fixed0;
97
98 return val;
99 }
100
tdx_get_guest_phys_addr_bits(const u32 eax)101 static int tdx_get_guest_phys_addr_bits(const u32 eax)
102 {
103 return (eax & GENMASK(23, 16)) >> 16;
104 }
105
tdx_set_guest_phys_addr_bits(const u32 eax,int addr_bits)106 static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
107 {
108 return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
109 }
110
111 #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
112
has_tsx(const struct kvm_cpuid_entry2 * entry)113 static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
114 {
115 return entry->function == 7 && entry->index == 0 &&
116 (entry->ebx & TDX_FEATURE_TSX);
117 }
118
clear_tsx(struct kvm_cpuid_entry2 * entry)119 static void clear_tsx(struct kvm_cpuid_entry2 *entry)
120 {
121 entry->ebx &= ~TDX_FEATURE_TSX;
122 }
123
has_waitpkg(const struct kvm_cpuid_entry2 * entry)124 static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
125 {
126 return entry->function == 7 && entry->index == 0 &&
127 (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
128 }
129
clear_waitpkg(struct kvm_cpuid_entry2 * entry)130 static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
131 {
132 entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
133 }
134
tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 * entry)135 static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
136 {
137 if (has_tsx(entry))
138 clear_tsx(entry);
139
140 if (has_waitpkg(entry))
141 clear_waitpkg(entry);
142 }
143
tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 * entry)144 static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
145 {
146 return has_tsx(entry) || has_waitpkg(entry);
147 }
148
149 #define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1)
150
td_init_cpuid_entry2(struct kvm_cpuid_entry2 * entry,unsigned char idx)151 static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
152 {
153 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
154
155 entry->function = (u32)td_conf->cpuid_config_leaves[idx];
156 entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
157 entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
158 entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
159 entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
160 entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
161
162 if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
163 entry->index = 0;
164
165 /*
166 * The TDX module doesn't allow configuring the guest phys addr bits
167 * (EAX[23:16]). However, KVM uses it as an interface to the userspace
168 * to configure the GPAW. Report these bits as configurable.
169 */
170 if (entry->function == 0x80000008)
171 entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
172
173 tdx_clear_unsupported_cpuid(entry);
174 }
175
176 #define TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT BIT(1)
177
init_kvm_tdx_caps(const struct tdx_sys_info_td_conf * td_conf,struct kvm_tdx_capabilities * caps)178 static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
179 struct kvm_tdx_capabilities *caps)
180 {
181 int i;
182
183 caps->supported_attrs = tdx_get_supported_attrs(td_conf);
184 if (!caps->supported_attrs)
185 return -EIO;
186
187 caps->supported_xfam = tdx_get_supported_xfam(td_conf);
188 if (!caps->supported_xfam)
189 return -EIO;
190
191 caps->cpuid.nent = td_conf->num_cpuid_config;
192
193 caps->user_tdvmcallinfo_1_r11 =
194 TDVMCALLINFO_SETUP_EVENT_NOTIFY_INTERRUPT;
195
196 for (i = 0; i < td_conf->num_cpuid_config; i++)
197 td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
198
199 return 0;
200 }
201
202 /*
203 * Some SEAMCALLs acquire the TDX module globally, and can fail with
204 * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs.
205 */
206 static DEFINE_MUTEX(tdx_lock);
207
208 static atomic_t nr_configured_hkid;
209
tdx_operand_busy(u64 err)210 static bool tdx_operand_busy(u64 err)
211 {
212 return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
213 }
214
215
216 /*
217 * A per-CPU list of TD vCPUs associated with a given CPU.
218 * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
219 * list.
220 * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
221 * the old CPU during the IPI callback running on the old CPU, and then added
222 * to the per-CPU list of the new CPU.
223 * - When a TD is tearing down, all vCPUs are disassociated from their current
224 * running CPUs and removed from the per-CPU list during the IPI callback
225 * running on those CPUs.
226 * - When a CPU is brought down, traverse the per-CPU list to disassociate all
227 * associated TD vCPUs and remove them from the per-CPU list.
228 */
229 static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
230
tdvmcall_exit_type(struct kvm_vcpu * vcpu)231 static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
232 {
233 return to_tdx(vcpu)->vp_enter_args.r10;
234 }
235
tdvmcall_leaf(struct kvm_vcpu * vcpu)236 static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
237 {
238 return to_tdx(vcpu)->vp_enter_args.r11;
239 }
240
tdvmcall_set_return_code(struct kvm_vcpu * vcpu,long val)241 static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
242 long val)
243 {
244 to_tdx(vcpu)->vp_enter_args.r10 = val;
245 }
246
tdvmcall_set_return_val(struct kvm_vcpu * vcpu,unsigned long val)247 static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
248 unsigned long val)
249 {
250 to_tdx(vcpu)->vp_enter_args.r11 = val;
251 }
252
tdx_hkid_free(struct kvm_tdx * kvm_tdx)253 static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
254 {
255 tdx_guest_keyid_free(kvm_tdx->hkid);
256 kvm_tdx->hkid = -1;
257 atomic_dec(&nr_configured_hkid);
258 misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
259 put_misc_cg(kvm_tdx->misc_cg);
260 kvm_tdx->misc_cg = NULL;
261 }
262
is_hkid_assigned(struct kvm_tdx * kvm_tdx)263 static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
264 {
265 return kvm_tdx->hkid > 0;
266 }
267
tdx_disassociate_vp(struct kvm_vcpu * vcpu)268 static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
269 {
270 lockdep_assert_irqs_disabled();
271
272 list_del(&to_tdx(vcpu)->cpu_list);
273
274 /*
275 * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
276 * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
277 * to its list before it's deleted from this CPU's list.
278 */
279 smp_wmb();
280
281 vcpu->cpu = -1;
282 }
283
tdx_clear_page(struct page * page)284 static void tdx_clear_page(struct page *page)
285 {
286 const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
287 void *dest = page_to_virt(page);
288 unsigned long i;
289
290 /*
291 * The page could have been poisoned. MOVDIR64B also clears
292 * the poison bit so the kernel can safely use the page again.
293 */
294 for (i = 0; i < PAGE_SIZE; i += 64)
295 movdir64b(dest + i, zero_page);
296 /*
297 * MOVDIR64B store uses WC buffer. Prevent following memory reads
298 * from seeing potentially poisoned cache.
299 */
300 __mb();
301 }
302
tdx_no_vcpus_enter_start(struct kvm * kvm)303 static void tdx_no_vcpus_enter_start(struct kvm *kvm)
304 {
305 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
306
307 lockdep_assert_held_write(&kvm->mmu_lock);
308
309 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
310
311 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
312 }
313
tdx_no_vcpus_enter_stop(struct kvm * kvm)314 static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
315 {
316 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
317
318 lockdep_assert_held_write(&kvm->mmu_lock);
319
320 WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
321 }
322
323 /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
__tdx_reclaim_page(struct page * page)324 static int __tdx_reclaim_page(struct page *page)
325 {
326 u64 err, rcx, rdx, r8;
327
328 err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
329
330 /*
331 * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
332 * before the HKID is released and control pages have also been
333 * released at this point, so there is no possibility of contention.
334 */
335 if (WARN_ON_ONCE(err)) {
336 pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
337 return -EIO;
338 }
339 return 0;
340 }
341
tdx_reclaim_page(struct page * page)342 static int tdx_reclaim_page(struct page *page)
343 {
344 int r;
345
346 r = __tdx_reclaim_page(page);
347 if (!r)
348 tdx_clear_page(page);
349 return r;
350 }
351
352
353 /*
354 * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
355 * private KeyID. Assume the cache associated with the TDX private KeyID has
356 * been flushed.
357 */
tdx_reclaim_control_page(struct page * ctrl_page)358 static void tdx_reclaim_control_page(struct page *ctrl_page)
359 {
360 /*
361 * Leak the page if the kernel failed to reclaim the page.
362 * The kernel cannot use it safely anymore.
363 */
364 if (tdx_reclaim_page(ctrl_page))
365 return;
366
367 __free_page(ctrl_page);
368 }
369
370 struct tdx_flush_vp_arg {
371 struct kvm_vcpu *vcpu;
372 u64 err;
373 };
374
tdx_flush_vp(void * _arg)375 static void tdx_flush_vp(void *_arg)
376 {
377 struct tdx_flush_vp_arg *arg = _arg;
378 struct kvm_vcpu *vcpu = arg->vcpu;
379 u64 err;
380
381 arg->err = 0;
382 lockdep_assert_irqs_disabled();
383
384 /* Task migration can race with CPU offlining. */
385 if (unlikely(vcpu->cpu != raw_smp_processor_id()))
386 return;
387
388 /*
389 * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The
390 * list tracking still needs to be updated so that it's correct if/when
391 * the vCPU does get initialized.
392 */
393 if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
394 /*
395 * No need to retry. TDX Resources needed for TDH.VP.FLUSH are:
396 * TDVPR as exclusive, TDR as shared, and TDCS as shared. This
397 * vp flush function is called when destructing vCPU/TD or vCPU
398 * migration. No other thread uses TDVPR in those cases.
399 */
400 err = tdh_vp_flush(&to_tdx(vcpu)->vp);
401 if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
402 /*
403 * This function is called in IPI context. Do not use
404 * printk to avoid console semaphore.
405 * The caller prints out the error message, instead.
406 */
407 if (err)
408 arg->err = err;
409 }
410 }
411
412 tdx_disassociate_vp(vcpu);
413 }
414
tdx_flush_vp_on_cpu(struct kvm_vcpu * vcpu)415 static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
416 {
417 struct tdx_flush_vp_arg arg = {
418 .vcpu = vcpu,
419 };
420 int cpu = vcpu->cpu;
421
422 if (unlikely(cpu == -1))
423 return;
424
425 smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
426 if (KVM_BUG_ON(arg.err, vcpu->kvm))
427 pr_tdx_error(TDH_VP_FLUSH, arg.err);
428 }
429
tdx_disable_virtualization_cpu(void)430 void tdx_disable_virtualization_cpu(void)
431 {
432 int cpu = raw_smp_processor_id();
433 struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
434 struct tdx_flush_vp_arg arg;
435 struct vcpu_tdx *tdx, *tmp;
436 unsigned long flags;
437
438 local_irq_save(flags);
439 /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
440 list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
441 arg.vcpu = &tdx->vcpu;
442 tdx_flush_vp(&arg);
443 }
444 local_irq_restore(flags);
445 }
446
447 #define TDX_SEAMCALL_RETRIES 10000
448
smp_func_do_phymem_cache_wb(void * unused)449 static void smp_func_do_phymem_cache_wb(void *unused)
450 {
451 u64 err = 0;
452 bool resume;
453 int i;
454
455 /*
456 * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
457 * KeyID on the package or core. The TDX module may not finish the
458 * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The
459 * kernel should retry it until it returns success w/o rescheduling.
460 */
461 for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
462 resume = !!err;
463 err = tdh_phymem_cache_wb(resume);
464 switch (err) {
465 case TDX_INTERRUPTED_RESUMABLE:
466 continue;
467 case TDX_NO_HKID_READY_TO_WBCACHE:
468 err = TDX_SUCCESS; /* Already done by other thread */
469 fallthrough;
470 default:
471 goto out;
472 }
473 }
474
475 out:
476 if (WARN_ON_ONCE(err))
477 pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
478 }
479
tdx_mmu_release_hkid(struct kvm * kvm)480 void tdx_mmu_release_hkid(struct kvm *kvm)
481 {
482 bool packages_allocated, targets_allocated;
483 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
484 cpumask_var_t packages, targets;
485 struct kvm_vcpu *vcpu;
486 unsigned long j;
487 int i;
488 u64 err;
489
490 if (!is_hkid_assigned(kvm_tdx))
491 return;
492
493 packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
494 targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
495 cpus_read_lock();
496
497 kvm_for_each_vcpu(j, vcpu, kvm)
498 tdx_flush_vp_on_cpu(vcpu);
499
500 /*
501 * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
502 * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
503 * Multiple TDX guests can be destroyed simultaneously. Take the
504 * mutex to prevent it from getting error.
505 */
506 mutex_lock(&tdx_lock);
507
508 /*
509 * Releasing HKID is in vm_destroy().
510 * After the above flushing vps, there should be no more vCPU
511 * associations, as all vCPU fds have been released at this stage.
512 */
513 err = tdh_mng_vpflushdone(&kvm_tdx->td);
514 if (err == TDX_FLUSHVP_NOT_DONE)
515 goto out;
516 if (KVM_BUG_ON(err, kvm)) {
517 pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
518 pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
519 kvm_tdx->hkid);
520 goto out;
521 }
522
523 for_each_online_cpu(i) {
524 if (packages_allocated &&
525 cpumask_test_and_set_cpu(topology_physical_package_id(i),
526 packages))
527 continue;
528 if (targets_allocated)
529 cpumask_set_cpu(i, targets);
530 }
531 if (targets_allocated)
532 on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
533 else
534 on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
535 /*
536 * In the case of error in smp_func_do_phymem_cache_wb(), the following
537 * tdh_mng_key_freeid() will fail.
538 */
539 err = tdh_mng_key_freeid(&kvm_tdx->td);
540 if (KVM_BUG_ON(err, kvm)) {
541 pr_tdx_error(TDH_MNG_KEY_FREEID, err);
542 pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
543 kvm_tdx->hkid);
544 } else {
545 tdx_hkid_free(kvm_tdx);
546 }
547
548 out:
549 mutex_unlock(&tdx_lock);
550 cpus_read_unlock();
551 free_cpumask_var(targets);
552 free_cpumask_var(packages);
553 }
554
tdx_reclaim_td_control_pages(struct kvm * kvm)555 static void tdx_reclaim_td_control_pages(struct kvm *kvm)
556 {
557 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
558 u64 err;
559 int i;
560
561 /*
562 * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong
563 * heavily with TDX module. Give up freeing TD pages. As the function
564 * already warned, don't warn it again.
565 */
566 if (is_hkid_assigned(kvm_tdx))
567 return;
568
569 if (kvm_tdx->td.tdcs_pages) {
570 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
571 if (!kvm_tdx->td.tdcs_pages[i])
572 continue;
573
574 tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
575 }
576 kfree(kvm_tdx->td.tdcs_pages);
577 kvm_tdx->td.tdcs_pages = NULL;
578 }
579
580 if (!kvm_tdx->td.tdr_page)
581 return;
582
583 if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
584 return;
585
586 /*
587 * Use a SEAMCALL to ask the TDX module to flush the cache based on the
588 * KeyID. TDX module may access TDR while operating on TD (Especially
589 * when it is reclaiming TDCS).
590 */
591 err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
592 if (KVM_BUG_ON(err, kvm)) {
593 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
594 return;
595 }
596 tdx_clear_page(kvm_tdx->td.tdr_page);
597
598 __free_page(kvm_tdx->td.tdr_page);
599 kvm_tdx->td.tdr_page = NULL;
600 }
601
tdx_vm_destroy(struct kvm * kvm)602 void tdx_vm_destroy(struct kvm *kvm)
603 {
604 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
605
606 tdx_reclaim_td_control_pages(kvm);
607
608 kvm_tdx->state = TD_STATE_UNINITIALIZED;
609 }
610
tdx_do_tdh_mng_key_config(void * param)611 static int tdx_do_tdh_mng_key_config(void *param)
612 {
613 struct kvm_tdx *kvm_tdx = param;
614 u64 err;
615
616 /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
617 err = tdh_mng_key_config(&kvm_tdx->td);
618
619 if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
620 pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
621 return -EIO;
622 }
623
624 return 0;
625 }
626
tdx_vm_init(struct kvm * kvm)627 int tdx_vm_init(struct kvm *kvm)
628 {
629 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
630
631 kvm->arch.has_protected_state = true;
632 kvm->arch.has_private_mem = true;
633 kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
634
635 /*
636 * Because guest TD is protected, VMM can't parse the instruction in TD.
637 * Instead, guest uses MMIO hypercall. For unmodified device driver,
638 * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
639 * instruction into MMIO hypercall.
640 *
641 * SPTE value for MMIO needs to be setup so that #VE is injected into
642 * TD instead of triggering EPT MISCONFIG.
643 * - RWX=0 so that EPT violation is triggered.
644 * - suppress #VE bit is cleared to inject #VE.
645 */
646 kvm_mmu_set_mmio_spte_value(kvm, 0);
647
648 /*
649 * TDX has its own limit of maximum vCPUs it can support for all
650 * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports
651 * such limit via the MAX_VCPU_PER_TD global metadata. In
652 * practice, it reflects the number of logical CPUs that ALL
653 * platforms that the TDX module supports can possibly have.
654 *
655 * Limit TDX guest's maximum vCPUs to the number of logical CPUs
656 * the platform has. Simply forwarding the MAX_VCPU_PER_TD to
657 * userspace would result in an unpredictable ABI.
658 */
659 kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
660
661 kvm_tdx->state = TD_STATE_UNINITIALIZED;
662
663 return 0;
664 }
665
tdx_vcpu_create(struct kvm_vcpu * vcpu)666 int tdx_vcpu_create(struct kvm_vcpu *vcpu)
667 {
668 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
669 struct vcpu_tdx *tdx = to_tdx(vcpu);
670
671 if (kvm_tdx->state != TD_STATE_INITIALIZED)
672 return -EIO;
673
674 /*
675 * TDX module mandates APICv, which requires an in-kernel local APIC.
676 * Disallow an in-kernel I/O APIC, because level-triggered interrupts
677 * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
678 */
679 if (!irqchip_split(vcpu->kvm))
680 return -EINVAL;
681
682 fpstate_set_confidential(&vcpu->arch.guest_fpu);
683 vcpu->arch.apic->guest_apic_protected = true;
684 INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
685
686 vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
687
688 vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
689 vcpu->arch.cr0_guest_owned_bits = -1ul;
690 vcpu->arch.cr4_guest_owned_bits = -1ul;
691
692 /* KVM can't change TSC offset/multiplier as TDX module manages them. */
693 vcpu->arch.guest_tsc_protected = true;
694 vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
695 vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
696 vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
697 vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
698
699 vcpu->arch.guest_state_protected =
700 !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
701
702 if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
703 vcpu->arch.xfd_no_write_intercept = true;
704
705 tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
706 __pi_set_sn(&tdx->vt.pi_desc);
707
708 tdx->state = VCPU_TD_STATE_UNINITIALIZED;
709
710 return 0;
711 }
712
tdx_vcpu_load(struct kvm_vcpu * vcpu,int cpu)713 void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
714 {
715 struct vcpu_tdx *tdx = to_tdx(vcpu);
716
717 vmx_vcpu_pi_load(vcpu, cpu);
718 if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
719 return;
720
721 tdx_flush_vp_on_cpu(vcpu);
722
723 KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
724 local_irq_disable();
725 /*
726 * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
727 * vcpu->cpu is read before tdx->cpu_list.
728 */
729 smp_rmb();
730
731 list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
732 local_irq_enable();
733 }
734
tdx_interrupt_allowed(struct kvm_vcpu * vcpu)735 bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
736 {
737 /*
738 * KVM can't get the interrupt status of TDX guest and it assumes
739 * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
740 * which passes the interrupt blocked flag.
741 */
742 return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
743 !to_tdx(vcpu)->vp_enter_args.r12;
744 }
745
tdx_protected_apic_has_interrupt(struct kvm_vcpu * vcpu)746 static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
747 {
748 u64 vcpu_state_details;
749
750 if (pi_has_pending_interrupt(vcpu))
751 return true;
752
753 /*
754 * Only check RVI pending for HALTED case with IRQ enabled.
755 * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the
756 * interrupt was pending before TD exit, then it _must_ be blocked,
757 * otherwise the interrupt would have been serviced at the instruction
758 * boundary.
759 */
760 if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
761 to_tdx(vcpu)->vp_enter_args.r12)
762 return false;
763
764 vcpu_state_details =
765 td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
766
767 return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
768 }
769
770 /*
771 * Compared to vmx_prepare_switch_to_guest(), there is not much to do
772 * as SEAMCALL/SEAMRET calls take care of most of save and restore.
773 */
tdx_prepare_switch_to_guest(struct kvm_vcpu * vcpu)774 void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
775 {
776 struct vcpu_vt *vt = to_vt(vcpu);
777
778 if (vt->guest_state_loaded)
779 return;
780
781 if (likely(is_64bit_mm(current->mm)))
782 vt->msr_host_kernel_gs_base = current->thread.gsbase;
783 else
784 vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
785
786 vt->guest_state_loaded = true;
787 }
788
789 struct tdx_uret_msr {
790 u32 msr;
791 unsigned int slot;
792 u64 defval;
793 };
794
795 static struct tdx_uret_msr tdx_uret_msrs[] = {
796 {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
797 {.msr = MSR_STAR,},
798 {.msr = MSR_LSTAR,},
799 {.msr = MSR_TSC_AUX,},
800 };
801
tdx_user_return_msr_update_cache(void)802 static void tdx_user_return_msr_update_cache(void)
803 {
804 int i;
805
806 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
807 kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
808 tdx_uret_msrs[i].defval);
809 }
810
tdx_prepare_switch_to_host(struct kvm_vcpu * vcpu)811 static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
812 {
813 struct vcpu_vt *vt = to_vt(vcpu);
814 struct vcpu_tdx *tdx = to_tdx(vcpu);
815
816 if (!vt->guest_state_loaded)
817 return;
818
819 ++vcpu->stat.host_state_reload;
820 wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
821
822 if (tdx->guest_entered) {
823 tdx_user_return_msr_update_cache();
824 tdx->guest_entered = false;
825 }
826
827 vt->guest_state_loaded = false;
828 }
829
tdx_vcpu_put(struct kvm_vcpu * vcpu)830 void tdx_vcpu_put(struct kvm_vcpu *vcpu)
831 {
832 vmx_vcpu_pi_put(vcpu);
833 tdx_prepare_switch_to_host(vcpu);
834 }
835
tdx_vcpu_free(struct kvm_vcpu * vcpu)836 void tdx_vcpu_free(struct kvm_vcpu *vcpu)
837 {
838 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
839 struct vcpu_tdx *tdx = to_tdx(vcpu);
840 int i;
841
842 /*
843 * It is not possible to reclaim pages while hkid is assigned. It might
844 * be assigned if:
845 * 1. the TD VM is being destroyed but freeing hkid failed, in which
846 * case the pages are leaked
847 * 2. TD VCPU creation failed and this on the error path, in which case
848 * there is nothing to do anyway
849 */
850 if (is_hkid_assigned(kvm_tdx))
851 return;
852
853 if (tdx->vp.tdcx_pages) {
854 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
855 if (tdx->vp.tdcx_pages[i])
856 tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
857 }
858 kfree(tdx->vp.tdcx_pages);
859 tdx->vp.tdcx_pages = NULL;
860 }
861 if (tdx->vp.tdvpr_page) {
862 tdx_reclaim_control_page(tdx->vp.tdvpr_page);
863 tdx->vp.tdvpr_page = 0;
864 }
865
866 tdx->state = VCPU_TD_STATE_UNINITIALIZED;
867 }
868
tdx_vcpu_pre_run(struct kvm_vcpu * vcpu)869 int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
870 {
871 if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
872 to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
873 return -EINVAL;
874
875 return 1;
876 }
877
tdcall_to_vmx_exit_reason(struct kvm_vcpu * vcpu)878 static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
879 {
880 switch (tdvmcall_leaf(vcpu)) {
881 case EXIT_REASON_CPUID:
882 case EXIT_REASON_HLT:
883 case EXIT_REASON_IO_INSTRUCTION:
884 case EXIT_REASON_MSR_READ:
885 case EXIT_REASON_MSR_WRITE:
886 return tdvmcall_leaf(vcpu);
887 case EXIT_REASON_EPT_VIOLATION:
888 return EXIT_REASON_EPT_MISCONFIG;
889 default:
890 break;
891 }
892
893 return EXIT_REASON_TDCALL;
894 }
895
tdx_to_vmx_exit_reason(struct kvm_vcpu * vcpu)896 static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
897 {
898 struct vcpu_tdx *tdx = to_tdx(vcpu);
899 u32 exit_reason;
900
901 switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
902 case TDX_SUCCESS:
903 case TDX_NON_RECOVERABLE_VCPU:
904 case TDX_NON_RECOVERABLE_TD:
905 case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
906 case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
907 break;
908 default:
909 return -1u;
910 }
911
912 exit_reason = tdx->vp_enter_ret;
913
914 switch (exit_reason) {
915 case EXIT_REASON_TDCALL:
916 if (tdvmcall_exit_type(vcpu))
917 return EXIT_REASON_VMCALL;
918
919 return tdcall_to_vmx_exit_reason(vcpu);
920 case EXIT_REASON_EPT_MISCONFIG:
921 /*
922 * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
923 * non-instrumentable code with interrupts disabled.
924 */
925 return -1u;
926 default:
927 break;
928 }
929
930 return exit_reason;
931 }
932
tdx_vcpu_enter_exit(struct kvm_vcpu * vcpu)933 static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
934 {
935 struct vcpu_tdx *tdx = to_tdx(vcpu);
936 struct vcpu_vt *vt = to_vt(vcpu);
937
938 guest_state_enter_irqoff();
939
940 tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
941
942 vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
943
944 vt->exit_qualification = tdx->vp_enter_args.rcx;
945 tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
946 tdx->exit_gpa = tdx->vp_enter_args.r8;
947 vt->exit_intr_info = tdx->vp_enter_args.r9;
948
949 vmx_handle_nmi(vcpu);
950
951 guest_state_exit_irqoff();
952 }
953
tdx_failed_vmentry(struct kvm_vcpu * vcpu)954 static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
955 {
956 return vmx_get_exit_reason(vcpu).failed_vmentry &&
957 vmx_get_exit_reason(vcpu).full != -1u;
958 }
959
tdx_exit_handlers_fastpath(struct kvm_vcpu * vcpu)960 static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
961 {
962 u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
963
964 /*
965 * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
966 * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
967 *
968 * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
969 * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
970 * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
971 * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
972 * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
973 * requester may be blocked endlessly.
974 */
975 if (unlikely(tdx_operand_busy(vp_enter_ret)))
976 return EXIT_FASTPATH_EXIT_HANDLED;
977
978 return EXIT_FASTPATH_NONE;
979 }
980
981 #define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
982 BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
983 BIT_ULL(VCPU_REGS_RAX) | \
984 BIT_ULL(VCPU_REGS_RBX) | \
985 BIT_ULL(VCPU_REGS_RCX) | \
986 BIT_ULL(VCPU_REGS_RDX) | \
987 BIT_ULL(VCPU_REGS_RBP) | \
988 BIT_ULL(VCPU_REGS_RSI) | \
989 BIT_ULL(VCPU_REGS_RDI) | \
990 BIT_ULL(VCPU_REGS_R8) | \
991 BIT_ULL(VCPU_REGS_R9) | \
992 BIT_ULL(VCPU_REGS_R10) | \
993 BIT_ULL(VCPU_REGS_R11) | \
994 BIT_ULL(VCPU_REGS_R12) | \
995 BIT_ULL(VCPU_REGS_R13) | \
996 BIT_ULL(VCPU_REGS_R14) | \
997 BIT_ULL(VCPU_REGS_R15))
998
tdx_load_host_xsave_state(struct kvm_vcpu * vcpu)999 static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
1000 {
1001 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
1002
1003 /*
1004 * All TDX hosts support PKRU; but even if they didn't,
1005 * vcpu->arch.host_pkru would be 0 and the wrpkru would be
1006 * skipped.
1007 */
1008 if (vcpu->arch.host_pkru != 0)
1009 wrpkru(vcpu->arch.host_pkru);
1010
1011 if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
1012 xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
1013
1014 /*
1015 * Likewise, even if a TDX hosts didn't support XSS both arms of
1016 * the comparison would be 0 and the wrmsrl would be skipped.
1017 */
1018 if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
1019 wrmsrl(MSR_IA32_XSS, kvm_host.xss);
1020 }
1021
1022 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
1023 DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
1024 DEBUGCTLMSR_FREEZE_IN_SMM)
1025
tdx_vcpu_run(struct kvm_vcpu * vcpu,u64 run_flags)1026 fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags)
1027 {
1028 struct vcpu_tdx *tdx = to_tdx(vcpu);
1029 struct vcpu_vt *vt = to_vt(vcpu);
1030
1031 /*
1032 * WARN if KVM wants to force an immediate exit, as the TDX module does
1033 * not guarantee entry into the guest, i.e. it's possible for KVM to
1034 * _think_ it completed entry to the guest and forced an immediate exit
1035 * without actually having done so. Luckily, KVM never needs to force
1036 * an immediate exit for TDX (KVM can't do direct event injection, so
1037 * just WARN and continue on.
1038 */
1039 WARN_ON_ONCE(run_flags);
1040
1041 /*
1042 * Wait until retry of SEPT-zap-related SEAMCALL completes before
1043 * allowing vCPU entry to avoid contention with tdh_vp_enter() and
1044 * TDCALLs.
1045 */
1046 if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
1047 return EXIT_FASTPATH_EXIT_HANDLED;
1048
1049 trace_kvm_entry(vcpu, run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT);
1050
1051 if (pi_test_on(&vt->pi_desc)) {
1052 apic->send_IPI_self(POSTED_INTR_VECTOR);
1053
1054 if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
1055 APIC_VECTOR_MASK, &vt->pi_desc))
1056 kvm_wait_lapic_expire(vcpu);
1057 }
1058
1059 tdx_vcpu_enter_exit(vcpu);
1060
1061 if (vcpu->arch.host_debugctl & ~TDX_DEBUGCTL_PRESERVED)
1062 update_debugctlmsr(vcpu->arch.host_debugctl);
1063
1064 tdx_load_host_xsave_state(vcpu);
1065 tdx->guest_entered = true;
1066
1067 vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
1068
1069 if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
1070 return EXIT_FASTPATH_NONE;
1071
1072 if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
1073 return EXIT_FASTPATH_NONE;
1074
1075 if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
1076 kvm_machine_check();
1077
1078 trace_kvm_exit(vcpu, KVM_ISA_VMX);
1079
1080 if (unlikely(tdx_failed_vmentry(vcpu)))
1081 return EXIT_FASTPATH_NONE;
1082
1083 return tdx_exit_handlers_fastpath(vcpu);
1084 }
1085
tdx_inject_nmi(struct kvm_vcpu * vcpu)1086 void tdx_inject_nmi(struct kvm_vcpu *vcpu)
1087 {
1088 ++vcpu->stat.nmi_injections;
1089 td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
1090 /*
1091 * From KVM's perspective, NMI injection is completed right after
1092 * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by
1093 * the TDX module or not.
1094 */
1095 vcpu->arch.nmi_injected = false;
1096 /*
1097 * TDX doesn't support KVM to request NMI window exit. If there is
1098 * still a pending vNMI, KVM is not able to inject it along with the
1099 * one pending in TDX module in a back-to-back way. Since the previous
1100 * vNMI is still pending in TDX module, i.e. it has not been delivered
1101 * to TDX guest yet, it's OK to collapse the pending vNMI into the
1102 * previous one. The guest is expected to handle all the NMI sources
1103 * when handling the first vNMI.
1104 */
1105 vcpu->arch.nmi_pending = 0;
1106 }
1107
tdx_handle_exception_nmi(struct kvm_vcpu * vcpu)1108 static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
1109 {
1110 u32 intr_info = vmx_get_intr_info(vcpu);
1111
1112 /*
1113 * Machine checks are handled by handle_exception_irqoff(), or by
1114 * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
1115 * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit().
1116 */
1117 if (is_nmi(intr_info) || is_machine_check(intr_info))
1118 return 1;
1119
1120 vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
1121 vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1122 vcpu->run->ex.error_code = 0;
1123
1124 return 0;
1125 }
1126
complete_hypercall_exit(struct kvm_vcpu * vcpu)1127 static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
1128 {
1129 tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
1130 return 1;
1131 }
1132
tdx_emulate_vmcall(struct kvm_vcpu * vcpu)1133 static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
1134 {
1135 kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
1136 kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
1137 kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
1138 kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
1139 kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
1140
1141 return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
1142 }
1143
1144 /*
1145 * Split into chunks and check interrupt pending between chunks. This allows
1146 * for timely injection of interrupts to prevent issues with guest lockup
1147 * detection.
1148 */
1149 #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
1150 static void __tdx_map_gpa(struct vcpu_tdx *tdx);
1151
tdx_complete_vmcall_map_gpa(struct kvm_vcpu * vcpu)1152 static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
1153 {
1154 struct vcpu_tdx *tdx = to_tdx(vcpu);
1155
1156 if (vcpu->run->hypercall.ret) {
1157 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1158 tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1159 return 1;
1160 }
1161
1162 tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
1163 if (tdx->map_gpa_next >= tdx->map_gpa_end)
1164 return 1;
1165
1166 /*
1167 * Stop processing the remaining part if there is a pending interrupt,
1168 * which could be qualified to deliver. Skip checking pending RVI for
1169 * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
1170 */
1171 if (kvm_vcpu_has_events(vcpu)) {
1172 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
1173 tdx->vp_enter_args.r11 = tdx->map_gpa_next;
1174 return 1;
1175 }
1176
1177 __tdx_map_gpa(tdx);
1178 return 0;
1179 }
1180
__tdx_map_gpa(struct vcpu_tdx * tdx)1181 static void __tdx_map_gpa(struct vcpu_tdx *tdx)
1182 {
1183 u64 gpa = tdx->map_gpa_next;
1184 u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
1185
1186 if (size > TDX_MAP_GPA_MAX_LEN)
1187 size = TDX_MAP_GPA_MAX_LEN;
1188
1189 tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL;
1190 tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
1191 /*
1192 * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
1193 * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
1194 * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
1195 * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
1196 */
1197 tdx->vcpu.run->hypercall.ret = 0;
1198 tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1199 tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
1200 tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
1201 KVM_MAP_GPA_RANGE_ENCRYPTED :
1202 KVM_MAP_GPA_RANGE_DECRYPTED;
1203 tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE;
1204
1205 tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
1206 }
1207
tdx_map_gpa(struct kvm_vcpu * vcpu)1208 static int tdx_map_gpa(struct kvm_vcpu *vcpu)
1209 {
1210 struct vcpu_tdx *tdx = to_tdx(vcpu);
1211 u64 gpa = tdx->vp_enter_args.r12;
1212 u64 size = tdx->vp_enter_args.r13;
1213 u64 ret;
1214
1215 /*
1216 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
1217 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
1218 * bit set. This is a base call so it should always be supported, but
1219 * KVM has no way to ensure that userspace implements the GHCI correctly.
1220 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
1221 * to the guest.
1222 */
1223 if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
1224 ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1225 goto error;
1226 }
1227
1228 if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
1229 !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
1230 (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
1231 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
1232 ret = TDVMCALL_STATUS_INVALID_OPERAND;
1233 goto error;
1234 }
1235
1236 if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
1237 ret = TDVMCALL_STATUS_ALIGN_ERROR;
1238 goto error;
1239 }
1240
1241 tdx->map_gpa_end = gpa + size;
1242 tdx->map_gpa_next = gpa;
1243
1244 __tdx_map_gpa(tdx);
1245 return 0;
1246
1247 error:
1248 tdvmcall_set_return_code(vcpu, ret);
1249 tdx->vp_enter_args.r11 = gpa;
1250 return 1;
1251 }
1252
tdx_report_fatal_error(struct kvm_vcpu * vcpu)1253 static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
1254 {
1255 struct vcpu_tdx *tdx = to_tdx(vcpu);
1256 u64 *regs = vcpu->run->system_event.data;
1257 u64 *module_regs = &tdx->vp_enter_args.r8;
1258 int index = VCPU_REGS_RAX;
1259
1260 vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
1261 vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
1262 vcpu->run->system_event.ndata = 16;
1263
1264 /* Dump 16 general-purpose registers to userspace in ascending order. */
1265 regs[index++] = tdx->vp_enter_ret;
1266 regs[index++] = tdx->vp_enter_args.rcx;
1267 regs[index++] = tdx->vp_enter_args.rdx;
1268 regs[index++] = tdx->vp_enter_args.rbx;
1269 regs[index++] = 0;
1270 regs[index++] = 0;
1271 regs[index++] = tdx->vp_enter_args.rsi;
1272 regs[index] = tdx->vp_enter_args.rdi;
1273 for (index = 0; index < 8; index++)
1274 regs[VCPU_REGS_R8 + index] = module_regs[index];
1275
1276 return 0;
1277 }
1278
tdx_emulate_cpuid(struct kvm_vcpu * vcpu)1279 static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
1280 {
1281 u32 eax, ebx, ecx, edx;
1282 struct vcpu_tdx *tdx = to_tdx(vcpu);
1283
1284 /* EAX and ECX for cpuid is stored in R12 and R13. */
1285 eax = tdx->vp_enter_args.r12;
1286 ecx = tdx->vp_enter_args.r13;
1287
1288 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
1289
1290 tdx->vp_enter_args.r12 = eax;
1291 tdx->vp_enter_args.r13 = ebx;
1292 tdx->vp_enter_args.r14 = ecx;
1293 tdx->vp_enter_args.r15 = edx;
1294
1295 return 1;
1296 }
1297
tdx_complete_pio_out(struct kvm_vcpu * vcpu)1298 static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
1299 {
1300 vcpu->arch.pio.count = 0;
1301 return 1;
1302 }
1303
tdx_complete_pio_in(struct kvm_vcpu * vcpu)1304 static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
1305 {
1306 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1307 unsigned long val = 0;
1308 int ret;
1309
1310 ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
1311 vcpu->arch.pio.port, &val, 1);
1312
1313 WARN_ON_ONCE(!ret);
1314
1315 tdvmcall_set_return_val(vcpu, val);
1316
1317 return 1;
1318 }
1319
tdx_emulate_io(struct kvm_vcpu * vcpu)1320 static int tdx_emulate_io(struct kvm_vcpu *vcpu)
1321 {
1322 struct vcpu_tdx *tdx = to_tdx(vcpu);
1323 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
1324 unsigned long val = 0;
1325 unsigned int port;
1326 u64 size, write;
1327 int ret;
1328
1329 ++vcpu->stat.io_exits;
1330
1331 size = tdx->vp_enter_args.r12;
1332 write = tdx->vp_enter_args.r13;
1333 port = tdx->vp_enter_args.r14;
1334
1335 if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
1336 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1337 return 1;
1338 }
1339
1340 if (write) {
1341 val = tdx->vp_enter_args.r15;
1342 ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
1343 } else {
1344 ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
1345 }
1346
1347 if (!ret)
1348 vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
1349 tdx_complete_pio_in;
1350 else if (!write)
1351 tdvmcall_set_return_val(vcpu, val);
1352
1353 return ret;
1354 }
1355
tdx_complete_mmio_read(struct kvm_vcpu * vcpu)1356 static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
1357 {
1358 unsigned long val = 0;
1359 gpa_t gpa;
1360 int size;
1361
1362 gpa = vcpu->mmio_fragments[0].gpa;
1363 size = vcpu->mmio_fragments[0].len;
1364
1365 memcpy(&val, vcpu->run->mmio.data, size);
1366 tdvmcall_set_return_val(vcpu, val);
1367 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1368 return 1;
1369 }
1370
tdx_mmio_write(struct kvm_vcpu * vcpu,gpa_t gpa,int size,unsigned long val)1371 static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
1372 unsigned long val)
1373 {
1374 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
1375 trace_kvm_fast_mmio(gpa);
1376 return 0;
1377 }
1378
1379 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
1380 if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1381 return -EOPNOTSUPP;
1382
1383 return 0;
1384 }
1385
tdx_mmio_read(struct kvm_vcpu * vcpu,gpa_t gpa,int size)1386 static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
1387 {
1388 unsigned long val;
1389
1390 if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
1391 return -EOPNOTSUPP;
1392
1393 tdvmcall_set_return_val(vcpu, val);
1394 trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
1395 return 0;
1396 }
1397
tdx_emulate_mmio(struct kvm_vcpu * vcpu)1398 static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
1399 {
1400 struct vcpu_tdx *tdx = to_tdx(vcpu);
1401 int size, write, r;
1402 unsigned long val;
1403 gpa_t gpa;
1404
1405 size = tdx->vp_enter_args.r12;
1406 write = tdx->vp_enter_args.r13;
1407 gpa = tdx->vp_enter_args.r14;
1408 val = write ? tdx->vp_enter_args.r15 : 0;
1409
1410 if (size != 1 && size != 2 && size != 4 && size != 8)
1411 goto error;
1412 if (write != 0 && write != 1)
1413 goto error;
1414
1415 /*
1416 * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
1417 * do MMIO emulation for private GPA.
1418 */
1419 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
1420 vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
1421 goto error;
1422
1423 gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
1424
1425 if (write)
1426 r = tdx_mmio_write(vcpu, gpa, size, val);
1427 else
1428 r = tdx_mmio_read(vcpu, gpa, size);
1429 if (!r)
1430 /* Kernel completed device emulation. */
1431 return 1;
1432
1433 /* Request the device emulation to userspace device model. */
1434 vcpu->mmio_is_write = write;
1435 if (!write)
1436 vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
1437
1438 vcpu->run->mmio.phys_addr = gpa;
1439 vcpu->run->mmio.len = size;
1440 vcpu->run->mmio.is_write = write;
1441 vcpu->run->exit_reason = KVM_EXIT_MMIO;
1442
1443 if (write) {
1444 memcpy(vcpu->run->mmio.data, &val, size);
1445 } else {
1446 vcpu->mmio_fragments[0].gpa = gpa;
1447 vcpu->mmio_fragments[0].len = size;
1448 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
1449 }
1450 return 0;
1451
1452 error:
1453 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1454 return 1;
1455 }
1456
tdx_complete_get_td_vm_call_info(struct kvm_vcpu * vcpu)1457 static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1458 {
1459 struct vcpu_tdx *tdx = to_tdx(vcpu);
1460
1461 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
1462
1463 /*
1464 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
1465 * directly without the support from userspace, just set the value
1466 * returned from userspace.
1467 */
1468 tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
1469 tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
1470 tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
1471 tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
1472
1473 return 1;
1474 }
1475
tdx_get_td_vm_call_info(struct kvm_vcpu * vcpu)1476 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
1477 {
1478 struct vcpu_tdx *tdx = to_tdx(vcpu);
1479
1480 switch (tdx->vp_enter_args.r12) {
1481 case 0:
1482 tdx->vp_enter_args.r11 = 0;
1483 tdx->vp_enter_args.r12 = 0;
1484 tdx->vp_enter_args.r13 = 0;
1485 tdx->vp_enter_args.r14 = 0;
1486 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
1487 return 1;
1488 case 1:
1489 vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
1490 vcpu->run->exit_reason = KVM_EXIT_TDX;
1491 vcpu->run->tdx.flags = 0;
1492 vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
1493 vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
1494 vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
1495 vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
1496 vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
1497 vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
1498 vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
1499 return 0;
1500 default:
1501 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1502 return 1;
1503 }
1504 }
1505
tdx_complete_simple(struct kvm_vcpu * vcpu)1506 static int tdx_complete_simple(struct kvm_vcpu *vcpu)
1507 {
1508 tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
1509 return 1;
1510 }
1511
tdx_get_quote(struct kvm_vcpu * vcpu)1512 static int tdx_get_quote(struct kvm_vcpu *vcpu)
1513 {
1514 struct vcpu_tdx *tdx = to_tdx(vcpu);
1515 u64 gpa = tdx->vp_enter_args.r12;
1516 u64 size = tdx->vp_enter_args.r13;
1517
1518 /* The gpa of buffer must have shared bit set. */
1519 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1520 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1521 return 1;
1522 }
1523
1524 vcpu->run->exit_reason = KVM_EXIT_TDX;
1525 vcpu->run->tdx.flags = 0;
1526 vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
1527 vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1528 vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
1529 vcpu->run->tdx.get_quote.size = size;
1530
1531 vcpu->arch.complete_userspace_io = tdx_complete_simple;
1532
1533 return 0;
1534 }
1535
tdx_setup_event_notify_interrupt(struct kvm_vcpu * vcpu)1536 static int tdx_setup_event_notify_interrupt(struct kvm_vcpu *vcpu)
1537 {
1538 struct vcpu_tdx *tdx = to_tdx(vcpu);
1539 u64 vector = tdx->vp_enter_args.r12;
1540
1541 if (vector < 32 || vector > 255) {
1542 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
1543 return 1;
1544 }
1545
1546 vcpu->run->exit_reason = KVM_EXIT_TDX;
1547 vcpu->run->tdx.flags = 0;
1548 vcpu->run->tdx.nr = TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT;
1549 vcpu->run->tdx.setup_event_notify.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
1550 vcpu->run->tdx.setup_event_notify.vector = vector;
1551
1552 vcpu->arch.complete_userspace_io = tdx_complete_simple;
1553
1554 return 0;
1555 }
1556
handle_tdvmcall(struct kvm_vcpu * vcpu)1557 static int handle_tdvmcall(struct kvm_vcpu *vcpu)
1558 {
1559 switch (tdvmcall_leaf(vcpu)) {
1560 case TDVMCALL_MAP_GPA:
1561 return tdx_map_gpa(vcpu);
1562 case TDVMCALL_REPORT_FATAL_ERROR:
1563 return tdx_report_fatal_error(vcpu);
1564 case TDVMCALL_GET_TD_VM_CALL_INFO:
1565 return tdx_get_td_vm_call_info(vcpu);
1566 case TDVMCALL_GET_QUOTE:
1567 return tdx_get_quote(vcpu);
1568 case TDVMCALL_SETUP_EVENT_NOTIFY_INTERRUPT:
1569 return tdx_setup_event_notify_interrupt(vcpu);
1570 default:
1571 break;
1572 }
1573
1574 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
1575 return 1;
1576 }
1577
tdx_load_mmu_pgd(struct kvm_vcpu * vcpu,hpa_t root_hpa,int pgd_level)1578 void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
1579 {
1580 u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
1581 TDX_SHARED_BIT_PWL_4;
1582
1583 if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
1584 return;
1585
1586 td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
1587 }
1588
tdx_unpin(struct kvm * kvm,struct page * page)1589 static void tdx_unpin(struct kvm *kvm, struct page *page)
1590 {
1591 put_page(page);
1592 }
1593
tdx_mem_page_aug(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)1594 static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
1595 enum pg_level level, struct page *page)
1596 {
1597 int tdx_level = pg_level_to_tdx_sept_level(level);
1598 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1599 gpa_t gpa = gfn_to_gpa(gfn);
1600 u64 entry, level_state;
1601 u64 err;
1602
1603 err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
1604 if (unlikely(tdx_operand_busy(err))) {
1605 tdx_unpin(kvm, page);
1606 return -EBUSY;
1607 }
1608
1609 if (KVM_BUG_ON(err, kvm)) {
1610 pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
1611 tdx_unpin(kvm, page);
1612 return -EIO;
1613 }
1614
1615 return 0;
1616 }
1617
1618 /*
1619 * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
1620 * callback tdx_gmem_post_populate() then maps pages into private memory.
1621 * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the
1622 * private EPT structures for the page to have been built before, which is
1623 * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
1624 * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
1625 * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
1626 * are no half-initialized shared EPT pages.
1627 */
tdx_mem_page_record_premap_cnt(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1628 static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
1629 enum pg_level level, kvm_pfn_t pfn)
1630 {
1631 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1632
1633 if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
1634 return -EINVAL;
1635
1636 /* nr_premapped will be decreased when tdh_mem_page_add() is called. */
1637 atomic64_inc(&kvm_tdx->nr_premapped);
1638 return 0;
1639 }
1640
tdx_sept_set_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1641 static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
1642 enum pg_level level, kvm_pfn_t pfn)
1643 {
1644 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1645 struct page *page = pfn_to_page(pfn);
1646
1647 /* TODO: handle large pages. */
1648 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1649 return -EINVAL;
1650
1651 /*
1652 * Because guest_memfd doesn't support page migration with
1653 * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
1654 * migration. Until guest_memfd supports page migration, prevent page
1655 * migration.
1656 * TODO: Once guest_memfd introduces callback on page migration,
1657 * implement it and remove get_page/put_page().
1658 */
1659 get_page(page);
1660
1661 /*
1662 * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
1663 * barrier in tdx_td_finalize().
1664 */
1665 smp_rmb();
1666 if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
1667 return tdx_mem_page_aug(kvm, gfn, level, page);
1668
1669 return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
1670 }
1671
tdx_sept_drop_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)1672 static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
1673 enum pg_level level, struct page *page)
1674 {
1675 int tdx_level = pg_level_to_tdx_sept_level(level);
1676 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1677 gpa_t gpa = gfn_to_gpa(gfn);
1678 u64 err, entry, level_state;
1679
1680 /* TODO: handle large pages. */
1681 if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
1682 return -EINVAL;
1683
1684 if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
1685 return -EINVAL;
1686
1687 /*
1688 * When zapping private page, write lock is held. So no race condition
1689 * with other vcpu sept operation.
1690 * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
1691 */
1692 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1693 &level_state);
1694
1695 if (unlikely(tdx_operand_busy(err))) {
1696 /*
1697 * The second retry is expected to succeed after kicking off all
1698 * other vCPUs and prevent them from invoking TDH.VP.ENTER.
1699 */
1700 tdx_no_vcpus_enter_start(kvm);
1701 err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
1702 &level_state);
1703 tdx_no_vcpus_enter_stop(kvm);
1704 }
1705
1706 if (KVM_BUG_ON(err, kvm)) {
1707 pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
1708 return -EIO;
1709 }
1710
1711 err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
1712
1713 if (KVM_BUG_ON(err, kvm)) {
1714 pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
1715 return -EIO;
1716 }
1717 tdx_clear_page(page);
1718 tdx_unpin(kvm, page);
1719 return 0;
1720 }
1721
tdx_sept_link_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)1722 static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
1723 enum pg_level level, void *private_spt)
1724 {
1725 int tdx_level = pg_level_to_tdx_sept_level(level);
1726 gpa_t gpa = gfn_to_gpa(gfn);
1727 struct page *page = virt_to_page(private_spt);
1728 u64 err, entry, level_state;
1729
1730 err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
1731 &level_state);
1732 if (unlikely(tdx_operand_busy(err)))
1733 return -EBUSY;
1734
1735 if (KVM_BUG_ON(err, kvm)) {
1736 pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
1737 return -EIO;
1738 }
1739
1740 return 0;
1741 }
1742
1743 /*
1744 * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
1745 * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
1746 * successfully.
1747 *
1748 * Since tdh_mem_sept_add() must have been invoked successfully before a
1749 * non-leaf entry present in the mirrored page table, the SEPT ZAP related
1750 * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
1751 * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
1752 * SEPT.
1753 *
1754 * Further check if the returned entry from SEPT walking is with RWX permissions
1755 * to filter out anything unexpected.
1756 *
1757 * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
1758 * level_state returned from a SEAMCALL error is the same as that passed into
1759 * the SEAMCALL.
1760 */
tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx * kvm_tdx,u64 err,u64 entry,int level)1761 static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
1762 u64 entry, int level)
1763 {
1764 if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
1765 return false;
1766
1767 if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
1768 return false;
1769
1770 if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
1771 return false;
1772
1773 return true;
1774 }
1775
tdx_sept_zap_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,struct page * page)1776 static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
1777 enum pg_level level, struct page *page)
1778 {
1779 int tdx_level = pg_level_to_tdx_sept_level(level);
1780 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1781 gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
1782 u64 err, entry, level_state;
1783
1784 /* For now large page isn't supported yet. */
1785 WARN_ON_ONCE(level != PG_LEVEL_4K);
1786
1787 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1788
1789 if (unlikely(tdx_operand_busy(err))) {
1790 /* After no vCPUs enter, the second retry is expected to succeed */
1791 tdx_no_vcpus_enter_start(kvm);
1792 err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
1793 tdx_no_vcpus_enter_stop(kvm);
1794 }
1795 if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
1796 !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
1797 atomic64_dec(&kvm_tdx->nr_premapped);
1798 tdx_unpin(kvm, page);
1799 return 0;
1800 }
1801
1802 if (KVM_BUG_ON(err, kvm)) {
1803 pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
1804 return -EIO;
1805 }
1806 return 1;
1807 }
1808
1809 /*
1810 * Ensure shared and private EPTs to be flushed on all vCPUs.
1811 * tdh_mem_track() is the only caller that increases TD epoch. An increase in
1812 * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
1813 * running in guest mode with the value "N - 1".
1814 *
1815 * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
1816 * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
1817 * being increased to "N + 1".
1818 *
1819 * Kicking off all vCPUs after that further results in no vCPUs can run in guest
1820 * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
1821 * to increase TD epoch to "N + 2").
1822 *
1823 * TDX module will flush EPT on the next TD enter and make vCPUs to run in
1824 * guest mode with TD epoch value "N + 1".
1825 *
1826 * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
1827 * waiting empty IPI handler ack_kick().
1828 *
1829 * No action is required to the vCPUs being kicked off since the kicking off
1830 * occurs certainly after TD epoch increment and before the next
1831 * tdh_mem_track().
1832 */
tdx_track(struct kvm * kvm)1833 static void tdx_track(struct kvm *kvm)
1834 {
1835 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1836 u64 err;
1837
1838 /* If TD isn't finalized, it's before any vcpu running. */
1839 if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
1840 return;
1841
1842 lockdep_assert_held_write(&kvm->mmu_lock);
1843
1844 err = tdh_mem_track(&kvm_tdx->td);
1845 if (unlikely(tdx_operand_busy(err))) {
1846 /* After no vCPUs enter, the second retry is expected to succeed */
1847 tdx_no_vcpus_enter_start(kvm);
1848 err = tdh_mem_track(&kvm_tdx->td);
1849 tdx_no_vcpus_enter_stop(kvm);
1850 }
1851
1852 if (KVM_BUG_ON(err, kvm))
1853 pr_tdx_error(TDH_MEM_TRACK, err);
1854
1855 kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
1856 }
1857
tdx_sept_free_private_spt(struct kvm * kvm,gfn_t gfn,enum pg_level level,void * private_spt)1858 static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
1859 enum pg_level level, void *private_spt)
1860 {
1861 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
1862
1863 /*
1864 * free_external_spt() is only called after hkid is freed when TD is
1865 * tearing down.
1866 * KVM doesn't (yet) zap page table pages in mirror page table while
1867 * TD is active, though guest pages mapped in mirror page table could be
1868 * zapped during TD is active, e.g. for shared <-> private conversion
1869 * and slot move/deletion.
1870 */
1871 if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
1872 return -EINVAL;
1873
1874 /*
1875 * The HKID assigned to this TD was already freed and cache was
1876 * already flushed. We don't have to flush again.
1877 */
1878 return tdx_reclaim_page(virt_to_page(private_spt));
1879 }
1880
tdx_sept_remove_private_spte(struct kvm * kvm,gfn_t gfn,enum pg_level level,kvm_pfn_t pfn)1881 static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
1882 enum pg_level level, kvm_pfn_t pfn)
1883 {
1884 struct page *page = pfn_to_page(pfn);
1885 int ret;
1886
1887 /*
1888 * HKID is released after all private pages have been removed, and set
1889 * before any might be populated. Warn if zapping is attempted when
1890 * there can't be anything populated in the private EPT.
1891 */
1892 if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
1893 return -EINVAL;
1894
1895 ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
1896 if (ret <= 0)
1897 return ret;
1898
1899 /*
1900 * TDX requires TLB tracking before dropping private page. Do
1901 * it here, although it is also done later.
1902 */
1903 tdx_track(kvm);
1904
1905 return tdx_sept_drop_private_spte(kvm, gfn, level, page);
1906 }
1907
tdx_deliver_interrupt(struct kvm_lapic * apic,int delivery_mode,int trig_mode,int vector)1908 void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
1909 int trig_mode, int vector)
1910 {
1911 struct kvm_vcpu *vcpu = apic->vcpu;
1912 struct vcpu_tdx *tdx = to_tdx(vcpu);
1913
1914 /* TDX supports only posted interrupt. No lapic emulation. */
1915 __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
1916
1917 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
1918 }
1919
tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu * vcpu)1920 static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
1921 {
1922 u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
1923 u64 eq = vmx_get_exit_qual(vcpu);
1924
1925 if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
1926 return false;
1927
1928 return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
1929 }
1930
tdx_handle_ept_violation(struct kvm_vcpu * vcpu)1931 static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
1932 {
1933 unsigned long exit_qual;
1934 gpa_t gpa = to_tdx(vcpu)->exit_gpa;
1935 bool local_retry = false;
1936 int ret;
1937
1938 if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
1939 if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
1940 pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
1941 gpa, vcpu->vcpu_id);
1942 kvm_vm_dead(vcpu->kvm);
1943 return -EIO;
1944 }
1945 /*
1946 * Always treat SEPT violations as write faults. Ignore the
1947 * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
1948 * TD private pages are always RWX in the SEPT tables,
1949 * i.e. they're always mapped writable. Just as importantly,
1950 * treating SEPT violations as write faults is necessary to
1951 * avoid COW allocations, which will cause TDAUGPAGE failures
1952 * due to aliasing a single HPA to multiple GPAs.
1953 */
1954 exit_qual = EPT_VIOLATION_ACC_WRITE;
1955
1956 /* Only private GPA triggers zero-step mitigation */
1957 local_retry = true;
1958 } else {
1959 exit_qual = vmx_get_exit_qual(vcpu);
1960 /*
1961 * EPT violation due to instruction fetch should never be
1962 * triggered from shared memory in TDX guest. If such EPT
1963 * violation occurs, treat it as broken hardware.
1964 */
1965 if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
1966 return -EIO;
1967 }
1968
1969 trace_kvm_page_fault(vcpu, gpa, exit_qual);
1970
1971 /*
1972 * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
1973 * mapping in TDX.
1974 *
1975 * KVM may return RET_PF_RETRY for private GPA due to
1976 * - contentions when atomically updating SPTEs of the mirror page table
1977 * - in-progress GFN invalidation or memslot removal.
1978 * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
1979 * caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
1980 * or certain TDCALLs.
1981 *
1982 * If TDH.VP.ENTER is invoked more times than the threshold set by the
1983 * TDX module before KVM resolves the private GPA mapping, the TDX
1984 * module will activate zero-step mitigation during TDH.VP.ENTER. This
1985 * process acquires an SEPT tree lock in the TDX module, leading to
1986 * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
1987 * operations on other vCPUs.
1988 *
1989 * Breaking out of local retries for kvm_vcpu_has_events() is for
1990 * interrupt injection. kvm_vcpu_has_events() should not see pending
1991 * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
1992 * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
1993 * the guest even if the IRQ/NMI can't be delivered.
1994 *
1995 * Note: even without breaking out of local retries, zero-step
1996 * mitigation may still occur due to
1997 * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
1998 * - a single RIP causing EPT violations for more GFNs than the
1999 * threshold count.
2000 * This is safe, as triggering zero-step mitigation only introduces
2001 * contentions to page installation SEAMCALLs on other vCPUs, which will
2002 * handle retries locally in their EPT violation handlers.
2003 */
2004 while (1) {
2005 ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
2006
2007 if (ret != RET_PF_RETRY || !local_retry)
2008 break;
2009
2010 if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
2011 break;
2012
2013 if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
2014 ret = -EIO;
2015 break;
2016 }
2017
2018 cond_resched();
2019 }
2020 return ret;
2021 }
2022
tdx_complete_emulated_msr(struct kvm_vcpu * vcpu,int err)2023 int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
2024 {
2025 if (err) {
2026 tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
2027 return 1;
2028 }
2029
2030 if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
2031 tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
2032
2033 return 1;
2034 }
2035
2036
tdx_handle_exit(struct kvm_vcpu * vcpu,fastpath_t fastpath)2037 int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
2038 {
2039 struct vcpu_tdx *tdx = to_tdx(vcpu);
2040 u64 vp_enter_ret = tdx->vp_enter_ret;
2041 union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
2042
2043 if (fastpath != EXIT_FASTPATH_NONE)
2044 return 1;
2045
2046 if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
2047 KVM_BUG_ON(1, vcpu->kvm);
2048 return -EIO;
2049 }
2050
2051 /*
2052 * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
2053 * TDX_SEAMCALL_VMFAILINVALID.
2054 */
2055 if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
2056 KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
2057 goto unhandled_exit;
2058 }
2059
2060 if (unlikely(tdx_failed_vmentry(vcpu))) {
2061 /*
2062 * If the guest state is protected, that means off-TD debug is
2063 * not enabled, TDX_NON_RECOVERABLE must be set.
2064 */
2065 WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
2066 !(vp_enter_ret & TDX_NON_RECOVERABLE));
2067 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
2068 vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
2069 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
2070 return 0;
2071 }
2072
2073 if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
2074 exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
2075 kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
2076 goto unhandled_exit;
2077 }
2078
2079 WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
2080 (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
2081
2082 switch (exit_reason.basic) {
2083 case EXIT_REASON_TRIPLE_FAULT:
2084 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2085 vcpu->mmio_needed = 0;
2086 return 0;
2087 case EXIT_REASON_EXCEPTION_NMI:
2088 return tdx_handle_exception_nmi(vcpu);
2089 case EXIT_REASON_EXTERNAL_INTERRUPT:
2090 ++vcpu->stat.irq_exits;
2091 return 1;
2092 case EXIT_REASON_CPUID:
2093 return tdx_emulate_cpuid(vcpu);
2094 case EXIT_REASON_HLT:
2095 return kvm_emulate_halt_noskip(vcpu);
2096 case EXIT_REASON_TDCALL:
2097 return handle_tdvmcall(vcpu);
2098 case EXIT_REASON_VMCALL:
2099 return tdx_emulate_vmcall(vcpu);
2100 case EXIT_REASON_IO_INSTRUCTION:
2101 return tdx_emulate_io(vcpu);
2102 case EXIT_REASON_MSR_READ:
2103 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2104 return kvm_emulate_rdmsr(vcpu);
2105 case EXIT_REASON_MSR_WRITE:
2106 kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
2107 kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
2108 kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
2109 return kvm_emulate_wrmsr(vcpu);
2110 case EXIT_REASON_EPT_MISCONFIG:
2111 return tdx_emulate_mmio(vcpu);
2112 case EXIT_REASON_EPT_VIOLATION:
2113 return tdx_handle_ept_violation(vcpu);
2114 case EXIT_REASON_OTHER_SMI:
2115 /*
2116 * Unlike VMX, SMI in SEAM non-root mode (i.e. when
2117 * TD guest vCPU is running) will cause VM exit to TDX module,
2118 * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered
2119 * and handled by kernel handler right away.
2120 *
2121 * The Other SMI exit can also be caused by the SEAM non-root
2122 * machine check delivered via Machine Check System Management
2123 * Interrupt (MSMI), but it has already been handled by the
2124 * kernel machine check handler, i.e., the memory page has been
2125 * marked as poisoned and it won't be freed to the free list
2126 * when the TDX guest is terminated (the TDX module marks the
2127 * guest as dead and prevent it from further running when
2128 * machine check happens in SEAM non-root).
2129 *
2130 * - A MSMI will not reach here, it's handled as non_recoverable
2131 * case above.
2132 * - If it's not an MSMI, no need to do anything here.
2133 */
2134 return 1;
2135 default:
2136 break;
2137 }
2138
2139 unhandled_exit:
2140 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2141 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
2142 vcpu->run->internal.ndata = 2;
2143 vcpu->run->internal.data[0] = vp_enter_ret;
2144 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
2145 return 0;
2146 }
2147
tdx_get_exit_info(struct kvm_vcpu * vcpu,u32 * reason,u64 * info1,u64 * info2,u32 * intr_info,u32 * error_code)2148 void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
2149 u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
2150 {
2151 struct vcpu_tdx *tdx = to_tdx(vcpu);
2152
2153 *reason = tdx->vt.exit_reason.full;
2154 if (*reason != -1u) {
2155 *info1 = vmx_get_exit_qual(vcpu);
2156 *info2 = tdx->ext_exit_qualification;
2157 *intr_info = vmx_get_intr_info(vcpu);
2158 } else {
2159 *info1 = 0;
2160 *info2 = 0;
2161 *intr_info = 0;
2162 }
2163
2164 *error_code = 0;
2165 }
2166
tdx_has_emulated_msr(u32 index)2167 bool tdx_has_emulated_msr(u32 index)
2168 {
2169 switch (index) {
2170 case MSR_IA32_UCODE_REV:
2171 case MSR_IA32_ARCH_CAPABILITIES:
2172 case MSR_IA32_POWER_CTL:
2173 case MSR_IA32_CR_PAT:
2174 case MSR_MTRRcap:
2175 case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
2176 case MSR_MTRRdefType:
2177 case MSR_IA32_TSC_DEADLINE:
2178 case MSR_IA32_MISC_ENABLE:
2179 case MSR_PLATFORM_INFO:
2180 case MSR_MISC_FEATURES_ENABLES:
2181 case MSR_IA32_APICBASE:
2182 case MSR_EFER:
2183 case MSR_IA32_FEAT_CTL:
2184 case MSR_IA32_MCG_CAP:
2185 case MSR_IA32_MCG_STATUS:
2186 case MSR_IA32_MCG_CTL:
2187 case MSR_IA32_MCG_EXT_CTL:
2188 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2189 case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
2190 /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
2191 case MSR_KVM_POLL_CONTROL:
2192 return true;
2193 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2194 /*
2195 * x2APIC registers that are virtualized by the CPU can't be
2196 * emulated, KVM doesn't have access to the virtual APIC page.
2197 */
2198 switch (index) {
2199 case X2APIC_MSR(APIC_TASKPRI):
2200 case X2APIC_MSR(APIC_PROCPRI):
2201 case X2APIC_MSR(APIC_EOI):
2202 case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
2203 case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
2204 case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
2205 return false;
2206 default:
2207 return true;
2208 }
2209 default:
2210 return false;
2211 }
2212 }
2213
tdx_is_read_only_msr(u32 index)2214 static bool tdx_is_read_only_msr(u32 index)
2215 {
2216 return index == MSR_IA32_APICBASE || index == MSR_EFER ||
2217 index == MSR_IA32_FEAT_CTL;
2218 }
2219
tdx_get_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2220 int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2221 {
2222 switch (msr->index) {
2223 case MSR_IA32_FEAT_CTL:
2224 /*
2225 * MCE and MCA are advertised via cpuid. Guest kernel could
2226 * check if LMCE is enabled or not.
2227 */
2228 msr->data = FEAT_CTL_LOCKED;
2229 if (vcpu->arch.mcg_cap & MCG_LMCE_P)
2230 msr->data |= FEAT_CTL_LMCE_ENABLED;
2231 return 0;
2232 case MSR_IA32_MCG_EXT_CTL:
2233 if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
2234 return 1;
2235 msr->data = vcpu->arch.mcg_ext_ctl;
2236 return 0;
2237 default:
2238 if (!tdx_has_emulated_msr(msr->index))
2239 return 1;
2240
2241 return kvm_get_msr_common(vcpu, msr);
2242 }
2243 }
2244
tdx_set_msr(struct kvm_vcpu * vcpu,struct msr_data * msr)2245 int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2246 {
2247 switch (msr->index) {
2248 case MSR_IA32_MCG_EXT_CTL:
2249 if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
2250 (msr->data & ~MCG_EXT_CTL_LMCE_EN))
2251 return 1;
2252 vcpu->arch.mcg_ext_ctl = msr->data;
2253 return 0;
2254 default:
2255 if (tdx_is_read_only_msr(msr->index))
2256 return 1;
2257
2258 if (!tdx_has_emulated_msr(msr->index))
2259 return 1;
2260
2261 return kvm_set_msr_common(vcpu, msr);
2262 }
2263 }
2264
tdx_get_capabilities(struct kvm_tdx_cmd * cmd)2265 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
2266 {
2267 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2268 struct kvm_tdx_capabilities __user *user_caps;
2269 struct kvm_tdx_capabilities *caps = NULL;
2270 u32 nr_user_entries;
2271 int ret = 0;
2272
2273 /* flags is reserved for future use */
2274 if (cmd->flags)
2275 return -EINVAL;
2276
2277 caps = kzalloc(sizeof(*caps) +
2278 sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
2279 GFP_KERNEL);
2280 if (!caps)
2281 return -ENOMEM;
2282
2283 user_caps = u64_to_user_ptr(cmd->data);
2284 if (get_user(nr_user_entries, &user_caps->cpuid.nent)) {
2285 ret = -EFAULT;
2286 goto out;
2287 }
2288
2289 if (nr_user_entries < td_conf->num_cpuid_config) {
2290 ret = -E2BIG;
2291 goto out;
2292 }
2293
2294 ret = init_kvm_tdx_caps(td_conf, caps);
2295 if (ret)
2296 goto out;
2297
2298 if (copy_to_user(user_caps, caps, sizeof(*caps))) {
2299 ret = -EFAULT;
2300 goto out;
2301 }
2302
2303 if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
2304 caps->cpuid.nent *
2305 sizeof(caps->cpuid.entries[0])))
2306 ret = -EFAULT;
2307
2308 out:
2309 /* kfree() accepts NULL. */
2310 kfree(caps);
2311 return ret;
2312 }
2313
2314 /*
2315 * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
2316 * similar to TDX's GPAW. Use this field as the interface for userspace to
2317 * configure the GPAW and EPT level for TDs.
2318 *
2319 * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
2320 * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
2321 * supported. Value 52 is only supported when the platform supports 5 level
2322 * EPT.
2323 */
setup_tdparams_eptp_controls(struct kvm_cpuid2 * cpuid,struct td_params * td_params)2324 static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
2325 struct td_params *td_params)
2326 {
2327 const struct kvm_cpuid_entry2 *entry;
2328 int guest_pa;
2329
2330 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
2331 if (!entry)
2332 return -EINVAL;
2333
2334 guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
2335
2336 if (guest_pa != 48 && guest_pa != 52)
2337 return -EINVAL;
2338
2339 if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
2340 return -EINVAL;
2341
2342 td_params->eptp_controls = VMX_EPTP_MT_WB;
2343 if (guest_pa == 52) {
2344 td_params->eptp_controls |= VMX_EPTP_PWL_5;
2345 td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
2346 } else {
2347 td_params->eptp_controls |= VMX_EPTP_PWL_4;
2348 }
2349
2350 return 0;
2351 }
2352
setup_tdparams_cpuids(struct kvm_cpuid2 * cpuid,struct td_params * td_params)2353 static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
2354 struct td_params *td_params)
2355 {
2356 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2357 const struct kvm_cpuid_entry2 *entry;
2358 struct tdx_cpuid_value *value;
2359 int i, copy_cnt = 0;
2360
2361 /*
2362 * td_params.cpuid_values: The number and the order of cpuid_value must
2363 * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
2364 * It's assumed that td_params was zeroed.
2365 */
2366 for (i = 0; i < td_conf->num_cpuid_config; i++) {
2367 struct kvm_cpuid_entry2 tmp;
2368
2369 td_init_cpuid_entry2(&tmp, i);
2370
2371 entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
2372 tmp.function, tmp.index);
2373 if (!entry)
2374 continue;
2375
2376 if (tdx_unsupported_cpuid(entry))
2377 return -EINVAL;
2378
2379 copy_cnt++;
2380
2381 value = &td_params->cpuid_values[i];
2382 value->eax = entry->eax;
2383 value->ebx = entry->ebx;
2384 value->ecx = entry->ecx;
2385 value->edx = entry->edx;
2386
2387 /*
2388 * TDX module does not accept nonzero bits 16..23 for the
2389 * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
2390 */
2391 if (tmp.function == 0x80000008)
2392 value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
2393 }
2394
2395 /*
2396 * Rely on the TDX module to reject invalid configuration, but it can't
2397 * check of leafs that don't have a proper slot in td_params->cpuid_values
2398 * to stick then. So fail if there were entries that didn't get copied to
2399 * td_params.
2400 */
2401 if (copy_cnt != cpuid->nent)
2402 return -EINVAL;
2403
2404 return 0;
2405 }
2406
setup_tdparams(struct kvm * kvm,struct td_params * td_params,struct kvm_tdx_init_vm * init_vm)2407 static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
2408 struct kvm_tdx_init_vm *init_vm)
2409 {
2410 const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
2411 struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
2412 int ret;
2413
2414 if (kvm->created_vcpus)
2415 return -EBUSY;
2416
2417 if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
2418 return -EINVAL;
2419
2420 if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
2421 return -EINVAL;
2422
2423 td_params->max_vcpus = kvm->max_vcpus;
2424 td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
2425 td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
2426
2427 td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
2428 td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
2429
2430 ret = setup_tdparams_eptp_controls(cpuid, td_params);
2431 if (ret)
2432 return ret;
2433
2434 ret = setup_tdparams_cpuids(cpuid, td_params);
2435 if (ret)
2436 return ret;
2437
2438 #define MEMCPY_SAME_SIZE(dst, src) \
2439 do { \
2440 BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \
2441 memcpy((dst), (src), sizeof(dst)); \
2442 } while (0)
2443
2444 MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
2445 MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
2446 MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
2447
2448 return 0;
2449 }
2450
__tdx_td_init(struct kvm * kvm,struct td_params * td_params,u64 * seamcall_err)2451 static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
2452 u64 *seamcall_err)
2453 {
2454 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2455 cpumask_var_t packages;
2456 struct page **tdcs_pages = NULL;
2457 struct page *tdr_page;
2458 int ret, i;
2459 u64 err, rcx;
2460
2461 *seamcall_err = 0;
2462 ret = tdx_guest_keyid_alloc();
2463 if (ret < 0)
2464 return ret;
2465 kvm_tdx->hkid = ret;
2466 kvm_tdx->misc_cg = get_current_misc_cg();
2467 ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
2468 if (ret)
2469 goto free_hkid;
2470
2471 ret = -ENOMEM;
2472
2473 atomic_inc(&nr_configured_hkid);
2474
2475 tdr_page = alloc_page(GFP_KERNEL);
2476 if (!tdr_page)
2477 goto free_hkid;
2478
2479 kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
2480 /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
2481 kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
2482 tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
2483 GFP_KERNEL | __GFP_ZERO);
2484 if (!tdcs_pages)
2485 goto free_tdr;
2486
2487 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2488 tdcs_pages[i] = alloc_page(GFP_KERNEL);
2489 if (!tdcs_pages[i])
2490 goto free_tdcs;
2491 }
2492
2493 if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
2494 goto free_tdcs;
2495
2496 cpus_read_lock();
2497
2498 /*
2499 * Need at least one CPU of the package to be online in order to
2500 * program all packages for host key id. Check it.
2501 */
2502 for_each_present_cpu(i)
2503 cpumask_set_cpu(topology_physical_package_id(i), packages);
2504 for_each_online_cpu(i)
2505 cpumask_clear_cpu(topology_physical_package_id(i), packages);
2506 if (!cpumask_empty(packages)) {
2507 ret = -EIO;
2508 /*
2509 * Because it's hard for human operator to figure out the
2510 * reason, warn it.
2511 */
2512 #define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n"
2513 pr_warn_ratelimited(MSG_ALLPKG);
2514 goto free_packages;
2515 }
2516
2517 /*
2518 * TDH.MNG.CREATE tries to grab the global TDX module and fails
2519 * with TDX_OPERAND_BUSY when it fails to grab. Take the global
2520 * lock to prevent it from failure.
2521 */
2522 mutex_lock(&tdx_lock);
2523 kvm_tdx->td.tdr_page = tdr_page;
2524 err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
2525 mutex_unlock(&tdx_lock);
2526
2527 if (err == TDX_RND_NO_ENTROPY) {
2528 ret = -EAGAIN;
2529 goto free_packages;
2530 }
2531
2532 if (WARN_ON_ONCE(err)) {
2533 pr_tdx_error(TDH_MNG_CREATE, err);
2534 ret = -EIO;
2535 goto free_packages;
2536 }
2537
2538 for_each_online_cpu(i) {
2539 int pkg = topology_physical_package_id(i);
2540
2541 if (cpumask_test_and_set_cpu(pkg, packages))
2542 continue;
2543
2544 /*
2545 * Program the memory controller in the package with an
2546 * encryption key associated to a TDX private host key id
2547 * assigned to this TDR. Concurrent operations on same memory
2548 * controller results in TDX_OPERAND_BUSY. No locking needed
2549 * beyond the cpus_read_lock() above as it serializes against
2550 * hotplug and the first online CPU of the package is always
2551 * used. We never have two CPUs in the same socket trying to
2552 * program the key.
2553 */
2554 ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
2555 kvm_tdx, true);
2556 if (ret)
2557 break;
2558 }
2559 cpus_read_unlock();
2560 free_cpumask_var(packages);
2561 if (ret) {
2562 i = 0;
2563 goto teardown;
2564 }
2565
2566 kvm_tdx->td.tdcs_pages = tdcs_pages;
2567 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2568 err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
2569 if (err == TDX_RND_NO_ENTROPY) {
2570 /* Here it's hard to allow userspace to retry. */
2571 ret = -EAGAIN;
2572 goto teardown;
2573 }
2574 if (WARN_ON_ONCE(err)) {
2575 pr_tdx_error(TDH_MNG_ADDCX, err);
2576 ret = -EIO;
2577 goto teardown;
2578 }
2579 }
2580
2581 err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
2582 if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
2583 /*
2584 * Because a user gives operands, don't warn.
2585 * Return a hint to the user because it's sometimes hard for the
2586 * user to figure out which operand is invalid. SEAMCALL status
2587 * code includes which operand caused invalid operand error.
2588 */
2589 *seamcall_err = err;
2590 ret = -EINVAL;
2591 goto teardown;
2592 } else if (WARN_ON_ONCE(err)) {
2593 pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
2594 ret = -EIO;
2595 goto teardown;
2596 }
2597
2598 return 0;
2599
2600 /*
2601 * The sequence for freeing resources from a partially initialized TD
2602 * varies based on where in the initialization flow failure occurred.
2603 * Simply use the full teardown and destroy, which naturally play nice
2604 * with partial initialization.
2605 */
2606 teardown:
2607 /* Only free pages not yet added, so start at 'i' */
2608 for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2609 if (tdcs_pages[i]) {
2610 __free_page(tdcs_pages[i]);
2611 tdcs_pages[i] = NULL;
2612 }
2613 }
2614 if (!kvm_tdx->td.tdcs_pages)
2615 kfree(tdcs_pages);
2616
2617 tdx_mmu_release_hkid(kvm);
2618 tdx_reclaim_td_control_pages(kvm);
2619
2620 return ret;
2621
2622 free_packages:
2623 cpus_read_unlock();
2624 free_cpumask_var(packages);
2625
2626 free_tdcs:
2627 for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
2628 if (tdcs_pages[i])
2629 __free_page(tdcs_pages[i]);
2630 }
2631 kfree(tdcs_pages);
2632 kvm_tdx->td.tdcs_pages = NULL;
2633
2634 free_tdr:
2635 if (tdr_page)
2636 __free_page(tdr_page);
2637 kvm_tdx->td.tdr_page = 0;
2638
2639 free_hkid:
2640 tdx_hkid_free(kvm_tdx);
2641
2642 return ret;
2643 }
2644
tdx_td_metadata_field_read(struct kvm_tdx * tdx,u64 field_id,u64 * data)2645 static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
2646 u64 *data)
2647 {
2648 u64 err;
2649
2650 err = tdh_mng_rd(&tdx->td, field_id, data);
2651
2652 return err;
2653 }
2654
2655 #define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7)
2656 #define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7)
2657
tdx_read_cpuid(struct kvm_vcpu * vcpu,u32 leaf,u32 sub_leaf,bool sub_leaf_set,int * entry_index,struct kvm_cpuid_entry2 * out)2658 static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
2659 bool sub_leaf_set, int *entry_index,
2660 struct kvm_cpuid_entry2 *out)
2661 {
2662 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2663 u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
2664 u64 ebx_eax, edx_ecx;
2665 u64 err = 0;
2666
2667 if (sub_leaf > 0b1111111)
2668 return -EINVAL;
2669
2670 if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
2671 return -EINVAL;
2672
2673 if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
2674 sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
2675 return -EINVAL;
2676
2677 /*
2678 * bit 23:17, REVSERVED: reserved, must be 0;
2679 * bit 16, LEAF_31: leaf number bit 31;
2680 * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
2681 * implicitly 0;
2682 * bit 8, SUBLEAF_NA: sub-leaf not applicable flag;
2683 * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
2684 * the SUBLEAF_6_0 is all-1.
2685 * sub-leaf bits 31:7 are implicitly 0;
2686 * bit 0, ELEMENT_I: Element index within field;
2687 */
2688 field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
2689 field_id |= (leaf & 0x7f) << 9;
2690 if (sub_leaf_set)
2691 field_id |= (sub_leaf & 0x7f) << 1;
2692 else
2693 field_id |= 0x1fe;
2694
2695 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
2696 if (err) //TODO check for specific errors
2697 goto err_out;
2698
2699 out->eax = (u32) ebx_eax;
2700 out->ebx = (u32) (ebx_eax >> 32);
2701
2702 field_id++;
2703 err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
2704 /*
2705 * It's weird that reading edx_ecx fails while reading ebx_eax
2706 * succeeded.
2707 */
2708 if (WARN_ON_ONCE(err))
2709 goto err_out;
2710
2711 out->ecx = (u32) edx_ecx;
2712 out->edx = (u32) (edx_ecx >> 32);
2713
2714 out->function = leaf;
2715 out->index = sub_leaf;
2716 out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
2717
2718 /*
2719 * Work around missing support on old TDX modules, fetch
2720 * guest maxpa from gfn_direct_bits.
2721 */
2722 if (leaf == 0x80000008) {
2723 gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
2724 unsigned int g_maxpa = __ffs(gpa_bits) + 1;
2725
2726 out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
2727 }
2728
2729 (*entry_index)++;
2730
2731 return 0;
2732
2733 err_out:
2734 out->eax = 0;
2735 out->ebx = 0;
2736 out->ecx = 0;
2737 out->edx = 0;
2738
2739 return -EIO;
2740 }
2741
tdx_td_init(struct kvm * kvm,struct kvm_tdx_cmd * cmd)2742 static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2743 {
2744 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2745 struct kvm_tdx_init_vm *init_vm;
2746 struct td_params *td_params = NULL;
2747 int ret;
2748
2749 BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
2750 BUILD_BUG_ON(sizeof(struct td_params) != 1024);
2751
2752 if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
2753 return -EINVAL;
2754
2755 if (cmd->flags)
2756 return -EINVAL;
2757
2758 init_vm = kmalloc(sizeof(*init_vm) +
2759 sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
2760 GFP_KERNEL);
2761 if (!init_vm)
2762 return -ENOMEM;
2763
2764 if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
2765 ret = -EFAULT;
2766 goto out;
2767 }
2768
2769 if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
2770 ret = -E2BIG;
2771 goto out;
2772 }
2773
2774 if (copy_from_user(init_vm->cpuid.entries,
2775 u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
2776 flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
2777 ret = -EFAULT;
2778 goto out;
2779 }
2780
2781 if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
2782 ret = -EINVAL;
2783 goto out;
2784 }
2785
2786 if (init_vm->cpuid.padding) {
2787 ret = -EINVAL;
2788 goto out;
2789 }
2790
2791 td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
2792 if (!td_params) {
2793 ret = -ENOMEM;
2794 goto out;
2795 }
2796
2797 ret = setup_tdparams(kvm, td_params, init_vm);
2798 if (ret)
2799 goto out;
2800
2801 ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
2802 if (ret)
2803 goto out;
2804
2805 kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
2806 kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
2807 kvm_tdx->attributes = td_params->attributes;
2808 kvm_tdx->xfam = td_params->xfam;
2809
2810 if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
2811 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
2812 else
2813 kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
2814
2815 kvm_tdx->state = TD_STATE_INITIALIZED;
2816 out:
2817 /* kfree() accepts NULL. */
2818 kfree(init_vm);
2819 kfree(td_params);
2820
2821 return ret;
2822 }
2823
tdx_flush_tlb_current(struct kvm_vcpu * vcpu)2824 void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
2825 {
2826 /*
2827 * flush_tlb_current() is invoked when the first time for the vcpu to
2828 * run or when root of shared EPT is invalidated.
2829 * KVM only needs to flush shared EPT because the TDX module handles TLB
2830 * invalidation for private EPT in tdh_vp_enter();
2831 *
2832 * A single context invalidation for shared EPT can be performed here.
2833 * However, this single context invalidation requires the private EPTP
2834 * rather than the shared EPTP to flush shared EPT, as shared EPT uses
2835 * private EPTP as its ASID for TLB invalidation.
2836 *
2837 * To avoid reading back private EPTP, perform a global invalidation for
2838 * shared EPT instead to keep this function simple.
2839 */
2840 ept_sync_global();
2841 }
2842
tdx_flush_tlb_all(struct kvm_vcpu * vcpu)2843 void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
2844 {
2845 /*
2846 * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
2847 * ensure that private EPT will be flushed on the next TD enter. No need
2848 * to call tdx_track() here again even when this callback is a result of
2849 * zapping private EPT.
2850 *
2851 * Due to the lack of the context to determine which EPT has been
2852 * affected by zapping, invoke invept() directly here for both shared
2853 * EPT and private EPT for simplicity, though it's not necessary for
2854 * private EPT.
2855 */
2856 ept_sync_global();
2857 }
2858
tdx_td_finalize(struct kvm * kvm,struct kvm_tdx_cmd * cmd)2859 static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
2860 {
2861 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
2862
2863 guard(mutex)(&kvm->slots_lock);
2864
2865 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
2866 return -EINVAL;
2867 /*
2868 * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
2869 * TDH.MEM.PAGE.ADD().
2870 */
2871 if (atomic64_read(&kvm_tdx->nr_premapped))
2872 return -EINVAL;
2873
2874 cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
2875 if (tdx_operand_busy(cmd->hw_error))
2876 return -EBUSY;
2877 if (KVM_BUG_ON(cmd->hw_error, kvm)) {
2878 pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
2879 return -EIO;
2880 }
2881
2882 kvm_tdx->state = TD_STATE_RUNNABLE;
2883 /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
2884 smp_wmb();
2885 kvm->arch.pre_fault_allowed = true;
2886 return 0;
2887 }
2888
tdx_vm_ioctl(struct kvm * kvm,void __user * argp)2889 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
2890 {
2891 struct kvm_tdx_cmd tdx_cmd;
2892 int r;
2893
2894 if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
2895 return -EFAULT;
2896
2897 /*
2898 * Userspace should never set hw_error. It is used to fill
2899 * hardware-defined error by the kernel.
2900 */
2901 if (tdx_cmd.hw_error)
2902 return -EINVAL;
2903
2904 mutex_lock(&kvm->lock);
2905
2906 switch (tdx_cmd.id) {
2907 case KVM_TDX_CAPABILITIES:
2908 r = tdx_get_capabilities(&tdx_cmd);
2909 break;
2910 case KVM_TDX_INIT_VM:
2911 r = tdx_td_init(kvm, &tdx_cmd);
2912 break;
2913 case KVM_TDX_FINALIZE_VM:
2914 r = tdx_td_finalize(kvm, &tdx_cmd);
2915 break;
2916 default:
2917 r = -EINVAL;
2918 goto out;
2919 }
2920
2921 if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
2922 r = -EFAULT;
2923
2924 out:
2925 mutex_unlock(&kvm->lock);
2926 return r;
2927 }
2928
2929 /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
tdx_td_vcpu_init(struct kvm_vcpu * vcpu,u64 vcpu_rcx)2930 static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
2931 {
2932 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
2933 struct vcpu_tdx *tdx = to_tdx(vcpu);
2934 struct page *page;
2935 int ret, i;
2936 u64 err;
2937
2938 page = alloc_page(GFP_KERNEL);
2939 if (!page)
2940 return -ENOMEM;
2941 tdx->vp.tdvpr_page = page;
2942
2943 tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
2944 GFP_KERNEL);
2945 if (!tdx->vp.tdcx_pages) {
2946 ret = -ENOMEM;
2947 goto free_tdvpr;
2948 }
2949
2950 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2951 page = alloc_page(GFP_KERNEL);
2952 if (!page) {
2953 ret = -ENOMEM;
2954 goto free_tdcx;
2955 }
2956 tdx->vp.tdcx_pages[i] = page;
2957 }
2958
2959 err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
2960 if (KVM_BUG_ON(err, vcpu->kvm)) {
2961 ret = -EIO;
2962 pr_tdx_error(TDH_VP_CREATE, err);
2963 goto free_tdcx;
2964 }
2965
2966 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2967 err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
2968 if (KVM_BUG_ON(err, vcpu->kvm)) {
2969 pr_tdx_error(TDH_VP_ADDCX, err);
2970 /*
2971 * Pages already added are reclaimed by the vcpu_free
2972 * method, but the rest are freed here.
2973 */
2974 for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2975 __free_page(tdx->vp.tdcx_pages[i]);
2976 tdx->vp.tdcx_pages[i] = NULL;
2977 }
2978 return -EIO;
2979 }
2980 }
2981
2982 err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
2983 if (KVM_BUG_ON(err, vcpu->kvm)) {
2984 pr_tdx_error(TDH_VP_INIT, err);
2985 return -EIO;
2986 }
2987
2988 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2989
2990 return 0;
2991
2992 free_tdcx:
2993 for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
2994 if (tdx->vp.tdcx_pages[i])
2995 __free_page(tdx->vp.tdcx_pages[i]);
2996 tdx->vp.tdcx_pages[i] = NULL;
2997 }
2998 kfree(tdx->vp.tdcx_pages);
2999 tdx->vp.tdcx_pages = NULL;
3000
3001 free_tdvpr:
3002 if (tdx->vp.tdvpr_page)
3003 __free_page(tdx->vp.tdvpr_page);
3004 tdx->vp.tdvpr_page = 0;
3005
3006 return ret;
3007 }
3008
3009 /* Sometimes reads multipple subleafs. Return how many enties were written. */
tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu * vcpu,u32 leaf,int * entry_index,struct kvm_cpuid_entry2 * output_e)3010 static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
3011 struct kvm_cpuid_entry2 *output_e)
3012 {
3013 int sub_leaf = 0;
3014 int ret;
3015
3016 /* First try without a subleaf */
3017 ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
3018
3019 /* If success, or invalid leaf, just give up */
3020 if (ret != -EIO)
3021 return ret;
3022
3023 /*
3024 * If the try without a subleaf failed, try reading subleafs until
3025 * failure. The TDX module only supports 6 bits of subleaf index.
3026 */
3027 while (1) {
3028 /* Keep reading subleafs until there is a failure. */
3029 if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
3030 return !sub_leaf;
3031
3032 sub_leaf++;
3033 output_e++;
3034 }
3035
3036 return 0;
3037 }
3038
tdx_vcpu_get_cpuid(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3039 static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3040 {
3041 struct kvm_cpuid2 __user *output, *td_cpuid;
3042 int r = 0, i = 0, leaf;
3043 u32 level;
3044
3045 output = u64_to_user_ptr(cmd->data);
3046 td_cpuid = kzalloc(sizeof(*td_cpuid) +
3047 sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
3048 GFP_KERNEL);
3049 if (!td_cpuid)
3050 return -ENOMEM;
3051
3052 if (copy_from_user(td_cpuid, output, sizeof(*output))) {
3053 r = -EFAULT;
3054 goto out;
3055 }
3056
3057 /* Read max CPUID for normal range */
3058 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
3059 r = -EIO;
3060 goto out;
3061 }
3062 level = td_cpuid->entries[0].eax;
3063
3064 for (leaf = 1; leaf <= level; leaf++)
3065 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3066
3067 /* Read max CPUID for extended range */
3068 if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
3069 r = -EIO;
3070 goto out;
3071 }
3072 level = td_cpuid->entries[i - 1].eax;
3073
3074 for (leaf = 0x80000001; leaf <= level; leaf++)
3075 tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
3076
3077 if (td_cpuid->nent < i)
3078 r = -E2BIG;
3079 td_cpuid->nent = i;
3080
3081 if (copy_to_user(output, td_cpuid, sizeof(*output))) {
3082 r = -EFAULT;
3083 goto out;
3084 }
3085
3086 if (r == -E2BIG)
3087 goto out;
3088
3089 if (copy_to_user(output->entries, td_cpuid->entries,
3090 td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
3091 r = -EFAULT;
3092
3093 out:
3094 kfree(td_cpuid);
3095
3096 return r;
3097 }
3098
tdx_vcpu_init(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3099 static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3100 {
3101 u64 apic_base;
3102 struct vcpu_tdx *tdx = to_tdx(vcpu);
3103 int ret;
3104
3105 if (cmd->flags)
3106 return -EINVAL;
3107
3108 if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
3109 return -EINVAL;
3110
3111 /*
3112 * TDX requires X2APIC, userspace is responsible for configuring guest
3113 * CPUID accordingly.
3114 */
3115 apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
3116 (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
3117 if (kvm_apic_set_base(vcpu, apic_base, true))
3118 return -EINVAL;
3119
3120 ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
3121 if (ret)
3122 return ret;
3123
3124 td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
3125 td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
3126 td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
3127
3128 tdx->state = VCPU_TD_STATE_INITIALIZED;
3129
3130 return 0;
3131 }
3132
tdx_vcpu_reset(struct kvm_vcpu * vcpu,bool init_event)3133 void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
3134 {
3135 /*
3136 * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
3137 * INIT events.
3138 *
3139 * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
3140 * userspace needs to define the vCPU model before KVM can initialize
3141 * vCPU state, e.g. to enable x2APIC.
3142 */
3143 WARN_ON_ONCE(init_event);
3144 }
3145
3146 struct tdx_gmem_post_populate_arg {
3147 struct kvm_vcpu *vcpu;
3148 __u32 flags;
3149 };
3150
tdx_gmem_post_populate(struct kvm * kvm,gfn_t gfn,kvm_pfn_t pfn,void __user * src,int order,void * _arg)3151 static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
3152 void __user *src, int order, void *_arg)
3153 {
3154 u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
3155 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3156 struct tdx_gmem_post_populate_arg *arg = _arg;
3157 struct kvm_vcpu *vcpu = arg->vcpu;
3158 gpa_t gpa = gfn_to_gpa(gfn);
3159 u8 level = PG_LEVEL_4K;
3160 struct page *src_page;
3161 int ret, i;
3162 u64 err, entry, level_state;
3163
3164 /*
3165 * Get the source page if it has been faulted in. Return failure if the
3166 * source page has been swapped out or unmapped in primary memory.
3167 */
3168 ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
3169 if (ret < 0)
3170 return ret;
3171 if (ret != 1)
3172 return -ENOMEM;
3173
3174 ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
3175 if (ret < 0)
3176 goto out;
3177
3178 /*
3179 * The private mem cannot be zapped after kvm_tdp_map_page()
3180 * because all paths are covered by slots_lock and the
3181 * filemap invalidate lock. Check that they are indeed enough.
3182 */
3183 if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
3184 scoped_guard(read_lock, &kvm->mmu_lock) {
3185 if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
3186 ret = -EIO;
3187 goto out;
3188 }
3189 }
3190 }
3191
3192 ret = 0;
3193 err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
3194 src_page, &entry, &level_state);
3195 if (err) {
3196 ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
3197 goto out;
3198 }
3199
3200 if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
3201 atomic64_dec(&kvm_tdx->nr_premapped);
3202
3203 if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
3204 for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
3205 err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
3206 &level_state);
3207 if (err) {
3208 ret = -EIO;
3209 break;
3210 }
3211 }
3212 }
3213
3214 out:
3215 put_page(src_page);
3216 return ret;
3217 }
3218
tdx_vcpu_init_mem_region(struct kvm_vcpu * vcpu,struct kvm_tdx_cmd * cmd)3219 static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
3220 {
3221 struct vcpu_tdx *tdx = to_tdx(vcpu);
3222 struct kvm *kvm = vcpu->kvm;
3223 struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
3224 struct kvm_tdx_init_mem_region region;
3225 struct tdx_gmem_post_populate_arg arg;
3226 long gmem_ret;
3227 int ret;
3228
3229 if (tdx->state != VCPU_TD_STATE_INITIALIZED)
3230 return -EINVAL;
3231
3232 guard(mutex)(&kvm->slots_lock);
3233
3234 /* Once TD is finalized, the initial guest memory is fixed. */
3235 if (kvm_tdx->state == TD_STATE_RUNNABLE)
3236 return -EINVAL;
3237
3238 if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
3239 return -EINVAL;
3240
3241 if (copy_from_user(®ion, u64_to_user_ptr(cmd->data), sizeof(region)))
3242 return -EFAULT;
3243
3244 if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
3245 !region.nr_pages ||
3246 region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
3247 !vt_is_tdx_private_gpa(kvm, region.gpa) ||
3248 !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
3249 return -EINVAL;
3250
3251 kvm_mmu_reload(vcpu);
3252 ret = 0;
3253 while (region.nr_pages) {
3254 if (signal_pending(current)) {
3255 ret = -EINTR;
3256 break;
3257 }
3258
3259 arg = (struct tdx_gmem_post_populate_arg) {
3260 .vcpu = vcpu,
3261 .flags = cmd->flags,
3262 };
3263 gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
3264 u64_to_user_ptr(region.source_addr),
3265 1, tdx_gmem_post_populate, &arg);
3266 if (gmem_ret < 0) {
3267 ret = gmem_ret;
3268 break;
3269 }
3270
3271 if (gmem_ret != 1) {
3272 ret = -EIO;
3273 break;
3274 }
3275
3276 region.source_addr += PAGE_SIZE;
3277 region.gpa += PAGE_SIZE;
3278 region.nr_pages--;
3279
3280 cond_resched();
3281 }
3282
3283 if (copy_to_user(u64_to_user_ptr(cmd->data), ®ion, sizeof(region)))
3284 ret = -EFAULT;
3285 return ret;
3286 }
3287
tdx_vcpu_ioctl(struct kvm_vcpu * vcpu,void __user * argp)3288 int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
3289 {
3290 struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
3291 struct kvm_tdx_cmd cmd;
3292 int ret;
3293
3294 if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
3295 return -EINVAL;
3296
3297 if (copy_from_user(&cmd, argp, sizeof(cmd)))
3298 return -EFAULT;
3299
3300 if (cmd.hw_error)
3301 return -EINVAL;
3302
3303 switch (cmd.id) {
3304 case KVM_TDX_INIT_VCPU:
3305 ret = tdx_vcpu_init(vcpu, &cmd);
3306 break;
3307 case KVM_TDX_INIT_MEM_REGION:
3308 ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
3309 break;
3310 case KVM_TDX_GET_CPUID:
3311 ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
3312 break;
3313 default:
3314 ret = -EINVAL;
3315 break;
3316 }
3317
3318 return ret;
3319 }
3320
tdx_gmem_private_max_mapping_level(struct kvm * kvm,kvm_pfn_t pfn)3321 int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
3322 {
3323 return PG_LEVEL_4K;
3324 }
3325
tdx_online_cpu(unsigned int cpu)3326 static int tdx_online_cpu(unsigned int cpu)
3327 {
3328 unsigned long flags;
3329 int r;
3330
3331 /* Sanity check CPU is already in post-VMXON */
3332 WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
3333
3334 local_irq_save(flags);
3335 r = tdx_cpu_enable();
3336 local_irq_restore(flags);
3337
3338 return r;
3339 }
3340
tdx_offline_cpu(unsigned int cpu)3341 static int tdx_offline_cpu(unsigned int cpu)
3342 {
3343 int i;
3344
3345 /* No TD is running. Allow any cpu to be offline. */
3346 if (!atomic_read(&nr_configured_hkid))
3347 return 0;
3348
3349 /*
3350 * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
3351 * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
3352 * controller with pconfig. If we have active TDX HKID, refuse to
3353 * offline the last online cpu.
3354 */
3355 for_each_online_cpu(i) {
3356 /*
3357 * Found another online cpu on the same package.
3358 * Allow to offline.
3359 */
3360 if (i != cpu && topology_physical_package_id(i) ==
3361 topology_physical_package_id(cpu))
3362 return 0;
3363 }
3364
3365 /*
3366 * This is the last cpu of this package. Don't offline it.
3367 *
3368 * Because it's hard for human operator to understand the
3369 * reason, warn it.
3370 */
3371 #define MSG_ALLPKG_ONLINE \
3372 "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
3373 pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
3374 return -EBUSY;
3375 }
3376
__do_tdx_cleanup(void)3377 static void __do_tdx_cleanup(void)
3378 {
3379 /*
3380 * Once TDX module is initialized, it cannot be disabled and
3381 * re-initialized again w/o runtime update (which isn't
3382 * supported by kernel). Only need to remove the cpuhp here.
3383 * The TDX host core code tracks TDX status and can handle
3384 * 'multiple enabling' scenario.
3385 */
3386 WARN_ON_ONCE(!tdx_cpuhp_state);
3387 cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
3388 tdx_cpuhp_state = 0;
3389 }
3390
__tdx_cleanup(void)3391 static void __tdx_cleanup(void)
3392 {
3393 cpus_read_lock();
3394 __do_tdx_cleanup();
3395 cpus_read_unlock();
3396 }
3397
__do_tdx_bringup(void)3398 static int __init __do_tdx_bringup(void)
3399 {
3400 int r;
3401
3402 /*
3403 * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
3404 * online CPUs before calling tdx_enable(), and on any new
3405 * going-online CPU to make sure it is ready for TDX guest.
3406 */
3407 r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
3408 "kvm/cpu/tdx:online",
3409 tdx_online_cpu, tdx_offline_cpu);
3410 if (r < 0)
3411 return r;
3412
3413 tdx_cpuhp_state = r;
3414
3415 r = tdx_enable();
3416 if (r)
3417 __do_tdx_cleanup();
3418
3419 return r;
3420 }
3421
__tdx_bringup(void)3422 static int __init __tdx_bringup(void)
3423 {
3424 const struct tdx_sys_info_td_conf *td_conf;
3425 int r, i;
3426
3427 for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
3428 /*
3429 * Check if MSRs (tdx_uret_msrs) can be saved/restored
3430 * before returning to user space.
3431 *
3432 * this_cpu_ptr(user_return_msrs)->registered isn't checked
3433 * because the registration is done at vcpu runtime by
3434 * tdx_user_return_msr_update_cache().
3435 */
3436 tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
3437 if (tdx_uret_msrs[i].slot == -1) {
3438 /* If any MSR isn't supported, it is a KVM bug */
3439 pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
3440 tdx_uret_msrs[i].msr);
3441 return -EIO;
3442 }
3443 }
3444
3445 /*
3446 * Enabling TDX requires enabling hardware virtualization first,
3447 * as making SEAMCALLs requires CPU being in post-VMXON state.
3448 */
3449 r = kvm_enable_virtualization();
3450 if (r)
3451 return r;
3452
3453 cpus_read_lock();
3454 r = __do_tdx_bringup();
3455 cpus_read_unlock();
3456
3457 if (r)
3458 goto tdx_bringup_err;
3459
3460 /* Get TDX global information for later use */
3461 tdx_sysinfo = tdx_get_sysinfo();
3462 if (WARN_ON_ONCE(!tdx_sysinfo)) {
3463 r = -EINVAL;
3464 goto get_sysinfo_err;
3465 }
3466
3467 /* Check TDX module and KVM capabilities */
3468 if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
3469 !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
3470 goto get_sysinfo_err;
3471
3472 if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
3473 goto get_sysinfo_err;
3474
3475 /*
3476 * TDX has its own limit of maximum vCPUs it can support for all
3477 * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to
3478 * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
3479 * extension on per-VM basis.
3480 *
3481 * TDX module reports such limit via the MAX_VCPU_PER_TD global
3482 * metadata. Different modules may report different values.
3483 * Some old module may also not support this metadata (in which
3484 * case this limit is U16_MAX).
3485 *
3486 * In practice, the reported value reflects the maximum logical
3487 * CPUs that ALL the platforms that the module supports can
3488 * possibly have.
3489 *
3490 * Simply forwarding the MAX_VCPU_PER_TD to userspace could
3491 * result in an unpredictable ABI. KVM instead always advertise
3492 * the number of logical CPUs the platform has as the maximum
3493 * vCPUs for TDX guests.
3494 *
3495 * Make sure MAX_VCPU_PER_TD reported by TDX module is not
3496 * smaller than the number of logical CPUs, otherwise KVM will
3497 * report an unsupported value to userspace.
3498 *
3499 * Note, a platform with TDX enabled in the BIOS cannot support
3500 * physical CPU hotplug, and TDX requires the BIOS has marked
3501 * all logical CPUs in MADT table as enabled. Just use
3502 * num_present_cpus() for the number of logical CPUs.
3503 */
3504 td_conf = &tdx_sysinfo->td_conf;
3505 if (td_conf->max_vcpus_per_td < num_present_cpus()) {
3506 pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
3507 td_conf->max_vcpus_per_td, num_present_cpus());
3508 r = -EINVAL;
3509 goto get_sysinfo_err;
3510 }
3511
3512 if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) {
3513 r = -EINVAL;
3514 goto get_sysinfo_err;
3515 }
3516
3517 /*
3518 * Leave hardware virtualization enabled after TDX is enabled
3519 * successfully. TDX CPU hotplug depends on this.
3520 */
3521 return 0;
3522
3523 get_sysinfo_err:
3524 __tdx_cleanup();
3525 tdx_bringup_err:
3526 kvm_disable_virtualization();
3527 return r;
3528 }
3529
tdx_cleanup(void)3530 void tdx_cleanup(void)
3531 {
3532 if (enable_tdx) {
3533 misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
3534 __tdx_cleanup();
3535 kvm_disable_virtualization();
3536 }
3537 }
3538
tdx_bringup(void)3539 int __init tdx_bringup(void)
3540 {
3541 int r, i;
3542
3543 /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
3544 for_each_possible_cpu(i)
3545 INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
3546
3547 if (!enable_tdx)
3548 return 0;
3549
3550 if (!enable_ept) {
3551 pr_err("EPT is required for TDX\n");
3552 goto success_disable_tdx;
3553 }
3554
3555 if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
3556 pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
3557 goto success_disable_tdx;
3558 }
3559
3560 if (!enable_apicv) {
3561 pr_err("APICv is required for TDX\n");
3562 goto success_disable_tdx;
3563 }
3564
3565 if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
3566 pr_err("tdx: OSXSAVE is required for TDX\n");
3567 goto success_disable_tdx;
3568 }
3569
3570 if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
3571 pr_err("tdx: MOVDIR64B is required for TDX\n");
3572 goto success_disable_tdx;
3573 }
3574
3575 if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
3576 pr_err("Self-snoop is required for TDX\n");
3577 goto success_disable_tdx;
3578 }
3579
3580 if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
3581 pr_err("tdx: no TDX private KeyIDs available\n");
3582 goto success_disable_tdx;
3583 }
3584
3585 if (!enable_virt_at_load) {
3586 pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
3587 goto success_disable_tdx;
3588 }
3589
3590 /*
3591 * Ideally KVM should probe whether TDX module has been loaded
3592 * first and then try to bring it up. But TDX needs to use SEAMCALL
3593 * to probe whether the module is loaded (there is no CPUID or MSR
3594 * for that), and making SEAMCALL requires enabling virtualization
3595 * first, just like the rest steps of bringing up TDX module.
3596 *
3597 * So, for simplicity do everything in __tdx_bringup(); the first
3598 * SEAMCALL will return -ENODEV when the module is not loaded. The
3599 * only complication is having to make sure that initialization
3600 * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
3601 * cases.
3602 */
3603 r = __tdx_bringup();
3604 if (r) {
3605 /*
3606 * Disable TDX only but don't fail to load module if the TDX
3607 * module could not be loaded. No need to print message saying
3608 * "module is not loaded" because it was printed when the first
3609 * SEAMCALL failed. Don't bother unwinding the S-EPT hooks or
3610 * vm_size, as kvm_x86_ops have already been finalized (and are
3611 * intentionally not exported). The S-EPT code is unreachable,
3612 * and allocating a few more bytes per VM in a should-be-rare
3613 * failure scenario is a non-issue.
3614 */
3615 if (r == -ENODEV)
3616 goto success_disable_tdx;
3617
3618 enable_tdx = 0;
3619 }
3620
3621 return r;
3622
3623 success_disable_tdx:
3624 enable_tdx = 0;
3625 return 0;
3626 }
3627
tdx_hardware_setup(void)3628 void __init tdx_hardware_setup(void)
3629 {
3630 KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
3631
3632 /*
3633 * Note, if the TDX module can't be loaded, KVM TDX support will be
3634 * disabled but KVM will continue loading (see tdx_bringup()).
3635 */
3636 vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
3637
3638 vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
3639 vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
3640 vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
3641 vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
3642 vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
3643 }
3644