xref: /qemu/target/i386/kvm/tdx.c (revision 0ba06e46d09b84a2cb97a268da5576aaca3a24ca)
1 /*
2  * QEMU TDX support
3  *
4  * Copyright (c) 2025 Intel Corporation
5  *
6  * Author:
7  *      Xiaoyao Li <xiaoyao.li@intel.com>
8  *
9  * SPDX-License-Identifier: GPL-2.0-or-later
10  */
11 
12 #include "qemu/osdep.h"
13 #include "qemu/error-report.h"
14 #include "qemu/base64.h"
15 #include "qemu/mmap-alloc.h"
16 #include "qapi/error.h"
17 #include "qom/object_interfaces.h"
18 #include "crypto/hash.h"
19 #include "system/kvm_int.h"
20 #include "system/runstate.h"
21 #include "system/system.h"
22 #include "system/ramblock.h"
23 
24 #include <linux/kvm_para.h>
25 
26 #include "hw/i386/e820_memory_layout.h"
27 #include "hw/i386/tdvf.h"
28 #include "hw/i386/x86.h"
29 #include "hw/i386/tdvf-hob.h"
30 #include "kvm_i386.h"
31 #include "tdx.h"
32 
33 #define TDX_MIN_TSC_FREQUENCY_KHZ   (100 * 1000)
34 #define TDX_MAX_TSC_FREQUENCY_KHZ   (10 * 1000 * 1000)
35 
36 #define TDX_TD_ATTRIBUTES_DEBUG             BIT_ULL(0)
37 #define TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE   BIT_ULL(28)
38 #define TDX_TD_ATTRIBUTES_PKS               BIT_ULL(30)
39 #define TDX_TD_ATTRIBUTES_PERFMON           BIT_ULL(63)
40 
41 #define TDX_SUPPORTED_TD_ATTRS  (TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE |\
42                                  TDX_TD_ATTRIBUTES_PKS | \
43                                  TDX_TD_ATTRIBUTES_PERFMON)
44 
45 static TdxGuest *tdx_guest;
46 
47 static struct kvm_tdx_capabilities *tdx_caps;
48 static struct kvm_cpuid2 *tdx_supported_cpuid;
49 
50 /* Valid after kvm_arch_init()->confidential_guest_kvm_init()->tdx_kvm_init() */
51 bool is_tdx_vm(void)
52 {
53     return !!tdx_guest;
54 }
55 
56 enum tdx_ioctl_level {
57     TDX_VM_IOCTL,
58     TDX_VCPU_IOCTL,
59 };
60 
61 static int tdx_ioctl_internal(enum tdx_ioctl_level level, void *state,
62                               int cmd_id, __u32 flags, void *data,
63                               Error **errp)
64 {
65     struct kvm_tdx_cmd tdx_cmd = {};
66     int r;
67 
68     const char *tdx_ioctl_name[] = {
69         [KVM_TDX_CAPABILITIES] = "KVM_TDX_CAPABILITIES",
70         [KVM_TDX_INIT_VM] = "KVM_TDX_INIT_VM",
71         [KVM_TDX_INIT_VCPU] = "KVM_TDX_INIT_VCPU",
72         [KVM_TDX_INIT_MEM_REGION] = "KVM_TDX_INIT_MEM_REGION",
73         [KVM_TDX_FINALIZE_VM] = "KVM_TDX_FINALIZE_VM",
74         [KVM_TDX_GET_CPUID] = "KVM_TDX_GET_CPUID",
75     };
76 
77     tdx_cmd.id = cmd_id;
78     tdx_cmd.flags = flags;
79     tdx_cmd.data = (__u64)(unsigned long)data;
80 
81     switch (level) {
82     case TDX_VM_IOCTL:
83         r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd);
84         break;
85     case TDX_VCPU_IOCTL:
86         r = kvm_vcpu_ioctl(state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd);
87         break;
88     default:
89         error_setg(errp, "Invalid tdx_ioctl_level %d", level);
90         return -EINVAL;
91     }
92 
93     if (r < 0) {
94         error_setg_errno(errp, -r, "TDX ioctl %s failed, hw_errors: 0x%llx",
95                          tdx_ioctl_name[cmd_id], tdx_cmd.hw_error);
96     }
97     return r;
98 }
99 
100 static inline int tdx_vm_ioctl(int cmd_id, __u32 flags, void *data,
101                                Error **errp)
102 {
103     return tdx_ioctl_internal(TDX_VM_IOCTL, NULL, cmd_id, flags, data, errp);
104 }
105 
106 static inline int tdx_vcpu_ioctl(CPUState *cpu, int cmd_id, __u32 flags,
107                                  void *data, Error **errp)
108 {
109     return  tdx_ioctl_internal(TDX_VCPU_IOCTL, cpu, cmd_id, flags, data, errp);
110 }
111 
112 static int get_tdx_capabilities(Error **errp)
113 {
114     struct kvm_tdx_capabilities *caps;
115     /* 1st generation of TDX reports 6 cpuid configs */
116     int nr_cpuid_configs = 6;
117     size_t size;
118     int r;
119 
120     do {
121         Error *local_err = NULL;
122         size = sizeof(struct kvm_tdx_capabilities) +
123                       nr_cpuid_configs * sizeof(struct kvm_cpuid_entry2);
124         caps = g_malloc0(size);
125         caps->cpuid.nent = nr_cpuid_configs;
126 
127         r = tdx_vm_ioctl(KVM_TDX_CAPABILITIES, 0, caps, &local_err);
128         if (r == -E2BIG) {
129             g_free(caps);
130             nr_cpuid_configs *= 2;
131             if (nr_cpuid_configs > KVM_MAX_CPUID_ENTRIES) {
132                 error_report("KVM TDX seems broken that number of CPUID entries"
133                              " in kvm_tdx_capabilities exceeds limit: %d",
134                              KVM_MAX_CPUID_ENTRIES);
135                 error_propagate(errp, local_err);
136                 return r;
137             }
138             error_free(local_err);
139         } else if (r < 0) {
140             g_free(caps);
141             error_propagate(errp, local_err);
142             return r;
143         }
144     } while (r == -E2BIG);
145 
146     tdx_caps = caps;
147 
148     return 0;
149 }
150 
151 void tdx_set_tdvf_region(MemoryRegion *tdvf_mr)
152 {
153     assert(!tdx_guest->tdvf_mr);
154     tdx_guest->tdvf_mr = tdvf_mr;
155 }
156 
157 static TdxFirmwareEntry *tdx_get_hob_entry(TdxGuest *tdx)
158 {
159     TdxFirmwareEntry *entry;
160 
161     for_each_tdx_fw_entry(&tdx->tdvf, entry) {
162         if (entry->type == TDVF_SECTION_TYPE_TD_HOB) {
163             return entry;
164         }
165     }
166     error_report("TDVF metadata doesn't specify TD_HOB location.");
167     exit(1);
168 }
169 
170 static void tdx_add_ram_entry(uint64_t address, uint64_t length,
171                               enum TdxRamType type)
172 {
173     uint32_t nr_entries = tdx_guest->nr_ram_entries;
174     tdx_guest->ram_entries = g_renew(TdxRamEntry, tdx_guest->ram_entries,
175                                      nr_entries + 1);
176 
177     tdx_guest->ram_entries[nr_entries].address = address;
178     tdx_guest->ram_entries[nr_entries].length = length;
179     tdx_guest->ram_entries[nr_entries].type = type;
180     tdx_guest->nr_ram_entries++;
181 }
182 
183 static int tdx_accept_ram_range(uint64_t address, uint64_t length)
184 {
185     uint64_t head_start, tail_start, head_length, tail_length;
186     uint64_t tmp_address, tmp_length;
187     TdxRamEntry *e;
188     int i = 0;
189 
190     do {
191         if (i == tdx_guest->nr_ram_entries) {
192             return -1;
193         }
194 
195         e = &tdx_guest->ram_entries[i++];
196     } while (address + length <= e->address || address >= e->address + e->length);
197 
198     /*
199      * The to-be-accepted ram range must be fully contained by one
200      * RAM entry.
201      */
202     if (e->address > address ||
203         e->address + e->length < address + length) {
204         return -1;
205     }
206 
207     if (e->type == TDX_RAM_ADDED) {
208         return 0;
209     }
210 
211     tmp_address = e->address;
212     tmp_length = e->length;
213 
214     e->address = address;
215     e->length = length;
216     e->type = TDX_RAM_ADDED;
217 
218     head_length = address - tmp_address;
219     if (head_length > 0) {
220         head_start = tmp_address;
221         tdx_add_ram_entry(head_start, head_length, TDX_RAM_UNACCEPTED);
222     }
223 
224     tail_start = address + length;
225     if (tail_start < tmp_address + tmp_length) {
226         tail_length = tmp_address + tmp_length - tail_start;
227         tdx_add_ram_entry(tail_start, tail_length, TDX_RAM_UNACCEPTED);
228     }
229 
230     return 0;
231 }
232 
233 static int tdx_ram_entry_compare(const void *lhs_, const void* rhs_)
234 {
235     const TdxRamEntry *lhs = lhs_;
236     const TdxRamEntry *rhs = rhs_;
237 
238     if (lhs->address == rhs->address) {
239         return 0;
240     }
241     if (le64_to_cpu(lhs->address) > le64_to_cpu(rhs->address)) {
242         return 1;
243     }
244     return -1;
245 }
246 
247 static void tdx_init_ram_entries(void)
248 {
249     unsigned i, j, nr_e820_entries;
250 
251     nr_e820_entries = e820_get_table(NULL);
252     tdx_guest->ram_entries = g_new(TdxRamEntry, nr_e820_entries);
253 
254     for (i = 0, j = 0; i < nr_e820_entries; i++) {
255         uint64_t addr, len;
256 
257         if (e820_get_entry(i, E820_RAM, &addr, &len)) {
258             tdx_guest->ram_entries[j].address = addr;
259             tdx_guest->ram_entries[j].length = len;
260             tdx_guest->ram_entries[j].type = TDX_RAM_UNACCEPTED;
261             j++;
262         }
263     }
264     tdx_guest->nr_ram_entries = j;
265 }
266 
267 static void tdx_post_init_vcpus(void)
268 {
269     TdxFirmwareEntry *hob;
270     CPUState *cpu;
271 
272     hob = tdx_get_hob_entry(tdx_guest);
273     CPU_FOREACH(cpu) {
274         tdx_vcpu_ioctl(cpu, KVM_TDX_INIT_VCPU, 0, (void *)hob->address,
275                        &error_fatal);
276     }
277 }
278 
279 static void tdx_finalize_vm(Notifier *notifier, void *unused)
280 {
281     TdxFirmware *tdvf = &tdx_guest->tdvf;
282     TdxFirmwareEntry *entry;
283     RAMBlock *ram_block;
284     Error *local_err = NULL;
285     int r;
286 
287     tdx_init_ram_entries();
288 
289     for_each_tdx_fw_entry(tdvf, entry) {
290         switch (entry->type) {
291         case TDVF_SECTION_TYPE_BFV:
292         case TDVF_SECTION_TYPE_CFV:
293             entry->mem_ptr = tdvf->mem_ptr + entry->data_offset;
294             break;
295         case TDVF_SECTION_TYPE_TD_HOB:
296         case TDVF_SECTION_TYPE_TEMP_MEM:
297             entry->mem_ptr = qemu_ram_mmap(-1, entry->size,
298                                            qemu_real_host_page_size(), 0, 0);
299             if (entry->mem_ptr == MAP_FAILED) {
300                 error_report("Failed to mmap memory for TDVF section %d",
301                              entry->type);
302                 exit(1);
303             }
304             if (tdx_accept_ram_range(entry->address, entry->size)) {
305                 error_report("Failed to accept memory for TDVF section %d",
306                              entry->type);
307                 qemu_ram_munmap(-1, entry->mem_ptr, entry->size);
308                 exit(1);
309             }
310             break;
311         default:
312             error_report("Unsupported TDVF section %d", entry->type);
313             exit(1);
314         }
315     }
316 
317     qsort(tdx_guest->ram_entries, tdx_guest->nr_ram_entries,
318           sizeof(TdxRamEntry), &tdx_ram_entry_compare);
319 
320     tdvf_hob_create(tdx_guest, tdx_get_hob_entry(tdx_guest));
321 
322     tdx_post_init_vcpus();
323 
324     for_each_tdx_fw_entry(tdvf, entry) {
325         struct kvm_tdx_init_mem_region region;
326         uint32_t flags;
327 
328         region = (struct kvm_tdx_init_mem_region) {
329             .source_addr = (uint64_t)entry->mem_ptr,
330             .gpa = entry->address,
331             .nr_pages = entry->size >> 12,
332         };
333 
334         flags = entry->attributes & TDVF_SECTION_ATTRIBUTES_MR_EXTEND ?
335                 KVM_TDX_MEASURE_MEMORY_REGION : 0;
336 
337         do {
338             error_free(local_err);
339             local_err = NULL;
340             r = tdx_vcpu_ioctl(first_cpu, KVM_TDX_INIT_MEM_REGION, flags,
341                                &region, &local_err);
342         } while (r == -EAGAIN || r == -EINTR);
343         if (r < 0) {
344             error_report_err(local_err);
345             exit(1);
346         }
347 
348         if (entry->type == TDVF_SECTION_TYPE_TD_HOB ||
349             entry->type == TDVF_SECTION_TYPE_TEMP_MEM) {
350             qemu_ram_munmap(-1, entry->mem_ptr, entry->size);
351             entry->mem_ptr = NULL;
352         }
353     }
354 
355     /*
356      * TDVF image has been copied into private region above via
357      * KVM_MEMORY_MAPPING. It becomes useless.
358      */
359     ram_block = tdx_guest->tdvf_mr->ram_block;
360     ram_block_discard_range(ram_block, 0, ram_block->max_length);
361 
362     tdx_vm_ioctl(KVM_TDX_FINALIZE_VM, 0, NULL, &error_fatal);
363     CONFIDENTIAL_GUEST_SUPPORT(tdx_guest)->ready = true;
364 }
365 
366 static Notifier tdx_machine_done_notify = {
367     .notify = tdx_finalize_vm,
368 };
369 
370 /*
371  * Some CPUID bits change from fixed1 to configurable bits when TDX module
372  * supports TDX_FEATURES0.VE_REDUCTION. e.g., MCA/MCE/MTRR/CORE_CAPABILITY.
373  *
374  * To make QEMU work with all the versions of TDX module, keep the fixed1 bits
375  * here if they are ever fixed1 bits in any of the version though not fixed1 in
376  * the latest version. Otherwise, with the older version of TDX module, QEMU may
377  * treat the fixed1 bit as unsupported.
378  *
379  * For newer TDX module, it does no harm to keep them in tdx_fixed1_bits even
380  * though they changed to configurable bits. Because tdx_fixed1_bits is used to
381  * setup the supported bits.
382  */
383 KvmCpuidInfo tdx_fixed1_bits = {
384     .cpuid.nent = 8,
385     .entries[0] = {
386         .function = 0x1,
387         .index = 0,
388         .ecx = CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_DTES64 |
389                CPUID_EXT_DSCPL | CPUID_EXT_SSSE3 | CPUID_EXT_CX16 |
390                CPUID_EXT_PDCM | CPUID_EXT_PCID | CPUID_EXT_SSE41 |
391                CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE |
392                CPUID_EXT_POPCNT | CPUID_EXT_AES | CPUID_EXT_XSAVE |
393                CPUID_EXT_RDRAND | CPUID_EXT_HYPERVISOR,
394         .edx = CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC |
395                CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC |
396                CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV |
397                CPUID_PAT | CPUID_CLFLUSH | CPUID_DTS | CPUID_MMX | CPUID_FXSR |
398                CPUID_SSE | CPUID_SSE2,
399     },
400     .entries[1] = {
401         .function = 0x6,
402         .index = 0,
403         .eax = CPUID_6_EAX_ARAT,
404     },
405     .entries[2] = {
406         .function = 0x7,
407         .index = 0,
408         .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX,
409         .ebx = CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_FDP_EXCPTN_ONLY |
410                CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_INVPCID |
411                CPUID_7_0_EBX_ZERO_FCS_FDS | CPUID_7_0_EBX_RDSEED |
412                CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT |
413                CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_SHA_NI,
414         .ecx = CPUID_7_0_ECX_BUS_LOCK_DETECT | CPUID_7_0_ECX_MOVDIRI |
415                CPUID_7_0_ECX_MOVDIR64B,
416         .edx = CPUID_7_0_EDX_MD_CLEAR | CPUID_7_0_EDX_SPEC_CTRL |
417                CPUID_7_0_EDX_STIBP | CPUID_7_0_EDX_FLUSH_L1D |
418                CPUID_7_0_EDX_ARCH_CAPABILITIES | CPUID_7_0_EDX_CORE_CAPABILITY |
419                CPUID_7_0_EDX_SPEC_CTRL_SSBD,
420     },
421     .entries[3] = {
422         .function = 0x7,
423         .index = 2,
424         .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX,
425         .edx = CPUID_7_2_EDX_PSFD | CPUID_7_2_EDX_IPRED_CTRL |
426                CPUID_7_2_EDX_RRSBA_CTRL | CPUID_7_2_EDX_BHI_CTRL,
427     },
428     .entries[4] = {
429         .function = 0xD,
430         .index = 0,
431         .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX,
432         .eax = XSTATE_FP_MASK | XSTATE_SSE_MASK,
433     },
434     .entries[5] = {
435         .function = 0xD,
436         .index = 1,
437         .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX,
438         .eax = CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC|
439                CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES,
440     },
441     .entries[6] = {
442         .function = 0x80000001,
443         .index = 0,
444         .ecx = CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH,
445         /*
446          * Strictly speaking, SYSCALL is not fixed1 bit since it depends on
447          * the CPU to be in 64-bit mode. But here fixed1 is used to serve the
448          * purpose of supported bits for TDX. In this sense, SYACALL is always
449          * supported.
450          */
451         .edx = CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB |
452                CPUID_EXT2_RDTSCP | CPUID_EXT2_LM,
453     },
454     .entries[7] = {
455         .function = 0x80000007,
456         .index = 0,
457         .edx = CPUID_APM_INVTSC,
458     },
459 };
460 
461 static struct kvm_cpuid_entry2 *find_in_supported_entry(uint32_t function,
462                                                         uint32_t index)
463 {
464     struct kvm_cpuid_entry2 *e;
465 
466     e = cpuid_find_entry(tdx_supported_cpuid, function, index);
467     if (!e) {
468         if (tdx_supported_cpuid->nent >= KVM_MAX_CPUID_ENTRIES) {
469             error_report("tdx_supported_cpuid requries more space than %d entries",
470                           KVM_MAX_CPUID_ENTRIES);
471             exit(1);
472         }
473         e = &tdx_supported_cpuid->entries[tdx_supported_cpuid->nent++];
474         e->function = function;
475         e->index = index;
476     }
477 
478     return e;
479 }
480 
481 static void tdx_add_supported_cpuid_by_fixed1_bits(void)
482 {
483     struct kvm_cpuid_entry2 *e, *e1;
484     int i;
485 
486     for (i = 0; i < tdx_fixed1_bits.cpuid.nent; i++) {
487         e = &tdx_fixed1_bits.entries[i];
488 
489         e1 = find_in_supported_entry(e->function, e->index);
490         e1->eax |= e->eax;
491         e1->ebx |= e->ebx;
492         e1->ecx |= e->ecx;
493         e1->edx |= e->edx;
494     }
495 }
496 
497 static void tdx_setup_supported_cpuid(void)
498 {
499     if (tdx_supported_cpuid) {
500         return;
501     }
502 
503     tdx_supported_cpuid = g_malloc0(sizeof(*tdx_supported_cpuid) +
504                     KVM_MAX_CPUID_ENTRIES * sizeof(struct kvm_cpuid_entry2));
505 
506     memcpy(tdx_supported_cpuid->entries, tdx_caps->cpuid.entries,
507            tdx_caps->cpuid.nent * sizeof(struct kvm_cpuid_entry2));
508     tdx_supported_cpuid->nent = tdx_caps->cpuid.nent;
509 
510     tdx_add_supported_cpuid_by_fixed1_bits();
511 }
512 
513 static int tdx_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
514 {
515     MachineState *ms = MACHINE(qdev_get_machine());
516     X86MachineState *x86ms = X86_MACHINE(ms);
517     TdxGuest *tdx = TDX_GUEST(cgs);
518     int r = 0;
519 
520     kvm_mark_guest_state_protected();
521 
522     if (x86ms->smm == ON_OFF_AUTO_AUTO) {
523         x86ms->smm = ON_OFF_AUTO_OFF;
524     } else if (x86ms->smm == ON_OFF_AUTO_ON) {
525         error_setg(errp, "TDX VM doesn't support SMM");
526         return -EINVAL;
527     }
528 
529     if (x86ms->pic == ON_OFF_AUTO_AUTO) {
530         x86ms->pic = ON_OFF_AUTO_OFF;
531     } else if (x86ms->pic == ON_OFF_AUTO_ON) {
532         error_setg(errp, "TDX VM doesn't support PIC");
533         return -EINVAL;
534     }
535 
536     if (kvm_state->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
537         kvm_state->kernel_irqchip_split = ON_OFF_AUTO_ON;
538     } else if (kvm_state->kernel_irqchip_split != ON_OFF_AUTO_ON) {
539         error_setg(errp, "TDX VM requires kernel_irqchip to be split");
540         return -EINVAL;
541     }
542 
543     if (!tdx_caps) {
544         r = get_tdx_capabilities(errp);
545         if (r) {
546             return r;
547         }
548     }
549 
550     tdx_setup_supported_cpuid();
551 
552     /* TDX relies on KVM_HC_MAP_GPA_RANGE to handle TDG.VP.VMCALL<MapGPA> */
553     if (!kvm_enable_hypercall(BIT_ULL(KVM_HC_MAP_GPA_RANGE))) {
554         return -EOPNOTSUPP;
555     }
556 
557     /*
558      * Set kvm_readonly_mem_allowed to false, because TDX only supports readonly
559      * memory for shared memory but not for private memory. Besides, whether a
560      * memslot is private or shared is not determined by QEMU.
561      *
562      * Thus, just mark readonly memory not supported for simplicity.
563      */
564     kvm_readonly_mem_allowed = false;
565 
566     qemu_add_machine_init_done_notifier(&tdx_machine_done_notify);
567 
568     tdx_guest = tdx;
569     return 0;
570 }
571 
572 static int tdx_kvm_type(X86ConfidentialGuest *cg)
573 {
574     /* Do the object check */
575     TDX_GUEST(cg);
576 
577     return KVM_X86_TDX_VM;
578 }
579 
580 static void tdx_cpu_instance_init(X86ConfidentialGuest *cg, CPUState *cpu)
581 {
582     X86CPU *x86cpu = X86_CPU(cpu);
583 
584     object_property_set_bool(OBJECT(cpu), "pmu", false, &error_abort);
585 
586     x86cpu->enable_cpuid_0x1f = true;
587 }
588 
589 static uint32_t tdx_adjust_cpuid_features(X86ConfidentialGuest *cg,
590                                           uint32_t feature, uint32_t index,
591                                           int reg, uint32_t value)
592 {
593     struct kvm_cpuid_entry2 *e;
594 
595     e = cpuid_find_entry(&tdx_fixed1_bits.cpuid, feature, index);
596     if (e) {
597         value |= cpuid_entry_get_reg(e, reg);
598     }
599 
600     if (is_feature_word_cpuid(feature, index, reg)) {
601         e = cpuid_find_entry(tdx_supported_cpuid, feature, index);
602         if (e) {
603             value &= cpuid_entry_get_reg(e, reg);
604         }
605     }
606 
607     return value;
608 }
609 
610 static int tdx_validate_attributes(TdxGuest *tdx, Error **errp)
611 {
612     if ((tdx->attributes & ~tdx_caps->supported_attrs)) {
613         error_setg(errp, "Invalid attributes 0x%lx for TDX VM "
614                    "(KVM supported: 0x%llx)", tdx->attributes,
615                    tdx_caps->supported_attrs);
616         return -1;
617     }
618 
619     if (tdx->attributes & ~TDX_SUPPORTED_TD_ATTRS) {
620         error_setg(errp, "Some QEMU unsupported TD attribute bits being "
621                     "requested: 0x%lx (QEMU supported: 0x%llx)",
622                     tdx->attributes, TDX_SUPPORTED_TD_ATTRS);
623         return -1;
624     }
625 
626     return 0;
627 }
628 
629 static int setup_td_guest_attributes(X86CPU *x86cpu, Error **errp)
630 {
631     CPUX86State *env = &x86cpu->env;
632 
633     tdx_guest->attributes |= (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS) ?
634                              TDX_TD_ATTRIBUTES_PKS : 0;
635     tdx_guest->attributes |= x86cpu->enable_pmu ? TDX_TD_ATTRIBUTES_PERFMON : 0;
636 
637     return tdx_validate_attributes(tdx_guest, errp);
638 }
639 
640 static int setup_td_xfam(X86CPU *x86cpu, Error **errp)
641 {
642     CPUX86State *env = &x86cpu->env;
643     uint64_t xfam;
644 
645     xfam = env->features[FEAT_XSAVE_XCR0_LO] |
646            env->features[FEAT_XSAVE_XCR0_HI] |
647            env->features[FEAT_XSAVE_XSS_LO] |
648            env->features[FEAT_XSAVE_XSS_HI];
649 
650     if (xfam & ~tdx_caps->supported_xfam) {
651         error_setg(errp, "Invalid XFAM 0x%lx for TDX VM (supported: 0x%llx))",
652                    xfam, tdx_caps->supported_xfam);
653         return -1;
654     }
655 
656     tdx_guest->xfam = xfam;
657     return 0;
658 }
659 
660 static void tdx_filter_cpuid(struct kvm_cpuid2 *cpuids)
661 {
662     int i, dest_cnt = 0;
663     struct kvm_cpuid_entry2 *src, *dest, *conf;
664 
665     for (i = 0; i < cpuids->nent; i++) {
666         src = cpuids->entries + i;
667         conf = cpuid_find_entry(&tdx_caps->cpuid, src->function, src->index);
668         if (!conf) {
669             continue;
670         }
671         dest = cpuids->entries + dest_cnt;
672 
673         dest->function = src->function;
674         dest->index = src->index;
675         dest->flags = src->flags;
676         dest->eax = src->eax & conf->eax;
677         dest->ebx = src->ebx & conf->ebx;
678         dest->ecx = src->ecx & conf->ecx;
679         dest->edx = src->edx & conf->edx;
680 
681         dest_cnt++;
682     }
683     cpuids->nent = dest_cnt++;
684 }
685 
686 int tdx_pre_create_vcpu(CPUState *cpu, Error **errp)
687 {
688     X86CPU *x86cpu = X86_CPU(cpu);
689     CPUX86State *env = &x86cpu->env;
690     g_autofree struct kvm_tdx_init_vm *init_vm = NULL;
691     Error *local_err = NULL;
692     size_t data_len;
693     int retry = 10000;
694     int r = 0;
695 
696     QEMU_LOCK_GUARD(&tdx_guest->lock);
697     if (tdx_guest->initialized) {
698         return r;
699     }
700 
701     init_vm = g_malloc0(sizeof(struct kvm_tdx_init_vm) +
702                         sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
703 
704     if (!kvm_check_extension(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS)) {
705         error_setg(errp, "KVM doesn't support KVM_CAP_X86_APIC_BUS_CYCLES_NS");
706         return -EOPNOTSUPP;
707     }
708 
709     r = kvm_vm_enable_cap(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
710                           0, TDX_APIC_BUS_CYCLES_NS);
711     if (r < 0) {
712         error_setg_errno(errp, -r,
713                          "Unable to set core crystal clock frequency to 25MHz");
714         return r;
715     }
716 
717     if (env->tsc_khz && (env->tsc_khz < TDX_MIN_TSC_FREQUENCY_KHZ ||
718                          env->tsc_khz > TDX_MAX_TSC_FREQUENCY_KHZ)) {
719         error_setg(errp, "Invalid TSC %ld KHz, must specify cpu_frequency "
720                          "between [%d, %d] kHz", env->tsc_khz,
721                          TDX_MIN_TSC_FREQUENCY_KHZ, TDX_MAX_TSC_FREQUENCY_KHZ);
722        return -EINVAL;
723     }
724 
725     if (env->tsc_khz % (25 * 1000)) {
726         error_setg(errp, "Invalid TSC %ld KHz, it must be multiple of 25MHz",
727                    env->tsc_khz);
728         return -EINVAL;
729     }
730 
731     /* it's safe even env->tsc_khz is 0. KVM uses host's tsc_khz in this case */
732     r = kvm_vm_ioctl(kvm_state, KVM_SET_TSC_KHZ, env->tsc_khz);
733     if (r < 0) {
734         error_setg_errno(errp, -r, "Unable to set TSC frequency to %ld kHz",
735                          env->tsc_khz);
736         return r;
737     }
738 
739     if (tdx_guest->mrconfigid) {
740         g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrconfigid,
741                               strlen(tdx_guest->mrconfigid), &data_len, errp);
742         if (!data) {
743             return -1;
744         }
745         if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) {
746             error_setg(errp, "TDX: failed to decode mrconfigid");
747             return -1;
748         }
749         memcpy(init_vm->mrconfigid, data, data_len);
750     }
751 
752     if (tdx_guest->mrowner) {
753         g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrowner,
754                               strlen(tdx_guest->mrowner), &data_len, errp);
755         if (!data) {
756             return -1;
757         }
758         if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) {
759             error_setg(errp, "TDX: failed to decode mrowner");
760             return -1;
761         }
762         memcpy(init_vm->mrowner, data, data_len);
763     }
764 
765     if (tdx_guest->mrownerconfig) {
766         g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrownerconfig,
767                             strlen(tdx_guest->mrownerconfig), &data_len, errp);
768         if (!data) {
769             return -1;
770         }
771         if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) {
772             error_setg(errp, "TDX: failed to decode mrownerconfig");
773             return -1;
774         }
775         memcpy(init_vm->mrownerconfig, data, data_len);
776     }
777 
778     r = setup_td_guest_attributes(x86cpu, errp);
779     if (r) {
780         return r;
781     }
782 
783     r = setup_td_xfam(x86cpu, errp);
784     if (r) {
785         return r;
786     }
787 
788     init_vm->cpuid.nent = kvm_x86_build_cpuid(env, init_vm->cpuid.entries, 0);
789     tdx_filter_cpuid(&init_vm->cpuid);
790 
791     init_vm->attributes = tdx_guest->attributes;
792     init_vm->xfam = tdx_guest->xfam;
793 
794     /*
795      * KVM_TDX_INIT_VM gets -EAGAIN when KVM side SEAMCALL(TDH_MNG_CREATE)
796      * gets TDX_RND_NO_ENTROPY due to Random number generation (e.g., RDRAND or
797      * RDSEED) is busy.
798      *
799      * Retry for the case.
800      */
801     do {
802         error_free(local_err);
803         local_err = NULL;
804         r = tdx_vm_ioctl(KVM_TDX_INIT_VM, 0, init_vm, &local_err);
805     } while (r == -EAGAIN && --retry);
806 
807     if (r < 0) {
808         if (!retry) {
809             error_append_hint(&local_err, "Hardware RNG (Random Number "
810             "Generator) is busy occupied by someone (via RDRAND/RDSEED) "
811             "maliciously, which leads to KVM_TDX_INIT_VM keeping failure "
812             "due to lack of entropy.\n");
813         }
814         error_propagate(errp, local_err);
815         return r;
816     }
817 
818     tdx_guest->initialized = true;
819 
820     return 0;
821 }
822 
823 int tdx_parse_tdvf(void *flash_ptr, int size)
824 {
825     return tdvf_parse_metadata(&tdx_guest->tdvf, flash_ptr, size);
826 }
827 
828 static void tdx_panicked_on_fatal_error(X86CPU *cpu, uint64_t error_code,
829                                         char *message, uint64_t gpa)
830 {
831     GuestPanicInformation *panic_info;
832 
833     panic_info = g_new0(GuestPanicInformation, 1);
834     panic_info->type = GUEST_PANIC_INFORMATION_TYPE_TDX;
835     panic_info->u.tdx.error_code = (uint32_t) error_code;
836     panic_info->u.tdx.message = message;
837     panic_info->u.tdx.gpa = gpa;
838 
839     qemu_system_guest_panicked(panic_info);
840 }
841 
842 /*
843  * Only 8 registers can contain valid ASCII byte stream to form the fatal
844  * message, and their sequence is: R14, R15, RBX, RDI, RSI, R8, R9, RDX
845  */
846 #define TDX_FATAL_MESSAGE_MAX        64
847 
848 #define TDX_REPORT_FATAL_ERROR_GPA_VALID    BIT_ULL(63)
849 
850 int tdx_handle_report_fatal_error(X86CPU *cpu, struct kvm_run *run)
851 {
852     uint64_t error_code = run->system_event.data[R_R12];
853     uint64_t reg_mask = run->system_event.data[R_ECX];
854     char *message = NULL;
855     uint64_t *tmp;
856     uint64_t gpa = -1ull;
857 
858     if (error_code & 0xffff) {
859         error_report("TDX: REPORT_FATAL_ERROR: invalid error code: 0x%lx",
860                      error_code);
861         return -1;
862     }
863 
864     if (reg_mask) {
865         message = g_malloc0(TDX_FATAL_MESSAGE_MAX + 1);
866         tmp = (uint64_t *)message;
867 
868 #define COPY_REG(REG)                               \
869     do {                                            \
870         if (reg_mask & BIT_ULL(REG)) {              \
871             *(tmp++) = run->system_event.data[REG]; \
872         }                                           \
873     } while (0)
874 
875         COPY_REG(R_R14);
876         COPY_REG(R_R15);
877         COPY_REG(R_EBX);
878         COPY_REG(R_EDI);
879         COPY_REG(R_ESI);
880         COPY_REG(R_R8);
881         COPY_REG(R_R9);
882         COPY_REG(R_EDX);
883         *((char *)tmp) = '\0';
884     }
885 #undef COPY_REG
886 
887     if (error_code & TDX_REPORT_FATAL_ERROR_GPA_VALID) {
888         gpa = run->system_event.data[R_R13];
889     }
890 
891     tdx_panicked_on_fatal_error(cpu, error_code, message, gpa);
892 
893     return -1;
894 }
895 
896 static bool tdx_guest_get_sept_ve_disable(Object *obj, Error **errp)
897 {
898     TdxGuest *tdx = TDX_GUEST(obj);
899 
900     return !!(tdx->attributes & TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE);
901 }
902 
903 static void tdx_guest_set_sept_ve_disable(Object *obj, bool value, Error **errp)
904 {
905     TdxGuest *tdx = TDX_GUEST(obj);
906 
907     if (value) {
908         tdx->attributes |= TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE;
909     } else {
910         tdx->attributes &= ~TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE;
911     }
912 }
913 
914 static char *tdx_guest_get_mrconfigid(Object *obj, Error **errp)
915 {
916     TdxGuest *tdx = TDX_GUEST(obj);
917 
918     return g_strdup(tdx->mrconfigid);
919 }
920 
921 static void tdx_guest_set_mrconfigid(Object *obj, const char *value, Error **errp)
922 {
923     TdxGuest *tdx = TDX_GUEST(obj);
924 
925     g_free(tdx->mrconfigid);
926     tdx->mrconfigid = g_strdup(value);
927 }
928 
929 static char *tdx_guest_get_mrowner(Object *obj, Error **errp)
930 {
931     TdxGuest *tdx = TDX_GUEST(obj);
932 
933     return g_strdup(tdx->mrowner);
934 }
935 
936 static void tdx_guest_set_mrowner(Object *obj, const char *value, Error **errp)
937 {
938     TdxGuest *tdx = TDX_GUEST(obj);
939 
940     g_free(tdx->mrowner);
941     tdx->mrowner = g_strdup(value);
942 }
943 
944 static char *tdx_guest_get_mrownerconfig(Object *obj, Error **errp)
945 {
946     TdxGuest *tdx = TDX_GUEST(obj);
947 
948     return g_strdup(tdx->mrownerconfig);
949 }
950 
951 static void tdx_guest_set_mrownerconfig(Object *obj, const char *value, Error **errp)
952 {
953     TdxGuest *tdx = TDX_GUEST(obj);
954 
955     g_free(tdx->mrownerconfig);
956     tdx->mrownerconfig = g_strdup(value);
957 }
958 
959 /* tdx guest */
960 OBJECT_DEFINE_TYPE_WITH_INTERFACES(TdxGuest,
961                                    tdx_guest,
962                                    TDX_GUEST,
963                                    X86_CONFIDENTIAL_GUEST,
964                                    { TYPE_USER_CREATABLE },
965                                    { NULL })
966 
967 static void tdx_guest_init(Object *obj)
968 {
969     ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj);
970     TdxGuest *tdx = TDX_GUEST(obj);
971 
972     qemu_mutex_init(&tdx->lock);
973 
974     cgs->require_guest_memfd = true;
975     tdx->attributes = TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE;
976 
977     object_property_add_uint64_ptr(obj, "attributes", &tdx->attributes,
978                                    OBJ_PROP_FLAG_READWRITE);
979     object_property_add_bool(obj, "sept-ve-disable",
980                              tdx_guest_get_sept_ve_disable,
981                              tdx_guest_set_sept_ve_disable);
982     object_property_add_str(obj, "mrconfigid",
983                             tdx_guest_get_mrconfigid,
984                             tdx_guest_set_mrconfigid);
985     object_property_add_str(obj, "mrowner",
986                             tdx_guest_get_mrowner, tdx_guest_set_mrowner);
987     object_property_add_str(obj, "mrownerconfig",
988                             tdx_guest_get_mrownerconfig,
989                             tdx_guest_set_mrownerconfig);
990 }
991 
992 static void tdx_guest_finalize(Object *obj)
993 {
994 }
995 
996 static void tdx_guest_class_init(ObjectClass *oc, const void *data)
997 {
998     ConfidentialGuestSupportClass *klass = CONFIDENTIAL_GUEST_SUPPORT_CLASS(oc);
999     X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc);
1000 
1001     klass->kvm_init = tdx_kvm_init;
1002     x86_klass->kvm_type = tdx_kvm_type;
1003     x86_klass->cpu_instance_init = tdx_cpu_instance_init;
1004     x86_klass->adjust_cpuid_features = tdx_adjust_cpuid_features;
1005 }
1006