xref: /qemu/target/i386/kvm/tdx.c (revision 43ba160cb4bbb193560eb0d2d7decc4b5fc599fe)
1 /*
2  * QEMU TDX support
3  *
4  * Copyright (c) 2025 Intel Corporation
5  *
6  * Author:
7  *      Xiaoyao Li <xiaoyao.li@intel.com>
8  *
9  * SPDX-License-Identifier: GPL-2.0-or-later
10  */
11 
12 #include "qemu/osdep.h"
13 #include "qemu/error-report.h"
14 #include "qemu/base64.h"
15 #include "qemu/mmap-alloc.h"
16 #include "qapi/error.h"
17 #include "qapi/qapi-visit-sockets.h"
18 #include "qom/object_interfaces.h"
19 #include "crypto/hash.h"
20 #include "system/kvm_int.h"
21 #include "system/runstate.h"
22 #include "system/system.h"
23 #include "system/ramblock.h"
24 #include "system/address-spaces.h"
25 
26 #include <linux/kvm_para.h>
27 
28 #include "cpu.h"
29 #include "cpu-internal.h"
30 #include "host-cpu.h"
31 #include "hw/i386/e820_memory_layout.h"
32 #include "hw/i386/tdvf.h"
33 #include "hw/i386/x86.h"
34 #include "hw/i386/tdvf-hob.h"
35 #include "kvm_i386.h"
36 #include "tdx.h"
37 #include "tdx-quote-generator.h"
38 
39 #include "standard-headers/asm-x86/kvm_para.h"
40 
41 #define TDX_MIN_TSC_FREQUENCY_KHZ   (100 * 1000)
42 #define TDX_MAX_TSC_FREQUENCY_KHZ   (10 * 1000 * 1000)
43 
44 #define TDX_TD_ATTRIBUTES_DEBUG             BIT_ULL(0)
45 #define TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE   BIT_ULL(28)
46 #define TDX_TD_ATTRIBUTES_PKS               BIT_ULL(30)
47 #define TDX_TD_ATTRIBUTES_PERFMON           BIT_ULL(63)
48 
49 #define TDX_SUPPORTED_TD_ATTRS  (TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE |\
50                                  TDX_TD_ATTRIBUTES_PKS | \
51                                  TDX_TD_ATTRIBUTES_PERFMON)
52 
53 #define TDX_SUPPORTED_KVM_FEATURES  ((1U << KVM_FEATURE_NOP_IO_DELAY) | \
54                                      (1U << KVM_FEATURE_PV_UNHALT) | \
55                                      (1U << KVM_FEATURE_PV_TLB_FLUSH) | \
56                                      (1U << KVM_FEATURE_PV_SEND_IPI) | \
57                                      (1U << KVM_FEATURE_POLL_CONTROL) | \
58                                      (1U << KVM_FEATURE_PV_SCHED_YIELD) | \
59                                      (1U << KVM_FEATURE_MSI_EXT_DEST_ID))
60 
61 static TdxGuest *tdx_guest;
62 
63 static struct kvm_tdx_capabilities *tdx_caps;
64 static struct kvm_cpuid2 *tdx_supported_cpuid;
65 
66 /* Valid after kvm_arch_init()->confidential_guest_kvm_init()->tdx_kvm_init() */
is_tdx_vm(void)67 bool is_tdx_vm(void)
68 {
69     return !!tdx_guest;
70 }
71 
72 enum tdx_ioctl_level {
73     TDX_VM_IOCTL,
74     TDX_VCPU_IOCTL,
75 };
76 
tdx_ioctl_internal(enum tdx_ioctl_level level,void * state,int cmd_id,__u32 flags,void * data,Error ** errp)77 static int tdx_ioctl_internal(enum tdx_ioctl_level level, void *state,
78                               int cmd_id, __u32 flags, void *data,
79                               Error **errp)
80 {
81     struct kvm_tdx_cmd tdx_cmd = {};
82     int r;
83 
84     const char *tdx_ioctl_name[] = {
85         [KVM_TDX_CAPABILITIES] = "KVM_TDX_CAPABILITIES",
86         [KVM_TDX_INIT_VM] = "KVM_TDX_INIT_VM",
87         [KVM_TDX_INIT_VCPU] = "KVM_TDX_INIT_VCPU",
88         [KVM_TDX_INIT_MEM_REGION] = "KVM_TDX_INIT_MEM_REGION",
89         [KVM_TDX_FINALIZE_VM] = "KVM_TDX_FINALIZE_VM",
90         [KVM_TDX_GET_CPUID] = "KVM_TDX_GET_CPUID",
91     };
92 
93     tdx_cmd.id = cmd_id;
94     tdx_cmd.flags = flags;
95     tdx_cmd.data = (__u64)(unsigned long)data;
96 
97     switch (level) {
98     case TDX_VM_IOCTL:
99         r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd);
100         break;
101     case TDX_VCPU_IOCTL:
102         r = kvm_vcpu_ioctl(state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd);
103         break;
104     default:
105         error_setg(errp, "Invalid tdx_ioctl_level %d", level);
106         return -EINVAL;
107     }
108 
109     if (r < 0) {
110         error_setg_errno(errp, -r, "TDX ioctl %s failed, hw_errors: 0x%llx",
111                          tdx_ioctl_name[cmd_id], tdx_cmd.hw_error);
112     }
113     return r;
114 }
115 
tdx_vm_ioctl(int cmd_id,__u32 flags,void * data,Error ** errp)116 static inline int tdx_vm_ioctl(int cmd_id, __u32 flags, void *data,
117                                Error **errp)
118 {
119     return tdx_ioctl_internal(TDX_VM_IOCTL, NULL, cmd_id, flags, data, errp);
120 }
121 
tdx_vcpu_ioctl(CPUState * cpu,int cmd_id,__u32 flags,void * data,Error ** errp)122 static inline int tdx_vcpu_ioctl(CPUState *cpu, int cmd_id, __u32 flags,
123                                  void *data, Error **errp)
124 {
125     return  tdx_ioctl_internal(TDX_VCPU_IOCTL, cpu, cmd_id, flags, data, errp);
126 }
127 
get_tdx_capabilities(Error ** errp)128 static int get_tdx_capabilities(Error **errp)
129 {
130     struct kvm_tdx_capabilities *caps;
131     /* 1st generation of TDX reports 6 cpuid configs */
132     int nr_cpuid_configs = 6;
133     size_t size;
134     int r;
135 
136     do {
137         Error *local_err = NULL;
138         size = sizeof(struct kvm_tdx_capabilities) +
139                       nr_cpuid_configs * sizeof(struct kvm_cpuid_entry2);
140         caps = g_malloc0(size);
141         caps->cpuid.nent = nr_cpuid_configs;
142 
143         r = tdx_vm_ioctl(KVM_TDX_CAPABILITIES, 0, caps, &local_err);
144         if (r == -E2BIG) {
145             g_free(caps);
146             nr_cpuid_configs *= 2;
147             if (nr_cpuid_configs > KVM_MAX_CPUID_ENTRIES) {
148                 error_report("KVM TDX seems broken that number of CPUID entries"
149                              " in kvm_tdx_capabilities exceeds limit: %d",
150                              KVM_MAX_CPUID_ENTRIES);
151                 error_propagate(errp, local_err);
152                 return r;
153             }
154             error_free(local_err);
155         } else if (r < 0) {
156             g_free(caps);
157             error_propagate(errp, local_err);
158             return r;
159         }
160     } while (r == -E2BIG);
161 
162     tdx_caps = caps;
163 
164     return 0;
165 }
166 
tdx_set_tdvf_region(MemoryRegion * tdvf_mr)167 void tdx_set_tdvf_region(MemoryRegion *tdvf_mr)
168 {
169     assert(!tdx_guest->tdvf_mr);
170     tdx_guest->tdvf_mr = tdvf_mr;
171 }
172 
tdx_get_hob_entry(TdxGuest * tdx)173 static TdxFirmwareEntry *tdx_get_hob_entry(TdxGuest *tdx)
174 {
175     TdxFirmwareEntry *entry;
176 
177     for_each_tdx_fw_entry(&tdx->tdvf, entry) {
178         if (entry->type == TDVF_SECTION_TYPE_TD_HOB) {
179             return entry;
180         }
181     }
182     error_report("TDVF metadata doesn't specify TD_HOB location.");
183     exit(1);
184 }
185 
tdx_add_ram_entry(uint64_t address,uint64_t length,enum TdxRamType type)186 static void tdx_add_ram_entry(uint64_t address, uint64_t length,
187                               enum TdxRamType type)
188 {
189     uint32_t nr_entries = tdx_guest->nr_ram_entries;
190     tdx_guest->ram_entries = g_renew(TdxRamEntry, tdx_guest->ram_entries,
191                                      nr_entries + 1);
192 
193     tdx_guest->ram_entries[nr_entries].address = address;
194     tdx_guest->ram_entries[nr_entries].length = length;
195     tdx_guest->ram_entries[nr_entries].type = type;
196     tdx_guest->nr_ram_entries++;
197 }
198 
tdx_accept_ram_range(uint64_t address,uint64_t length)199 static int tdx_accept_ram_range(uint64_t address, uint64_t length)
200 {
201     uint64_t head_start, tail_start, head_length, tail_length;
202     uint64_t tmp_address, tmp_length;
203     TdxRamEntry *e;
204     int i = 0;
205 
206     do {
207         if (i == tdx_guest->nr_ram_entries) {
208             return -1;
209         }
210 
211         e = &tdx_guest->ram_entries[i++];
212     } while (address + length <= e->address || address >= e->address + e->length);
213 
214     /*
215      * The to-be-accepted ram range must be fully contained by one
216      * RAM entry.
217      */
218     if (e->address > address ||
219         e->address + e->length < address + length) {
220         return -1;
221     }
222 
223     if (e->type == TDX_RAM_ADDED) {
224         return 0;
225     }
226 
227     tmp_address = e->address;
228     tmp_length = e->length;
229 
230     e->address = address;
231     e->length = length;
232     e->type = TDX_RAM_ADDED;
233 
234     head_length = address - tmp_address;
235     if (head_length > 0) {
236         head_start = tmp_address;
237         tdx_add_ram_entry(head_start, head_length, TDX_RAM_UNACCEPTED);
238     }
239 
240     tail_start = address + length;
241     if (tail_start < tmp_address + tmp_length) {
242         tail_length = tmp_address + tmp_length - tail_start;
243         tdx_add_ram_entry(tail_start, tail_length, TDX_RAM_UNACCEPTED);
244     }
245 
246     return 0;
247 }
248 
tdx_ram_entry_compare(const void * lhs_,const void * rhs_)249 static int tdx_ram_entry_compare(const void *lhs_, const void* rhs_)
250 {
251     const TdxRamEntry *lhs = lhs_;
252     const TdxRamEntry *rhs = rhs_;
253 
254     if (lhs->address == rhs->address) {
255         return 0;
256     }
257     if (le64_to_cpu(lhs->address) > le64_to_cpu(rhs->address)) {
258         return 1;
259     }
260     return -1;
261 }
262 
tdx_init_ram_entries(void)263 static void tdx_init_ram_entries(void)
264 {
265     unsigned i, j, nr_e820_entries;
266 
267     nr_e820_entries = e820_get_table(NULL);
268     tdx_guest->ram_entries = g_new(TdxRamEntry, nr_e820_entries);
269 
270     for (i = 0, j = 0; i < nr_e820_entries; i++) {
271         uint64_t addr, len;
272 
273         if (e820_get_entry(i, E820_RAM, &addr, &len)) {
274             tdx_guest->ram_entries[j].address = addr;
275             tdx_guest->ram_entries[j].length = len;
276             tdx_guest->ram_entries[j].type = TDX_RAM_UNACCEPTED;
277             j++;
278         }
279     }
280     tdx_guest->nr_ram_entries = j;
281 }
282 
tdx_post_init_vcpus(void)283 static void tdx_post_init_vcpus(void)
284 {
285     TdxFirmwareEntry *hob;
286     CPUState *cpu;
287 
288     hob = tdx_get_hob_entry(tdx_guest);
289     CPU_FOREACH(cpu) {
290         tdx_vcpu_ioctl(cpu, KVM_TDX_INIT_VCPU, 0, (void *)(uintptr_t)hob->address,
291                        &error_fatal);
292     }
293 }
294 
tdx_finalize_vm(Notifier * notifier,void * unused)295 static void tdx_finalize_vm(Notifier *notifier, void *unused)
296 {
297     TdxFirmware *tdvf = &tdx_guest->tdvf;
298     TdxFirmwareEntry *entry;
299     RAMBlock *ram_block;
300     Error *local_err = NULL;
301     int r;
302 
303     tdx_init_ram_entries();
304 
305     for_each_tdx_fw_entry(tdvf, entry) {
306         switch (entry->type) {
307         case TDVF_SECTION_TYPE_BFV:
308         case TDVF_SECTION_TYPE_CFV:
309             entry->mem_ptr = tdvf->mem_ptr + entry->data_offset;
310             break;
311         case TDVF_SECTION_TYPE_TD_HOB:
312         case TDVF_SECTION_TYPE_TEMP_MEM:
313             entry->mem_ptr = qemu_ram_mmap(-1, entry->size,
314                                            qemu_real_host_page_size(), 0, 0);
315             if (entry->mem_ptr == MAP_FAILED) {
316                 error_report("Failed to mmap memory for TDVF section %d",
317                              entry->type);
318                 exit(1);
319             }
320             if (tdx_accept_ram_range(entry->address, entry->size)) {
321                 error_report("Failed to accept memory for TDVF section %d",
322                              entry->type);
323                 qemu_ram_munmap(-1, entry->mem_ptr, entry->size);
324                 exit(1);
325             }
326             break;
327         default:
328             error_report("Unsupported TDVF section %d", entry->type);
329             exit(1);
330         }
331     }
332 
333     qsort(tdx_guest->ram_entries, tdx_guest->nr_ram_entries,
334           sizeof(TdxRamEntry), &tdx_ram_entry_compare);
335 
336     tdvf_hob_create(tdx_guest, tdx_get_hob_entry(tdx_guest));
337 
338     tdx_post_init_vcpus();
339 
340     for_each_tdx_fw_entry(tdvf, entry) {
341         struct kvm_tdx_init_mem_region region;
342         uint32_t flags;
343 
344         region = (struct kvm_tdx_init_mem_region) {
345             .source_addr = (uintptr_t)entry->mem_ptr,
346             .gpa = entry->address,
347             .nr_pages = entry->size >> 12,
348         };
349 
350         flags = entry->attributes & TDVF_SECTION_ATTRIBUTES_MR_EXTEND ?
351                 KVM_TDX_MEASURE_MEMORY_REGION : 0;
352 
353         do {
354             error_free(local_err);
355             local_err = NULL;
356             r = tdx_vcpu_ioctl(first_cpu, KVM_TDX_INIT_MEM_REGION, flags,
357                                &region, &local_err);
358         } while (r == -EAGAIN || r == -EINTR);
359         if (r < 0) {
360             error_report_err(local_err);
361             exit(1);
362         }
363 
364         if (entry->type == TDVF_SECTION_TYPE_TD_HOB ||
365             entry->type == TDVF_SECTION_TYPE_TEMP_MEM) {
366             qemu_ram_munmap(-1, entry->mem_ptr, entry->size);
367             entry->mem_ptr = NULL;
368         }
369     }
370 
371     /*
372      * TDVF image has been copied into private region above via
373      * KVM_MEMORY_MAPPING. It becomes useless.
374      */
375     ram_block = tdx_guest->tdvf_mr->ram_block;
376     ram_block_discard_range(ram_block, 0, ram_block->max_length);
377 
378     tdx_vm_ioctl(KVM_TDX_FINALIZE_VM, 0, NULL, &error_fatal);
379     CONFIDENTIAL_GUEST_SUPPORT(tdx_guest)->ready = true;
380 }
381 
382 static Notifier tdx_machine_done_notify = {
383     .notify = tdx_finalize_vm,
384 };
385 
386 /*
387  * Some CPUID bits change from fixed1 to configurable bits when TDX module
388  * supports TDX_FEATURES0.VE_REDUCTION. e.g., MCA/MCE/MTRR/CORE_CAPABILITY.
389  *
390  * To make QEMU work with all the versions of TDX module, keep the fixed1 bits
391  * here if they are ever fixed1 bits in any of the version though not fixed1 in
392  * the latest version. Otherwise, with the older version of TDX module, QEMU may
393  * treat the fixed1 bit as unsupported.
394  *
395  * For newer TDX module, it does no harm to keep them in tdx_fixed1_bits even
396  * though they changed to configurable bits. Because tdx_fixed1_bits is used to
397  * setup the supported bits.
398  */
399 KvmCpuidInfo tdx_fixed1_bits = {
400     .cpuid.nent = 8,
401     .entries[0] = {
402         .function = 0x1,
403         .index = 0,
404         .ecx = CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_DTES64 |
405                CPUID_EXT_DSCPL | CPUID_EXT_SSSE3 | CPUID_EXT_CX16 |
406                CPUID_EXT_PDCM | CPUID_EXT_PCID | CPUID_EXT_SSE41 |
407                CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE |
408                CPUID_EXT_POPCNT | CPUID_EXT_AES | CPUID_EXT_XSAVE |
409                CPUID_EXT_RDRAND | CPUID_EXT_HYPERVISOR,
410         .edx = CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC |
411                CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC |
412                CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV |
413                CPUID_PAT | CPUID_CLFLUSH | CPUID_DTS | CPUID_MMX | CPUID_FXSR |
414                CPUID_SSE | CPUID_SSE2,
415     },
416     .entries[1] = {
417         .function = 0x6,
418         .index = 0,
419         .eax = CPUID_6_EAX_ARAT,
420     },
421     .entries[2] = {
422         .function = 0x7,
423         .index = 0,
424         .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX,
425         .ebx = CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_FDP_EXCPTN_ONLY |
426                CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_INVPCID |
427                CPUID_7_0_EBX_ZERO_FCS_FDS | CPUID_7_0_EBX_RDSEED |
428                CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT |
429                CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_SHA_NI,
430         .ecx = CPUID_7_0_ECX_BUS_LOCK_DETECT | CPUID_7_0_ECX_MOVDIRI |
431                CPUID_7_0_ECX_MOVDIR64B,
432         .edx = CPUID_7_0_EDX_MD_CLEAR | CPUID_7_0_EDX_SPEC_CTRL |
433                CPUID_7_0_EDX_STIBP | CPUID_7_0_EDX_FLUSH_L1D |
434                CPUID_7_0_EDX_ARCH_CAPABILITIES | CPUID_7_0_EDX_CORE_CAPABILITY |
435                CPUID_7_0_EDX_SPEC_CTRL_SSBD,
436     },
437     .entries[3] = {
438         .function = 0x7,
439         .index = 2,
440         .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX,
441         .edx = CPUID_7_2_EDX_PSFD | CPUID_7_2_EDX_IPRED_CTRL |
442                CPUID_7_2_EDX_RRSBA_CTRL | CPUID_7_2_EDX_BHI_CTRL,
443     },
444     .entries[4] = {
445         .function = 0xD,
446         .index = 0,
447         .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX,
448         .eax = XSTATE_FP_MASK | XSTATE_SSE_MASK,
449     },
450     .entries[5] = {
451         .function = 0xD,
452         .index = 1,
453         .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX,
454         .eax = CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC|
455                CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES,
456     },
457     .entries[6] = {
458         .function = 0x80000001,
459         .index = 0,
460         .ecx = CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH,
461         /*
462          * Strictly speaking, SYSCALL is not fixed1 bit since it depends on
463          * the CPU to be in 64-bit mode. But here fixed1 is used to serve the
464          * purpose of supported bits for TDX. In this sense, SYACALL is always
465          * supported.
466          */
467         .edx = CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB |
468                CPUID_EXT2_RDTSCP | CPUID_EXT2_LM,
469     },
470     .entries[7] = {
471         .function = 0x80000007,
472         .index = 0,
473         .edx = CPUID_APM_INVTSC,
474     },
475 };
476 
477 typedef struct TdxAttrsMap {
478     uint32_t attr_index;
479     uint32_t cpuid_leaf;
480     uint32_t cpuid_subleaf;
481     int cpuid_reg;
482     uint32_t feat_mask;
483 } TdxAttrsMap;
484 
485 static TdxAttrsMap tdx_attrs_maps[] = {
486     {.attr_index = 27,
487      .cpuid_leaf = 7,
488      .cpuid_subleaf = 1,
489      .cpuid_reg = R_EAX,
490      .feat_mask = CPUID_7_1_EAX_LASS,},
491 
492     {.attr_index = 30,
493      .cpuid_leaf = 7,
494      .cpuid_subleaf = 0,
495      .cpuid_reg = R_ECX,
496      .feat_mask = CPUID_7_0_ECX_PKS,},
497 
498     {.attr_index = 31,
499      .cpuid_leaf = 7,
500      .cpuid_subleaf = 0,
501      .cpuid_reg = R_ECX,
502      .feat_mask = CPUID_7_0_ECX_KeyLocker,},
503 };
504 
505 typedef struct TdxXFAMDep {
506     int xfam_bit;
507     FeatureMask feat_mask;
508 } TdxXFAMDep;
509 
510 /*
511  * Note, only the CPUID bits whose virtualization type are "XFAM & Native" are
512  * defiend here.
513  *
514  * For those whose virtualization type are "XFAM & Configured & Native", they
515  * are reported as configurable bits. And they are not supported if not in the
516  * configureable bits list from KVM even if the corresponding XFAM bit is
517  * supported.
518  */
519 TdxXFAMDep tdx_xfam_deps[] = {
520     { XSTATE_YMM_BIT,       { FEAT_1_ECX, CPUID_EXT_FMA }},
521     { XSTATE_YMM_BIT,       { FEAT_7_0_EBX, CPUID_7_0_EBX_AVX2 }},
522     { XSTATE_OPMASK_BIT,    { FEAT_7_0_ECX, CPUID_7_0_ECX_AVX512_VBMI}},
523     { XSTATE_OPMASK_BIT,    { FEAT_7_0_EDX, CPUID_7_0_EDX_AVX512_FP16}},
524     { XSTATE_PT_BIT,        { FEAT_7_0_EBX, CPUID_7_0_EBX_INTEL_PT}},
525     { XSTATE_PKRU_BIT,      { FEAT_7_0_ECX, CPUID_7_0_ECX_PKU}},
526     { XSTATE_XTILE_CFG_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_BF16 }},
527     { XSTATE_XTILE_CFG_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE }},
528     { XSTATE_XTILE_CFG_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_INT8 }},
529 };
530 
find_in_supported_entry(uint32_t function,uint32_t index)531 static struct kvm_cpuid_entry2 *find_in_supported_entry(uint32_t function,
532                                                         uint32_t index)
533 {
534     struct kvm_cpuid_entry2 *e;
535 
536     e = cpuid_find_entry(tdx_supported_cpuid, function, index);
537     if (!e) {
538         if (tdx_supported_cpuid->nent >= KVM_MAX_CPUID_ENTRIES) {
539             error_report("tdx_supported_cpuid requries more space than %d entries",
540                           KVM_MAX_CPUID_ENTRIES);
541             exit(1);
542         }
543         e = &tdx_supported_cpuid->entries[tdx_supported_cpuid->nent++];
544         e->function = function;
545         e->index = index;
546     }
547 
548     return e;
549 }
550 
tdx_add_supported_cpuid_by_fixed1_bits(void)551 static void tdx_add_supported_cpuid_by_fixed1_bits(void)
552 {
553     struct kvm_cpuid_entry2 *e, *e1;
554     int i;
555 
556     for (i = 0; i < tdx_fixed1_bits.cpuid.nent; i++) {
557         e = &tdx_fixed1_bits.entries[i];
558 
559         e1 = find_in_supported_entry(e->function, e->index);
560         e1->eax |= e->eax;
561         e1->ebx |= e->ebx;
562         e1->ecx |= e->ecx;
563         e1->edx |= e->edx;
564     }
565 }
566 
tdx_add_supported_cpuid_by_attrs(void)567 static void tdx_add_supported_cpuid_by_attrs(void)
568 {
569     struct kvm_cpuid_entry2 *e;
570     TdxAttrsMap *map;
571     int i;
572 
573     for (i = 0; i < ARRAY_SIZE(tdx_attrs_maps); i++) {
574         map = &tdx_attrs_maps[i];
575         if (!((1ULL << map->attr_index) & tdx_caps->supported_attrs)) {
576             continue;
577         }
578 
579         e = find_in_supported_entry(map->cpuid_leaf, map->cpuid_subleaf);
580 
581         switch(map->cpuid_reg) {
582         case R_EAX:
583             e->eax |= map->feat_mask;
584             break;
585         case R_EBX:
586             e->ebx |= map->feat_mask;
587             break;
588         case R_ECX:
589             e->ecx |= map->feat_mask;
590             break;
591         case R_EDX:
592             e->edx |= map->feat_mask;
593             break;
594         }
595     }
596 }
597 
tdx_add_supported_cpuid_by_xfam(void)598 static void tdx_add_supported_cpuid_by_xfam(void)
599 {
600     struct kvm_cpuid_entry2 *e;
601     int i;
602 
603     const TdxXFAMDep *xfam_dep;
604     const FeatureWordInfo *f;
605     for (i = 0; i < ARRAY_SIZE(tdx_xfam_deps); i++) {
606         xfam_dep = &tdx_xfam_deps[i];
607         if (!((1ULL << xfam_dep->xfam_bit) & tdx_caps->supported_xfam)) {
608             continue;
609         }
610 
611         f = &feature_word_info[xfam_dep->feat_mask.index];
612         if (f->type != CPUID_FEATURE_WORD) {
613             continue;
614         }
615 
616         e = find_in_supported_entry(f->cpuid.eax, f->cpuid.ecx);
617         switch(f->cpuid.reg) {
618         case R_EAX:
619             e->eax |= xfam_dep->feat_mask.mask;
620             break;
621         case R_EBX:
622             e->ebx |= xfam_dep->feat_mask.mask;
623             break;
624         case R_ECX:
625             e->ecx |= xfam_dep->feat_mask.mask;
626             break;
627         case R_EDX:
628             e->edx |= xfam_dep->feat_mask.mask;
629             break;
630         }
631     }
632 
633     e = find_in_supported_entry(0xd, 0);
634     e->eax |= (tdx_caps->supported_xfam & CPUID_XSTATE_XCR0_MASK);
635     e->edx |= (tdx_caps->supported_xfam & CPUID_XSTATE_XCR0_MASK) >> 32;
636 
637     e = find_in_supported_entry(0xd, 1);
638     /*
639      * Mark XFD always support for TDX, it will be cleared finally in
640      * tdx_adjust_cpuid_features() if XFD is unavailable on the hardware
641      * because in this case the original data has it as 0.
642      */
643     e->eax |= CPUID_XSAVE_XFD;
644     e->ecx |= (tdx_caps->supported_xfam & CPUID_XSTATE_XSS_MASK);
645     e->edx |= (tdx_caps->supported_xfam & CPUID_XSTATE_XSS_MASK) >> 32;
646 }
647 
tdx_add_supported_kvm_features(void)648 static void tdx_add_supported_kvm_features(void)
649 {
650     struct kvm_cpuid_entry2 *e;
651 
652     e = find_in_supported_entry(0x40000001, 0);
653     e->eax = TDX_SUPPORTED_KVM_FEATURES;
654 }
655 
tdx_setup_supported_cpuid(void)656 static void tdx_setup_supported_cpuid(void)
657 {
658     if (tdx_supported_cpuid) {
659         return;
660     }
661 
662     tdx_supported_cpuid = g_malloc0(sizeof(*tdx_supported_cpuid) +
663                     KVM_MAX_CPUID_ENTRIES * sizeof(struct kvm_cpuid_entry2));
664 
665     memcpy(tdx_supported_cpuid->entries, tdx_caps->cpuid.entries,
666            tdx_caps->cpuid.nent * sizeof(struct kvm_cpuid_entry2));
667     tdx_supported_cpuid->nent = tdx_caps->cpuid.nent;
668 
669     tdx_add_supported_cpuid_by_fixed1_bits();
670     tdx_add_supported_cpuid_by_attrs();
671     tdx_add_supported_cpuid_by_xfam();
672 
673     tdx_add_supported_kvm_features();
674 }
675 
tdx_kvm_init(ConfidentialGuestSupport * cgs,Error ** errp)676 static int tdx_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
677 {
678     MachineState *ms = MACHINE(qdev_get_machine());
679     X86MachineState *x86ms = X86_MACHINE(ms);
680     TdxGuest *tdx = TDX_GUEST(cgs);
681     int r = 0;
682 
683     kvm_mark_guest_state_protected();
684 
685     if (x86ms->smm == ON_OFF_AUTO_AUTO) {
686         x86ms->smm = ON_OFF_AUTO_OFF;
687     } else if (x86ms->smm == ON_OFF_AUTO_ON) {
688         error_setg(errp, "TDX VM doesn't support SMM");
689         return -EINVAL;
690     }
691 
692     if (x86ms->pic == ON_OFF_AUTO_AUTO) {
693         x86ms->pic = ON_OFF_AUTO_OFF;
694     } else if (x86ms->pic == ON_OFF_AUTO_ON) {
695         error_setg(errp, "TDX VM doesn't support PIC");
696         return -EINVAL;
697     }
698 
699     if (kvm_state->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
700         kvm_state->kernel_irqchip_split = ON_OFF_AUTO_ON;
701     } else if (kvm_state->kernel_irqchip_split != ON_OFF_AUTO_ON) {
702         error_setg(errp, "TDX VM requires kernel_irqchip to be split");
703         return -EINVAL;
704     }
705 
706     if (!tdx_caps) {
707         r = get_tdx_capabilities(errp);
708         if (r) {
709             return r;
710         }
711     }
712 
713     tdx_setup_supported_cpuid();
714 
715     /* TDX relies on KVM_HC_MAP_GPA_RANGE to handle TDG.VP.VMCALL<MapGPA> */
716     if (!kvm_enable_hypercall(BIT_ULL(KVM_HC_MAP_GPA_RANGE))) {
717         return -EOPNOTSUPP;
718     }
719 
720     /*
721      * Set kvm_readonly_mem_allowed to false, because TDX only supports readonly
722      * memory for shared memory but not for private memory. Besides, whether a
723      * memslot is private or shared is not determined by QEMU.
724      *
725      * Thus, just mark readonly memory not supported for simplicity.
726      */
727     kvm_readonly_mem_allowed = false;
728 
729     qemu_add_machine_init_done_notifier(&tdx_machine_done_notify);
730 
731     tdx_guest = tdx;
732     return 0;
733 }
734 
tdx_kvm_type(X86ConfidentialGuest * cg)735 static int tdx_kvm_type(X86ConfidentialGuest *cg)
736 {
737     /* Do the object check */
738     TDX_GUEST(cg);
739 
740     return KVM_X86_TDX_VM;
741 }
742 
tdx_cpu_instance_init(X86ConfidentialGuest * cg,CPUState * cpu)743 static void tdx_cpu_instance_init(X86ConfidentialGuest *cg, CPUState *cpu)
744 {
745     X86CPUClass *xcc = X86_CPU_GET_CLASS(cpu);
746     X86CPU *x86cpu = X86_CPU(cpu);
747 
748     if (xcc->model) {
749         error_report("Named cpu model is not supported for TDX yet!");
750         exit(1);
751     }
752 
753     object_property_set_bool(OBJECT(cpu), "pmu", false, &error_abort);
754 
755     /* invtsc is fixed1 for TD guest */
756     object_property_set_bool(OBJECT(cpu), "invtsc", true, &error_abort);
757 
758     x86cpu->force_cpuid_0x1f = true;
759 }
760 
tdx_adjust_cpuid_features(X86ConfidentialGuest * cg,uint32_t feature,uint32_t index,int reg,uint32_t value)761 static uint32_t tdx_adjust_cpuid_features(X86ConfidentialGuest *cg,
762                                           uint32_t feature, uint32_t index,
763                                           int reg, uint32_t value)
764 {
765     struct kvm_cpuid_entry2 *e;
766 
767     e = cpuid_find_entry(&tdx_fixed1_bits.cpuid, feature, index);
768     if (e) {
769         value |= cpuid_entry_get_reg(e, reg);
770     }
771 
772     if (is_feature_word_cpuid(feature, index, reg)) {
773         e = cpuid_find_entry(tdx_supported_cpuid, feature, index);
774         if (e) {
775             value &= cpuid_entry_get_reg(e, reg);
776         }
777     }
778 
779     return value;
780 }
781 
tdx_fetch_cpuid(CPUState * cpu,int * ret)782 static struct kvm_cpuid2 *tdx_fetch_cpuid(CPUState *cpu, int *ret)
783 {
784     struct kvm_cpuid2 *fetch_cpuid;
785     int size = KVM_MAX_CPUID_ENTRIES;
786     Error *local_err = NULL;
787     int r;
788 
789     do {
790         error_free(local_err);
791         local_err = NULL;
792 
793         fetch_cpuid = g_malloc0(sizeof(*fetch_cpuid) +
794                                 sizeof(struct kvm_cpuid_entry2) * size);
795         fetch_cpuid->nent = size;
796         r = tdx_vcpu_ioctl(cpu, KVM_TDX_GET_CPUID, 0, fetch_cpuid, &local_err);
797         if (r == -E2BIG) {
798             g_free(fetch_cpuid);
799             size = fetch_cpuid->nent;
800         }
801     } while (r == -E2BIG);
802 
803     if (r < 0) {
804         error_report_err(local_err);
805         *ret = r;
806         return NULL;
807     }
808 
809     return fetch_cpuid;
810 }
811 
tdx_check_features(X86ConfidentialGuest * cg,CPUState * cs)812 static int tdx_check_features(X86ConfidentialGuest *cg, CPUState *cs)
813 {
814     uint64_t actual, requested, unavailable, forced_on;
815     g_autofree struct kvm_cpuid2 *fetch_cpuid;
816     const char *forced_on_prefix = NULL;
817     const char *unav_prefix = NULL;
818     struct kvm_cpuid_entry2 *entry;
819     X86CPU *cpu = X86_CPU(cs);
820     CPUX86State *env = &cpu->env;
821     FeatureWordInfo *wi;
822     FeatureWord w;
823     bool mismatch = false;
824     int r;
825 
826     fetch_cpuid = tdx_fetch_cpuid(cs, &r);
827     if (!fetch_cpuid) {
828         return r;
829     }
830 
831     if (cpu->check_cpuid || cpu->enforce_cpuid) {
832         unav_prefix = "TDX doesn't support requested feature";
833         forced_on_prefix = "TDX forcibly sets the feature";
834     }
835 
836     for (w = 0; w < FEATURE_WORDS; w++) {
837         wi = &feature_word_info[w];
838         actual = 0;
839 
840         switch (wi->type) {
841         case CPUID_FEATURE_WORD:
842             entry = cpuid_find_entry(fetch_cpuid, wi->cpuid.eax, wi->cpuid.ecx);
843             if (!entry) {
844                 /*
845                  * If KVM doesn't report it means it's totally configurable
846                  * by QEMU
847                  */
848                 continue;
849             }
850 
851             actual = cpuid_entry_get_reg(entry, wi->cpuid.reg);
852             break;
853         case MSR_FEATURE_WORD:
854             /*
855              * TODO:
856              * validate MSR features when KVM has interface report them.
857              */
858             continue;
859         }
860 
861         /* Fixup for special cases */
862         switch (w) {
863         case FEAT_8000_0001_EDX:
864             /*
865              * Intel enumerates SYSCALL bit as 1 only when processor in 64-bit
866              * mode and before vcpu running it's not in 64-bit mode.
867              */
868             actual |= CPUID_EXT2_SYSCALL;
869             break;
870         default:
871             break;
872         }
873 
874         requested = env->features[w];
875         unavailable = requested & ~actual;
876         mark_unavailable_features(cpu, w, unavailable, unav_prefix);
877         if (unavailable) {
878             mismatch = true;
879         }
880 
881         forced_on = actual & ~requested;
882         mark_forced_on_features(cpu, w, forced_on, forced_on_prefix);
883         if (forced_on) {
884             mismatch = true;
885         }
886     }
887 
888     if (cpu->enforce_cpuid && mismatch) {
889         return -EINVAL;
890     }
891 
892     if (cpu->phys_bits != host_cpu_phys_bits()) {
893         error_report("TDX requires guest CPU physical bits (%u) "
894                      "to match host CPU physical bits (%u)",
895                      cpu->phys_bits, host_cpu_phys_bits());
896         return -EINVAL;
897     }
898 
899     return 0;
900 }
901 
tdx_validate_attributes(TdxGuest * tdx,Error ** errp)902 static int tdx_validate_attributes(TdxGuest *tdx, Error **errp)
903 {
904     if ((tdx->attributes & ~tdx_caps->supported_attrs)) {
905         error_setg(errp, "Invalid attributes 0x%"PRIx64" for TDX VM "
906                    "(KVM supported: 0x%"PRIx64")", tdx->attributes,
907                    (uint64_t)tdx_caps->supported_attrs);
908         return -1;
909     }
910 
911     if (tdx->attributes & ~TDX_SUPPORTED_TD_ATTRS) {
912         error_setg(errp, "Some QEMU unsupported TD attribute bits being "
913                     "requested: 0x%"PRIx64" (QEMU supported: 0x%"PRIx64")",
914                     tdx->attributes, (uint64_t)TDX_SUPPORTED_TD_ATTRS);
915         return -1;
916     }
917 
918     return 0;
919 }
920 
setup_td_guest_attributes(X86CPU * x86cpu,Error ** errp)921 static int setup_td_guest_attributes(X86CPU *x86cpu, Error **errp)
922 {
923     CPUX86State *env = &x86cpu->env;
924 
925     tdx_guest->attributes |= (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS) ?
926                              TDX_TD_ATTRIBUTES_PKS : 0;
927     tdx_guest->attributes |= x86cpu->enable_pmu ? TDX_TD_ATTRIBUTES_PERFMON : 0;
928 
929     return tdx_validate_attributes(tdx_guest, errp);
930 }
931 
setup_td_xfam(X86CPU * x86cpu,Error ** errp)932 static int setup_td_xfam(X86CPU *x86cpu, Error **errp)
933 {
934     CPUX86State *env = &x86cpu->env;
935     uint64_t xfam;
936 
937     xfam = env->features[FEAT_XSAVE_XCR0_LO] |
938            env->features[FEAT_XSAVE_XCR0_HI] |
939            env->features[FEAT_XSAVE_XSS_LO] |
940            env->features[FEAT_XSAVE_XSS_HI];
941 
942     if (xfam & ~tdx_caps->supported_xfam) {
943         error_setg(errp, "Invalid XFAM 0x%"PRIx64" for TDX VM (supported: 0x%"PRIx64"))",
944                    xfam, (uint64_t)tdx_caps->supported_xfam);
945         return -1;
946     }
947 
948     tdx_guest->xfam = xfam;
949     return 0;
950 }
951 
tdx_filter_cpuid(struct kvm_cpuid2 * cpuids)952 static void tdx_filter_cpuid(struct kvm_cpuid2 *cpuids)
953 {
954     int i, dest_cnt = 0;
955     struct kvm_cpuid_entry2 *src, *dest, *conf;
956 
957     for (i = 0; i < cpuids->nent; i++) {
958         src = cpuids->entries + i;
959         conf = cpuid_find_entry(&tdx_caps->cpuid, src->function, src->index);
960         if (!conf) {
961             continue;
962         }
963         dest = cpuids->entries + dest_cnt;
964 
965         dest->function = src->function;
966         dest->index = src->index;
967         dest->flags = src->flags;
968         dest->eax = src->eax & conf->eax;
969         dest->ebx = src->ebx & conf->ebx;
970         dest->ecx = src->ecx & conf->ecx;
971         dest->edx = src->edx & conf->edx;
972 
973         dest_cnt++;
974     }
975     cpuids->nent = dest_cnt++;
976 }
977 
tdx_pre_create_vcpu(CPUState * cpu,Error ** errp)978 int tdx_pre_create_vcpu(CPUState *cpu, Error **errp)
979 {
980     X86CPU *x86cpu = X86_CPU(cpu);
981     CPUX86State *env = &x86cpu->env;
982     g_autofree struct kvm_tdx_init_vm *init_vm = NULL;
983     Error *local_err = NULL;
984     size_t data_len;
985     int retry = 10000;
986     int r = 0;
987 
988     QEMU_LOCK_GUARD(&tdx_guest->lock);
989     if (tdx_guest->initialized) {
990         return r;
991     }
992 
993     init_vm = g_malloc0(sizeof(struct kvm_tdx_init_vm) +
994                         sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
995 
996     if (!kvm_check_extension(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS)) {
997         error_setg(errp, "KVM doesn't support KVM_CAP_X86_APIC_BUS_CYCLES_NS");
998         return -EOPNOTSUPP;
999     }
1000 
1001     r = kvm_vm_enable_cap(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS,
1002                           0, TDX_APIC_BUS_CYCLES_NS);
1003     if (r < 0) {
1004         error_setg_errno(errp, -r,
1005                          "Unable to set core crystal clock frequency to 25MHz");
1006         return r;
1007     }
1008 
1009     if (env->tsc_khz && (env->tsc_khz < TDX_MIN_TSC_FREQUENCY_KHZ ||
1010                          env->tsc_khz > TDX_MAX_TSC_FREQUENCY_KHZ)) {
1011         error_setg(errp, "Invalid TSC %"PRId64" KHz, must specify cpu_frequency "
1012                          "between [%d, %d] kHz", env->tsc_khz,
1013                          TDX_MIN_TSC_FREQUENCY_KHZ, TDX_MAX_TSC_FREQUENCY_KHZ);
1014        return -EINVAL;
1015     }
1016 
1017     if (env->tsc_khz % (25 * 1000)) {
1018         error_setg(errp, "Invalid TSC %"PRId64" KHz, it must be multiple of 25MHz",
1019                    env->tsc_khz);
1020         return -EINVAL;
1021     }
1022 
1023     /* it's safe even env->tsc_khz is 0. KVM uses host's tsc_khz in this case */
1024     r = kvm_vm_ioctl(kvm_state, KVM_SET_TSC_KHZ, env->tsc_khz);
1025     if (r < 0) {
1026         error_setg_errno(errp, -r, "Unable to set TSC frequency to %"PRId64" kHz",
1027                          env->tsc_khz);
1028         return r;
1029     }
1030 
1031     if (tdx_guest->mrconfigid) {
1032         g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrconfigid,
1033                               strlen(tdx_guest->mrconfigid), &data_len, errp);
1034         if (!data) {
1035             return -1;
1036         }
1037         if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) {
1038             error_setg(errp, "TDX 'mrconfigid' sha384 digest was %ld bytes, "
1039                              "expected %d bytes", data_len,
1040                              QCRYPTO_HASH_DIGEST_LEN_SHA384);
1041             return -1;
1042         }
1043         memcpy(init_vm->mrconfigid, data, data_len);
1044     }
1045 
1046     if (tdx_guest->mrowner) {
1047         g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrowner,
1048                               strlen(tdx_guest->mrowner), &data_len, errp);
1049         if (!data) {
1050             return -1;
1051         }
1052         if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) {
1053             error_setg(errp, "TDX 'mrowner' sha384 digest was %ld bytes, "
1054                              "expected %d bytes", data_len,
1055                              QCRYPTO_HASH_DIGEST_LEN_SHA384);
1056             return -1;
1057         }
1058         memcpy(init_vm->mrowner, data, data_len);
1059     }
1060 
1061     if (tdx_guest->mrownerconfig) {
1062         g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrownerconfig,
1063                             strlen(tdx_guest->mrownerconfig), &data_len, errp);
1064         if (!data) {
1065             return -1;
1066         }
1067         if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) {
1068             error_setg(errp, "TDX 'mrownerconfig' sha384 digest was %ld bytes, "
1069                              "expected %d bytes", data_len,
1070                              QCRYPTO_HASH_DIGEST_LEN_SHA384);
1071             return -1;
1072         }
1073         memcpy(init_vm->mrownerconfig, data, data_len);
1074     }
1075 
1076     r = setup_td_guest_attributes(x86cpu, errp);
1077     if (r) {
1078         return r;
1079     }
1080 
1081     r = setup_td_xfam(x86cpu, errp);
1082     if (r) {
1083         return r;
1084     }
1085 
1086     init_vm->cpuid.nent = kvm_x86_build_cpuid(env, init_vm->cpuid.entries, 0);
1087     tdx_filter_cpuid(&init_vm->cpuid);
1088 
1089     init_vm->attributes = tdx_guest->attributes;
1090     init_vm->xfam = tdx_guest->xfam;
1091 
1092     /*
1093      * KVM_TDX_INIT_VM gets -EAGAIN when KVM side SEAMCALL(TDH_MNG_CREATE)
1094      * gets TDX_RND_NO_ENTROPY due to Random number generation (e.g., RDRAND or
1095      * RDSEED) is busy.
1096      *
1097      * Retry for the case.
1098      */
1099     do {
1100         error_free(local_err);
1101         local_err = NULL;
1102         r = tdx_vm_ioctl(KVM_TDX_INIT_VM, 0, init_vm, &local_err);
1103     } while (r == -EAGAIN && --retry);
1104 
1105     if (r < 0) {
1106         if (!retry) {
1107             error_append_hint(&local_err, "Hardware RNG (Random Number "
1108             "Generator) is busy occupied by someone (via RDRAND/RDSEED) "
1109             "maliciously, which leads to KVM_TDX_INIT_VM keeping failure "
1110             "due to lack of entropy.\n");
1111         }
1112         error_propagate(errp, local_err);
1113         return r;
1114     }
1115 
1116     tdx_guest->initialized = true;
1117 
1118     return 0;
1119 }
1120 
tdx_parse_tdvf(void * flash_ptr,int size)1121 int tdx_parse_tdvf(void *flash_ptr, int size)
1122 {
1123     return tdvf_parse_metadata(&tdx_guest->tdvf, flash_ptr, size);
1124 }
1125 
tdx_get_quote_completion(TdxGenerateQuoteTask * task)1126 static void tdx_get_quote_completion(TdxGenerateQuoteTask *task)
1127 {
1128     TdxGuest *tdx = task->opaque;
1129     int ret;
1130 
1131     /* Maintain the number of in-flight requests. */
1132     qemu_mutex_lock(&tdx->lock);
1133     tdx->num--;
1134     qemu_mutex_unlock(&tdx->lock);
1135 
1136     if (task->status_code == TDX_VP_GET_QUOTE_SUCCESS) {
1137         ret = address_space_write(&address_space_memory, task->payload_gpa,
1138                                   MEMTXATTRS_UNSPECIFIED, task->receive_buf,
1139                                   task->receive_buf_received);
1140         if (ret != MEMTX_OK) {
1141             error_report("TDX: get-quote: failed to write quote data.");
1142         } else {
1143             task->hdr.out_len = cpu_to_le64(task->receive_buf_received);
1144         }
1145     }
1146     task->hdr.error_code = cpu_to_le64(task->status_code);
1147 
1148     /* Publish the response contents before marking this request completed. */
1149     smp_wmb();
1150     ret = address_space_write(&address_space_memory, task->buf_gpa,
1151                               MEMTXATTRS_UNSPECIFIED, &task->hdr,
1152                               TDX_GET_QUOTE_HDR_SIZE);
1153     if (ret != MEMTX_OK) {
1154         error_report("TDX: get-quote: failed to update GetQuote header.");
1155     }
1156 
1157     g_free(task->send_data);
1158     g_free(task->receive_buf);
1159     g_free(task);
1160     object_unref(tdx);
1161 }
1162 
tdx_handle_get_quote(X86CPU * cpu,struct kvm_run * run)1163 void tdx_handle_get_quote(X86CPU *cpu, struct kvm_run *run)
1164 {
1165     TdxGenerateQuoteTask *task;
1166     struct tdx_get_quote_header hdr;
1167     hwaddr buf_gpa = run->tdx.get_quote.gpa;
1168     uint64_t buf_len = run->tdx.get_quote.size;
1169 
1170     QEMU_BUILD_BUG_ON(sizeof(struct tdx_get_quote_header) != TDX_GET_QUOTE_HDR_SIZE);
1171 
1172     run->tdx.get_quote.ret = TDG_VP_VMCALL_INVALID_OPERAND;
1173 
1174     if (buf_len == 0) {
1175         return;
1176     }
1177 
1178     if (!QEMU_IS_ALIGNED(buf_gpa, 4096) || !QEMU_IS_ALIGNED(buf_len, 4096)) {
1179         run->tdx.get_quote.ret = TDG_VP_VMCALL_ALIGN_ERROR;
1180         return;
1181     }
1182 
1183     if (address_space_read(&address_space_memory, buf_gpa, MEMTXATTRS_UNSPECIFIED,
1184                            &hdr, TDX_GET_QUOTE_HDR_SIZE) != MEMTX_OK) {
1185         error_report("TDX: get-quote: failed to read GetQuote header.");
1186         return;
1187     }
1188 
1189     if (le64_to_cpu(hdr.structure_version) != TDX_GET_QUOTE_STRUCTURE_VERSION) {
1190         return;
1191     }
1192 
1193     /* Only safe-guard check to avoid too large buffer size. */
1194     if (buf_len > TDX_GET_QUOTE_MAX_BUF_LEN ||
1195         le32_to_cpu(hdr.in_len) > buf_len - TDX_GET_QUOTE_HDR_SIZE) {
1196         return;
1197     }
1198 
1199     if (!tdx_guest->qg_sock_addr) {
1200         hdr.error_code = cpu_to_le64(TDX_VP_GET_QUOTE_QGS_UNAVAILABLE);
1201         if (address_space_write(&address_space_memory, buf_gpa,
1202                                 MEMTXATTRS_UNSPECIFIED,
1203                                 &hdr, TDX_GET_QUOTE_HDR_SIZE) != MEMTX_OK) {
1204             error_report("TDX: failed to update GetQuote header.");
1205             return;
1206         }
1207         run->tdx.get_quote.ret = TDG_VP_VMCALL_SUCCESS;
1208         return;
1209     }
1210 
1211     qemu_mutex_lock(&tdx_guest->lock);
1212     if (tdx_guest->num >= TDX_MAX_GET_QUOTE_REQUEST) {
1213         qemu_mutex_unlock(&tdx_guest->lock);
1214         run->tdx.get_quote.ret = TDG_VP_VMCALL_RETRY;
1215         return;
1216     }
1217     tdx_guest->num++;
1218     qemu_mutex_unlock(&tdx_guest->lock);
1219 
1220     task = g_new(TdxGenerateQuoteTask, 1);
1221     task->buf_gpa = buf_gpa;
1222     task->payload_gpa = buf_gpa + TDX_GET_QUOTE_HDR_SIZE;
1223     task->payload_len = buf_len - TDX_GET_QUOTE_HDR_SIZE;
1224     task->hdr = hdr;
1225     task->completion = tdx_get_quote_completion;
1226 
1227     task->send_data_size = le32_to_cpu(hdr.in_len);
1228     task->send_data = g_malloc(task->send_data_size);
1229     task->send_data_sent = 0;
1230 
1231     if (address_space_read(&address_space_memory, task->payload_gpa,
1232                            MEMTXATTRS_UNSPECIFIED, task->send_data,
1233                            task->send_data_size) != MEMTX_OK) {
1234         goto out_free;
1235     }
1236 
1237     /* Mark the buffer in-flight. */
1238     hdr.error_code = cpu_to_le64(TDX_VP_GET_QUOTE_IN_FLIGHT);
1239     if (address_space_write(&address_space_memory, buf_gpa,
1240                             MEMTXATTRS_UNSPECIFIED,
1241                             &hdr, TDX_GET_QUOTE_HDR_SIZE) != MEMTX_OK) {
1242         goto out_free;
1243     }
1244 
1245     task->receive_buf = g_malloc0(task->payload_len);
1246     task->receive_buf_received = 0;
1247     task->opaque = tdx_guest;
1248 
1249     object_ref(tdx_guest);
1250     tdx_generate_quote(task, tdx_guest->qg_sock_addr);
1251     run->tdx.get_quote.ret = TDG_VP_VMCALL_SUCCESS;
1252     return;
1253 
1254 out_free:
1255     g_free(task->send_data);
1256     g_free(task);
1257 }
1258 
tdx_handle_get_tdvmcall_info(X86CPU * cpu,struct kvm_run * run)1259 void tdx_handle_get_tdvmcall_info(X86CPU *cpu, struct kvm_run *run)
1260 {
1261     if (run->tdx.get_tdvmcall_info.leaf != 1) {
1262 	return;
1263     }
1264 
1265     run->tdx.get_tdvmcall_info.r11 = TDG_VP_VMCALL_SUBFUNC_GET_QUOTE;
1266     run->tdx.get_tdvmcall_info.r12 = 0;
1267     run->tdx.get_tdvmcall_info.r13 = 0;
1268     run->tdx.get_tdvmcall_info.r14 = 0;
1269 }
1270 
tdx_panicked_on_fatal_error(X86CPU * cpu,uint64_t error_code,char * message,uint64_t gpa)1271 static void tdx_panicked_on_fatal_error(X86CPU *cpu, uint64_t error_code,
1272                                         char *message, uint64_t gpa)
1273 {
1274     GuestPanicInformation *panic_info;
1275 
1276     panic_info = g_new0(GuestPanicInformation, 1);
1277     panic_info->type = GUEST_PANIC_INFORMATION_TYPE_TDX;
1278     panic_info->u.tdx.error_code = (uint32_t) error_code;
1279     panic_info->u.tdx.message = message;
1280     panic_info->u.tdx.gpa = gpa;
1281 
1282     qemu_system_guest_panicked(panic_info);
1283 }
1284 
1285 /*
1286  * Only 8 registers can contain valid ASCII byte stream to form the fatal
1287  * message, and their sequence is: R14, R15, RBX, RDI, RSI, R8, R9, RDX
1288  */
1289 #define TDX_FATAL_MESSAGE_MAX        64
1290 
1291 #define TDX_REPORT_FATAL_ERROR_GPA_VALID    BIT_ULL(63)
1292 
tdx_handle_report_fatal_error(X86CPU * cpu,struct kvm_run * run)1293 int tdx_handle_report_fatal_error(X86CPU *cpu, struct kvm_run *run)
1294 {
1295     uint64_t error_code = run->system_event.data[R_R12];
1296     uint64_t reg_mask = run->system_event.data[R_ECX];
1297     char *message = NULL;
1298     uint64_t *tmp;
1299     uint64_t gpa = -1ull;
1300 
1301     if (error_code & 0xffff) {
1302         error_report("TDX: REPORT_FATAL_ERROR: invalid error code: 0x%"PRIx64,
1303                      error_code);
1304         return -1;
1305     }
1306 
1307     if (reg_mask) {
1308         message = g_malloc0(TDX_FATAL_MESSAGE_MAX + 1);
1309         tmp = (uint64_t *)message;
1310 
1311 #define COPY_REG(REG)                               \
1312     do {                                            \
1313         if (reg_mask & BIT_ULL(REG)) {              \
1314             *(tmp++) = run->system_event.data[REG]; \
1315         }                                           \
1316     } while (0)
1317 
1318         COPY_REG(R_R14);
1319         COPY_REG(R_R15);
1320         COPY_REG(R_EBX);
1321         COPY_REG(R_EDI);
1322         COPY_REG(R_ESI);
1323         COPY_REG(R_R8);
1324         COPY_REG(R_R9);
1325         COPY_REG(R_EDX);
1326         *((char *)tmp) = '\0';
1327     }
1328 #undef COPY_REG
1329 
1330     if (error_code & TDX_REPORT_FATAL_ERROR_GPA_VALID) {
1331         gpa = run->system_event.data[R_R13];
1332     }
1333 
1334     tdx_panicked_on_fatal_error(cpu, error_code, message, gpa);
1335 
1336     return -1;
1337 }
1338 
tdx_guest_get_sept_ve_disable(Object * obj,Error ** errp)1339 static bool tdx_guest_get_sept_ve_disable(Object *obj, Error **errp)
1340 {
1341     TdxGuest *tdx = TDX_GUEST(obj);
1342 
1343     return !!(tdx->attributes & TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE);
1344 }
1345 
tdx_guest_set_sept_ve_disable(Object * obj,bool value,Error ** errp)1346 static void tdx_guest_set_sept_ve_disable(Object *obj, bool value, Error **errp)
1347 {
1348     TdxGuest *tdx = TDX_GUEST(obj);
1349 
1350     if (value) {
1351         tdx->attributes |= TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE;
1352     } else {
1353         tdx->attributes &= ~TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE;
1354     }
1355 }
1356 
tdx_guest_get_mrconfigid(Object * obj,Error ** errp)1357 static char *tdx_guest_get_mrconfigid(Object *obj, Error **errp)
1358 {
1359     TdxGuest *tdx = TDX_GUEST(obj);
1360 
1361     return g_strdup(tdx->mrconfigid);
1362 }
1363 
tdx_guest_set_mrconfigid(Object * obj,const char * value,Error ** errp)1364 static void tdx_guest_set_mrconfigid(Object *obj, const char *value, Error **errp)
1365 {
1366     TdxGuest *tdx = TDX_GUEST(obj);
1367 
1368     g_free(tdx->mrconfigid);
1369     tdx->mrconfigid = g_strdup(value);
1370 }
1371 
tdx_guest_get_mrowner(Object * obj,Error ** errp)1372 static char *tdx_guest_get_mrowner(Object *obj, Error **errp)
1373 {
1374     TdxGuest *tdx = TDX_GUEST(obj);
1375 
1376     return g_strdup(tdx->mrowner);
1377 }
1378 
tdx_guest_set_mrowner(Object * obj,const char * value,Error ** errp)1379 static void tdx_guest_set_mrowner(Object *obj, const char *value, Error **errp)
1380 {
1381     TdxGuest *tdx = TDX_GUEST(obj);
1382 
1383     g_free(tdx->mrowner);
1384     tdx->mrowner = g_strdup(value);
1385 }
1386 
tdx_guest_get_mrownerconfig(Object * obj,Error ** errp)1387 static char *tdx_guest_get_mrownerconfig(Object *obj, Error **errp)
1388 {
1389     TdxGuest *tdx = TDX_GUEST(obj);
1390 
1391     return g_strdup(tdx->mrownerconfig);
1392 }
1393 
tdx_guest_set_mrownerconfig(Object * obj,const char * value,Error ** errp)1394 static void tdx_guest_set_mrownerconfig(Object *obj, const char *value, Error **errp)
1395 {
1396     TdxGuest *tdx = TDX_GUEST(obj);
1397 
1398     g_free(tdx->mrownerconfig);
1399     tdx->mrownerconfig = g_strdup(value);
1400 }
1401 
tdx_guest_get_qgs(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1402 static void tdx_guest_get_qgs(Object *obj, Visitor *v,
1403                               const char *name, void *opaque,
1404                               Error **errp)
1405 {
1406     TdxGuest *tdx = TDX_GUEST(obj);
1407 
1408     if (!tdx->qg_sock_addr) {
1409         error_setg(errp, "quote-generation-socket is not set");
1410         return;
1411     }
1412     visit_type_SocketAddress(v, name, &tdx->qg_sock_addr, errp);
1413 }
1414 
tdx_guest_set_qgs(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)1415 static void tdx_guest_set_qgs(Object *obj, Visitor *v,
1416                               const char *name, void *opaque,
1417                               Error **errp)
1418 {
1419     TdxGuest *tdx = TDX_GUEST(obj);
1420     SocketAddress *sock = NULL;
1421 
1422     if (!visit_type_SocketAddress(v, name, &sock, errp)) {
1423         return;
1424     }
1425 
1426     if (tdx->qg_sock_addr) {
1427         qapi_free_SocketAddress(tdx->qg_sock_addr);
1428     }
1429 
1430     tdx->qg_sock_addr = sock;
1431 }
1432 
1433 /* tdx guest */
1434 OBJECT_DEFINE_TYPE_WITH_INTERFACES(TdxGuest,
1435                                    tdx_guest,
1436                                    TDX_GUEST,
1437                                    X86_CONFIDENTIAL_GUEST,
1438                                    { TYPE_USER_CREATABLE },
1439                                    { NULL })
1440 
tdx_guest_init(Object * obj)1441 static void tdx_guest_init(Object *obj)
1442 {
1443     ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj);
1444     TdxGuest *tdx = TDX_GUEST(obj);
1445 
1446     qemu_mutex_init(&tdx->lock);
1447 
1448     cgs->require_guest_memfd = true;
1449     tdx->attributes = TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE;
1450 
1451     object_property_add_uint64_ptr(obj, "attributes", &tdx->attributes,
1452                                    OBJ_PROP_FLAG_READWRITE);
1453     object_property_add_bool(obj, "sept-ve-disable",
1454                              tdx_guest_get_sept_ve_disable,
1455                              tdx_guest_set_sept_ve_disable);
1456     object_property_add_str(obj, "mrconfigid",
1457                             tdx_guest_get_mrconfigid,
1458                             tdx_guest_set_mrconfigid);
1459     object_property_add_str(obj, "mrowner",
1460                             tdx_guest_get_mrowner, tdx_guest_set_mrowner);
1461     object_property_add_str(obj, "mrownerconfig",
1462                             tdx_guest_get_mrownerconfig,
1463                             tdx_guest_set_mrownerconfig);
1464 
1465     object_property_add(obj, "quote-generation-socket", "SocketAddress",
1466                             tdx_guest_get_qgs,
1467                             tdx_guest_set_qgs,
1468                             NULL, NULL);
1469 
1470     qemu_mutex_init(&tdx->lock);
1471 }
1472 
tdx_guest_finalize(Object * obj)1473 static void tdx_guest_finalize(Object *obj)
1474 {
1475 }
1476 
tdx_guest_class_init(ObjectClass * oc,const void * data)1477 static void tdx_guest_class_init(ObjectClass *oc, const void *data)
1478 {
1479     ConfidentialGuestSupportClass *klass = CONFIDENTIAL_GUEST_SUPPORT_CLASS(oc);
1480     X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc);
1481 
1482     klass->kvm_init = tdx_kvm_init;
1483     x86_klass->kvm_type = tdx_kvm_type;
1484     x86_klass->cpu_instance_init = tdx_cpu_instance_init;
1485     x86_klass->adjust_cpuid_features = tdx_adjust_cpuid_features;
1486     x86_klass->check_features = tdx_check_features;
1487 }
1488