1 /* 2 * QEMU TDX support 3 * 4 * Copyright (c) 2025 Intel Corporation 5 * 6 * Author: 7 * Xiaoyao Li <xiaoyao.li@intel.com> 8 * 9 * SPDX-License-Identifier: GPL-2.0-or-later 10 */ 11 12 #include "qemu/osdep.h" 13 #include "qemu/error-report.h" 14 #include "qemu/base64.h" 15 #include "qemu/mmap-alloc.h" 16 #include "qapi/error.h" 17 #include "qom/object_interfaces.h" 18 #include "crypto/hash.h" 19 #include "system/kvm_int.h" 20 #include "system/runstate.h" 21 #include "system/system.h" 22 #include "system/ramblock.h" 23 24 #include <linux/kvm_para.h> 25 26 #include "hw/i386/e820_memory_layout.h" 27 #include "hw/i386/tdvf.h" 28 #include "hw/i386/x86.h" 29 #include "hw/i386/tdvf-hob.h" 30 #include "kvm_i386.h" 31 #include "tdx.h" 32 33 #define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000) 34 #define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000) 35 36 #define TDX_TD_ATTRIBUTES_DEBUG BIT_ULL(0) 37 #define TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE BIT_ULL(28) 38 #define TDX_TD_ATTRIBUTES_PKS BIT_ULL(30) 39 #define TDX_TD_ATTRIBUTES_PERFMON BIT_ULL(63) 40 41 #define TDX_SUPPORTED_TD_ATTRS (TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE |\ 42 TDX_TD_ATTRIBUTES_PKS | \ 43 TDX_TD_ATTRIBUTES_PERFMON) 44 45 static TdxGuest *tdx_guest; 46 47 static struct kvm_tdx_capabilities *tdx_caps; 48 static struct kvm_cpuid2 *tdx_supported_cpuid; 49 50 /* Valid after kvm_arch_init()->confidential_guest_kvm_init()->tdx_kvm_init() */ 51 bool is_tdx_vm(void) 52 { 53 return !!tdx_guest; 54 } 55 56 enum tdx_ioctl_level { 57 TDX_VM_IOCTL, 58 TDX_VCPU_IOCTL, 59 }; 60 61 static int tdx_ioctl_internal(enum tdx_ioctl_level level, void *state, 62 int cmd_id, __u32 flags, void *data, 63 Error **errp) 64 { 65 struct kvm_tdx_cmd tdx_cmd = {}; 66 int r; 67 68 const char *tdx_ioctl_name[] = { 69 [KVM_TDX_CAPABILITIES] = "KVM_TDX_CAPABILITIES", 70 [KVM_TDX_INIT_VM] = "KVM_TDX_INIT_VM", 71 [KVM_TDX_INIT_VCPU] = "KVM_TDX_INIT_VCPU", 72 [KVM_TDX_INIT_MEM_REGION] = "KVM_TDX_INIT_MEM_REGION", 73 [KVM_TDX_FINALIZE_VM] = "KVM_TDX_FINALIZE_VM", 74 [KVM_TDX_GET_CPUID] = "KVM_TDX_GET_CPUID", 75 }; 76 77 tdx_cmd.id = cmd_id; 78 tdx_cmd.flags = flags; 79 tdx_cmd.data = (__u64)(unsigned long)data; 80 81 switch (level) { 82 case TDX_VM_IOCTL: 83 r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd); 84 break; 85 case TDX_VCPU_IOCTL: 86 r = kvm_vcpu_ioctl(state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd); 87 break; 88 default: 89 error_setg(errp, "Invalid tdx_ioctl_level %d", level); 90 return -EINVAL; 91 } 92 93 if (r < 0) { 94 error_setg_errno(errp, -r, "TDX ioctl %s failed, hw_errors: 0x%llx", 95 tdx_ioctl_name[cmd_id], tdx_cmd.hw_error); 96 } 97 return r; 98 } 99 100 static inline int tdx_vm_ioctl(int cmd_id, __u32 flags, void *data, 101 Error **errp) 102 { 103 return tdx_ioctl_internal(TDX_VM_IOCTL, NULL, cmd_id, flags, data, errp); 104 } 105 106 static inline int tdx_vcpu_ioctl(CPUState *cpu, int cmd_id, __u32 flags, 107 void *data, Error **errp) 108 { 109 return tdx_ioctl_internal(TDX_VCPU_IOCTL, cpu, cmd_id, flags, data, errp); 110 } 111 112 static int get_tdx_capabilities(Error **errp) 113 { 114 struct kvm_tdx_capabilities *caps; 115 /* 1st generation of TDX reports 6 cpuid configs */ 116 int nr_cpuid_configs = 6; 117 size_t size; 118 int r; 119 120 do { 121 Error *local_err = NULL; 122 size = sizeof(struct kvm_tdx_capabilities) + 123 nr_cpuid_configs * sizeof(struct kvm_cpuid_entry2); 124 caps = g_malloc0(size); 125 caps->cpuid.nent = nr_cpuid_configs; 126 127 r = tdx_vm_ioctl(KVM_TDX_CAPABILITIES, 0, caps, &local_err); 128 if (r == -E2BIG) { 129 g_free(caps); 130 nr_cpuid_configs *= 2; 131 if (nr_cpuid_configs > KVM_MAX_CPUID_ENTRIES) { 132 error_report("KVM TDX seems broken that number of CPUID entries" 133 " in kvm_tdx_capabilities exceeds limit: %d", 134 KVM_MAX_CPUID_ENTRIES); 135 error_propagate(errp, local_err); 136 return r; 137 } 138 error_free(local_err); 139 } else if (r < 0) { 140 g_free(caps); 141 error_propagate(errp, local_err); 142 return r; 143 } 144 } while (r == -E2BIG); 145 146 tdx_caps = caps; 147 148 return 0; 149 } 150 151 void tdx_set_tdvf_region(MemoryRegion *tdvf_mr) 152 { 153 assert(!tdx_guest->tdvf_mr); 154 tdx_guest->tdvf_mr = tdvf_mr; 155 } 156 157 static TdxFirmwareEntry *tdx_get_hob_entry(TdxGuest *tdx) 158 { 159 TdxFirmwareEntry *entry; 160 161 for_each_tdx_fw_entry(&tdx->tdvf, entry) { 162 if (entry->type == TDVF_SECTION_TYPE_TD_HOB) { 163 return entry; 164 } 165 } 166 error_report("TDVF metadata doesn't specify TD_HOB location."); 167 exit(1); 168 } 169 170 static void tdx_add_ram_entry(uint64_t address, uint64_t length, 171 enum TdxRamType type) 172 { 173 uint32_t nr_entries = tdx_guest->nr_ram_entries; 174 tdx_guest->ram_entries = g_renew(TdxRamEntry, tdx_guest->ram_entries, 175 nr_entries + 1); 176 177 tdx_guest->ram_entries[nr_entries].address = address; 178 tdx_guest->ram_entries[nr_entries].length = length; 179 tdx_guest->ram_entries[nr_entries].type = type; 180 tdx_guest->nr_ram_entries++; 181 } 182 183 static int tdx_accept_ram_range(uint64_t address, uint64_t length) 184 { 185 uint64_t head_start, tail_start, head_length, tail_length; 186 uint64_t tmp_address, tmp_length; 187 TdxRamEntry *e; 188 int i = 0; 189 190 do { 191 if (i == tdx_guest->nr_ram_entries) { 192 return -1; 193 } 194 195 e = &tdx_guest->ram_entries[i++]; 196 } while (address + length <= e->address || address >= e->address + e->length); 197 198 /* 199 * The to-be-accepted ram range must be fully contained by one 200 * RAM entry. 201 */ 202 if (e->address > address || 203 e->address + e->length < address + length) { 204 return -1; 205 } 206 207 if (e->type == TDX_RAM_ADDED) { 208 return 0; 209 } 210 211 tmp_address = e->address; 212 tmp_length = e->length; 213 214 e->address = address; 215 e->length = length; 216 e->type = TDX_RAM_ADDED; 217 218 head_length = address - tmp_address; 219 if (head_length > 0) { 220 head_start = tmp_address; 221 tdx_add_ram_entry(head_start, head_length, TDX_RAM_UNACCEPTED); 222 } 223 224 tail_start = address + length; 225 if (tail_start < tmp_address + tmp_length) { 226 tail_length = tmp_address + tmp_length - tail_start; 227 tdx_add_ram_entry(tail_start, tail_length, TDX_RAM_UNACCEPTED); 228 } 229 230 return 0; 231 } 232 233 static int tdx_ram_entry_compare(const void *lhs_, const void* rhs_) 234 { 235 const TdxRamEntry *lhs = lhs_; 236 const TdxRamEntry *rhs = rhs_; 237 238 if (lhs->address == rhs->address) { 239 return 0; 240 } 241 if (le64_to_cpu(lhs->address) > le64_to_cpu(rhs->address)) { 242 return 1; 243 } 244 return -1; 245 } 246 247 static void tdx_init_ram_entries(void) 248 { 249 unsigned i, j, nr_e820_entries; 250 251 nr_e820_entries = e820_get_table(NULL); 252 tdx_guest->ram_entries = g_new(TdxRamEntry, nr_e820_entries); 253 254 for (i = 0, j = 0; i < nr_e820_entries; i++) { 255 uint64_t addr, len; 256 257 if (e820_get_entry(i, E820_RAM, &addr, &len)) { 258 tdx_guest->ram_entries[j].address = addr; 259 tdx_guest->ram_entries[j].length = len; 260 tdx_guest->ram_entries[j].type = TDX_RAM_UNACCEPTED; 261 j++; 262 } 263 } 264 tdx_guest->nr_ram_entries = j; 265 } 266 267 static void tdx_post_init_vcpus(void) 268 { 269 TdxFirmwareEntry *hob; 270 CPUState *cpu; 271 272 hob = tdx_get_hob_entry(tdx_guest); 273 CPU_FOREACH(cpu) { 274 tdx_vcpu_ioctl(cpu, KVM_TDX_INIT_VCPU, 0, (void *)hob->address, 275 &error_fatal); 276 } 277 } 278 279 static void tdx_finalize_vm(Notifier *notifier, void *unused) 280 { 281 TdxFirmware *tdvf = &tdx_guest->tdvf; 282 TdxFirmwareEntry *entry; 283 RAMBlock *ram_block; 284 Error *local_err = NULL; 285 int r; 286 287 tdx_init_ram_entries(); 288 289 for_each_tdx_fw_entry(tdvf, entry) { 290 switch (entry->type) { 291 case TDVF_SECTION_TYPE_BFV: 292 case TDVF_SECTION_TYPE_CFV: 293 entry->mem_ptr = tdvf->mem_ptr + entry->data_offset; 294 break; 295 case TDVF_SECTION_TYPE_TD_HOB: 296 case TDVF_SECTION_TYPE_TEMP_MEM: 297 entry->mem_ptr = qemu_ram_mmap(-1, entry->size, 298 qemu_real_host_page_size(), 0, 0); 299 if (entry->mem_ptr == MAP_FAILED) { 300 error_report("Failed to mmap memory for TDVF section %d", 301 entry->type); 302 exit(1); 303 } 304 if (tdx_accept_ram_range(entry->address, entry->size)) { 305 error_report("Failed to accept memory for TDVF section %d", 306 entry->type); 307 qemu_ram_munmap(-1, entry->mem_ptr, entry->size); 308 exit(1); 309 } 310 break; 311 default: 312 error_report("Unsupported TDVF section %d", entry->type); 313 exit(1); 314 } 315 } 316 317 qsort(tdx_guest->ram_entries, tdx_guest->nr_ram_entries, 318 sizeof(TdxRamEntry), &tdx_ram_entry_compare); 319 320 tdvf_hob_create(tdx_guest, tdx_get_hob_entry(tdx_guest)); 321 322 tdx_post_init_vcpus(); 323 324 for_each_tdx_fw_entry(tdvf, entry) { 325 struct kvm_tdx_init_mem_region region; 326 uint32_t flags; 327 328 region = (struct kvm_tdx_init_mem_region) { 329 .source_addr = (uint64_t)entry->mem_ptr, 330 .gpa = entry->address, 331 .nr_pages = entry->size >> 12, 332 }; 333 334 flags = entry->attributes & TDVF_SECTION_ATTRIBUTES_MR_EXTEND ? 335 KVM_TDX_MEASURE_MEMORY_REGION : 0; 336 337 do { 338 error_free(local_err); 339 local_err = NULL; 340 r = tdx_vcpu_ioctl(first_cpu, KVM_TDX_INIT_MEM_REGION, flags, 341 ®ion, &local_err); 342 } while (r == -EAGAIN || r == -EINTR); 343 if (r < 0) { 344 error_report_err(local_err); 345 exit(1); 346 } 347 348 if (entry->type == TDVF_SECTION_TYPE_TD_HOB || 349 entry->type == TDVF_SECTION_TYPE_TEMP_MEM) { 350 qemu_ram_munmap(-1, entry->mem_ptr, entry->size); 351 entry->mem_ptr = NULL; 352 } 353 } 354 355 /* 356 * TDVF image has been copied into private region above via 357 * KVM_MEMORY_MAPPING. It becomes useless. 358 */ 359 ram_block = tdx_guest->tdvf_mr->ram_block; 360 ram_block_discard_range(ram_block, 0, ram_block->max_length); 361 362 tdx_vm_ioctl(KVM_TDX_FINALIZE_VM, 0, NULL, &error_fatal); 363 CONFIDENTIAL_GUEST_SUPPORT(tdx_guest)->ready = true; 364 } 365 366 static Notifier tdx_machine_done_notify = { 367 .notify = tdx_finalize_vm, 368 }; 369 370 /* 371 * Some CPUID bits change from fixed1 to configurable bits when TDX module 372 * supports TDX_FEATURES0.VE_REDUCTION. e.g., MCA/MCE/MTRR/CORE_CAPABILITY. 373 * 374 * To make QEMU work with all the versions of TDX module, keep the fixed1 bits 375 * here if they are ever fixed1 bits in any of the version though not fixed1 in 376 * the latest version. Otherwise, with the older version of TDX module, QEMU may 377 * treat the fixed1 bit as unsupported. 378 * 379 * For newer TDX module, it does no harm to keep them in tdx_fixed1_bits even 380 * though they changed to configurable bits. Because tdx_fixed1_bits is used to 381 * setup the supported bits. 382 */ 383 KvmCpuidInfo tdx_fixed1_bits = { 384 .cpuid.nent = 8, 385 .entries[0] = { 386 .function = 0x1, 387 .index = 0, 388 .ecx = CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_DTES64 | 389 CPUID_EXT_DSCPL | CPUID_EXT_SSSE3 | CPUID_EXT_CX16 | 390 CPUID_EXT_PDCM | CPUID_EXT_PCID | CPUID_EXT_SSE41 | 391 CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE | 392 CPUID_EXT_POPCNT | CPUID_EXT_AES | CPUID_EXT_XSAVE | 393 CPUID_EXT_RDRAND | CPUID_EXT_HYPERVISOR, 394 .edx = CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC | 395 CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC | 396 CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV | 397 CPUID_PAT | CPUID_CLFLUSH | CPUID_DTS | CPUID_MMX | CPUID_FXSR | 398 CPUID_SSE | CPUID_SSE2, 399 }, 400 .entries[1] = { 401 .function = 0x6, 402 .index = 0, 403 .eax = CPUID_6_EAX_ARAT, 404 }, 405 .entries[2] = { 406 .function = 0x7, 407 .index = 0, 408 .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX, 409 .ebx = CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_FDP_EXCPTN_ONLY | 410 CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_INVPCID | 411 CPUID_7_0_EBX_ZERO_FCS_FDS | CPUID_7_0_EBX_RDSEED | 412 CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT | 413 CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_SHA_NI, 414 .ecx = CPUID_7_0_ECX_BUS_LOCK_DETECT | CPUID_7_0_ECX_MOVDIRI | 415 CPUID_7_0_ECX_MOVDIR64B, 416 .edx = CPUID_7_0_EDX_MD_CLEAR | CPUID_7_0_EDX_SPEC_CTRL | 417 CPUID_7_0_EDX_STIBP | CPUID_7_0_EDX_FLUSH_L1D | 418 CPUID_7_0_EDX_ARCH_CAPABILITIES | CPUID_7_0_EDX_CORE_CAPABILITY | 419 CPUID_7_0_EDX_SPEC_CTRL_SSBD, 420 }, 421 .entries[3] = { 422 .function = 0x7, 423 .index = 2, 424 .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX, 425 .edx = CPUID_7_2_EDX_PSFD | CPUID_7_2_EDX_IPRED_CTRL | 426 CPUID_7_2_EDX_RRSBA_CTRL | CPUID_7_2_EDX_BHI_CTRL, 427 }, 428 .entries[4] = { 429 .function = 0xD, 430 .index = 0, 431 .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX, 432 .eax = XSTATE_FP_MASK | XSTATE_SSE_MASK, 433 }, 434 .entries[5] = { 435 .function = 0xD, 436 .index = 1, 437 .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX, 438 .eax = CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC| 439 CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES, 440 }, 441 .entries[6] = { 442 .function = 0x80000001, 443 .index = 0, 444 .ecx = CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH, 445 /* 446 * Strictly speaking, SYSCALL is not fixed1 bit since it depends on 447 * the CPU to be in 64-bit mode. But here fixed1 is used to serve the 448 * purpose of supported bits for TDX. In this sense, SYACALL is always 449 * supported. 450 */ 451 .edx = CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB | 452 CPUID_EXT2_RDTSCP | CPUID_EXT2_LM, 453 }, 454 .entries[7] = { 455 .function = 0x80000007, 456 .index = 0, 457 .edx = CPUID_APM_INVTSC, 458 }, 459 }; 460 461 static struct kvm_cpuid_entry2 *find_in_supported_entry(uint32_t function, 462 uint32_t index) 463 { 464 struct kvm_cpuid_entry2 *e; 465 466 e = cpuid_find_entry(tdx_supported_cpuid, function, index); 467 if (!e) { 468 if (tdx_supported_cpuid->nent >= KVM_MAX_CPUID_ENTRIES) { 469 error_report("tdx_supported_cpuid requries more space than %d entries", 470 KVM_MAX_CPUID_ENTRIES); 471 exit(1); 472 } 473 e = &tdx_supported_cpuid->entries[tdx_supported_cpuid->nent++]; 474 e->function = function; 475 e->index = index; 476 } 477 478 return e; 479 } 480 481 static void tdx_add_supported_cpuid_by_fixed1_bits(void) 482 { 483 struct kvm_cpuid_entry2 *e, *e1; 484 int i; 485 486 for (i = 0; i < tdx_fixed1_bits.cpuid.nent; i++) { 487 e = &tdx_fixed1_bits.entries[i]; 488 489 e1 = find_in_supported_entry(e->function, e->index); 490 e1->eax |= e->eax; 491 e1->ebx |= e->ebx; 492 e1->ecx |= e->ecx; 493 e1->edx |= e->edx; 494 } 495 } 496 497 static void tdx_setup_supported_cpuid(void) 498 { 499 if (tdx_supported_cpuid) { 500 return; 501 } 502 503 tdx_supported_cpuid = g_malloc0(sizeof(*tdx_supported_cpuid) + 504 KVM_MAX_CPUID_ENTRIES * sizeof(struct kvm_cpuid_entry2)); 505 506 memcpy(tdx_supported_cpuid->entries, tdx_caps->cpuid.entries, 507 tdx_caps->cpuid.nent * sizeof(struct kvm_cpuid_entry2)); 508 tdx_supported_cpuid->nent = tdx_caps->cpuid.nent; 509 510 tdx_add_supported_cpuid_by_fixed1_bits(); 511 } 512 513 static int tdx_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) 514 { 515 MachineState *ms = MACHINE(qdev_get_machine()); 516 X86MachineState *x86ms = X86_MACHINE(ms); 517 TdxGuest *tdx = TDX_GUEST(cgs); 518 int r = 0; 519 520 kvm_mark_guest_state_protected(); 521 522 if (x86ms->smm == ON_OFF_AUTO_AUTO) { 523 x86ms->smm = ON_OFF_AUTO_OFF; 524 } else if (x86ms->smm == ON_OFF_AUTO_ON) { 525 error_setg(errp, "TDX VM doesn't support SMM"); 526 return -EINVAL; 527 } 528 529 if (x86ms->pic == ON_OFF_AUTO_AUTO) { 530 x86ms->pic = ON_OFF_AUTO_OFF; 531 } else if (x86ms->pic == ON_OFF_AUTO_ON) { 532 error_setg(errp, "TDX VM doesn't support PIC"); 533 return -EINVAL; 534 } 535 536 if (kvm_state->kernel_irqchip_split == ON_OFF_AUTO_AUTO) { 537 kvm_state->kernel_irqchip_split = ON_OFF_AUTO_ON; 538 } else if (kvm_state->kernel_irqchip_split != ON_OFF_AUTO_ON) { 539 error_setg(errp, "TDX VM requires kernel_irqchip to be split"); 540 return -EINVAL; 541 } 542 543 if (!tdx_caps) { 544 r = get_tdx_capabilities(errp); 545 if (r) { 546 return r; 547 } 548 } 549 550 tdx_setup_supported_cpuid(); 551 552 /* TDX relies on KVM_HC_MAP_GPA_RANGE to handle TDG.VP.VMCALL<MapGPA> */ 553 if (!kvm_enable_hypercall(BIT_ULL(KVM_HC_MAP_GPA_RANGE))) { 554 return -EOPNOTSUPP; 555 } 556 557 /* 558 * Set kvm_readonly_mem_allowed to false, because TDX only supports readonly 559 * memory for shared memory but not for private memory. Besides, whether a 560 * memslot is private or shared is not determined by QEMU. 561 * 562 * Thus, just mark readonly memory not supported for simplicity. 563 */ 564 kvm_readonly_mem_allowed = false; 565 566 qemu_add_machine_init_done_notifier(&tdx_machine_done_notify); 567 568 tdx_guest = tdx; 569 return 0; 570 } 571 572 static int tdx_kvm_type(X86ConfidentialGuest *cg) 573 { 574 /* Do the object check */ 575 TDX_GUEST(cg); 576 577 return KVM_X86_TDX_VM; 578 } 579 580 static void tdx_cpu_instance_init(X86ConfidentialGuest *cg, CPUState *cpu) 581 { 582 X86CPU *x86cpu = X86_CPU(cpu); 583 584 object_property_set_bool(OBJECT(cpu), "pmu", false, &error_abort); 585 586 x86cpu->enable_cpuid_0x1f = true; 587 } 588 589 static uint32_t tdx_adjust_cpuid_features(X86ConfidentialGuest *cg, 590 uint32_t feature, uint32_t index, 591 int reg, uint32_t value) 592 { 593 struct kvm_cpuid_entry2 *e; 594 595 e = cpuid_find_entry(&tdx_fixed1_bits.cpuid, feature, index); 596 if (e) { 597 value |= cpuid_entry_get_reg(e, reg); 598 } 599 600 if (is_feature_word_cpuid(feature, index, reg)) { 601 e = cpuid_find_entry(tdx_supported_cpuid, feature, index); 602 if (e) { 603 value &= cpuid_entry_get_reg(e, reg); 604 } 605 } 606 607 return value; 608 } 609 610 static int tdx_validate_attributes(TdxGuest *tdx, Error **errp) 611 { 612 if ((tdx->attributes & ~tdx_caps->supported_attrs)) { 613 error_setg(errp, "Invalid attributes 0x%lx for TDX VM " 614 "(KVM supported: 0x%llx)", tdx->attributes, 615 tdx_caps->supported_attrs); 616 return -1; 617 } 618 619 if (tdx->attributes & ~TDX_SUPPORTED_TD_ATTRS) { 620 error_setg(errp, "Some QEMU unsupported TD attribute bits being " 621 "requested: 0x%lx (QEMU supported: 0x%llx)", 622 tdx->attributes, TDX_SUPPORTED_TD_ATTRS); 623 return -1; 624 } 625 626 return 0; 627 } 628 629 static int setup_td_guest_attributes(X86CPU *x86cpu, Error **errp) 630 { 631 CPUX86State *env = &x86cpu->env; 632 633 tdx_guest->attributes |= (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS) ? 634 TDX_TD_ATTRIBUTES_PKS : 0; 635 tdx_guest->attributes |= x86cpu->enable_pmu ? TDX_TD_ATTRIBUTES_PERFMON : 0; 636 637 return tdx_validate_attributes(tdx_guest, errp); 638 } 639 640 static int setup_td_xfam(X86CPU *x86cpu, Error **errp) 641 { 642 CPUX86State *env = &x86cpu->env; 643 uint64_t xfam; 644 645 xfam = env->features[FEAT_XSAVE_XCR0_LO] | 646 env->features[FEAT_XSAVE_XCR0_HI] | 647 env->features[FEAT_XSAVE_XSS_LO] | 648 env->features[FEAT_XSAVE_XSS_HI]; 649 650 if (xfam & ~tdx_caps->supported_xfam) { 651 error_setg(errp, "Invalid XFAM 0x%lx for TDX VM (supported: 0x%llx))", 652 xfam, tdx_caps->supported_xfam); 653 return -1; 654 } 655 656 tdx_guest->xfam = xfam; 657 return 0; 658 } 659 660 static void tdx_filter_cpuid(struct kvm_cpuid2 *cpuids) 661 { 662 int i, dest_cnt = 0; 663 struct kvm_cpuid_entry2 *src, *dest, *conf; 664 665 for (i = 0; i < cpuids->nent; i++) { 666 src = cpuids->entries + i; 667 conf = cpuid_find_entry(&tdx_caps->cpuid, src->function, src->index); 668 if (!conf) { 669 continue; 670 } 671 dest = cpuids->entries + dest_cnt; 672 673 dest->function = src->function; 674 dest->index = src->index; 675 dest->flags = src->flags; 676 dest->eax = src->eax & conf->eax; 677 dest->ebx = src->ebx & conf->ebx; 678 dest->ecx = src->ecx & conf->ecx; 679 dest->edx = src->edx & conf->edx; 680 681 dest_cnt++; 682 } 683 cpuids->nent = dest_cnt++; 684 } 685 686 int tdx_pre_create_vcpu(CPUState *cpu, Error **errp) 687 { 688 X86CPU *x86cpu = X86_CPU(cpu); 689 CPUX86State *env = &x86cpu->env; 690 g_autofree struct kvm_tdx_init_vm *init_vm = NULL; 691 Error *local_err = NULL; 692 size_t data_len; 693 int retry = 10000; 694 int r = 0; 695 696 QEMU_LOCK_GUARD(&tdx_guest->lock); 697 if (tdx_guest->initialized) { 698 return r; 699 } 700 701 init_vm = g_malloc0(sizeof(struct kvm_tdx_init_vm) + 702 sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES); 703 704 if (!kvm_check_extension(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS)) { 705 error_setg(errp, "KVM doesn't support KVM_CAP_X86_APIC_BUS_CYCLES_NS"); 706 return -EOPNOTSUPP; 707 } 708 709 r = kvm_vm_enable_cap(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS, 710 0, TDX_APIC_BUS_CYCLES_NS); 711 if (r < 0) { 712 error_setg_errno(errp, -r, 713 "Unable to set core crystal clock frequency to 25MHz"); 714 return r; 715 } 716 717 if (env->tsc_khz && (env->tsc_khz < TDX_MIN_TSC_FREQUENCY_KHZ || 718 env->tsc_khz > TDX_MAX_TSC_FREQUENCY_KHZ)) { 719 error_setg(errp, "Invalid TSC %ld KHz, must specify cpu_frequency " 720 "between [%d, %d] kHz", env->tsc_khz, 721 TDX_MIN_TSC_FREQUENCY_KHZ, TDX_MAX_TSC_FREQUENCY_KHZ); 722 return -EINVAL; 723 } 724 725 if (env->tsc_khz % (25 * 1000)) { 726 error_setg(errp, "Invalid TSC %ld KHz, it must be multiple of 25MHz", 727 env->tsc_khz); 728 return -EINVAL; 729 } 730 731 /* it's safe even env->tsc_khz is 0. KVM uses host's tsc_khz in this case */ 732 r = kvm_vm_ioctl(kvm_state, KVM_SET_TSC_KHZ, env->tsc_khz); 733 if (r < 0) { 734 error_setg_errno(errp, -r, "Unable to set TSC frequency to %ld kHz", 735 env->tsc_khz); 736 return r; 737 } 738 739 if (tdx_guest->mrconfigid) { 740 g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrconfigid, 741 strlen(tdx_guest->mrconfigid), &data_len, errp); 742 if (!data) { 743 return -1; 744 } 745 if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) { 746 error_setg(errp, "TDX: failed to decode mrconfigid"); 747 return -1; 748 } 749 memcpy(init_vm->mrconfigid, data, data_len); 750 } 751 752 if (tdx_guest->mrowner) { 753 g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrowner, 754 strlen(tdx_guest->mrowner), &data_len, errp); 755 if (!data) { 756 return -1; 757 } 758 if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) { 759 error_setg(errp, "TDX: failed to decode mrowner"); 760 return -1; 761 } 762 memcpy(init_vm->mrowner, data, data_len); 763 } 764 765 if (tdx_guest->mrownerconfig) { 766 g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrownerconfig, 767 strlen(tdx_guest->mrownerconfig), &data_len, errp); 768 if (!data) { 769 return -1; 770 } 771 if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) { 772 error_setg(errp, "TDX: failed to decode mrownerconfig"); 773 return -1; 774 } 775 memcpy(init_vm->mrownerconfig, data, data_len); 776 } 777 778 r = setup_td_guest_attributes(x86cpu, errp); 779 if (r) { 780 return r; 781 } 782 783 r = setup_td_xfam(x86cpu, errp); 784 if (r) { 785 return r; 786 } 787 788 init_vm->cpuid.nent = kvm_x86_build_cpuid(env, init_vm->cpuid.entries, 0); 789 tdx_filter_cpuid(&init_vm->cpuid); 790 791 init_vm->attributes = tdx_guest->attributes; 792 init_vm->xfam = tdx_guest->xfam; 793 794 /* 795 * KVM_TDX_INIT_VM gets -EAGAIN when KVM side SEAMCALL(TDH_MNG_CREATE) 796 * gets TDX_RND_NO_ENTROPY due to Random number generation (e.g., RDRAND or 797 * RDSEED) is busy. 798 * 799 * Retry for the case. 800 */ 801 do { 802 error_free(local_err); 803 local_err = NULL; 804 r = tdx_vm_ioctl(KVM_TDX_INIT_VM, 0, init_vm, &local_err); 805 } while (r == -EAGAIN && --retry); 806 807 if (r < 0) { 808 if (!retry) { 809 error_append_hint(&local_err, "Hardware RNG (Random Number " 810 "Generator) is busy occupied by someone (via RDRAND/RDSEED) " 811 "maliciously, which leads to KVM_TDX_INIT_VM keeping failure " 812 "due to lack of entropy.\n"); 813 } 814 error_propagate(errp, local_err); 815 return r; 816 } 817 818 tdx_guest->initialized = true; 819 820 return 0; 821 } 822 823 int tdx_parse_tdvf(void *flash_ptr, int size) 824 { 825 return tdvf_parse_metadata(&tdx_guest->tdvf, flash_ptr, size); 826 } 827 828 static void tdx_panicked_on_fatal_error(X86CPU *cpu, uint64_t error_code, 829 char *message, uint64_t gpa) 830 { 831 GuestPanicInformation *panic_info; 832 833 panic_info = g_new0(GuestPanicInformation, 1); 834 panic_info->type = GUEST_PANIC_INFORMATION_TYPE_TDX; 835 panic_info->u.tdx.error_code = (uint32_t) error_code; 836 panic_info->u.tdx.message = message; 837 panic_info->u.tdx.gpa = gpa; 838 839 qemu_system_guest_panicked(panic_info); 840 } 841 842 /* 843 * Only 8 registers can contain valid ASCII byte stream to form the fatal 844 * message, and their sequence is: R14, R15, RBX, RDI, RSI, R8, R9, RDX 845 */ 846 #define TDX_FATAL_MESSAGE_MAX 64 847 848 #define TDX_REPORT_FATAL_ERROR_GPA_VALID BIT_ULL(63) 849 850 int tdx_handle_report_fatal_error(X86CPU *cpu, struct kvm_run *run) 851 { 852 uint64_t error_code = run->system_event.data[R_R12]; 853 uint64_t reg_mask = run->system_event.data[R_ECX]; 854 char *message = NULL; 855 uint64_t *tmp; 856 uint64_t gpa = -1ull; 857 858 if (error_code & 0xffff) { 859 error_report("TDX: REPORT_FATAL_ERROR: invalid error code: 0x%lx", 860 error_code); 861 return -1; 862 } 863 864 if (reg_mask) { 865 message = g_malloc0(TDX_FATAL_MESSAGE_MAX + 1); 866 tmp = (uint64_t *)message; 867 868 #define COPY_REG(REG) \ 869 do { \ 870 if (reg_mask & BIT_ULL(REG)) { \ 871 *(tmp++) = run->system_event.data[REG]; \ 872 } \ 873 } while (0) 874 875 COPY_REG(R_R14); 876 COPY_REG(R_R15); 877 COPY_REG(R_EBX); 878 COPY_REG(R_EDI); 879 COPY_REG(R_ESI); 880 COPY_REG(R_R8); 881 COPY_REG(R_R9); 882 COPY_REG(R_EDX); 883 *((char *)tmp) = '\0'; 884 } 885 #undef COPY_REG 886 887 if (error_code & TDX_REPORT_FATAL_ERROR_GPA_VALID) { 888 gpa = run->system_event.data[R_R13]; 889 } 890 891 tdx_panicked_on_fatal_error(cpu, error_code, message, gpa); 892 893 return -1; 894 } 895 896 static bool tdx_guest_get_sept_ve_disable(Object *obj, Error **errp) 897 { 898 TdxGuest *tdx = TDX_GUEST(obj); 899 900 return !!(tdx->attributes & TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE); 901 } 902 903 static void tdx_guest_set_sept_ve_disable(Object *obj, bool value, Error **errp) 904 { 905 TdxGuest *tdx = TDX_GUEST(obj); 906 907 if (value) { 908 tdx->attributes |= TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE; 909 } else { 910 tdx->attributes &= ~TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE; 911 } 912 } 913 914 static char *tdx_guest_get_mrconfigid(Object *obj, Error **errp) 915 { 916 TdxGuest *tdx = TDX_GUEST(obj); 917 918 return g_strdup(tdx->mrconfigid); 919 } 920 921 static void tdx_guest_set_mrconfigid(Object *obj, const char *value, Error **errp) 922 { 923 TdxGuest *tdx = TDX_GUEST(obj); 924 925 g_free(tdx->mrconfigid); 926 tdx->mrconfigid = g_strdup(value); 927 } 928 929 static char *tdx_guest_get_mrowner(Object *obj, Error **errp) 930 { 931 TdxGuest *tdx = TDX_GUEST(obj); 932 933 return g_strdup(tdx->mrowner); 934 } 935 936 static void tdx_guest_set_mrowner(Object *obj, const char *value, Error **errp) 937 { 938 TdxGuest *tdx = TDX_GUEST(obj); 939 940 g_free(tdx->mrowner); 941 tdx->mrowner = g_strdup(value); 942 } 943 944 static char *tdx_guest_get_mrownerconfig(Object *obj, Error **errp) 945 { 946 TdxGuest *tdx = TDX_GUEST(obj); 947 948 return g_strdup(tdx->mrownerconfig); 949 } 950 951 static void tdx_guest_set_mrownerconfig(Object *obj, const char *value, Error **errp) 952 { 953 TdxGuest *tdx = TDX_GUEST(obj); 954 955 g_free(tdx->mrownerconfig); 956 tdx->mrownerconfig = g_strdup(value); 957 } 958 959 /* tdx guest */ 960 OBJECT_DEFINE_TYPE_WITH_INTERFACES(TdxGuest, 961 tdx_guest, 962 TDX_GUEST, 963 X86_CONFIDENTIAL_GUEST, 964 { TYPE_USER_CREATABLE }, 965 { NULL }) 966 967 static void tdx_guest_init(Object *obj) 968 { 969 ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj); 970 TdxGuest *tdx = TDX_GUEST(obj); 971 972 qemu_mutex_init(&tdx->lock); 973 974 cgs->require_guest_memfd = true; 975 tdx->attributes = TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE; 976 977 object_property_add_uint64_ptr(obj, "attributes", &tdx->attributes, 978 OBJ_PROP_FLAG_READWRITE); 979 object_property_add_bool(obj, "sept-ve-disable", 980 tdx_guest_get_sept_ve_disable, 981 tdx_guest_set_sept_ve_disable); 982 object_property_add_str(obj, "mrconfigid", 983 tdx_guest_get_mrconfigid, 984 tdx_guest_set_mrconfigid); 985 object_property_add_str(obj, "mrowner", 986 tdx_guest_get_mrowner, tdx_guest_set_mrowner); 987 object_property_add_str(obj, "mrownerconfig", 988 tdx_guest_get_mrownerconfig, 989 tdx_guest_set_mrownerconfig); 990 } 991 992 static void tdx_guest_finalize(Object *obj) 993 { 994 } 995 996 static void tdx_guest_class_init(ObjectClass *oc, const void *data) 997 { 998 ConfidentialGuestSupportClass *klass = CONFIDENTIAL_GUEST_SUPPORT_CLASS(oc); 999 X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc); 1000 1001 klass->kvm_init = tdx_kvm_init; 1002 x86_klass->kvm_type = tdx_kvm_type; 1003 x86_klass->cpu_instance_init = tdx_cpu_instance_init; 1004 x86_klass->adjust_cpuid_features = tdx_adjust_cpuid_features; 1005 } 1006