1 /* 2 * QEMU TDX support 3 * 4 * Copyright (c) 2025 Intel Corporation 5 * 6 * Author: 7 * Xiaoyao Li <xiaoyao.li@intel.com> 8 * 9 * SPDX-License-Identifier: GPL-2.0-or-later 10 */ 11 12 #include "qemu/osdep.h" 13 #include "qemu/error-report.h" 14 #include "qemu/base64.h" 15 #include "qemu/mmap-alloc.h" 16 #include "qapi/error.h" 17 #include "qom/object_interfaces.h" 18 #include "crypto/hash.h" 19 #include "system/kvm_int.h" 20 #include "system/runstate.h" 21 #include "system/system.h" 22 #include "system/ramblock.h" 23 24 #include <linux/kvm_para.h> 25 26 #include "cpu.h" 27 #include "cpu-internal.h" 28 #include "hw/i386/e820_memory_layout.h" 29 #include "hw/i386/tdvf.h" 30 #include "hw/i386/x86.h" 31 #include "hw/i386/tdvf-hob.h" 32 #include "kvm_i386.h" 33 #include "tdx.h" 34 35 #include "standard-headers/asm-x86/kvm_para.h" 36 37 #define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000) 38 #define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000) 39 40 #define TDX_TD_ATTRIBUTES_DEBUG BIT_ULL(0) 41 #define TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE BIT_ULL(28) 42 #define TDX_TD_ATTRIBUTES_PKS BIT_ULL(30) 43 #define TDX_TD_ATTRIBUTES_PERFMON BIT_ULL(63) 44 45 #define TDX_SUPPORTED_TD_ATTRS (TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE |\ 46 TDX_TD_ATTRIBUTES_PKS | \ 47 TDX_TD_ATTRIBUTES_PERFMON) 48 49 #define TDX_SUPPORTED_KVM_FEATURES ((1U << KVM_FEATURE_NOP_IO_DELAY) | \ 50 (1U << KVM_FEATURE_PV_UNHALT) | \ 51 (1U << KVM_FEATURE_PV_TLB_FLUSH) | \ 52 (1U << KVM_FEATURE_PV_SEND_IPI) | \ 53 (1U << KVM_FEATURE_POLL_CONTROL) | \ 54 (1U << KVM_FEATURE_PV_SCHED_YIELD) | \ 55 (1U << KVM_FEATURE_MSI_EXT_DEST_ID)) 56 57 static TdxGuest *tdx_guest; 58 59 static struct kvm_tdx_capabilities *tdx_caps; 60 static struct kvm_cpuid2 *tdx_supported_cpuid; 61 62 /* Valid after kvm_arch_init()->confidential_guest_kvm_init()->tdx_kvm_init() */ 63 bool is_tdx_vm(void) 64 { 65 return !!tdx_guest; 66 } 67 68 enum tdx_ioctl_level { 69 TDX_VM_IOCTL, 70 TDX_VCPU_IOCTL, 71 }; 72 73 static int tdx_ioctl_internal(enum tdx_ioctl_level level, void *state, 74 int cmd_id, __u32 flags, void *data, 75 Error **errp) 76 { 77 struct kvm_tdx_cmd tdx_cmd = {}; 78 int r; 79 80 const char *tdx_ioctl_name[] = { 81 [KVM_TDX_CAPABILITIES] = "KVM_TDX_CAPABILITIES", 82 [KVM_TDX_INIT_VM] = "KVM_TDX_INIT_VM", 83 [KVM_TDX_INIT_VCPU] = "KVM_TDX_INIT_VCPU", 84 [KVM_TDX_INIT_MEM_REGION] = "KVM_TDX_INIT_MEM_REGION", 85 [KVM_TDX_FINALIZE_VM] = "KVM_TDX_FINALIZE_VM", 86 [KVM_TDX_GET_CPUID] = "KVM_TDX_GET_CPUID", 87 }; 88 89 tdx_cmd.id = cmd_id; 90 tdx_cmd.flags = flags; 91 tdx_cmd.data = (__u64)(unsigned long)data; 92 93 switch (level) { 94 case TDX_VM_IOCTL: 95 r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd); 96 break; 97 case TDX_VCPU_IOCTL: 98 r = kvm_vcpu_ioctl(state, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd); 99 break; 100 default: 101 error_setg(errp, "Invalid tdx_ioctl_level %d", level); 102 return -EINVAL; 103 } 104 105 if (r < 0) { 106 error_setg_errno(errp, -r, "TDX ioctl %s failed, hw_errors: 0x%llx", 107 tdx_ioctl_name[cmd_id], tdx_cmd.hw_error); 108 } 109 return r; 110 } 111 112 static inline int tdx_vm_ioctl(int cmd_id, __u32 flags, void *data, 113 Error **errp) 114 { 115 return tdx_ioctl_internal(TDX_VM_IOCTL, NULL, cmd_id, flags, data, errp); 116 } 117 118 static inline int tdx_vcpu_ioctl(CPUState *cpu, int cmd_id, __u32 flags, 119 void *data, Error **errp) 120 { 121 return tdx_ioctl_internal(TDX_VCPU_IOCTL, cpu, cmd_id, flags, data, errp); 122 } 123 124 static int get_tdx_capabilities(Error **errp) 125 { 126 struct kvm_tdx_capabilities *caps; 127 /* 1st generation of TDX reports 6 cpuid configs */ 128 int nr_cpuid_configs = 6; 129 size_t size; 130 int r; 131 132 do { 133 Error *local_err = NULL; 134 size = sizeof(struct kvm_tdx_capabilities) + 135 nr_cpuid_configs * sizeof(struct kvm_cpuid_entry2); 136 caps = g_malloc0(size); 137 caps->cpuid.nent = nr_cpuid_configs; 138 139 r = tdx_vm_ioctl(KVM_TDX_CAPABILITIES, 0, caps, &local_err); 140 if (r == -E2BIG) { 141 g_free(caps); 142 nr_cpuid_configs *= 2; 143 if (nr_cpuid_configs > KVM_MAX_CPUID_ENTRIES) { 144 error_report("KVM TDX seems broken that number of CPUID entries" 145 " in kvm_tdx_capabilities exceeds limit: %d", 146 KVM_MAX_CPUID_ENTRIES); 147 error_propagate(errp, local_err); 148 return r; 149 } 150 error_free(local_err); 151 } else if (r < 0) { 152 g_free(caps); 153 error_propagate(errp, local_err); 154 return r; 155 } 156 } while (r == -E2BIG); 157 158 tdx_caps = caps; 159 160 return 0; 161 } 162 163 void tdx_set_tdvf_region(MemoryRegion *tdvf_mr) 164 { 165 assert(!tdx_guest->tdvf_mr); 166 tdx_guest->tdvf_mr = tdvf_mr; 167 } 168 169 static TdxFirmwareEntry *tdx_get_hob_entry(TdxGuest *tdx) 170 { 171 TdxFirmwareEntry *entry; 172 173 for_each_tdx_fw_entry(&tdx->tdvf, entry) { 174 if (entry->type == TDVF_SECTION_TYPE_TD_HOB) { 175 return entry; 176 } 177 } 178 error_report("TDVF metadata doesn't specify TD_HOB location."); 179 exit(1); 180 } 181 182 static void tdx_add_ram_entry(uint64_t address, uint64_t length, 183 enum TdxRamType type) 184 { 185 uint32_t nr_entries = tdx_guest->nr_ram_entries; 186 tdx_guest->ram_entries = g_renew(TdxRamEntry, tdx_guest->ram_entries, 187 nr_entries + 1); 188 189 tdx_guest->ram_entries[nr_entries].address = address; 190 tdx_guest->ram_entries[nr_entries].length = length; 191 tdx_guest->ram_entries[nr_entries].type = type; 192 tdx_guest->nr_ram_entries++; 193 } 194 195 static int tdx_accept_ram_range(uint64_t address, uint64_t length) 196 { 197 uint64_t head_start, tail_start, head_length, tail_length; 198 uint64_t tmp_address, tmp_length; 199 TdxRamEntry *e; 200 int i = 0; 201 202 do { 203 if (i == tdx_guest->nr_ram_entries) { 204 return -1; 205 } 206 207 e = &tdx_guest->ram_entries[i++]; 208 } while (address + length <= e->address || address >= e->address + e->length); 209 210 /* 211 * The to-be-accepted ram range must be fully contained by one 212 * RAM entry. 213 */ 214 if (e->address > address || 215 e->address + e->length < address + length) { 216 return -1; 217 } 218 219 if (e->type == TDX_RAM_ADDED) { 220 return 0; 221 } 222 223 tmp_address = e->address; 224 tmp_length = e->length; 225 226 e->address = address; 227 e->length = length; 228 e->type = TDX_RAM_ADDED; 229 230 head_length = address - tmp_address; 231 if (head_length > 0) { 232 head_start = tmp_address; 233 tdx_add_ram_entry(head_start, head_length, TDX_RAM_UNACCEPTED); 234 } 235 236 tail_start = address + length; 237 if (tail_start < tmp_address + tmp_length) { 238 tail_length = tmp_address + tmp_length - tail_start; 239 tdx_add_ram_entry(tail_start, tail_length, TDX_RAM_UNACCEPTED); 240 } 241 242 return 0; 243 } 244 245 static int tdx_ram_entry_compare(const void *lhs_, const void* rhs_) 246 { 247 const TdxRamEntry *lhs = lhs_; 248 const TdxRamEntry *rhs = rhs_; 249 250 if (lhs->address == rhs->address) { 251 return 0; 252 } 253 if (le64_to_cpu(lhs->address) > le64_to_cpu(rhs->address)) { 254 return 1; 255 } 256 return -1; 257 } 258 259 static void tdx_init_ram_entries(void) 260 { 261 unsigned i, j, nr_e820_entries; 262 263 nr_e820_entries = e820_get_table(NULL); 264 tdx_guest->ram_entries = g_new(TdxRamEntry, nr_e820_entries); 265 266 for (i = 0, j = 0; i < nr_e820_entries; i++) { 267 uint64_t addr, len; 268 269 if (e820_get_entry(i, E820_RAM, &addr, &len)) { 270 tdx_guest->ram_entries[j].address = addr; 271 tdx_guest->ram_entries[j].length = len; 272 tdx_guest->ram_entries[j].type = TDX_RAM_UNACCEPTED; 273 j++; 274 } 275 } 276 tdx_guest->nr_ram_entries = j; 277 } 278 279 static void tdx_post_init_vcpus(void) 280 { 281 TdxFirmwareEntry *hob; 282 CPUState *cpu; 283 284 hob = tdx_get_hob_entry(tdx_guest); 285 CPU_FOREACH(cpu) { 286 tdx_vcpu_ioctl(cpu, KVM_TDX_INIT_VCPU, 0, (void *)hob->address, 287 &error_fatal); 288 } 289 } 290 291 static void tdx_finalize_vm(Notifier *notifier, void *unused) 292 { 293 TdxFirmware *tdvf = &tdx_guest->tdvf; 294 TdxFirmwareEntry *entry; 295 RAMBlock *ram_block; 296 Error *local_err = NULL; 297 int r; 298 299 tdx_init_ram_entries(); 300 301 for_each_tdx_fw_entry(tdvf, entry) { 302 switch (entry->type) { 303 case TDVF_SECTION_TYPE_BFV: 304 case TDVF_SECTION_TYPE_CFV: 305 entry->mem_ptr = tdvf->mem_ptr + entry->data_offset; 306 break; 307 case TDVF_SECTION_TYPE_TD_HOB: 308 case TDVF_SECTION_TYPE_TEMP_MEM: 309 entry->mem_ptr = qemu_ram_mmap(-1, entry->size, 310 qemu_real_host_page_size(), 0, 0); 311 if (entry->mem_ptr == MAP_FAILED) { 312 error_report("Failed to mmap memory for TDVF section %d", 313 entry->type); 314 exit(1); 315 } 316 if (tdx_accept_ram_range(entry->address, entry->size)) { 317 error_report("Failed to accept memory for TDVF section %d", 318 entry->type); 319 qemu_ram_munmap(-1, entry->mem_ptr, entry->size); 320 exit(1); 321 } 322 break; 323 default: 324 error_report("Unsupported TDVF section %d", entry->type); 325 exit(1); 326 } 327 } 328 329 qsort(tdx_guest->ram_entries, tdx_guest->nr_ram_entries, 330 sizeof(TdxRamEntry), &tdx_ram_entry_compare); 331 332 tdvf_hob_create(tdx_guest, tdx_get_hob_entry(tdx_guest)); 333 334 tdx_post_init_vcpus(); 335 336 for_each_tdx_fw_entry(tdvf, entry) { 337 struct kvm_tdx_init_mem_region region; 338 uint32_t flags; 339 340 region = (struct kvm_tdx_init_mem_region) { 341 .source_addr = (uint64_t)entry->mem_ptr, 342 .gpa = entry->address, 343 .nr_pages = entry->size >> 12, 344 }; 345 346 flags = entry->attributes & TDVF_SECTION_ATTRIBUTES_MR_EXTEND ? 347 KVM_TDX_MEASURE_MEMORY_REGION : 0; 348 349 do { 350 error_free(local_err); 351 local_err = NULL; 352 r = tdx_vcpu_ioctl(first_cpu, KVM_TDX_INIT_MEM_REGION, flags, 353 ®ion, &local_err); 354 } while (r == -EAGAIN || r == -EINTR); 355 if (r < 0) { 356 error_report_err(local_err); 357 exit(1); 358 } 359 360 if (entry->type == TDVF_SECTION_TYPE_TD_HOB || 361 entry->type == TDVF_SECTION_TYPE_TEMP_MEM) { 362 qemu_ram_munmap(-1, entry->mem_ptr, entry->size); 363 entry->mem_ptr = NULL; 364 } 365 } 366 367 /* 368 * TDVF image has been copied into private region above via 369 * KVM_MEMORY_MAPPING. It becomes useless. 370 */ 371 ram_block = tdx_guest->tdvf_mr->ram_block; 372 ram_block_discard_range(ram_block, 0, ram_block->max_length); 373 374 tdx_vm_ioctl(KVM_TDX_FINALIZE_VM, 0, NULL, &error_fatal); 375 CONFIDENTIAL_GUEST_SUPPORT(tdx_guest)->ready = true; 376 } 377 378 static Notifier tdx_machine_done_notify = { 379 .notify = tdx_finalize_vm, 380 }; 381 382 /* 383 * Some CPUID bits change from fixed1 to configurable bits when TDX module 384 * supports TDX_FEATURES0.VE_REDUCTION. e.g., MCA/MCE/MTRR/CORE_CAPABILITY. 385 * 386 * To make QEMU work with all the versions of TDX module, keep the fixed1 bits 387 * here if they are ever fixed1 bits in any of the version though not fixed1 in 388 * the latest version. Otherwise, with the older version of TDX module, QEMU may 389 * treat the fixed1 bit as unsupported. 390 * 391 * For newer TDX module, it does no harm to keep them in tdx_fixed1_bits even 392 * though they changed to configurable bits. Because tdx_fixed1_bits is used to 393 * setup the supported bits. 394 */ 395 KvmCpuidInfo tdx_fixed1_bits = { 396 .cpuid.nent = 8, 397 .entries[0] = { 398 .function = 0x1, 399 .index = 0, 400 .ecx = CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_DTES64 | 401 CPUID_EXT_DSCPL | CPUID_EXT_SSSE3 | CPUID_EXT_CX16 | 402 CPUID_EXT_PDCM | CPUID_EXT_PCID | CPUID_EXT_SSE41 | 403 CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE | 404 CPUID_EXT_POPCNT | CPUID_EXT_AES | CPUID_EXT_XSAVE | 405 CPUID_EXT_RDRAND | CPUID_EXT_HYPERVISOR, 406 .edx = CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC | 407 CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC | 408 CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV | 409 CPUID_PAT | CPUID_CLFLUSH | CPUID_DTS | CPUID_MMX | CPUID_FXSR | 410 CPUID_SSE | CPUID_SSE2, 411 }, 412 .entries[1] = { 413 .function = 0x6, 414 .index = 0, 415 .eax = CPUID_6_EAX_ARAT, 416 }, 417 .entries[2] = { 418 .function = 0x7, 419 .index = 0, 420 .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX, 421 .ebx = CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_FDP_EXCPTN_ONLY | 422 CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_INVPCID | 423 CPUID_7_0_EBX_ZERO_FCS_FDS | CPUID_7_0_EBX_RDSEED | 424 CPUID_7_0_EBX_SMAP | CPUID_7_0_EBX_CLFLUSHOPT | 425 CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_SHA_NI, 426 .ecx = CPUID_7_0_ECX_BUS_LOCK_DETECT | CPUID_7_0_ECX_MOVDIRI | 427 CPUID_7_0_ECX_MOVDIR64B, 428 .edx = CPUID_7_0_EDX_MD_CLEAR | CPUID_7_0_EDX_SPEC_CTRL | 429 CPUID_7_0_EDX_STIBP | CPUID_7_0_EDX_FLUSH_L1D | 430 CPUID_7_0_EDX_ARCH_CAPABILITIES | CPUID_7_0_EDX_CORE_CAPABILITY | 431 CPUID_7_0_EDX_SPEC_CTRL_SSBD, 432 }, 433 .entries[3] = { 434 .function = 0x7, 435 .index = 2, 436 .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX, 437 .edx = CPUID_7_2_EDX_PSFD | CPUID_7_2_EDX_IPRED_CTRL | 438 CPUID_7_2_EDX_RRSBA_CTRL | CPUID_7_2_EDX_BHI_CTRL, 439 }, 440 .entries[4] = { 441 .function = 0xD, 442 .index = 0, 443 .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX, 444 .eax = XSTATE_FP_MASK | XSTATE_SSE_MASK, 445 }, 446 .entries[5] = { 447 .function = 0xD, 448 .index = 1, 449 .flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX, 450 .eax = CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XSAVEC| 451 CPUID_XSAVE_XGETBV1 | CPUID_XSAVE_XSAVES, 452 }, 453 .entries[6] = { 454 .function = 0x80000001, 455 .index = 0, 456 .ecx = CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH, 457 /* 458 * Strictly speaking, SYSCALL is not fixed1 bit since it depends on 459 * the CPU to be in 64-bit mode. But here fixed1 is used to serve the 460 * purpose of supported bits for TDX. In this sense, SYACALL is always 461 * supported. 462 */ 463 .edx = CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB | 464 CPUID_EXT2_RDTSCP | CPUID_EXT2_LM, 465 }, 466 .entries[7] = { 467 .function = 0x80000007, 468 .index = 0, 469 .edx = CPUID_APM_INVTSC, 470 }, 471 }; 472 473 typedef struct TdxAttrsMap { 474 uint32_t attr_index; 475 uint32_t cpuid_leaf; 476 uint32_t cpuid_subleaf; 477 int cpuid_reg; 478 uint32_t feat_mask; 479 } TdxAttrsMap; 480 481 static TdxAttrsMap tdx_attrs_maps[] = { 482 {.attr_index = 27, 483 .cpuid_leaf = 7, 484 .cpuid_subleaf = 1, 485 .cpuid_reg = R_EAX, 486 .feat_mask = CPUID_7_1_EAX_LASS,}, 487 488 {.attr_index = 30, 489 .cpuid_leaf = 7, 490 .cpuid_subleaf = 0, 491 .cpuid_reg = R_ECX, 492 .feat_mask = CPUID_7_0_ECX_PKS,}, 493 494 {.attr_index = 31, 495 .cpuid_leaf = 7, 496 .cpuid_subleaf = 0, 497 .cpuid_reg = R_ECX, 498 .feat_mask = CPUID_7_0_ECX_KeyLocker,}, 499 }; 500 501 typedef struct TdxXFAMDep { 502 int xfam_bit; 503 FeatureMask feat_mask; 504 } TdxXFAMDep; 505 506 /* 507 * Note, only the CPUID bits whose virtualization type are "XFAM & Native" are 508 * defiend here. 509 * 510 * For those whose virtualization type are "XFAM & Configured & Native", they 511 * are reported as configurable bits. And they are not supported if not in the 512 * configureable bits list from KVM even if the corresponding XFAM bit is 513 * supported. 514 */ 515 TdxXFAMDep tdx_xfam_deps[] = { 516 { XSTATE_YMM_BIT, { FEAT_1_ECX, CPUID_EXT_FMA }}, 517 { XSTATE_YMM_BIT, { FEAT_7_0_EBX, CPUID_7_0_EBX_AVX2 }}, 518 { XSTATE_OPMASK_BIT, { FEAT_7_0_ECX, CPUID_7_0_ECX_AVX512_VBMI}}, 519 { XSTATE_OPMASK_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AVX512_FP16}}, 520 { XSTATE_PT_BIT, { FEAT_7_0_EBX, CPUID_7_0_EBX_INTEL_PT}}, 521 { XSTATE_PKRU_BIT, { FEAT_7_0_ECX, CPUID_7_0_ECX_PKU}}, 522 { XSTATE_XTILE_CFG_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_BF16 }}, 523 { XSTATE_XTILE_CFG_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE }}, 524 { XSTATE_XTILE_CFG_BIT, { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_INT8 }}, 525 }; 526 527 static struct kvm_cpuid_entry2 *find_in_supported_entry(uint32_t function, 528 uint32_t index) 529 { 530 struct kvm_cpuid_entry2 *e; 531 532 e = cpuid_find_entry(tdx_supported_cpuid, function, index); 533 if (!e) { 534 if (tdx_supported_cpuid->nent >= KVM_MAX_CPUID_ENTRIES) { 535 error_report("tdx_supported_cpuid requries more space than %d entries", 536 KVM_MAX_CPUID_ENTRIES); 537 exit(1); 538 } 539 e = &tdx_supported_cpuid->entries[tdx_supported_cpuid->nent++]; 540 e->function = function; 541 e->index = index; 542 } 543 544 return e; 545 } 546 547 static void tdx_add_supported_cpuid_by_fixed1_bits(void) 548 { 549 struct kvm_cpuid_entry2 *e, *e1; 550 int i; 551 552 for (i = 0; i < tdx_fixed1_bits.cpuid.nent; i++) { 553 e = &tdx_fixed1_bits.entries[i]; 554 555 e1 = find_in_supported_entry(e->function, e->index); 556 e1->eax |= e->eax; 557 e1->ebx |= e->ebx; 558 e1->ecx |= e->ecx; 559 e1->edx |= e->edx; 560 } 561 } 562 563 static void tdx_add_supported_cpuid_by_attrs(void) 564 { 565 struct kvm_cpuid_entry2 *e; 566 TdxAttrsMap *map; 567 int i; 568 569 for (i = 0; i < ARRAY_SIZE(tdx_attrs_maps); i++) { 570 map = &tdx_attrs_maps[i]; 571 if (!((1ULL << map->attr_index) & tdx_caps->supported_attrs)) { 572 continue; 573 } 574 575 e = find_in_supported_entry(map->cpuid_leaf, map->cpuid_subleaf); 576 577 switch(map->cpuid_reg) { 578 case R_EAX: 579 e->eax |= map->feat_mask; 580 break; 581 case R_EBX: 582 e->ebx |= map->feat_mask; 583 break; 584 case R_ECX: 585 e->ecx |= map->feat_mask; 586 break; 587 case R_EDX: 588 e->edx |= map->feat_mask; 589 break; 590 } 591 } 592 } 593 594 static void tdx_add_supported_cpuid_by_xfam(void) 595 { 596 struct kvm_cpuid_entry2 *e; 597 int i; 598 599 const TdxXFAMDep *xfam_dep; 600 const FeatureWordInfo *f; 601 for (i = 0; i < ARRAY_SIZE(tdx_xfam_deps); i++) { 602 xfam_dep = &tdx_xfam_deps[i]; 603 if (!((1ULL << xfam_dep->xfam_bit) & tdx_caps->supported_xfam)) { 604 continue; 605 } 606 607 f = &feature_word_info[xfam_dep->feat_mask.index]; 608 if (f->type != CPUID_FEATURE_WORD) { 609 continue; 610 } 611 612 e = find_in_supported_entry(f->cpuid.eax, f->cpuid.ecx); 613 switch(f->cpuid.reg) { 614 case R_EAX: 615 e->eax |= xfam_dep->feat_mask.mask; 616 break; 617 case R_EBX: 618 e->ebx |= xfam_dep->feat_mask.mask; 619 break; 620 case R_ECX: 621 e->ecx |= xfam_dep->feat_mask.mask; 622 break; 623 case R_EDX: 624 e->edx |= xfam_dep->feat_mask.mask; 625 break; 626 } 627 } 628 629 e = find_in_supported_entry(0xd, 0); 630 e->eax |= (tdx_caps->supported_xfam & CPUID_XSTATE_XCR0_MASK); 631 e->edx |= (tdx_caps->supported_xfam & CPUID_XSTATE_XCR0_MASK) >> 32; 632 633 e = find_in_supported_entry(0xd, 1); 634 /* 635 * Mark XFD always support for TDX, it will be cleared finally in 636 * tdx_adjust_cpuid_features() if XFD is unavailable on the hardware 637 * because in this case the original data has it as 0. 638 */ 639 e->eax |= CPUID_XSAVE_XFD; 640 e->ecx |= (tdx_caps->supported_xfam & CPUID_XSTATE_XSS_MASK); 641 e->edx |= (tdx_caps->supported_xfam & CPUID_XSTATE_XSS_MASK) >> 32; 642 } 643 644 static void tdx_add_supported_kvm_features(void) 645 { 646 struct kvm_cpuid_entry2 *e; 647 648 e = find_in_supported_entry(0x40000001, 0); 649 e->eax = TDX_SUPPORTED_KVM_FEATURES; 650 } 651 652 static void tdx_setup_supported_cpuid(void) 653 { 654 if (tdx_supported_cpuid) { 655 return; 656 } 657 658 tdx_supported_cpuid = g_malloc0(sizeof(*tdx_supported_cpuid) + 659 KVM_MAX_CPUID_ENTRIES * sizeof(struct kvm_cpuid_entry2)); 660 661 memcpy(tdx_supported_cpuid->entries, tdx_caps->cpuid.entries, 662 tdx_caps->cpuid.nent * sizeof(struct kvm_cpuid_entry2)); 663 tdx_supported_cpuid->nent = tdx_caps->cpuid.nent; 664 665 tdx_add_supported_cpuid_by_fixed1_bits(); 666 tdx_add_supported_cpuid_by_attrs(); 667 tdx_add_supported_cpuid_by_xfam(); 668 669 tdx_add_supported_kvm_features(); 670 } 671 672 static int tdx_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) 673 { 674 MachineState *ms = MACHINE(qdev_get_machine()); 675 X86MachineState *x86ms = X86_MACHINE(ms); 676 TdxGuest *tdx = TDX_GUEST(cgs); 677 int r = 0; 678 679 kvm_mark_guest_state_protected(); 680 681 if (x86ms->smm == ON_OFF_AUTO_AUTO) { 682 x86ms->smm = ON_OFF_AUTO_OFF; 683 } else if (x86ms->smm == ON_OFF_AUTO_ON) { 684 error_setg(errp, "TDX VM doesn't support SMM"); 685 return -EINVAL; 686 } 687 688 if (x86ms->pic == ON_OFF_AUTO_AUTO) { 689 x86ms->pic = ON_OFF_AUTO_OFF; 690 } else if (x86ms->pic == ON_OFF_AUTO_ON) { 691 error_setg(errp, "TDX VM doesn't support PIC"); 692 return -EINVAL; 693 } 694 695 if (kvm_state->kernel_irqchip_split == ON_OFF_AUTO_AUTO) { 696 kvm_state->kernel_irqchip_split = ON_OFF_AUTO_ON; 697 } else if (kvm_state->kernel_irqchip_split != ON_OFF_AUTO_ON) { 698 error_setg(errp, "TDX VM requires kernel_irqchip to be split"); 699 return -EINVAL; 700 } 701 702 if (!tdx_caps) { 703 r = get_tdx_capabilities(errp); 704 if (r) { 705 return r; 706 } 707 } 708 709 tdx_setup_supported_cpuid(); 710 711 /* TDX relies on KVM_HC_MAP_GPA_RANGE to handle TDG.VP.VMCALL<MapGPA> */ 712 if (!kvm_enable_hypercall(BIT_ULL(KVM_HC_MAP_GPA_RANGE))) { 713 return -EOPNOTSUPP; 714 } 715 716 /* 717 * Set kvm_readonly_mem_allowed to false, because TDX only supports readonly 718 * memory for shared memory but not for private memory. Besides, whether a 719 * memslot is private or shared is not determined by QEMU. 720 * 721 * Thus, just mark readonly memory not supported for simplicity. 722 */ 723 kvm_readonly_mem_allowed = false; 724 725 qemu_add_machine_init_done_notifier(&tdx_machine_done_notify); 726 727 tdx_guest = tdx; 728 return 0; 729 } 730 731 static int tdx_kvm_type(X86ConfidentialGuest *cg) 732 { 733 /* Do the object check */ 734 TDX_GUEST(cg); 735 736 return KVM_X86_TDX_VM; 737 } 738 739 static void tdx_cpu_instance_init(X86ConfidentialGuest *cg, CPUState *cpu) 740 { 741 X86CPU *x86cpu = X86_CPU(cpu); 742 743 object_property_set_bool(OBJECT(cpu), "pmu", false, &error_abort); 744 745 x86cpu->enable_cpuid_0x1f = true; 746 } 747 748 static uint32_t tdx_adjust_cpuid_features(X86ConfidentialGuest *cg, 749 uint32_t feature, uint32_t index, 750 int reg, uint32_t value) 751 { 752 struct kvm_cpuid_entry2 *e; 753 754 e = cpuid_find_entry(&tdx_fixed1_bits.cpuid, feature, index); 755 if (e) { 756 value |= cpuid_entry_get_reg(e, reg); 757 } 758 759 if (is_feature_word_cpuid(feature, index, reg)) { 760 e = cpuid_find_entry(tdx_supported_cpuid, feature, index); 761 if (e) { 762 value &= cpuid_entry_get_reg(e, reg); 763 } 764 } 765 766 return value; 767 } 768 769 static int tdx_validate_attributes(TdxGuest *tdx, Error **errp) 770 { 771 if ((tdx->attributes & ~tdx_caps->supported_attrs)) { 772 error_setg(errp, "Invalid attributes 0x%lx for TDX VM " 773 "(KVM supported: 0x%llx)", tdx->attributes, 774 tdx_caps->supported_attrs); 775 return -1; 776 } 777 778 if (tdx->attributes & ~TDX_SUPPORTED_TD_ATTRS) { 779 error_setg(errp, "Some QEMU unsupported TD attribute bits being " 780 "requested: 0x%lx (QEMU supported: 0x%llx)", 781 tdx->attributes, TDX_SUPPORTED_TD_ATTRS); 782 return -1; 783 } 784 785 return 0; 786 } 787 788 static int setup_td_guest_attributes(X86CPU *x86cpu, Error **errp) 789 { 790 CPUX86State *env = &x86cpu->env; 791 792 tdx_guest->attributes |= (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_PKS) ? 793 TDX_TD_ATTRIBUTES_PKS : 0; 794 tdx_guest->attributes |= x86cpu->enable_pmu ? TDX_TD_ATTRIBUTES_PERFMON : 0; 795 796 return tdx_validate_attributes(tdx_guest, errp); 797 } 798 799 static int setup_td_xfam(X86CPU *x86cpu, Error **errp) 800 { 801 CPUX86State *env = &x86cpu->env; 802 uint64_t xfam; 803 804 xfam = env->features[FEAT_XSAVE_XCR0_LO] | 805 env->features[FEAT_XSAVE_XCR0_HI] | 806 env->features[FEAT_XSAVE_XSS_LO] | 807 env->features[FEAT_XSAVE_XSS_HI]; 808 809 if (xfam & ~tdx_caps->supported_xfam) { 810 error_setg(errp, "Invalid XFAM 0x%lx for TDX VM (supported: 0x%llx))", 811 xfam, tdx_caps->supported_xfam); 812 return -1; 813 } 814 815 tdx_guest->xfam = xfam; 816 return 0; 817 } 818 819 static void tdx_filter_cpuid(struct kvm_cpuid2 *cpuids) 820 { 821 int i, dest_cnt = 0; 822 struct kvm_cpuid_entry2 *src, *dest, *conf; 823 824 for (i = 0; i < cpuids->nent; i++) { 825 src = cpuids->entries + i; 826 conf = cpuid_find_entry(&tdx_caps->cpuid, src->function, src->index); 827 if (!conf) { 828 continue; 829 } 830 dest = cpuids->entries + dest_cnt; 831 832 dest->function = src->function; 833 dest->index = src->index; 834 dest->flags = src->flags; 835 dest->eax = src->eax & conf->eax; 836 dest->ebx = src->ebx & conf->ebx; 837 dest->ecx = src->ecx & conf->ecx; 838 dest->edx = src->edx & conf->edx; 839 840 dest_cnt++; 841 } 842 cpuids->nent = dest_cnt++; 843 } 844 845 int tdx_pre_create_vcpu(CPUState *cpu, Error **errp) 846 { 847 X86CPU *x86cpu = X86_CPU(cpu); 848 CPUX86State *env = &x86cpu->env; 849 g_autofree struct kvm_tdx_init_vm *init_vm = NULL; 850 Error *local_err = NULL; 851 size_t data_len; 852 int retry = 10000; 853 int r = 0; 854 855 QEMU_LOCK_GUARD(&tdx_guest->lock); 856 if (tdx_guest->initialized) { 857 return r; 858 } 859 860 init_vm = g_malloc0(sizeof(struct kvm_tdx_init_vm) + 861 sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES); 862 863 if (!kvm_check_extension(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS)) { 864 error_setg(errp, "KVM doesn't support KVM_CAP_X86_APIC_BUS_CYCLES_NS"); 865 return -EOPNOTSUPP; 866 } 867 868 r = kvm_vm_enable_cap(kvm_state, KVM_CAP_X86_APIC_BUS_CYCLES_NS, 869 0, TDX_APIC_BUS_CYCLES_NS); 870 if (r < 0) { 871 error_setg_errno(errp, -r, 872 "Unable to set core crystal clock frequency to 25MHz"); 873 return r; 874 } 875 876 if (env->tsc_khz && (env->tsc_khz < TDX_MIN_TSC_FREQUENCY_KHZ || 877 env->tsc_khz > TDX_MAX_TSC_FREQUENCY_KHZ)) { 878 error_setg(errp, "Invalid TSC %ld KHz, must specify cpu_frequency " 879 "between [%d, %d] kHz", env->tsc_khz, 880 TDX_MIN_TSC_FREQUENCY_KHZ, TDX_MAX_TSC_FREQUENCY_KHZ); 881 return -EINVAL; 882 } 883 884 if (env->tsc_khz % (25 * 1000)) { 885 error_setg(errp, "Invalid TSC %ld KHz, it must be multiple of 25MHz", 886 env->tsc_khz); 887 return -EINVAL; 888 } 889 890 /* it's safe even env->tsc_khz is 0. KVM uses host's tsc_khz in this case */ 891 r = kvm_vm_ioctl(kvm_state, KVM_SET_TSC_KHZ, env->tsc_khz); 892 if (r < 0) { 893 error_setg_errno(errp, -r, "Unable to set TSC frequency to %ld kHz", 894 env->tsc_khz); 895 return r; 896 } 897 898 if (tdx_guest->mrconfigid) { 899 g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrconfigid, 900 strlen(tdx_guest->mrconfigid), &data_len, errp); 901 if (!data) { 902 return -1; 903 } 904 if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) { 905 error_setg(errp, "TDX: failed to decode mrconfigid"); 906 return -1; 907 } 908 memcpy(init_vm->mrconfigid, data, data_len); 909 } 910 911 if (tdx_guest->mrowner) { 912 g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrowner, 913 strlen(tdx_guest->mrowner), &data_len, errp); 914 if (!data) { 915 return -1; 916 } 917 if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) { 918 error_setg(errp, "TDX: failed to decode mrowner"); 919 return -1; 920 } 921 memcpy(init_vm->mrowner, data, data_len); 922 } 923 924 if (tdx_guest->mrownerconfig) { 925 g_autofree uint8_t *data = qbase64_decode(tdx_guest->mrownerconfig, 926 strlen(tdx_guest->mrownerconfig), &data_len, errp); 927 if (!data) { 928 return -1; 929 } 930 if (data_len != QCRYPTO_HASH_DIGEST_LEN_SHA384) { 931 error_setg(errp, "TDX: failed to decode mrownerconfig"); 932 return -1; 933 } 934 memcpy(init_vm->mrownerconfig, data, data_len); 935 } 936 937 r = setup_td_guest_attributes(x86cpu, errp); 938 if (r) { 939 return r; 940 } 941 942 r = setup_td_xfam(x86cpu, errp); 943 if (r) { 944 return r; 945 } 946 947 init_vm->cpuid.nent = kvm_x86_build_cpuid(env, init_vm->cpuid.entries, 0); 948 tdx_filter_cpuid(&init_vm->cpuid); 949 950 init_vm->attributes = tdx_guest->attributes; 951 init_vm->xfam = tdx_guest->xfam; 952 953 /* 954 * KVM_TDX_INIT_VM gets -EAGAIN when KVM side SEAMCALL(TDH_MNG_CREATE) 955 * gets TDX_RND_NO_ENTROPY due to Random number generation (e.g., RDRAND or 956 * RDSEED) is busy. 957 * 958 * Retry for the case. 959 */ 960 do { 961 error_free(local_err); 962 local_err = NULL; 963 r = tdx_vm_ioctl(KVM_TDX_INIT_VM, 0, init_vm, &local_err); 964 } while (r == -EAGAIN && --retry); 965 966 if (r < 0) { 967 if (!retry) { 968 error_append_hint(&local_err, "Hardware RNG (Random Number " 969 "Generator) is busy occupied by someone (via RDRAND/RDSEED) " 970 "maliciously, which leads to KVM_TDX_INIT_VM keeping failure " 971 "due to lack of entropy.\n"); 972 } 973 error_propagate(errp, local_err); 974 return r; 975 } 976 977 tdx_guest->initialized = true; 978 979 return 0; 980 } 981 982 int tdx_parse_tdvf(void *flash_ptr, int size) 983 { 984 return tdvf_parse_metadata(&tdx_guest->tdvf, flash_ptr, size); 985 } 986 987 static void tdx_panicked_on_fatal_error(X86CPU *cpu, uint64_t error_code, 988 char *message, uint64_t gpa) 989 { 990 GuestPanicInformation *panic_info; 991 992 panic_info = g_new0(GuestPanicInformation, 1); 993 panic_info->type = GUEST_PANIC_INFORMATION_TYPE_TDX; 994 panic_info->u.tdx.error_code = (uint32_t) error_code; 995 panic_info->u.tdx.message = message; 996 panic_info->u.tdx.gpa = gpa; 997 998 qemu_system_guest_panicked(panic_info); 999 } 1000 1001 /* 1002 * Only 8 registers can contain valid ASCII byte stream to form the fatal 1003 * message, and their sequence is: R14, R15, RBX, RDI, RSI, R8, R9, RDX 1004 */ 1005 #define TDX_FATAL_MESSAGE_MAX 64 1006 1007 #define TDX_REPORT_FATAL_ERROR_GPA_VALID BIT_ULL(63) 1008 1009 int tdx_handle_report_fatal_error(X86CPU *cpu, struct kvm_run *run) 1010 { 1011 uint64_t error_code = run->system_event.data[R_R12]; 1012 uint64_t reg_mask = run->system_event.data[R_ECX]; 1013 char *message = NULL; 1014 uint64_t *tmp; 1015 uint64_t gpa = -1ull; 1016 1017 if (error_code & 0xffff) { 1018 error_report("TDX: REPORT_FATAL_ERROR: invalid error code: 0x%lx", 1019 error_code); 1020 return -1; 1021 } 1022 1023 if (reg_mask) { 1024 message = g_malloc0(TDX_FATAL_MESSAGE_MAX + 1); 1025 tmp = (uint64_t *)message; 1026 1027 #define COPY_REG(REG) \ 1028 do { \ 1029 if (reg_mask & BIT_ULL(REG)) { \ 1030 *(tmp++) = run->system_event.data[REG]; \ 1031 } \ 1032 } while (0) 1033 1034 COPY_REG(R_R14); 1035 COPY_REG(R_R15); 1036 COPY_REG(R_EBX); 1037 COPY_REG(R_EDI); 1038 COPY_REG(R_ESI); 1039 COPY_REG(R_R8); 1040 COPY_REG(R_R9); 1041 COPY_REG(R_EDX); 1042 *((char *)tmp) = '\0'; 1043 } 1044 #undef COPY_REG 1045 1046 if (error_code & TDX_REPORT_FATAL_ERROR_GPA_VALID) { 1047 gpa = run->system_event.data[R_R13]; 1048 } 1049 1050 tdx_panicked_on_fatal_error(cpu, error_code, message, gpa); 1051 1052 return -1; 1053 } 1054 1055 static bool tdx_guest_get_sept_ve_disable(Object *obj, Error **errp) 1056 { 1057 TdxGuest *tdx = TDX_GUEST(obj); 1058 1059 return !!(tdx->attributes & TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE); 1060 } 1061 1062 static void tdx_guest_set_sept_ve_disable(Object *obj, bool value, Error **errp) 1063 { 1064 TdxGuest *tdx = TDX_GUEST(obj); 1065 1066 if (value) { 1067 tdx->attributes |= TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE; 1068 } else { 1069 tdx->attributes &= ~TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE; 1070 } 1071 } 1072 1073 static char *tdx_guest_get_mrconfigid(Object *obj, Error **errp) 1074 { 1075 TdxGuest *tdx = TDX_GUEST(obj); 1076 1077 return g_strdup(tdx->mrconfigid); 1078 } 1079 1080 static void tdx_guest_set_mrconfigid(Object *obj, const char *value, Error **errp) 1081 { 1082 TdxGuest *tdx = TDX_GUEST(obj); 1083 1084 g_free(tdx->mrconfigid); 1085 tdx->mrconfigid = g_strdup(value); 1086 } 1087 1088 static char *tdx_guest_get_mrowner(Object *obj, Error **errp) 1089 { 1090 TdxGuest *tdx = TDX_GUEST(obj); 1091 1092 return g_strdup(tdx->mrowner); 1093 } 1094 1095 static void tdx_guest_set_mrowner(Object *obj, const char *value, Error **errp) 1096 { 1097 TdxGuest *tdx = TDX_GUEST(obj); 1098 1099 g_free(tdx->mrowner); 1100 tdx->mrowner = g_strdup(value); 1101 } 1102 1103 static char *tdx_guest_get_mrownerconfig(Object *obj, Error **errp) 1104 { 1105 TdxGuest *tdx = TDX_GUEST(obj); 1106 1107 return g_strdup(tdx->mrownerconfig); 1108 } 1109 1110 static void tdx_guest_set_mrownerconfig(Object *obj, const char *value, Error **errp) 1111 { 1112 TdxGuest *tdx = TDX_GUEST(obj); 1113 1114 g_free(tdx->mrownerconfig); 1115 tdx->mrownerconfig = g_strdup(value); 1116 } 1117 1118 /* tdx guest */ 1119 OBJECT_DEFINE_TYPE_WITH_INTERFACES(TdxGuest, 1120 tdx_guest, 1121 TDX_GUEST, 1122 X86_CONFIDENTIAL_GUEST, 1123 { TYPE_USER_CREATABLE }, 1124 { NULL }) 1125 1126 static void tdx_guest_init(Object *obj) 1127 { 1128 ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj); 1129 TdxGuest *tdx = TDX_GUEST(obj); 1130 1131 qemu_mutex_init(&tdx->lock); 1132 1133 cgs->require_guest_memfd = true; 1134 tdx->attributes = TDX_TD_ATTRIBUTES_SEPT_VE_DISABLE; 1135 1136 object_property_add_uint64_ptr(obj, "attributes", &tdx->attributes, 1137 OBJ_PROP_FLAG_READWRITE); 1138 object_property_add_bool(obj, "sept-ve-disable", 1139 tdx_guest_get_sept_ve_disable, 1140 tdx_guest_set_sept_ve_disable); 1141 object_property_add_str(obj, "mrconfigid", 1142 tdx_guest_get_mrconfigid, 1143 tdx_guest_set_mrconfigid); 1144 object_property_add_str(obj, "mrowner", 1145 tdx_guest_get_mrowner, tdx_guest_set_mrowner); 1146 object_property_add_str(obj, "mrownerconfig", 1147 tdx_guest_get_mrownerconfig, 1148 tdx_guest_set_mrownerconfig); 1149 } 1150 1151 static void tdx_guest_finalize(Object *obj) 1152 { 1153 } 1154 1155 static void tdx_guest_class_init(ObjectClass *oc, const void *data) 1156 { 1157 ConfidentialGuestSupportClass *klass = CONFIDENTIAL_GUEST_SUPPORT_CLASS(oc); 1158 X86ConfidentialGuestClass *x86_klass = X86_CONFIDENTIAL_GUEST_CLASS(oc); 1159 1160 klass->kvm_init = tdx_kvm_init; 1161 x86_klass->kvm_type = tdx_kvm_type; 1162 x86_klass->cpu_instance_init = tdx_cpu_instance_init; 1163 x86_klass->adjust_cpuid_features = tdx_adjust_cpuid_features; 1164 } 1165