1 /* 2 * QEMU KVM support 3 * 4 * Copyright (C) 2006-2008 Qumranet Technologies 5 * Copyright IBM, Corp. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2 or later. 11 * See the COPYING file in the top-level directory. 12 * 13 */ 14 15 #include "qemu/osdep.h" 16 #include "qapi/qapi-events-run-state.h" 17 #include "qapi/error.h" 18 #include "qapi/visitor.h" 19 #include <math.h> 20 #include <sys/ioctl.h> 21 #include <sys/utsname.h> 22 #include <sys/syscall.h> 23 #include <sys/resource.h> 24 #include <sys/time.h> 25 26 #include <linux/kvm.h> 27 #include <linux/kvm_para.h> 28 #include "standard-headers/asm-x86/kvm_para.h" 29 #include "hw/xen/interface/arch-x86/cpuid.h" 30 31 #include "cpu.h" 32 #include "host-cpu.h" 33 #include "vmsr_energy.h" 34 #include "system/system.h" 35 #include "system/hw_accel.h" 36 #include "system/kvm_int.h" 37 #include "system/runstate.h" 38 #include "kvm_i386.h" 39 #include "../confidential-guest.h" 40 #include "sev.h" 41 #include "xen-emu.h" 42 #include "hyperv.h" 43 #include "hyperv-proto.h" 44 45 #include "gdbstub/enums.h" 46 #include "qemu/host-utils.h" 47 #include "qemu/main-loop.h" 48 #include "qemu/ratelimit.h" 49 #include "qemu/config-file.h" 50 #include "qemu/error-report.h" 51 #include "qemu/memalign.h" 52 #include "hw/i386/x86.h" 53 #include "hw/i386/kvm/xen_evtchn.h" 54 #include "hw/i386/pc.h" 55 #include "hw/i386/apic.h" 56 #include "hw/i386/apic_internal.h" 57 #include "hw/i386/apic-msidef.h" 58 #include "hw/i386/intel_iommu.h" 59 #include "hw/i386/topology.h" 60 #include "hw/i386/x86-iommu.h" 61 #include "hw/i386/e820_memory_layout.h" 62 63 #include "hw/xen/xen.h" 64 65 #include "hw/pci/pci.h" 66 #include "hw/pci/msi.h" 67 #include "hw/pci/msix.h" 68 #include "migration/blocker.h" 69 #include "exec/memattrs.h" 70 #include "exec/target_page.h" 71 #include "trace.h" 72 73 #include CONFIG_DEVICES 74 75 //#define DEBUG_KVM 76 77 #ifdef DEBUG_KVM 78 #define DPRINTF(fmt, ...) \ 79 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 80 #else 81 #define DPRINTF(fmt, ...) \ 82 do { } while (0) 83 #endif 84 85 /* 86 * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly. 87 * In order to use vm86 mode, an EPT identity map and a TSS are needed. 88 * Since these must be part of guest physical memory, we need to allocate 89 * them, both by setting their start addresses in the kernel and by 90 * creating a corresponding e820 entry. We need 4 pages before the BIOS, 91 * so this value allows up to 16M BIOSes. 92 */ 93 #define KVM_IDENTITY_BASE 0xfeffc000 94 95 /* From arch/x86/kvm/lapic.h */ 96 #define KVM_APIC_BUS_CYCLE_NS 1 97 #define KVM_APIC_BUS_FREQUENCY (1000000000ULL / KVM_APIC_BUS_CYCLE_NS) 98 99 /* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus 100 * 255 kvm_msr_entry structs */ 101 #define MSR_BUF_SIZE 4096 102 103 typedef bool QEMURDMSRHandler(X86CPU *cpu, uint32_t msr, uint64_t *val); 104 typedef bool QEMUWRMSRHandler(X86CPU *cpu, uint32_t msr, uint64_t val); 105 typedef struct { 106 uint32_t msr; 107 QEMURDMSRHandler *rdmsr; 108 QEMUWRMSRHandler *wrmsr; 109 } KVMMSRHandlers; 110 111 static void kvm_init_msrs(X86CPU *cpu); 112 static int kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr, 113 QEMUWRMSRHandler *wrmsr); 114 115 const KVMCapabilityInfo kvm_arch_required_capabilities[] = { 116 KVM_CAP_INFO(SET_TSS_ADDR), 117 KVM_CAP_INFO(EXT_CPUID), 118 KVM_CAP_INFO(MP_STATE), 119 KVM_CAP_INFO(SIGNAL_MSI), 120 KVM_CAP_INFO(IRQ_ROUTING), 121 KVM_CAP_INFO(DEBUGREGS), 122 KVM_CAP_INFO(XSAVE), 123 KVM_CAP_INFO(VCPU_EVENTS), 124 KVM_CAP_INFO(X86_ROBUST_SINGLESTEP), 125 KVM_CAP_INFO(MCE), 126 KVM_CAP_INFO(ADJUST_CLOCK), 127 KVM_CAP_INFO(SET_IDENTITY_MAP_ADDR), 128 KVM_CAP_LAST_INFO 129 }; 130 131 static bool has_msr_star; 132 static bool has_msr_hsave_pa; 133 static bool has_msr_tsc_aux; 134 static bool has_msr_tsc_adjust; 135 static bool has_msr_tsc_deadline; 136 static bool has_msr_feature_control; 137 static bool has_msr_misc_enable; 138 static bool has_msr_smbase; 139 static bool has_msr_bndcfgs; 140 static int lm_capable_kernel; 141 static bool has_msr_hv_hypercall; 142 static bool has_msr_hv_crash; 143 static bool has_msr_hv_reset; 144 static bool has_msr_hv_vpindex; 145 static bool hv_vpindex_settable; 146 static bool has_msr_hv_runtime; 147 static bool has_msr_hv_synic; 148 static bool has_msr_hv_stimer; 149 static bool has_msr_hv_frequencies; 150 static bool has_msr_hv_reenlightenment; 151 static bool has_msr_hv_syndbg_options; 152 static bool has_msr_xss; 153 static bool has_msr_umwait; 154 static bool has_msr_spec_ctrl; 155 static bool has_tsc_scale_msr; 156 static bool has_msr_tsx_ctrl; 157 static bool has_msr_virt_ssbd; 158 static bool has_msr_smi_count; 159 static bool has_msr_arch_capabs; 160 static bool has_msr_core_capabs; 161 static bool has_msr_vmx_vmfunc; 162 static bool has_msr_ucode_rev; 163 static bool has_msr_vmx_procbased_ctls2; 164 static bool has_msr_perf_capabs; 165 static bool has_msr_pkrs; 166 static bool has_msr_hwcr; 167 168 static uint32_t has_architectural_pmu_version; 169 static uint32_t num_architectural_pmu_gp_counters; 170 static uint32_t num_architectural_pmu_fixed_counters; 171 172 static int has_xsave2; 173 static int has_xcrs; 174 static int has_sregs2; 175 static int has_exception_payload; 176 static int has_triple_fault_event; 177 178 static bool has_msr_mcg_ext_ctl; 179 180 static struct kvm_cpuid2 *cpuid_cache; 181 static struct kvm_cpuid2 *hv_cpuid_cache; 182 static struct kvm_msr_list *kvm_feature_msrs; 183 184 static KVMMSRHandlers msr_handlers[KVM_MSR_FILTER_MAX_RANGES]; 185 186 #define BUS_LOCK_SLICE_TIME 1000000000ULL /* ns */ 187 static RateLimit bus_lock_ratelimit_ctrl; 188 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value); 189 190 static const char *vm_type_name[] = { 191 [KVM_X86_DEFAULT_VM] = "default", 192 [KVM_X86_SEV_VM] = "SEV", 193 [KVM_X86_SEV_ES_VM] = "SEV-ES", 194 [KVM_X86_SNP_VM] = "SEV-SNP", 195 [KVM_X86_TDX_VM] = "TDX", 196 }; 197 198 bool kvm_is_vm_type_supported(int type) 199 { 200 uint32_t machine_types; 201 202 /* 203 * old KVM doesn't support KVM_CAP_VM_TYPES but KVM_X86_DEFAULT_VM 204 * is always supported 205 */ 206 if (type == KVM_X86_DEFAULT_VM) { 207 return true; 208 } 209 210 machine_types = kvm_check_extension(KVM_STATE(current_machine->accelerator), 211 KVM_CAP_VM_TYPES); 212 return !!(machine_types & BIT(type)); 213 } 214 215 int kvm_get_vm_type(MachineState *ms) 216 { 217 int kvm_type = KVM_X86_DEFAULT_VM; 218 219 if (ms->cgs) { 220 if (!object_dynamic_cast(OBJECT(ms->cgs), TYPE_X86_CONFIDENTIAL_GUEST)) { 221 error_report("configuration type %s not supported for x86 guests", 222 object_get_typename(OBJECT(ms->cgs))); 223 exit(1); 224 } 225 kvm_type = x86_confidential_guest_kvm_type( 226 X86_CONFIDENTIAL_GUEST(ms->cgs)); 227 } 228 229 if (!kvm_is_vm_type_supported(kvm_type)) { 230 error_report("vm-type %s not supported by KVM", vm_type_name[kvm_type]); 231 exit(1); 232 } 233 234 return kvm_type; 235 } 236 237 bool kvm_enable_hypercall(uint64_t enable_mask) 238 { 239 KVMState *s = KVM_STATE(current_accel()); 240 241 return !kvm_vm_enable_cap(s, KVM_CAP_EXIT_HYPERCALL, 0, enable_mask); 242 } 243 244 bool kvm_has_smm(void) 245 { 246 return kvm_vm_check_extension(kvm_state, KVM_CAP_X86_SMM); 247 } 248 249 bool kvm_has_adjust_clock_stable(void) 250 { 251 int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK); 252 253 return (ret & KVM_CLOCK_TSC_STABLE); 254 } 255 256 bool kvm_has_exception_payload(void) 257 { 258 return has_exception_payload; 259 } 260 261 static bool kvm_x2apic_api_set_flags(uint64_t flags) 262 { 263 KVMState *s = KVM_STATE(current_accel()); 264 265 return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags); 266 } 267 268 #define MEMORIZE(fn, _result) \ 269 ({ \ 270 static bool _memorized; \ 271 \ 272 if (_memorized) { \ 273 return _result; \ 274 } \ 275 _memorized = true; \ 276 _result = fn; \ 277 }) 278 279 static bool has_x2apic_api; 280 281 bool kvm_has_x2apic_api(void) 282 { 283 return has_x2apic_api; 284 } 285 286 bool kvm_enable_x2apic(void) 287 { 288 return MEMORIZE( 289 kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS | 290 KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK), 291 has_x2apic_api); 292 } 293 294 bool kvm_hv_vpindex_settable(void) 295 { 296 return hv_vpindex_settable; 297 } 298 299 static int kvm_get_tsc(CPUState *cs) 300 { 301 X86CPU *cpu = X86_CPU(cs); 302 CPUX86State *env = &cpu->env; 303 uint64_t value; 304 int ret; 305 306 if (env->tsc_valid) { 307 return 0; 308 } 309 310 env->tsc_valid = !runstate_is_running(); 311 312 ret = kvm_get_one_msr(cpu, MSR_IA32_TSC, &value); 313 if (ret < 0) { 314 return ret; 315 } 316 317 env->tsc = value; 318 return 0; 319 } 320 321 static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg) 322 { 323 kvm_get_tsc(cpu); 324 } 325 326 void kvm_synchronize_all_tsc(void) 327 { 328 CPUState *cpu; 329 330 if (kvm_enabled()) { 331 CPU_FOREACH(cpu) { 332 run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL); 333 } 334 } 335 } 336 337 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max) 338 { 339 struct kvm_cpuid2 *cpuid; 340 int r, size; 341 342 size = sizeof(*cpuid) + max * sizeof(*cpuid->entries); 343 cpuid = g_malloc0(size); 344 cpuid->nent = max; 345 r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid); 346 if (r == 0 && cpuid->nent >= max) { 347 r = -E2BIG; 348 } 349 if (r < 0) { 350 if (r == -E2BIG) { 351 g_free(cpuid); 352 return NULL; 353 } else { 354 fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n", 355 strerror(-r)); 356 exit(1); 357 } 358 } 359 return cpuid; 360 } 361 362 /* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough 363 * for all entries. 364 */ 365 static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s) 366 { 367 struct kvm_cpuid2 *cpuid; 368 int max = 1; 369 370 if (cpuid_cache != NULL) { 371 return cpuid_cache; 372 } 373 while ((cpuid = try_get_cpuid(s, max)) == NULL) { 374 max *= 2; 375 } 376 cpuid_cache = cpuid; 377 return cpuid; 378 } 379 380 static bool host_tsx_broken(void) 381 { 382 int family, model, stepping;\ 383 char vendor[CPUID_VENDOR_SZ + 1]; 384 385 host_cpu_vendor_fms(vendor, &family, &model, &stepping); 386 387 /* Check if we are running on a Haswell host known to have broken TSX */ 388 return !strcmp(vendor, CPUID_VENDOR_INTEL) && 389 (family == 6) && 390 ((model == 63 && stepping < 4) || 391 model == 60 || model == 69 || model == 70); 392 } 393 394 /* Returns the value for a specific register on the cpuid entry 395 */ 396 static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg) 397 { 398 uint32_t ret = 0; 399 switch (reg) { 400 case R_EAX: 401 ret = entry->eax; 402 break; 403 case R_EBX: 404 ret = entry->ebx; 405 break; 406 case R_ECX: 407 ret = entry->ecx; 408 break; 409 case R_EDX: 410 ret = entry->edx; 411 break; 412 } 413 return ret; 414 } 415 416 /* Find matching entry for function/index on kvm_cpuid2 struct 417 */ 418 static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid, 419 uint32_t function, 420 uint32_t index) 421 { 422 int i; 423 for (i = 0; i < cpuid->nent; ++i) { 424 if (cpuid->entries[i].function == function && 425 cpuid->entries[i].index == index) { 426 return &cpuid->entries[i]; 427 } 428 } 429 /* not found: */ 430 return NULL; 431 } 432 433 uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, 434 uint32_t index, int reg) 435 { 436 struct kvm_cpuid2 *cpuid; 437 uint32_t ret = 0; 438 uint32_t cpuid_1_edx, unused; 439 uint64_t bitmask; 440 441 cpuid = get_supported_cpuid(s); 442 443 struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index); 444 if (entry) { 445 ret = cpuid_entry_get_reg(entry, reg); 446 } 447 448 /* Fixups for the data returned by KVM, below */ 449 450 if (function == 1 && reg == R_EDX) { 451 /* KVM before 2.6.30 misreports the following features */ 452 ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA; 453 /* KVM never reports CPUID_HT but QEMU can support when vcpus > 1 */ 454 ret |= CPUID_HT; 455 } else if (function == 1 && reg == R_ECX) { 456 /* We can set the hypervisor flag, even if KVM does not return it on 457 * GET_SUPPORTED_CPUID 458 */ 459 ret |= CPUID_EXT_HYPERVISOR; 460 /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it 461 * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER, 462 * and the irqchip is in the kernel. 463 */ 464 if (kvm_irqchip_in_kernel() && 465 kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) { 466 ret |= CPUID_EXT_TSC_DEADLINE_TIMER; 467 } 468 469 /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled 470 * without the in-kernel irqchip 471 */ 472 if (!kvm_irqchip_in_kernel()) { 473 ret &= ~CPUID_EXT_X2APIC; 474 } 475 476 if (enable_cpu_pm) { 477 int disable_exits = kvm_check_extension(s, 478 KVM_CAP_X86_DISABLE_EXITS); 479 480 if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) { 481 ret |= CPUID_EXT_MONITOR; 482 } 483 } 484 } else if (function == 6 && reg == R_EAX) { 485 ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */ 486 } else if (function == 7 && index == 0 && reg == R_EBX) { 487 /* Not new instructions, just an optimization. */ 488 uint32_t ebx; 489 host_cpuid(7, 0, &unused, &ebx, &unused, &unused); 490 ret |= ebx & CPUID_7_0_EBX_ERMS; 491 492 if (host_tsx_broken()) { 493 ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE); 494 } 495 } else if (function == 7 && index == 0 && reg == R_EDX) { 496 /* Not new instructions, just an optimization. */ 497 uint32_t edx; 498 host_cpuid(7, 0, &unused, &unused, &unused, &edx); 499 ret |= edx & CPUID_7_0_EDX_FSRM; 500 501 /* 502 * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts. 503 * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is 504 * returned by KVM_GET_MSR_INDEX_LIST. 505 */ 506 if (!has_msr_arch_capabs) { 507 ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES; 508 } 509 } else if (function == 7 && index == 1 && reg == R_EAX) { 510 /* Not new instructions, just an optimization. */ 511 uint32_t eax; 512 host_cpuid(7, 1, &eax, &unused, &unused, &unused); 513 ret |= eax & (CPUID_7_1_EAX_FZRM | CPUID_7_1_EAX_FSRS | CPUID_7_1_EAX_FSRC); 514 } else if (function == 7 && index == 2 && reg == R_EDX) { 515 uint32_t edx; 516 host_cpuid(7, 2, &unused, &unused, &unused, &edx); 517 ret |= edx & CPUID_7_2_EDX_MCDT_NO; 518 } else if (function == 0xd && index == 0 && 519 (reg == R_EAX || reg == R_EDX)) { 520 /* 521 * The value returned by KVM_GET_SUPPORTED_CPUID does not include 522 * features that still have to be enabled with the arch_prctl 523 * system call. QEMU needs the full value, which is retrieved 524 * with KVM_GET_DEVICE_ATTR. 525 */ 526 struct kvm_device_attr attr = { 527 .group = 0, 528 .attr = KVM_X86_XCOMP_GUEST_SUPP, 529 .addr = (unsigned long) &bitmask 530 }; 531 532 bool sys_attr = kvm_check_extension(s, KVM_CAP_SYS_ATTRIBUTES); 533 if (!sys_attr) { 534 return ret; 535 } 536 537 int rc = kvm_ioctl(s, KVM_GET_DEVICE_ATTR, &attr); 538 if (rc < 0) { 539 if (rc != -ENXIO) { 540 warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) " 541 "error: %d", rc); 542 } 543 return ret; 544 } 545 ret = (reg == R_EAX) ? bitmask : bitmask >> 32; 546 } else if (function == 0x80000001 && reg == R_ECX) { 547 /* 548 * It's safe to enable TOPOEXT even if it's not returned by 549 * GET_SUPPORTED_CPUID. Unconditionally enabling TOPOEXT here allows 550 * us to keep CPU models including TOPOEXT runnable on older kernels. 551 */ 552 ret |= CPUID_EXT3_TOPOEXT; 553 } else if (function == 0x80000001 && reg == R_EDX) { 554 /* On Intel, kvm returns cpuid according to the Intel spec, 555 * so add missing bits according to the AMD spec: 556 */ 557 cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX); 558 ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES; 559 } else if (function == 0x80000007 && reg == R_EBX) { 560 ret |= CPUID_8000_0007_EBX_OVERFLOW_RECOV | CPUID_8000_0007_EBX_SUCCOR; 561 } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) { 562 /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't 563 * be enabled without the in-kernel irqchip 564 */ 565 if (!kvm_irqchip_in_kernel()) { 566 ret &= ~CPUID_KVM_PV_UNHALT; 567 } 568 if (kvm_irqchip_is_split()) { 569 ret |= CPUID_KVM_MSI_EXT_DEST_ID; 570 } 571 } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) { 572 ret |= CPUID_KVM_HINTS_REALTIME; 573 } 574 575 if (current_machine->cgs) { 576 ret = x86_confidential_guest_mask_cpuid_features( 577 X86_CONFIDENTIAL_GUEST(current_machine->cgs), 578 function, index, reg, ret); 579 } 580 return ret; 581 } 582 583 uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index) 584 { 585 struct { 586 struct kvm_msrs info; 587 struct kvm_msr_entry entries[1]; 588 } msr_data = {}; 589 uint64_t value; 590 uint32_t ret, can_be_one, must_be_one; 591 592 if (kvm_feature_msrs == NULL) { /* Host doesn't support feature MSRs */ 593 return 0; 594 } 595 596 /* Check if requested MSR is supported feature MSR */ 597 int i; 598 for (i = 0; i < kvm_feature_msrs->nmsrs; i++) 599 if (kvm_feature_msrs->indices[i] == index) { 600 break; 601 } 602 if (i == kvm_feature_msrs->nmsrs) { 603 return 0; /* if the feature MSR is not supported, simply return 0 */ 604 } 605 606 msr_data.info.nmsrs = 1; 607 msr_data.entries[0].index = index; 608 609 ret = kvm_ioctl(s, KVM_GET_MSRS, &msr_data); 610 if (ret != 1) { 611 error_report("KVM get MSR (index=0x%x) feature failed, %s", 612 index, strerror(-ret)); 613 exit(1); 614 } 615 616 value = msr_data.entries[0].data; 617 switch (index) { 618 case MSR_IA32_VMX_PROCBASED_CTLS2: 619 if (!has_msr_vmx_procbased_ctls2) { 620 /* KVM forgot to add these bits for some time, do this ourselves. */ 621 if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) & 622 CPUID_XSAVE_XSAVES) { 623 value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32; 624 } 625 if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) & 626 CPUID_EXT_RDRAND) { 627 value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32; 628 } 629 if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & 630 CPUID_7_0_EBX_INVPCID) { 631 value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32; 632 } 633 if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & 634 CPUID_7_0_EBX_RDSEED) { 635 value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32; 636 } 637 if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) & 638 CPUID_EXT2_RDTSCP) { 639 value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32; 640 } 641 } 642 /* fall through */ 643 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 644 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 645 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 646 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 647 /* 648 * Return true for bits that can be one, but do not have to be one. 649 * The SDM tells us which bits could have a "must be one" setting, 650 * so we can do the opposite transformation in make_vmx_msr_value. 651 */ 652 must_be_one = (uint32_t)value; 653 can_be_one = (uint32_t)(value >> 32); 654 return can_be_one & ~must_be_one; 655 656 default: 657 return value; 658 } 659 } 660 661 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap, 662 int *max_banks) 663 { 664 *max_banks = kvm_check_extension(s, KVM_CAP_MCE); 665 return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap); 666 } 667 668 static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code) 669 { 670 CPUState *cs = CPU(cpu); 671 CPUX86State *env = &cpu->env; 672 uint64_t status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_MISCV | 673 MCI_STATUS_ADDRV; 674 uint64_t mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV; 675 int flags = 0; 676 677 if (!IS_AMD_CPU(env)) { 678 status |= MCI_STATUS_S | MCI_STATUS_UC; 679 if (code == BUS_MCEERR_AR) { 680 status |= MCI_STATUS_AR | 0x134; 681 mcg_status |= MCG_STATUS_EIPV; 682 } else { 683 status |= 0xc0; 684 } 685 } else { 686 if (code == BUS_MCEERR_AR) { 687 status |= MCI_STATUS_UC | MCI_STATUS_POISON; 688 mcg_status |= MCG_STATUS_EIPV; 689 } else { 690 /* Setting the POISON bit for deferred errors indicates to the 691 * guest kernel that the address provided by the MCE is valid 692 * and usable which will ensure that the guest kernel will send 693 * a SIGBUS_AO signal to the guest process. This allows for 694 * more desirable behavior in the case that the guest process 695 * with poisoned memory has set the MCE_KILL_EARLY prctl flag 696 * which indicates that the process would prefer to handle or 697 * shutdown due to the poisoned memory condition before the 698 * memory has been accessed. 699 * 700 * While the POISON bit would not be set in a deferred error 701 * sent from hardware, the bit is not meaningful for deferred 702 * errors and can be reused in this scenario. 703 */ 704 status |= MCI_STATUS_DEFERRED | MCI_STATUS_POISON; 705 } 706 } 707 708 flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0; 709 /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the 710 * guest kernel back into env->mcg_ext_ctl. 711 */ 712 cpu_synchronize_state(cs); 713 if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) { 714 mcg_status |= MCG_STATUS_LMCE; 715 flags = 0; 716 } 717 718 cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr, 719 (MCM_ADDR_PHYS << 6) | 0xc, flags); 720 } 721 722 static void emit_hypervisor_memory_failure(MemoryFailureAction action, bool ar) 723 { 724 MemoryFailureFlags mff = {.action_required = ar, .recursive = false}; 725 726 qapi_event_send_memory_failure(MEMORY_FAILURE_RECIPIENT_HYPERVISOR, action, 727 &mff); 728 } 729 730 static void hardware_memory_error(void *host_addr) 731 { 732 emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_FATAL, true); 733 error_report("QEMU got Hardware memory error at addr %p", host_addr); 734 exit(1); 735 } 736 737 void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) 738 { 739 X86CPU *cpu = X86_CPU(c); 740 CPUX86State *env = &cpu->env; 741 ram_addr_t ram_addr; 742 hwaddr paddr; 743 744 /* If we get an action required MCE, it has been injected by KVM 745 * while the VM was running. An action optional MCE instead should 746 * be coming from the main thread, which qemu_init_sigbus identifies 747 * as the "early kill" thread. 748 */ 749 assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); 750 751 if ((env->mcg_cap & MCG_SER_P) && addr) { 752 ram_addr = qemu_ram_addr_from_host(addr); 753 if (ram_addr != RAM_ADDR_INVALID && 754 kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { 755 kvm_hwpoison_page_add(ram_addr); 756 kvm_mce_inject(cpu, paddr, code); 757 758 /* 759 * Use different logging severity based on error type. 760 * If there is additional MCE reporting on the hypervisor, QEMU VA 761 * could be another source to identify the PA and MCE details. 762 */ 763 if (code == BUS_MCEERR_AR) { 764 error_report("Guest MCE Memory Error at QEMU addr %p and " 765 "GUEST addr 0x%" HWADDR_PRIx " of type %s injected", 766 addr, paddr, "BUS_MCEERR_AR"); 767 } else { 768 warn_report("Guest MCE Memory Error at QEMU addr %p and " 769 "GUEST addr 0x%" HWADDR_PRIx " of type %s injected", 770 addr, paddr, "BUS_MCEERR_AO"); 771 } 772 773 return; 774 } 775 776 if (code == BUS_MCEERR_AO) { 777 warn_report("Hardware memory error at addr %p of type %s " 778 "for memory used by QEMU itself instead of guest system!", 779 addr, "BUS_MCEERR_AO"); 780 } 781 } 782 783 if (code == BUS_MCEERR_AR) { 784 hardware_memory_error(addr); 785 } 786 787 /* Hope we are lucky for AO MCE, just notify a event */ 788 emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_IGNORE, false); 789 } 790 791 static void kvm_queue_exception(CPUX86State *env, 792 int32_t exception_nr, 793 uint8_t exception_has_payload, 794 uint64_t exception_payload) 795 { 796 assert(env->exception_nr == -1); 797 assert(!env->exception_pending); 798 assert(!env->exception_injected); 799 assert(!env->exception_has_payload); 800 801 env->exception_nr = exception_nr; 802 803 if (has_exception_payload) { 804 env->exception_pending = 1; 805 806 env->exception_has_payload = exception_has_payload; 807 env->exception_payload = exception_payload; 808 } else { 809 env->exception_injected = 1; 810 811 if (exception_nr == EXCP01_DB) { 812 assert(exception_has_payload); 813 env->dr[6] = exception_payload; 814 } else if (exception_nr == EXCP0E_PAGE) { 815 assert(exception_has_payload); 816 env->cr[2] = exception_payload; 817 } else { 818 assert(!exception_has_payload); 819 } 820 } 821 } 822 823 static void cpu_update_state(void *opaque, bool running, RunState state) 824 { 825 CPUX86State *env = opaque; 826 827 if (running) { 828 env->tsc_valid = false; 829 } 830 } 831 832 unsigned long kvm_arch_vcpu_id(CPUState *cs) 833 { 834 X86CPU *cpu = X86_CPU(cs); 835 return cpu->apic_id; 836 } 837 838 #ifndef KVM_CPUID_SIGNATURE_NEXT 839 #define KVM_CPUID_SIGNATURE_NEXT 0x40000100 840 #endif 841 842 static bool hyperv_enabled(X86CPU *cpu) 843 { 844 return kvm_check_extension(kvm_state, KVM_CAP_HYPERV) > 0 && 845 ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) || 846 cpu->hyperv_features || cpu->hyperv_passthrough); 847 } 848 849 /* 850 * Check whether target_freq is within conservative 851 * ntp correctable bounds (250ppm) of freq 852 */ 853 static inline bool freq_within_bounds(int freq, int target_freq) 854 { 855 int max_freq = freq + (freq * 250 / 1000000); 856 int min_freq = freq - (freq * 250 / 1000000); 857 858 if (target_freq >= min_freq && target_freq <= max_freq) { 859 return true; 860 } 861 862 return false; 863 } 864 865 static int kvm_arch_set_tsc_khz(CPUState *cs) 866 { 867 X86CPU *cpu = X86_CPU(cs); 868 CPUX86State *env = &cpu->env; 869 int r, cur_freq; 870 bool set_ioctl = false; 871 872 if (!env->tsc_khz) { 873 return 0; 874 } 875 876 cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ? 877 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP; 878 879 /* 880 * If TSC scaling is supported, attempt to set TSC frequency. 881 */ 882 if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) { 883 set_ioctl = true; 884 } 885 886 /* 887 * If desired TSC frequency is within bounds of NTP correction, 888 * attempt to set TSC frequency. 889 */ 890 if (cur_freq != -ENOTSUP && freq_within_bounds(cur_freq, env->tsc_khz)) { 891 set_ioctl = true; 892 } 893 894 r = set_ioctl ? 895 kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) : 896 -ENOTSUP; 897 898 if (r < 0) { 899 /* When KVM_SET_TSC_KHZ fails, it's an error only if the current 900 * TSC frequency doesn't match the one we want. 901 */ 902 cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ? 903 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : 904 -ENOTSUP; 905 if (cur_freq <= 0 || cur_freq != env->tsc_khz) { 906 warn_report("TSC frequency mismatch between " 907 "VM (%" PRId64 " kHz) and host (%d kHz), " 908 "and TSC scaling unavailable", 909 env->tsc_khz, cur_freq); 910 return r; 911 } 912 } 913 914 return 0; 915 } 916 917 static bool tsc_is_stable_and_known(CPUX86State *env) 918 { 919 if (!env->tsc_khz) { 920 return false; 921 } 922 return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) 923 || env->user_tsc_khz; 924 } 925 926 #define DEFAULT_EVMCS_VERSION ((1 << 8) | 1) 927 928 static struct { 929 const char *desc; 930 struct { 931 uint32_t func; 932 int reg; 933 uint32_t bits; 934 } flags[2]; 935 uint64_t dependencies; 936 bool skip_passthrough; 937 } kvm_hyperv_properties[] = { 938 [HYPERV_FEAT_RELAXED] = { 939 .desc = "relaxed timing (hv-relaxed)", 940 .flags = { 941 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX, 942 .bits = HV_RELAXED_TIMING_RECOMMENDED} 943 } 944 }, 945 [HYPERV_FEAT_VAPIC] = { 946 .desc = "virtual APIC (hv-vapic)", 947 .flags = { 948 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 949 .bits = HV_APIC_ACCESS_AVAILABLE} 950 } 951 }, 952 [HYPERV_FEAT_TIME] = { 953 .desc = "clocksources (hv-time)", 954 .flags = { 955 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 956 .bits = HV_TIME_REF_COUNT_AVAILABLE | HV_REFERENCE_TSC_AVAILABLE} 957 } 958 }, 959 [HYPERV_FEAT_CRASH] = { 960 .desc = "crash MSRs (hv-crash)", 961 .flags = { 962 {.func = HV_CPUID_FEATURES, .reg = R_EDX, 963 .bits = HV_GUEST_CRASH_MSR_AVAILABLE} 964 } 965 }, 966 [HYPERV_FEAT_RESET] = { 967 .desc = "reset MSR (hv-reset)", 968 .flags = { 969 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 970 .bits = HV_RESET_AVAILABLE} 971 } 972 }, 973 [HYPERV_FEAT_VPINDEX] = { 974 .desc = "VP_INDEX MSR (hv-vpindex)", 975 .flags = { 976 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 977 .bits = HV_VP_INDEX_AVAILABLE} 978 } 979 }, 980 [HYPERV_FEAT_RUNTIME] = { 981 .desc = "VP_RUNTIME MSR (hv-runtime)", 982 .flags = { 983 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 984 .bits = HV_VP_RUNTIME_AVAILABLE} 985 } 986 }, 987 [HYPERV_FEAT_SYNIC] = { 988 .desc = "synthetic interrupt controller (hv-synic)", 989 .flags = { 990 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 991 .bits = HV_SYNIC_AVAILABLE} 992 } 993 }, 994 [HYPERV_FEAT_STIMER] = { 995 .desc = "synthetic timers (hv-stimer)", 996 .flags = { 997 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 998 .bits = HV_SYNTIMERS_AVAILABLE} 999 }, 1000 .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME) 1001 }, 1002 [HYPERV_FEAT_FREQUENCIES] = { 1003 .desc = "frequency MSRs (hv-frequencies)", 1004 .flags = { 1005 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 1006 .bits = HV_ACCESS_FREQUENCY_MSRS}, 1007 {.func = HV_CPUID_FEATURES, .reg = R_EDX, 1008 .bits = HV_FREQUENCY_MSRS_AVAILABLE} 1009 } 1010 }, 1011 [HYPERV_FEAT_REENLIGHTENMENT] = { 1012 .desc = "reenlightenment MSRs (hv-reenlightenment)", 1013 .flags = { 1014 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 1015 .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL} 1016 } 1017 }, 1018 [HYPERV_FEAT_TLBFLUSH] = { 1019 .desc = "paravirtualized TLB flush (hv-tlbflush)", 1020 .flags = { 1021 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX, 1022 .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED | 1023 HV_EX_PROCESSOR_MASKS_RECOMMENDED} 1024 }, 1025 .dependencies = BIT(HYPERV_FEAT_VPINDEX) 1026 }, 1027 [HYPERV_FEAT_EVMCS] = { 1028 .desc = "enlightened VMCS (hv-evmcs)", 1029 .flags = { 1030 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX, 1031 .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED} 1032 }, 1033 .dependencies = BIT(HYPERV_FEAT_VAPIC) 1034 }, 1035 [HYPERV_FEAT_IPI] = { 1036 .desc = "paravirtualized IPI (hv-ipi)", 1037 .flags = { 1038 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX, 1039 .bits = HV_CLUSTER_IPI_RECOMMENDED | 1040 HV_EX_PROCESSOR_MASKS_RECOMMENDED} 1041 }, 1042 .dependencies = BIT(HYPERV_FEAT_VPINDEX) 1043 }, 1044 [HYPERV_FEAT_STIMER_DIRECT] = { 1045 .desc = "direct mode synthetic timers (hv-stimer-direct)", 1046 .flags = { 1047 {.func = HV_CPUID_FEATURES, .reg = R_EDX, 1048 .bits = HV_STIMER_DIRECT_MODE_AVAILABLE} 1049 }, 1050 .dependencies = BIT(HYPERV_FEAT_STIMER) 1051 }, 1052 [HYPERV_FEAT_AVIC] = { 1053 .desc = "AVIC/APICv support (hv-avic/hv-apicv)", 1054 .flags = { 1055 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX, 1056 .bits = HV_DEPRECATING_AEOI_RECOMMENDED} 1057 } 1058 }, 1059 [HYPERV_FEAT_SYNDBG] = { 1060 .desc = "Enable synthetic kernel debugger channel (hv-syndbg)", 1061 .flags = { 1062 {.func = HV_CPUID_FEATURES, .reg = R_EDX, 1063 .bits = HV_FEATURE_DEBUG_MSRS_AVAILABLE} 1064 }, 1065 .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED), 1066 .skip_passthrough = true, 1067 }, 1068 [HYPERV_FEAT_MSR_BITMAP] = { 1069 .desc = "enlightened MSR-Bitmap (hv-emsr-bitmap)", 1070 .flags = { 1071 {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX, 1072 .bits = HV_NESTED_MSR_BITMAP} 1073 } 1074 }, 1075 [HYPERV_FEAT_XMM_INPUT] = { 1076 .desc = "XMM fast hypercall input (hv-xmm-input)", 1077 .flags = { 1078 {.func = HV_CPUID_FEATURES, .reg = R_EDX, 1079 .bits = HV_HYPERCALL_XMM_INPUT_AVAILABLE} 1080 } 1081 }, 1082 [HYPERV_FEAT_TLBFLUSH_EXT] = { 1083 .desc = "Extended gva ranges for TLB flush hypercalls (hv-tlbflush-ext)", 1084 .flags = { 1085 {.func = HV_CPUID_FEATURES, .reg = R_EDX, 1086 .bits = HV_EXT_GVA_RANGES_FLUSH_AVAILABLE} 1087 }, 1088 .dependencies = BIT(HYPERV_FEAT_TLBFLUSH) 1089 }, 1090 [HYPERV_FEAT_TLBFLUSH_DIRECT] = { 1091 .desc = "direct TLB flush (hv-tlbflush-direct)", 1092 .flags = { 1093 {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX, 1094 .bits = HV_NESTED_DIRECT_FLUSH} 1095 }, 1096 .dependencies = BIT(HYPERV_FEAT_VAPIC) 1097 }, 1098 }; 1099 1100 static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max, 1101 bool do_sys_ioctl) 1102 { 1103 struct kvm_cpuid2 *cpuid; 1104 int r, size; 1105 1106 size = sizeof(*cpuid) + max * sizeof(*cpuid->entries); 1107 cpuid = g_malloc0(size); 1108 cpuid->nent = max; 1109 1110 if (do_sys_ioctl) { 1111 r = kvm_ioctl(kvm_state, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 1112 } else { 1113 r = kvm_vcpu_ioctl(cs, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 1114 } 1115 if (r == 0 && cpuid->nent >= max) { 1116 r = -E2BIG; 1117 } 1118 if (r < 0) { 1119 if (r == -E2BIG) { 1120 g_free(cpuid); 1121 return NULL; 1122 } else { 1123 fprintf(stderr, "KVM_GET_SUPPORTED_HV_CPUID failed: %s\n", 1124 strerror(-r)); 1125 exit(1); 1126 } 1127 } 1128 return cpuid; 1129 } 1130 1131 /* 1132 * Run KVM_GET_SUPPORTED_HV_CPUID ioctl(), allocating a buffer large enough 1133 * for all entries. 1134 */ 1135 static struct kvm_cpuid2 *get_supported_hv_cpuid(CPUState *cs) 1136 { 1137 struct kvm_cpuid2 *cpuid; 1138 /* 0x40000000..0x40000005, 0x4000000A, 0x40000080..0x40000082 leaves */ 1139 int max = 11; 1140 int i; 1141 bool do_sys_ioctl; 1142 1143 do_sys_ioctl = 1144 kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID) > 0; 1145 1146 /* 1147 * Non-empty KVM context is needed when KVM_CAP_SYS_HYPERV_CPUID is 1148 * unsupported, kvm_hyperv_expand_features() checks for that. 1149 */ 1150 assert(do_sys_ioctl || cs->kvm_state); 1151 1152 /* 1153 * When the buffer is too small, KVM_GET_SUPPORTED_HV_CPUID fails with 1154 * -E2BIG, however, it doesn't report back the right size. Keep increasing 1155 * it and re-trying until we succeed. 1156 */ 1157 while ((cpuid = try_get_hv_cpuid(cs, max, do_sys_ioctl)) == NULL) { 1158 max++; 1159 } 1160 1161 /* 1162 * KVM_GET_SUPPORTED_HV_CPUID does not set EVMCS CPUID bit before 1163 * KVM_CAP_HYPERV_ENLIGHTENED_VMCS is enabled but we want to get the 1164 * information early, just check for the capability and set the bit 1165 * manually. 1166 */ 1167 if (!do_sys_ioctl && kvm_check_extension(cs->kvm_state, 1168 KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) { 1169 for (i = 0; i < cpuid->nent; i++) { 1170 if (cpuid->entries[i].function == HV_CPUID_ENLIGHTMENT_INFO) { 1171 cpuid->entries[i].eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED; 1172 } 1173 } 1174 } 1175 1176 return cpuid; 1177 } 1178 1179 /* 1180 * When KVM_GET_SUPPORTED_HV_CPUID is not supported we fill CPUID feature 1181 * leaves from KVM_CAP_HYPERV* and present MSRs data. 1182 */ 1183 static struct kvm_cpuid2 *get_supported_hv_cpuid_legacy(CPUState *cs) 1184 { 1185 X86CPU *cpu = X86_CPU(cs); 1186 struct kvm_cpuid2 *cpuid; 1187 struct kvm_cpuid_entry2 *entry_feat, *entry_recomm; 1188 1189 /* HV_CPUID_FEATURES, HV_CPUID_ENLIGHTMENT_INFO */ 1190 cpuid = g_malloc0(sizeof(*cpuid) + 2 * sizeof(*cpuid->entries)); 1191 cpuid->nent = 2; 1192 1193 /* HV_CPUID_VENDOR_AND_MAX_FUNCTIONS */ 1194 entry_feat = &cpuid->entries[0]; 1195 entry_feat->function = HV_CPUID_FEATURES; 1196 1197 entry_recomm = &cpuid->entries[1]; 1198 entry_recomm->function = HV_CPUID_ENLIGHTMENT_INFO; 1199 entry_recomm->ebx = cpu->hyperv_spinlock_attempts; 1200 1201 if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0) { 1202 entry_feat->eax |= HV_HYPERCALL_AVAILABLE; 1203 entry_feat->eax |= HV_APIC_ACCESS_AVAILABLE; 1204 entry_feat->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE; 1205 entry_recomm->eax |= HV_RELAXED_TIMING_RECOMMENDED; 1206 entry_recomm->eax |= HV_APIC_ACCESS_RECOMMENDED; 1207 } 1208 1209 if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) { 1210 entry_feat->eax |= HV_TIME_REF_COUNT_AVAILABLE; 1211 entry_feat->eax |= HV_REFERENCE_TSC_AVAILABLE; 1212 } 1213 1214 if (has_msr_hv_frequencies) { 1215 entry_feat->eax |= HV_ACCESS_FREQUENCY_MSRS; 1216 entry_feat->edx |= HV_FREQUENCY_MSRS_AVAILABLE; 1217 } 1218 1219 if (has_msr_hv_crash) { 1220 entry_feat->edx |= HV_GUEST_CRASH_MSR_AVAILABLE; 1221 } 1222 1223 if (has_msr_hv_reenlightenment) { 1224 entry_feat->eax |= HV_ACCESS_REENLIGHTENMENTS_CONTROL; 1225 } 1226 1227 if (has_msr_hv_reset) { 1228 entry_feat->eax |= HV_RESET_AVAILABLE; 1229 } 1230 1231 if (has_msr_hv_vpindex) { 1232 entry_feat->eax |= HV_VP_INDEX_AVAILABLE; 1233 } 1234 1235 if (has_msr_hv_runtime) { 1236 entry_feat->eax |= HV_VP_RUNTIME_AVAILABLE; 1237 } 1238 1239 if (has_msr_hv_synic) { 1240 unsigned int cap = cpu->hyperv_synic_kvm_only ? 1241 KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2; 1242 1243 if (kvm_check_extension(cs->kvm_state, cap) > 0) { 1244 entry_feat->eax |= HV_SYNIC_AVAILABLE; 1245 } 1246 } 1247 1248 if (has_msr_hv_stimer) { 1249 entry_feat->eax |= HV_SYNTIMERS_AVAILABLE; 1250 } 1251 1252 if (has_msr_hv_syndbg_options) { 1253 entry_feat->edx |= HV_GUEST_DEBUGGING_AVAILABLE; 1254 entry_feat->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; 1255 entry_feat->ebx |= HV_PARTITION_DEBUGGING_ALLOWED; 1256 } 1257 1258 if (kvm_check_extension(cs->kvm_state, 1259 KVM_CAP_HYPERV_TLBFLUSH) > 0) { 1260 entry_recomm->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED; 1261 entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED; 1262 } 1263 1264 if (kvm_check_extension(cs->kvm_state, 1265 KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) { 1266 entry_recomm->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED; 1267 } 1268 1269 if (kvm_check_extension(cs->kvm_state, 1270 KVM_CAP_HYPERV_SEND_IPI) > 0) { 1271 entry_recomm->eax |= HV_CLUSTER_IPI_RECOMMENDED; 1272 entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED; 1273 } 1274 1275 return cpuid; 1276 } 1277 1278 static uint32_t hv_cpuid_get_host(CPUState *cs, uint32_t func, int reg) 1279 { 1280 struct kvm_cpuid_entry2 *entry; 1281 struct kvm_cpuid2 *cpuid; 1282 1283 if (hv_cpuid_cache) { 1284 cpuid = hv_cpuid_cache; 1285 } else { 1286 if (kvm_check_extension(kvm_state, KVM_CAP_HYPERV_CPUID) > 0) { 1287 cpuid = get_supported_hv_cpuid(cs); 1288 } else { 1289 /* 1290 * 'cs->kvm_state' may be NULL when Hyper-V features are expanded 1291 * before KVM context is created but this is only done when 1292 * KVM_CAP_SYS_HYPERV_CPUID is supported and it implies 1293 * KVM_CAP_HYPERV_CPUID. 1294 */ 1295 assert(cs->kvm_state); 1296 1297 cpuid = get_supported_hv_cpuid_legacy(cs); 1298 } 1299 hv_cpuid_cache = cpuid; 1300 } 1301 1302 if (!cpuid) { 1303 return 0; 1304 } 1305 1306 entry = cpuid_find_entry(cpuid, func, 0); 1307 if (!entry) { 1308 return 0; 1309 } 1310 1311 return cpuid_entry_get_reg(entry, reg); 1312 } 1313 1314 static bool hyperv_feature_supported(CPUState *cs, int feature) 1315 { 1316 uint32_t func, bits; 1317 int i, reg; 1318 1319 /* 1320 * kvm_hyperv_properties needs to define at least one CPUID flag which 1321 * must be used to detect the feature, it's hard to say whether it is 1322 * supported or not otherwise. 1323 */ 1324 assert(kvm_hyperv_properties[feature].flags[0].func); 1325 1326 for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) { 1327 1328 func = kvm_hyperv_properties[feature].flags[i].func; 1329 reg = kvm_hyperv_properties[feature].flags[i].reg; 1330 bits = kvm_hyperv_properties[feature].flags[i].bits; 1331 1332 if (!func) { 1333 continue; 1334 } 1335 1336 if ((hv_cpuid_get_host(cs, func, reg) & bits) != bits) { 1337 return false; 1338 } 1339 } 1340 1341 return true; 1342 } 1343 1344 /* Checks that all feature dependencies are enabled */ 1345 static bool hv_feature_check_deps(X86CPU *cpu, int feature, Error **errp) 1346 { 1347 uint64_t deps; 1348 int dep_feat; 1349 1350 deps = kvm_hyperv_properties[feature].dependencies; 1351 while (deps) { 1352 dep_feat = ctz64(deps); 1353 if (!(hyperv_feat_enabled(cpu, dep_feat))) { 1354 error_setg(errp, "Hyper-V %s requires Hyper-V %s", 1355 kvm_hyperv_properties[feature].desc, 1356 kvm_hyperv_properties[dep_feat].desc); 1357 return false; 1358 } 1359 deps &= ~(1ull << dep_feat); 1360 } 1361 1362 return true; 1363 } 1364 1365 static uint32_t hv_build_cpuid_leaf(CPUState *cs, uint32_t func, int reg) 1366 { 1367 X86CPU *cpu = X86_CPU(cs); 1368 uint32_t r = 0; 1369 int i, j; 1370 1371 for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties); i++) { 1372 if (!hyperv_feat_enabled(cpu, i)) { 1373 continue; 1374 } 1375 1376 for (j = 0; j < ARRAY_SIZE(kvm_hyperv_properties[i].flags); j++) { 1377 if (kvm_hyperv_properties[i].flags[j].func != func) { 1378 continue; 1379 } 1380 if (kvm_hyperv_properties[i].flags[j].reg != reg) { 1381 continue; 1382 } 1383 1384 r |= kvm_hyperv_properties[i].flags[j].bits; 1385 } 1386 } 1387 1388 /* HV_CPUID_NESTED_FEATURES.EAX also encodes the supported eVMCS range */ 1389 if (func == HV_CPUID_NESTED_FEATURES && reg == R_EAX) { 1390 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) { 1391 r |= DEFAULT_EVMCS_VERSION; 1392 } 1393 } 1394 1395 return r; 1396 } 1397 1398 /* 1399 * Expand Hyper-V CPU features. In partucular, check that all the requested 1400 * features are supported by the host and the sanity of the configuration 1401 * (that all the required dependencies are included). Also, this takes care 1402 * of 'hv_passthrough' mode and fills the environment with all supported 1403 * Hyper-V features. 1404 */ 1405 bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp) 1406 { 1407 CPUState *cs = CPU(cpu); 1408 Error *local_err = NULL; 1409 int feat; 1410 1411 if (!hyperv_enabled(cpu)) 1412 return true; 1413 1414 /* 1415 * When kvm_hyperv_expand_features is called at CPU feature expansion 1416 * time per-CPU kvm_state is not available yet so we can only proceed 1417 * when KVM_CAP_SYS_HYPERV_CPUID is supported. 1418 */ 1419 if (!cs->kvm_state && 1420 !kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID)) 1421 return true; 1422 1423 if (cpu->hyperv_passthrough) { 1424 cpu->hyperv_vendor_id[0] = 1425 hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EBX); 1426 cpu->hyperv_vendor_id[1] = 1427 hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_ECX); 1428 cpu->hyperv_vendor_id[2] = 1429 hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EDX); 1430 cpu->hyperv_vendor = g_realloc(cpu->hyperv_vendor, 1431 sizeof(cpu->hyperv_vendor_id) + 1); 1432 memcpy(cpu->hyperv_vendor, cpu->hyperv_vendor_id, 1433 sizeof(cpu->hyperv_vendor_id)); 1434 cpu->hyperv_vendor[sizeof(cpu->hyperv_vendor_id)] = 0; 1435 1436 cpu->hyperv_interface_id[0] = 1437 hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EAX); 1438 cpu->hyperv_interface_id[1] = 1439 hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EBX); 1440 cpu->hyperv_interface_id[2] = 1441 hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_ECX); 1442 cpu->hyperv_interface_id[3] = 1443 hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EDX); 1444 1445 cpu->hyperv_ver_id_build = 1446 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EAX); 1447 cpu->hyperv_ver_id_major = 1448 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) >> 16; 1449 cpu->hyperv_ver_id_minor = 1450 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) & 0xffff; 1451 cpu->hyperv_ver_id_sp = 1452 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_ECX); 1453 cpu->hyperv_ver_id_sb = 1454 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) >> 24; 1455 cpu->hyperv_ver_id_sn = 1456 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) & 0xffffff; 1457 1458 cpu->hv_max_vps = hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, 1459 R_EAX); 1460 cpu->hyperv_limits[0] = 1461 hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EBX); 1462 cpu->hyperv_limits[1] = 1463 hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_ECX); 1464 cpu->hyperv_limits[2] = 1465 hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EDX); 1466 1467 cpu->hyperv_spinlock_attempts = 1468 hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EBX); 1469 1470 /* 1471 * Mark feature as enabled in 'cpu->hyperv_features' as 1472 * hv_build_cpuid_leaf() uses this info to build guest CPUIDs. 1473 */ 1474 for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) { 1475 if (hyperv_feature_supported(cs, feat) && 1476 !kvm_hyperv_properties[feat].skip_passthrough) { 1477 cpu->hyperv_features |= BIT(feat); 1478 } 1479 } 1480 } else { 1481 /* Check features availability and dependencies */ 1482 for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) { 1483 /* If the feature was not requested skip it. */ 1484 if (!hyperv_feat_enabled(cpu, feat)) { 1485 continue; 1486 } 1487 1488 /* Check if the feature is supported by KVM */ 1489 if (!hyperv_feature_supported(cs, feat)) { 1490 error_setg(errp, "Hyper-V %s is not supported by kernel", 1491 kvm_hyperv_properties[feat].desc); 1492 return false; 1493 } 1494 1495 /* Check dependencies */ 1496 if (!hv_feature_check_deps(cpu, feat, &local_err)) { 1497 error_propagate(errp, local_err); 1498 return false; 1499 } 1500 } 1501 } 1502 1503 /* Additional dependencies not covered by kvm_hyperv_properties[] */ 1504 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) && 1505 !cpu->hyperv_synic_kvm_only && 1506 !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) { 1507 error_setg(errp, "Hyper-V %s requires Hyper-V %s", 1508 kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc, 1509 kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc); 1510 return false; 1511 } 1512 1513 return true; 1514 } 1515 1516 /* 1517 * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent. 1518 */ 1519 static int hyperv_fill_cpuids(CPUState *cs, 1520 struct kvm_cpuid_entry2 *cpuid_ent) 1521 { 1522 X86CPU *cpu = X86_CPU(cs); 1523 struct kvm_cpuid_entry2 *c; 1524 uint32_t signature[3]; 1525 uint32_t cpuid_i = 0, max_cpuid_leaf = 0; 1526 uint32_t nested_eax = 1527 hv_build_cpuid_leaf(cs, HV_CPUID_NESTED_FEATURES, R_EAX); 1528 1529 max_cpuid_leaf = nested_eax ? HV_CPUID_NESTED_FEATURES : 1530 HV_CPUID_IMPLEMENT_LIMITS; 1531 1532 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) { 1533 max_cpuid_leaf = 1534 MAX(max_cpuid_leaf, HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); 1535 } 1536 1537 c = &cpuid_ent[cpuid_i++]; 1538 c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS; 1539 c->eax = max_cpuid_leaf; 1540 c->ebx = cpu->hyperv_vendor_id[0]; 1541 c->ecx = cpu->hyperv_vendor_id[1]; 1542 c->edx = cpu->hyperv_vendor_id[2]; 1543 1544 c = &cpuid_ent[cpuid_i++]; 1545 c->function = HV_CPUID_INTERFACE; 1546 c->eax = cpu->hyperv_interface_id[0]; 1547 c->ebx = cpu->hyperv_interface_id[1]; 1548 c->ecx = cpu->hyperv_interface_id[2]; 1549 c->edx = cpu->hyperv_interface_id[3]; 1550 1551 c = &cpuid_ent[cpuid_i++]; 1552 c->function = HV_CPUID_VERSION; 1553 c->eax = cpu->hyperv_ver_id_build; 1554 c->ebx = (uint32_t)cpu->hyperv_ver_id_major << 16 | 1555 cpu->hyperv_ver_id_minor; 1556 c->ecx = cpu->hyperv_ver_id_sp; 1557 c->edx = (uint32_t)cpu->hyperv_ver_id_sb << 24 | 1558 (cpu->hyperv_ver_id_sn & 0xffffff); 1559 1560 c = &cpuid_ent[cpuid_i++]; 1561 c->function = HV_CPUID_FEATURES; 1562 c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EAX); 1563 c->ebx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EBX); 1564 c->edx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EDX); 1565 1566 /* Unconditionally required with any Hyper-V enlightenment */ 1567 c->eax |= HV_HYPERCALL_AVAILABLE; 1568 1569 /* SynIC and Vmbus devices require messages/signals hypercalls */ 1570 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) && 1571 !cpu->hyperv_synic_kvm_only) { 1572 c->ebx |= HV_POST_MESSAGES | HV_SIGNAL_EVENTS; 1573 } 1574 1575 1576 /* Not exposed by KVM but needed to make CPU hotplug in Windows work */ 1577 c->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE; 1578 1579 c = &cpuid_ent[cpuid_i++]; 1580 c->function = HV_CPUID_ENLIGHTMENT_INFO; 1581 c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX); 1582 c->ebx = cpu->hyperv_spinlock_attempts; 1583 1584 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) && 1585 !hyperv_feat_enabled(cpu, HYPERV_FEAT_AVIC)) { 1586 c->eax |= HV_APIC_ACCESS_RECOMMENDED; 1587 } 1588 1589 if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_ON) { 1590 c->eax |= HV_NO_NONARCH_CORESHARING; 1591 } else if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO) { 1592 c->eax |= hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX) & 1593 HV_NO_NONARCH_CORESHARING; 1594 } 1595 1596 c = &cpuid_ent[cpuid_i++]; 1597 c->function = HV_CPUID_IMPLEMENT_LIMITS; 1598 c->eax = cpu->hv_max_vps; 1599 c->ebx = cpu->hyperv_limits[0]; 1600 c->ecx = cpu->hyperv_limits[1]; 1601 c->edx = cpu->hyperv_limits[2]; 1602 1603 if (nested_eax) { 1604 uint32_t function; 1605 1606 /* Create zeroed 0x40000006..0x40000009 leaves */ 1607 for (function = HV_CPUID_IMPLEMENT_LIMITS + 1; 1608 function < HV_CPUID_NESTED_FEATURES; function++) { 1609 c = &cpuid_ent[cpuid_i++]; 1610 c->function = function; 1611 } 1612 1613 c = &cpuid_ent[cpuid_i++]; 1614 c->function = HV_CPUID_NESTED_FEATURES; 1615 c->eax = nested_eax; 1616 } 1617 1618 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) { 1619 c = &cpuid_ent[cpuid_i++]; 1620 c->function = HV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS; 1621 c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ? 1622 HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS; 1623 memcpy(signature, "Microsoft VS", 12); 1624 c->eax = 0; 1625 c->ebx = signature[0]; 1626 c->ecx = signature[1]; 1627 c->edx = signature[2]; 1628 1629 c = &cpuid_ent[cpuid_i++]; 1630 c->function = HV_CPUID_SYNDBG_INTERFACE; 1631 memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12); 1632 c->eax = signature[0]; 1633 c->ebx = 0; 1634 c->ecx = 0; 1635 c->edx = 0; 1636 1637 c = &cpuid_ent[cpuid_i++]; 1638 c->function = HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES; 1639 c->eax = HV_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; 1640 c->ebx = 0; 1641 c->ecx = 0; 1642 c->edx = 0; 1643 } 1644 1645 return cpuid_i; 1646 } 1647 1648 static Error *hv_passthrough_mig_blocker; 1649 static Error *hv_no_nonarch_cs_mig_blocker; 1650 1651 /* Checks that the exposed eVMCS version range is supported by KVM */ 1652 static bool evmcs_version_supported(uint16_t evmcs_version, 1653 uint16_t supported_evmcs_version) 1654 { 1655 uint8_t min_version = evmcs_version & 0xff; 1656 uint8_t max_version = evmcs_version >> 8; 1657 uint8_t min_supported_version = supported_evmcs_version & 0xff; 1658 uint8_t max_supported_version = supported_evmcs_version >> 8; 1659 1660 return (min_version >= min_supported_version) && 1661 (max_version <= max_supported_version); 1662 } 1663 1664 static int hyperv_init_vcpu(X86CPU *cpu) 1665 { 1666 CPUState *cs = CPU(cpu); 1667 Error *local_err = NULL; 1668 int ret; 1669 1670 if (cpu->hyperv_passthrough && hv_passthrough_mig_blocker == NULL) { 1671 error_setg(&hv_passthrough_mig_blocker, 1672 "'hv-passthrough' CPU flag prevents migration, use explicit" 1673 " set of hv-* flags instead"); 1674 ret = migrate_add_blocker(&hv_passthrough_mig_blocker, &local_err); 1675 if (ret < 0) { 1676 error_report_err(local_err); 1677 return ret; 1678 } 1679 } 1680 1681 if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO && 1682 hv_no_nonarch_cs_mig_blocker == NULL) { 1683 error_setg(&hv_no_nonarch_cs_mig_blocker, 1684 "'hv-no-nonarch-coresharing=auto' CPU flag prevents migration" 1685 " use explicit 'hv-no-nonarch-coresharing=on' instead (but" 1686 " make sure SMT is disabled and/or that vCPUs are properly" 1687 " pinned)"); 1688 ret = migrate_add_blocker(&hv_no_nonarch_cs_mig_blocker, &local_err); 1689 if (ret < 0) { 1690 error_report_err(local_err); 1691 return ret; 1692 } 1693 } 1694 1695 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && !hv_vpindex_settable) { 1696 /* 1697 * the kernel doesn't support setting vp_index; assert that its value 1698 * is in sync 1699 */ 1700 uint64_t value; 1701 1702 ret = kvm_get_one_msr(cpu, HV_X64_MSR_VP_INDEX, &value); 1703 if (ret < 0) { 1704 return ret; 1705 } 1706 1707 if (value != hyperv_vp_index(CPU(cpu))) { 1708 error_report("kernel's vp_index != QEMU's vp_index"); 1709 return -ENXIO; 1710 } 1711 } 1712 1713 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) { 1714 uint32_t synic_cap = cpu->hyperv_synic_kvm_only ? 1715 KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2; 1716 ret = kvm_vcpu_enable_cap(cs, synic_cap, 0); 1717 if (ret < 0) { 1718 error_report("failed to turn on HyperV SynIC in KVM: %s", 1719 strerror(-ret)); 1720 return ret; 1721 } 1722 1723 if (!cpu->hyperv_synic_kvm_only) { 1724 ret = hyperv_x86_synic_add(cpu); 1725 if (ret < 0) { 1726 error_report("failed to create HyperV SynIC: %s", 1727 strerror(-ret)); 1728 return ret; 1729 } 1730 } 1731 } 1732 1733 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) { 1734 uint16_t evmcs_version = DEFAULT_EVMCS_VERSION; 1735 uint16_t supported_evmcs_version; 1736 1737 ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0, 1738 (uintptr_t)&supported_evmcs_version); 1739 1740 /* 1741 * KVM is required to support EVMCS ver.1. as that's what 'hv-evmcs' 1742 * option sets. Note: we hardcode the maximum supported eVMCS version 1743 * to '1' as well so 'hv-evmcs' feature is migratable even when (and if) 1744 * ver.2 is implemented. A new option (e.g. 'hv-evmcs=2') will then have 1745 * to be added. 1746 */ 1747 if (ret < 0) { 1748 error_report("Hyper-V %s is not supported by kernel", 1749 kvm_hyperv_properties[HYPERV_FEAT_EVMCS].desc); 1750 return ret; 1751 } 1752 1753 if (!evmcs_version_supported(evmcs_version, supported_evmcs_version)) { 1754 error_report("eVMCS version range [%d..%d] is not supported by " 1755 "kernel (supported: [%d..%d])", evmcs_version & 0xff, 1756 evmcs_version >> 8, supported_evmcs_version & 0xff, 1757 supported_evmcs_version >> 8); 1758 return -ENOTSUP; 1759 } 1760 } 1761 1762 if (cpu->hyperv_enforce_cpuid) { 1763 ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENFORCE_CPUID, 0, 1); 1764 if (ret < 0) { 1765 error_report("failed to enable KVM_CAP_HYPERV_ENFORCE_CPUID: %s", 1766 strerror(-ret)); 1767 return ret; 1768 } 1769 } 1770 1771 /* Skip SynIC and VP_INDEX since they are hard deps already */ 1772 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_STIMER) && 1773 hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) && 1774 hyperv_feat_enabled(cpu, HYPERV_FEAT_RUNTIME)) { 1775 hyperv_x86_set_vmbus_recommended_features_enabled(); 1776 } 1777 1778 return 0; 1779 } 1780 1781 static Error *invtsc_mig_blocker; 1782 1783 static void kvm_init_xsave(CPUX86State *env) 1784 { 1785 if (has_xsave2) { 1786 env->xsave_buf_len = QEMU_ALIGN_UP(has_xsave2, 4096); 1787 } else { 1788 env->xsave_buf_len = sizeof(struct kvm_xsave); 1789 } 1790 1791 env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len); 1792 memset(env->xsave_buf, 0, env->xsave_buf_len); 1793 /* 1794 * The allocated storage must be large enough for all of the 1795 * possible XSAVE state components. 1796 */ 1797 assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX) <= 1798 env->xsave_buf_len); 1799 } 1800 1801 static void kvm_init_nested_state(CPUX86State *env) 1802 { 1803 struct kvm_vmx_nested_state_hdr *vmx_hdr; 1804 uint32_t size; 1805 1806 if (!env->nested_state) { 1807 return; 1808 } 1809 1810 size = env->nested_state->size; 1811 1812 memset(env->nested_state, 0, size); 1813 env->nested_state->size = size; 1814 1815 if (cpu_has_vmx(env)) { 1816 env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX; 1817 vmx_hdr = &env->nested_state->hdr.vmx; 1818 vmx_hdr->vmxon_pa = -1ull; 1819 vmx_hdr->vmcs12_pa = -1ull; 1820 } else if (cpu_has_svm(env)) { 1821 env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM; 1822 } 1823 } 1824 1825 static uint32_t kvm_x86_build_cpuid(CPUX86State *env, 1826 struct kvm_cpuid_entry2 *entries, 1827 uint32_t cpuid_i) 1828 { 1829 uint32_t limit, i, j; 1830 uint32_t unused; 1831 struct kvm_cpuid_entry2 *c; 1832 1833 cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused); 1834 1835 for (i = 0; i <= limit; i++) { 1836 j = 0; 1837 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1838 goto full; 1839 } 1840 c = &entries[cpuid_i++]; 1841 switch (i) { 1842 case 2: { 1843 /* Keep reading function 2 till all the input is received */ 1844 int times; 1845 1846 c->function = i; 1847 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1848 times = c->eax & 0xff; 1849 if (times > 1) { 1850 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC | 1851 KVM_CPUID_FLAG_STATE_READ_NEXT; 1852 } 1853 1854 for (j = 1; j < times; ++j) { 1855 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1856 goto full; 1857 } 1858 c = &entries[cpuid_i++]; 1859 c->function = i; 1860 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC; 1861 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1862 } 1863 break; 1864 } 1865 case 0x1f: 1866 if (!x86_has_extended_topo(env->avail_cpu_topo)) { 1867 cpuid_i--; 1868 break; 1869 } 1870 /* fallthrough */ 1871 case 4: 1872 case 0xb: 1873 case 0xd: 1874 for (j = 0; ; j++) { 1875 c->function = i; 1876 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1877 c->index = j; 1878 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx); 1879 1880 if (i == 4 && c->eax == 0) { 1881 break; 1882 } 1883 if (i == 0xb && !(c->ecx & 0xff00)) { 1884 break; 1885 } 1886 if (i == 0x1f && !(c->ecx & 0xff00)) { 1887 break; 1888 } 1889 if (i == 0xd && c->eax == 0) { 1890 if (j < 63) { 1891 continue; 1892 } else { 1893 cpuid_i--; 1894 break; 1895 } 1896 } 1897 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1898 goto full; 1899 } 1900 c = &entries[cpuid_i++]; 1901 } 1902 break; 1903 case 0x12: 1904 for (j = 0; ; j++) { 1905 c->function = i; 1906 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1907 c->index = j; 1908 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx); 1909 1910 if (j > 1 && (c->eax & 0xf) != 1) { 1911 break; 1912 } 1913 1914 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1915 goto full; 1916 } 1917 c = &entries[cpuid_i++]; 1918 } 1919 break; 1920 case 0x7: 1921 case 0x14: 1922 case 0x1d: 1923 case 0x1e: 1924 case 0x24: { 1925 uint32_t times; 1926 1927 c->function = i; 1928 c->index = 0; 1929 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1930 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1931 times = c->eax; 1932 1933 for (j = 1; j <= times; ++j) { 1934 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1935 goto full; 1936 } 1937 c = &entries[cpuid_i++]; 1938 c->function = i; 1939 c->index = j; 1940 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1941 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx); 1942 } 1943 break; 1944 } 1945 default: 1946 c->function = i; 1947 c->flags = 0; 1948 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1949 if (!c->eax && !c->ebx && !c->ecx && !c->edx) { 1950 /* 1951 * KVM already returns all zeroes if a CPUID entry is missing, 1952 * so we can omit it and avoid hitting KVM's 80-entry limit. 1953 */ 1954 cpuid_i--; 1955 } 1956 break; 1957 } 1958 } 1959 1960 if (limit >= 0x0a) { 1961 uint32_t eax, edx; 1962 1963 cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx); 1964 1965 has_architectural_pmu_version = eax & 0xff; 1966 if (has_architectural_pmu_version > 0) { 1967 num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8; 1968 1969 /* Shouldn't be more than 32, since that's the number of bits 1970 * available in EBX to tell us _which_ counters are available. 1971 * Play it safe. 1972 */ 1973 if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) { 1974 num_architectural_pmu_gp_counters = MAX_GP_COUNTERS; 1975 } 1976 1977 if (has_architectural_pmu_version > 1) { 1978 num_architectural_pmu_fixed_counters = edx & 0x1f; 1979 1980 if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) { 1981 num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS; 1982 } 1983 } 1984 } 1985 } 1986 1987 cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused); 1988 1989 for (i = 0x80000000; i <= limit; i++) { 1990 j = 0; 1991 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1992 goto full; 1993 } 1994 c = &entries[cpuid_i++]; 1995 1996 switch (i) { 1997 case 0x8000001d: 1998 /* Query for all AMD cache information leaves */ 1999 for (j = 0; ; j++) { 2000 c->function = i; 2001 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2002 c->index = j; 2003 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx); 2004 2005 if (c->eax == 0) { 2006 break; 2007 } 2008 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 2009 goto full; 2010 } 2011 c = &entries[cpuid_i++]; 2012 } 2013 break; 2014 default: 2015 c->function = i; 2016 c->flags = 0; 2017 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 2018 if (!c->eax && !c->ebx && !c->ecx && !c->edx) { 2019 /* 2020 * KVM already returns all zeroes if a CPUID entry is missing, 2021 * so we can omit it and avoid hitting KVM's 80-entry limit. 2022 */ 2023 cpuid_i--; 2024 } 2025 break; 2026 } 2027 } 2028 2029 /* Call Centaur's CPUID instructions they are supported. */ 2030 if (env->cpuid_xlevel2 > 0) { 2031 cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused); 2032 2033 for (i = 0xC0000000; i <= limit; i++) { 2034 j = 0; 2035 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 2036 goto full; 2037 } 2038 c = &entries[cpuid_i++]; 2039 2040 c->function = i; 2041 c->flags = 0; 2042 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 2043 } 2044 } 2045 2046 return cpuid_i; 2047 2048 full: 2049 fprintf(stderr, "cpuid_data is full, no space for " 2050 "cpuid(eax:0x%x,ecx:0x%x)\n", i, j); 2051 abort(); 2052 } 2053 2054 int kvm_arch_pre_create_vcpu(CPUState *cpu, Error **errp) 2055 { 2056 return 0; 2057 } 2058 2059 int kvm_arch_init_vcpu(CPUState *cs) 2060 { 2061 struct { 2062 struct kvm_cpuid2 cpuid; 2063 struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES]; 2064 } cpuid_data; 2065 /* 2066 * The kernel defines these structs with padding fields so there 2067 * should be no extra padding in our cpuid_data struct. 2068 */ 2069 QEMU_BUILD_BUG_ON(sizeof(cpuid_data) != 2070 sizeof(struct kvm_cpuid2) + 2071 sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES); 2072 2073 X86CPU *cpu = X86_CPU(cs); 2074 CPUX86State *env = &cpu->env; 2075 uint32_t cpuid_i; 2076 struct kvm_cpuid_entry2 *c; 2077 uint32_t signature[3]; 2078 int kvm_base = KVM_CPUID_SIGNATURE; 2079 int max_nested_state_len; 2080 int r; 2081 Error *local_err = NULL; 2082 2083 memset(&cpuid_data, 0, sizeof(cpuid_data)); 2084 2085 cpuid_i = 0; 2086 2087 has_xsave2 = kvm_check_extension(cs->kvm_state, KVM_CAP_XSAVE2); 2088 2089 r = kvm_arch_set_tsc_khz(cs); 2090 if (r < 0) { 2091 return r; 2092 } 2093 2094 /* vcpu's TSC frequency is either specified by user, or following 2095 * the value used by KVM if the former is not present. In the 2096 * latter case, we query it from KVM and record in env->tsc_khz, 2097 * so that vcpu's TSC frequency can be migrated later via this field. 2098 */ 2099 if (!env->tsc_khz) { 2100 r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ? 2101 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : 2102 -ENOTSUP; 2103 if (r > 0) { 2104 env->tsc_khz = r; 2105 } 2106 } 2107 2108 env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY; 2109 2110 /* 2111 * kvm_hyperv_expand_features() is called here for the second time in case 2112 * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle 2113 * 'query-cpu-model-expansion' in this case as we don't have a KVM vCPU to 2114 * check which Hyper-V enlightenments are supported and which are not, we 2115 * can still proceed and check/expand Hyper-V enlightenments here so legacy 2116 * behavior is preserved. 2117 */ 2118 if (!kvm_hyperv_expand_features(cpu, &local_err)) { 2119 error_report_err(local_err); 2120 return -ENOSYS; 2121 } 2122 2123 if (hyperv_enabled(cpu)) { 2124 r = hyperv_init_vcpu(cpu); 2125 if (r) { 2126 return r; 2127 } 2128 2129 cpuid_i = hyperv_fill_cpuids(cs, cpuid_data.entries); 2130 kvm_base = KVM_CPUID_SIGNATURE_NEXT; 2131 has_msr_hv_hypercall = true; 2132 } 2133 2134 if (cs->kvm_state->xen_version) { 2135 #ifdef CONFIG_XEN_EMU 2136 struct kvm_cpuid_entry2 *xen_max_leaf; 2137 2138 memcpy(signature, "XenVMMXenVMM", 12); 2139 2140 xen_max_leaf = c = &cpuid_data.entries[cpuid_i++]; 2141 c->function = kvm_base + XEN_CPUID_SIGNATURE; 2142 c->eax = kvm_base + XEN_CPUID_TIME; 2143 c->ebx = signature[0]; 2144 c->ecx = signature[1]; 2145 c->edx = signature[2]; 2146 2147 c = &cpuid_data.entries[cpuid_i++]; 2148 c->function = kvm_base + XEN_CPUID_VENDOR; 2149 c->eax = cs->kvm_state->xen_version; 2150 c->ebx = 0; 2151 c->ecx = 0; 2152 c->edx = 0; 2153 2154 c = &cpuid_data.entries[cpuid_i++]; 2155 c->function = kvm_base + XEN_CPUID_HVM_MSR; 2156 /* Number of hypercall-transfer pages */ 2157 c->eax = 1; 2158 /* Hypercall MSR base address */ 2159 if (hyperv_enabled(cpu)) { 2160 c->ebx = XEN_HYPERCALL_MSR_HYPERV; 2161 kvm_xen_init(cs->kvm_state, c->ebx); 2162 } else { 2163 c->ebx = XEN_HYPERCALL_MSR; 2164 } 2165 c->ecx = 0; 2166 c->edx = 0; 2167 2168 c = &cpuid_data.entries[cpuid_i++]; 2169 c->function = kvm_base + XEN_CPUID_TIME; 2170 c->eax = ((!!tsc_is_stable_and_known(env) << 1) | 2171 (!!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP) << 2)); 2172 /* default=0 (emulate if necessary) */ 2173 c->ebx = 0; 2174 /* guest tsc frequency */ 2175 c->ecx = env->user_tsc_khz; 2176 /* guest tsc incarnation (migration count) */ 2177 c->edx = 0; 2178 2179 c = &cpuid_data.entries[cpuid_i++]; 2180 c->function = kvm_base + XEN_CPUID_HVM; 2181 xen_max_leaf->eax = kvm_base + XEN_CPUID_HVM; 2182 if (cs->kvm_state->xen_version >= XEN_VERSION(4, 5)) { 2183 c->function = kvm_base + XEN_CPUID_HVM; 2184 2185 if (cpu->xen_vapic) { 2186 c->eax |= XEN_HVM_CPUID_APIC_ACCESS_VIRT; 2187 c->eax |= XEN_HVM_CPUID_X2APIC_VIRT; 2188 } 2189 2190 c->eax |= XEN_HVM_CPUID_IOMMU_MAPPINGS; 2191 2192 if (cs->kvm_state->xen_version >= XEN_VERSION(4, 6)) { 2193 c->eax |= XEN_HVM_CPUID_VCPU_ID_PRESENT; 2194 c->ebx = cs->cpu_index; 2195 } 2196 2197 if (cs->kvm_state->xen_version >= XEN_VERSION(4, 17)) { 2198 c->eax |= XEN_HVM_CPUID_UPCALL_VECTOR; 2199 } 2200 } 2201 2202 r = kvm_xen_init_vcpu(cs); 2203 if (r) { 2204 return r; 2205 } 2206 2207 kvm_base += 0x100; 2208 #else /* CONFIG_XEN_EMU */ 2209 /* This should never happen as kvm_arch_init() would have died first. */ 2210 fprintf(stderr, "Cannot enable Xen CPUID without Xen support\n"); 2211 abort(); 2212 #endif 2213 } else if (cpu->expose_kvm) { 2214 memcpy(signature, "KVMKVMKVM\0\0\0", 12); 2215 c = &cpuid_data.entries[cpuid_i++]; 2216 c->function = KVM_CPUID_SIGNATURE | kvm_base; 2217 c->eax = KVM_CPUID_FEATURES | kvm_base; 2218 c->ebx = signature[0]; 2219 c->ecx = signature[1]; 2220 c->edx = signature[2]; 2221 2222 c = &cpuid_data.entries[cpuid_i++]; 2223 c->function = KVM_CPUID_FEATURES | kvm_base; 2224 c->eax = env->features[FEAT_KVM]; 2225 c->edx = env->features[FEAT_KVM_HINTS]; 2226 } 2227 2228 if (cpu->kvm_pv_enforce_cpuid) { 2229 r = kvm_vcpu_enable_cap(cs, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 0, 1); 2230 if (r < 0) { 2231 fprintf(stderr, 2232 "failed to enable KVM_CAP_ENFORCE_PV_FEATURE_CPUID: %s", 2233 strerror(-r)); 2234 abort(); 2235 } 2236 } 2237 2238 cpuid_i = kvm_x86_build_cpuid(env, cpuid_data.entries, cpuid_i); 2239 cpuid_data.cpuid.nent = cpuid_i; 2240 2241 if (((env->cpuid_version >> 8)&0xF) >= 6 2242 && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) == 2243 (CPUID_MCE | CPUID_MCA)) { 2244 uint64_t mcg_cap, unsupported_caps; 2245 int banks; 2246 int ret; 2247 2248 ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks); 2249 if (ret < 0) { 2250 fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret)); 2251 return ret; 2252 } 2253 2254 if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) { 2255 error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)", 2256 (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks); 2257 return -ENOTSUP; 2258 } 2259 2260 unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK); 2261 if (unsupported_caps) { 2262 if (unsupported_caps & MCG_LMCE_P) { 2263 error_report("kvm: LMCE not supported"); 2264 return -ENOTSUP; 2265 } 2266 warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64, 2267 unsupported_caps); 2268 } 2269 2270 env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK; 2271 ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap); 2272 if (ret < 0) { 2273 fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret)); 2274 return ret; 2275 } 2276 } 2277 2278 cpu->vmsentry = qemu_add_vm_change_state_handler(cpu_update_state, env); 2279 2280 c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0); 2281 if (c) { 2282 has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) || 2283 !!(c->ecx & CPUID_EXT_SMX); 2284 } 2285 2286 c = cpuid_find_entry(&cpuid_data.cpuid, 7, 0); 2287 if (c && (c->ebx & CPUID_7_0_EBX_SGX)) { 2288 has_msr_feature_control = true; 2289 } 2290 2291 if (env->mcg_cap & MCG_LMCE_P) { 2292 has_msr_mcg_ext_ctl = has_msr_feature_control = true; 2293 } 2294 2295 if (!env->user_tsc_khz) { 2296 if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) && 2297 invtsc_mig_blocker == NULL) { 2298 error_setg(&invtsc_mig_blocker, 2299 "State blocked by non-migratable CPU device" 2300 " (invtsc flag)"); 2301 r = migrate_add_blocker(&invtsc_mig_blocker, &local_err); 2302 if (r < 0) { 2303 error_report_err(local_err); 2304 return r; 2305 } 2306 } 2307 } 2308 2309 if (cpu->vmware_cpuid_freq 2310 /* Guests depend on 0x40000000 to detect this feature, so only expose 2311 * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */ 2312 && cpu->expose_kvm 2313 && kvm_base == KVM_CPUID_SIGNATURE 2314 /* TSC clock must be stable and known for this feature. */ 2315 && tsc_is_stable_and_known(env)) { 2316 2317 c = &cpuid_data.entries[cpuid_i++]; 2318 c->function = KVM_CPUID_SIGNATURE | 0x10; 2319 c->eax = env->tsc_khz; 2320 c->ebx = env->apic_bus_freq / 1000; /* Hz to KHz */ 2321 c->ecx = c->edx = 0; 2322 2323 c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0); 2324 c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10); 2325 } 2326 2327 cpuid_data.cpuid.nent = cpuid_i; 2328 2329 cpuid_data.cpuid.padding = 0; 2330 r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data); 2331 if (r) { 2332 goto fail; 2333 } 2334 kvm_init_xsave(env); 2335 2336 max_nested_state_len = kvm_max_nested_state_length(); 2337 if (max_nested_state_len > 0) { 2338 assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data)); 2339 2340 if (cpu_has_vmx(env) || cpu_has_svm(env)) { 2341 env->nested_state = g_malloc0(max_nested_state_len); 2342 env->nested_state->size = max_nested_state_len; 2343 2344 kvm_init_nested_state(env); 2345 } 2346 } 2347 2348 cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE); 2349 2350 if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) { 2351 has_msr_tsc_aux = false; 2352 } 2353 2354 kvm_init_msrs(cpu); 2355 2356 return 0; 2357 2358 fail: 2359 migrate_del_blocker(&invtsc_mig_blocker); 2360 2361 return r; 2362 } 2363 2364 int kvm_arch_destroy_vcpu(CPUState *cs) 2365 { 2366 X86CPU *cpu = X86_CPU(cs); 2367 CPUX86State *env = &cpu->env; 2368 2369 g_free(env->xsave_buf); 2370 2371 g_free(cpu->kvm_msr_buf); 2372 cpu->kvm_msr_buf = NULL; 2373 2374 g_free(env->nested_state); 2375 env->nested_state = NULL; 2376 2377 qemu_del_vm_change_state_handler(cpu->vmsentry); 2378 2379 return 0; 2380 } 2381 2382 void kvm_arch_reset_vcpu(X86CPU *cpu) 2383 { 2384 CPUX86State *env = &cpu->env; 2385 2386 env->xcr0 = 1; 2387 if (kvm_irqchip_in_kernel()) { 2388 env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE : 2389 KVM_MP_STATE_UNINITIALIZED; 2390 } else { 2391 env->mp_state = KVM_MP_STATE_RUNNABLE; 2392 } 2393 2394 /* enabled by default */ 2395 env->poll_control_msr = 1; 2396 2397 kvm_init_nested_state(env); 2398 2399 sev_es_set_reset_vector(CPU(cpu)); 2400 } 2401 2402 void kvm_arch_after_reset_vcpu(X86CPU *cpu) 2403 { 2404 CPUX86State *env = &cpu->env; 2405 int i; 2406 2407 /* 2408 * Reset SynIC after all other devices have been reset to let them remove 2409 * their SINT routes first. 2410 */ 2411 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) { 2412 for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) { 2413 env->msr_hv_synic_sint[i] = HV_SINT_MASKED; 2414 } 2415 2416 hyperv_x86_synic_reset(cpu); 2417 } 2418 } 2419 2420 void kvm_arch_reset_parked_vcpu(unsigned long vcpu_id, int kvm_fd) 2421 { 2422 g_autofree struct kvm_msrs *msrs = NULL; 2423 2424 msrs = g_malloc0(sizeof(*msrs) + sizeof(msrs->entries[0])); 2425 msrs->entries[0].index = MSR_IA32_TSC; 2426 msrs->entries[0].data = 1; /* match the value in x86_cpu_reset() */ 2427 msrs->nmsrs++; 2428 2429 if (ioctl(kvm_fd, KVM_SET_MSRS, msrs) != 1) { 2430 warn_report("parked vCPU %lu TSC reset failed: %d", 2431 vcpu_id, errno); 2432 } 2433 } 2434 2435 void kvm_arch_do_init_vcpu(X86CPU *cpu) 2436 { 2437 CPUX86State *env = &cpu->env; 2438 2439 /* APs get directly into wait-for-SIPI state. */ 2440 if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) { 2441 env->mp_state = KVM_MP_STATE_INIT_RECEIVED; 2442 } 2443 } 2444 2445 static int kvm_get_supported_feature_msrs(KVMState *s) 2446 { 2447 int ret = 0; 2448 2449 if (kvm_feature_msrs != NULL) { 2450 return 0; 2451 } 2452 2453 if (!kvm_check_extension(s, KVM_CAP_GET_MSR_FEATURES)) { 2454 return 0; 2455 } 2456 2457 struct kvm_msr_list msr_list; 2458 2459 msr_list.nmsrs = 0; 2460 ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, &msr_list); 2461 if (ret < 0 && ret != -E2BIG) { 2462 error_report("Fetch KVM feature MSR list failed: %s", 2463 strerror(-ret)); 2464 return ret; 2465 } 2466 2467 assert(msr_list.nmsrs > 0); 2468 kvm_feature_msrs = g_malloc0(sizeof(msr_list) + 2469 msr_list.nmsrs * sizeof(msr_list.indices[0])); 2470 2471 kvm_feature_msrs->nmsrs = msr_list.nmsrs; 2472 ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, kvm_feature_msrs); 2473 2474 if (ret < 0) { 2475 error_report("Fetch KVM feature MSR list failed: %s", 2476 strerror(-ret)); 2477 g_free(kvm_feature_msrs); 2478 kvm_feature_msrs = NULL; 2479 return ret; 2480 } 2481 2482 return 0; 2483 } 2484 2485 static int kvm_get_supported_msrs(KVMState *s) 2486 { 2487 int ret = 0; 2488 struct kvm_msr_list msr_list, *kvm_msr_list; 2489 2490 /* 2491 * Obtain MSR list from KVM. These are the MSRs that we must 2492 * save/restore. 2493 */ 2494 msr_list.nmsrs = 0; 2495 ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list); 2496 if (ret < 0 && ret != -E2BIG) { 2497 return ret; 2498 } 2499 /* 2500 * Old kernel modules had a bug and could write beyond the provided 2501 * memory. Allocate at least a safe amount of 1K. 2502 */ 2503 kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) + 2504 msr_list.nmsrs * 2505 sizeof(msr_list.indices[0]))); 2506 2507 kvm_msr_list->nmsrs = msr_list.nmsrs; 2508 ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list); 2509 if (ret >= 0) { 2510 int i; 2511 2512 for (i = 0; i < kvm_msr_list->nmsrs; i++) { 2513 switch (kvm_msr_list->indices[i]) { 2514 case MSR_STAR: 2515 has_msr_star = true; 2516 break; 2517 case MSR_VM_HSAVE_PA: 2518 has_msr_hsave_pa = true; 2519 break; 2520 case MSR_TSC_AUX: 2521 has_msr_tsc_aux = true; 2522 break; 2523 case MSR_TSC_ADJUST: 2524 has_msr_tsc_adjust = true; 2525 break; 2526 case MSR_IA32_TSCDEADLINE: 2527 has_msr_tsc_deadline = true; 2528 break; 2529 case MSR_IA32_SMBASE: 2530 has_msr_smbase = true; 2531 break; 2532 case MSR_SMI_COUNT: 2533 has_msr_smi_count = true; 2534 break; 2535 case MSR_IA32_MISC_ENABLE: 2536 has_msr_misc_enable = true; 2537 break; 2538 case MSR_IA32_BNDCFGS: 2539 has_msr_bndcfgs = true; 2540 break; 2541 case MSR_IA32_XSS: 2542 has_msr_xss = true; 2543 break; 2544 case MSR_IA32_UMWAIT_CONTROL: 2545 has_msr_umwait = true; 2546 break; 2547 case HV_X64_MSR_CRASH_CTL: 2548 has_msr_hv_crash = true; 2549 break; 2550 case HV_X64_MSR_RESET: 2551 has_msr_hv_reset = true; 2552 break; 2553 case HV_X64_MSR_VP_INDEX: 2554 has_msr_hv_vpindex = true; 2555 break; 2556 case HV_X64_MSR_VP_RUNTIME: 2557 has_msr_hv_runtime = true; 2558 break; 2559 case HV_X64_MSR_SCONTROL: 2560 has_msr_hv_synic = true; 2561 break; 2562 case HV_X64_MSR_STIMER0_CONFIG: 2563 has_msr_hv_stimer = true; 2564 break; 2565 case HV_X64_MSR_TSC_FREQUENCY: 2566 has_msr_hv_frequencies = true; 2567 break; 2568 case HV_X64_MSR_REENLIGHTENMENT_CONTROL: 2569 has_msr_hv_reenlightenment = true; 2570 break; 2571 case HV_X64_MSR_SYNDBG_OPTIONS: 2572 has_msr_hv_syndbg_options = true; 2573 break; 2574 case MSR_IA32_SPEC_CTRL: 2575 has_msr_spec_ctrl = true; 2576 break; 2577 case MSR_AMD64_TSC_RATIO: 2578 has_tsc_scale_msr = true; 2579 break; 2580 case MSR_IA32_TSX_CTRL: 2581 has_msr_tsx_ctrl = true; 2582 break; 2583 case MSR_VIRT_SSBD: 2584 has_msr_virt_ssbd = true; 2585 break; 2586 case MSR_IA32_ARCH_CAPABILITIES: 2587 has_msr_arch_capabs = true; 2588 break; 2589 case MSR_IA32_CORE_CAPABILITY: 2590 has_msr_core_capabs = true; 2591 break; 2592 case MSR_IA32_PERF_CAPABILITIES: 2593 has_msr_perf_capabs = true; 2594 break; 2595 case MSR_IA32_VMX_VMFUNC: 2596 has_msr_vmx_vmfunc = true; 2597 break; 2598 case MSR_IA32_UCODE_REV: 2599 has_msr_ucode_rev = true; 2600 break; 2601 case MSR_IA32_VMX_PROCBASED_CTLS2: 2602 has_msr_vmx_procbased_ctls2 = true; 2603 break; 2604 case MSR_IA32_PKRS: 2605 has_msr_pkrs = true; 2606 break; 2607 case MSR_K7_HWCR: 2608 has_msr_hwcr = true; 2609 } 2610 } 2611 } 2612 2613 g_free(kvm_msr_list); 2614 2615 return ret; 2616 } 2617 2618 static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, 2619 uint32_t msr, 2620 uint64_t *val) 2621 { 2622 *val = cpu_x86_get_msr_core_thread_count(cpu); 2623 2624 return true; 2625 } 2626 2627 static bool kvm_rdmsr_rapl_power_unit(X86CPU *cpu, 2628 uint32_t msr, 2629 uint64_t *val) 2630 { 2631 2632 CPUState *cs = CPU(cpu); 2633 2634 *val = cs->kvm_state->msr_energy.msr_unit; 2635 2636 return true; 2637 } 2638 2639 static bool kvm_rdmsr_pkg_power_limit(X86CPU *cpu, 2640 uint32_t msr, 2641 uint64_t *val) 2642 { 2643 2644 CPUState *cs = CPU(cpu); 2645 2646 *val = cs->kvm_state->msr_energy.msr_limit; 2647 2648 return true; 2649 } 2650 2651 static bool kvm_rdmsr_pkg_power_info(X86CPU *cpu, 2652 uint32_t msr, 2653 uint64_t *val) 2654 { 2655 2656 CPUState *cs = CPU(cpu); 2657 2658 *val = cs->kvm_state->msr_energy.msr_info; 2659 2660 return true; 2661 } 2662 2663 static bool kvm_rdmsr_pkg_energy_status(X86CPU *cpu, 2664 uint32_t msr, 2665 uint64_t *val) 2666 { 2667 2668 CPUState *cs = CPU(cpu); 2669 *val = cs->kvm_state->msr_energy.msr_value[cs->cpu_index]; 2670 2671 return true; 2672 } 2673 2674 static Notifier smram_machine_done; 2675 static KVMMemoryListener smram_listener; 2676 static AddressSpace smram_address_space; 2677 static MemoryRegion smram_as_root; 2678 static MemoryRegion smram_as_mem; 2679 2680 static void register_smram_listener(Notifier *n, void *unused) 2681 { 2682 MemoryRegion *smram = 2683 (MemoryRegion *) object_resolve_path("/machine/smram", NULL); 2684 2685 /* Outer container... */ 2686 memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull); 2687 memory_region_set_enabled(&smram_as_root, true); 2688 2689 /* ... with two regions inside: normal system memory with low 2690 * priority, and... 2691 */ 2692 memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram", 2693 get_system_memory(), 0, ~0ull); 2694 memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0); 2695 memory_region_set_enabled(&smram_as_mem, true); 2696 2697 if (smram) { 2698 /* ... SMRAM with higher priority */ 2699 memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10); 2700 memory_region_set_enabled(smram, true); 2701 } 2702 2703 address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM"); 2704 kvm_memory_listener_register(kvm_state, &smram_listener, 2705 &smram_address_space, 1, "kvm-smram"); 2706 } 2707 2708 static void *kvm_msr_energy_thread(void *data) 2709 { 2710 KVMState *s = data; 2711 struct KVMMsrEnergy *vmsr = &s->msr_energy; 2712 2713 g_autofree vmsr_package_energy_stat *pkg_stat = NULL; 2714 g_autofree vmsr_thread_stat *thd_stat = NULL; 2715 g_autofree CPUState *cpu = NULL; 2716 g_autofree unsigned int *vpkgs_energy_stat = NULL; 2717 unsigned int num_threads = 0; 2718 2719 X86CPUTopoIDs topo_ids; 2720 2721 rcu_register_thread(); 2722 2723 /* Allocate memory for each package energy status */ 2724 pkg_stat = g_new0(vmsr_package_energy_stat, vmsr->host_topo.maxpkgs); 2725 2726 /* Allocate memory for thread stats */ 2727 thd_stat = g_new0(vmsr_thread_stat, 1); 2728 2729 /* Allocate memory for holding virtual package energy counter */ 2730 vpkgs_energy_stat = g_new0(unsigned int, vmsr->guest_vsockets); 2731 2732 /* Populate the max tick of each packages */ 2733 for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { 2734 /* 2735 * Max numbers of ticks per package 2736 * Time in second * Number of ticks/second * Number of cores/package 2737 * ex: 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max 2738 */ 2739 vmsr->host_topo.maxticks[i] = (MSR_ENERGY_THREAD_SLEEP_US / 1000000) 2740 * sysconf(_SC_CLK_TCK) 2741 * vmsr->host_topo.pkg_cpu_count[i]; 2742 } 2743 2744 while (true) { 2745 /* Get all qemu threads id */ 2746 g_autofree pid_t *thread_ids 2747 = vmsr_get_thread_ids(vmsr->pid, &num_threads); 2748 2749 if (thread_ids == NULL) { 2750 goto clean; 2751 } 2752 2753 thd_stat = g_renew(vmsr_thread_stat, thd_stat, num_threads); 2754 /* Unlike g_new0, g_renew0 function doesn't exist yet... */ 2755 memset(thd_stat, 0, num_threads * sizeof(vmsr_thread_stat)); 2756 2757 /* Populate all the thread stats */ 2758 for (int i = 0; i < num_threads; i++) { 2759 thd_stat[i].utime = g_new0(unsigned long long, 2); 2760 thd_stat[i].stime = g_new0(unsigned long long, 2); 2761 thd_stat[i].thread_id = thread_ids[i]; 2762 vmsr_read_thread_stat(vmsr->pid, 2763 thd_stat[i].thread_id, 2764 &thd_stat[i].utime[0], 2765 &thd_stat[i].stime[0], 2766 &thd_stat[i].cpu_id); 2767 thd_stat[i].pkg_id = 2768 vmsr_get_physical_package_id(thd_stat[i].cpu_id); 2769 } 2770 2771 /* Retrieve all packages power plane energy counter */ 2772 for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { 2773 for (int j = 0; j < num_threads; j++) { 2774 /* 2775 * Use the first thread we found that ran on the CPU 2776 * of the package to read the packages energy counter 2777 */ 2778 if (thd_stat[j].pkg_id == i) { 2779 pkg_stat[i].e_start = 2780 vmsr_read_msr(MSR_PKG_ENERGY_STATUS, 2781 thd_stat[j].cpu_id, 2782 thd_stat[j].thread_id, 2783 s->msr_energy.sioc); 2784 break; 2785 } 2786 } 2787 } 2788 2789 /* Sleep a short period while the other threads are working */ 2790 usleep(MSR_ENERGY_THREAD_SLEEP_US); 2791 2792 /* 2793 * Retrieve all packages power plane energy counter 2794 * Calculate the delta of all packages 2795 */ 2796 for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { 2797 for (int j = 0; j < num_threads; j++) { 2798 /* 2799 * Use the first thread we found that ran on the CPU 2800 * of the package to read the packages energy counter 2801 */ 2802 if (thd_stat[j].pkg_id == i) { 2803 pkg_stat[i].e_end = 2804 vmsr_read_msr(MSR_PKG_ENERGY_STATUS, 2805 thd_stat[j].cpu_id, 2806 thd_stat[j].thread_id, 2807 s->msr_energy.sioc); 2808 /* 2809 * Prevent the case we have migrate the VM 2810 * during the sleep period or any other cases 2811 * were energy counter might be lower after 2812 * the sleep period. 2813 */ 2814 if (pkg_stat[i].e_end > pkg_stat[i].e_start) { 2815 pkg_stat[i].e_delta = 2816 pkg_stat[i].e_end - pkg_stat[i].e_start; 2817 } else { 2818 pkg_stat[i].e_delta = 0; 2819 } 2820 break; 2821 } 2822 } 2823 } 2824 2825 /* Delta of ticks spend by each thread between the sample */ 2826 for (int i = 0; i < num_threads; i++) { 2827 vmsr_read_thread_stat(vmsr->pid, 2828 thd_stat[i].thread_id, 2829 &thd_stat[i].utime[1], 2830 &thd_stat[i].stime[1], 2831 &thd_stat[i].cpu_id); 2832 2833 if (vmsr->pid < 0) { 2834 /* 2835 * We don't count the dead thread 2836 * i.e threads that existed before the sleep 2837 * and not anymore 2838 */ 2839 thd_stat[i].delta_ticks = 0; 2840 } else { 2841 vmsr_delta_ticks(thd_stat, i); 2842 } 2843 } 2844 2845 /* 2846 * Identify the vcpu threads 2847 * Calculate the number of vcpu per package 2848 */ 2849 CPU_FOREACH(cpu) { 2850 for (int i = 0; i < num_threads; i++) { 2851 if (cpu->thread_id == thd_stat[i].thread_id) { 2852 thd_stat[i].is_vcpu = true; 2853 thd_stat[i].vcpu_id = cpu->cpu_index; 2854 pkg_stat[thd_stat[i].pkg_id].nb_vcpu++; 2855 thd_stat[i].acpi_id = kvm_arch_vcpu_id(cpu); 2856 break; 2857 } 2858 } 2859 } 2860 2861 /* Retrieve the virtual package number of each vCPU */ 2862 for (int i = 0; i < vmsr->guest_cpu_list->len; i++) { 2863 for (int j = 0; j < num_threads; j++) { 2864 if ((thd_stat[j].acpi_id == 2865 vmsr->guest_cpu_list->cpus[i].arch_id) 2866 && (thd_stat[j].is_vcpu == true)) { 2867 x86_topo_ids_from_apicid(thd_stat[j].acpi_id, 2868 &vmsr->guest_topo_info, &topo_ids); 2869 thd_stat[j].vpkg_id = topo_ids.pkg_id; 2870 } 2871 } 2872 } 2873 2874 /* Calculate the total energy of all non-vCPU thread */ 2875 for (int i = 0; i < num_threads; i++) { 2876 if ((thd_stat[i].is_vcpu != true) && 2877 (thd_stat[i].delta_ticks > 0)) { 2878 double temp; 2879 temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta, 2880 thd_stat[i].delta_ticks, 2881 vmsr->host_topo.maxticks[thd_stat[i].pkg_id]); 2882 pkg_stat[thd_stat[i].pkg_id].e_ratio 2883 += (uint64_t)lround(temp); 2884 } 2885 } 2886 2887 /* Calculate the ratio per non-vCPU thread of each package */ 2888 for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { 2889 if (pkg_stat[i].nb_vcpu > 0) { 2890 pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu; 2891 } 2892 } 2893 2894 /* 2895 * Calculate the energy for each Package: 2896 * Energy Package = sum of each vCPU energy that belongs to the package 2897 */ 2898 for (int i = 0; i < num_threads; i++) { 2899 if ((thd_stat[i].is_vcpu == true) && \ 2900 (thd_stat[i].delta_ticks > 0)) { 2901 double temp; 2902 temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta, 2903 thd_stat[i].delta_ticks, 2904 vmsr->host_topo.maxticks[thd_stat[i].pkg_id]); 2905 vpkgs_energy_stat[thd_stat[i].vpkg_id] += 2906 (uint64_t)lround(temp); 2907 vpkgs_energy_stat[thd_stat[i].vpkg_id] += 2908 pkg_stat[thd_stat[i].pkg_id].e_ratio; 2909 } 2910 } 2911 2912 /* 2913 * Finally populate the vmsr register of each vCPU with the total 2914 * package value to emulate the real hardware where each CPU return the 2915 * value of the package it belongs. 2916 */ 2917 for (int i = 0; i < num_threads; i++) { 2918 if ((thd_stat[i].is_vcpu == true) && \ 2919 (thd_stat[i].delta_ticks > 0)) { 2920 vmsr->msr_value[thd_stat[i].vcpu_id] = \ 2921 vpkgs_energy_stat[thd_stat[i].vpkg_id]; 2922 } 2923 } 2924 2925 /* Freeing memory before zeroing the pointer */ 2926 for (int i = 0; i < num_threads; i++) { 2927 g_free(thd_stat[i].utime); 2928 g_free(thd_stat[i].stime); 2929 } 2930 } 2931 2932 clean: 2933 rcu_unregister_thread(); 2934 return NULL; 2935 } 2936 2937 static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms) 2938 { 2939 MachineClass *mc = MACHINE_GET_CLASS(ms); 2940 struct KVMMsrEnergy *r = &s->msr_energy; 2941 2942 /* 2943 * Sanity check 2944 * 1. Host cpu must be Intel cpu 2945 * 2. RAPL must be enabled on the Host 2946 */ 2947 if (!is_host_cpu_intel()) { 2948 error_report("The RAPL feature can only be enabled on hosts " 2949 "with Intel CPU models"); 2950 return -1; 2951 } 2952 2953 if (!is_rapl_enabled()) { 2954 return -1; 2955 } 2956 2957 /* Retrieve the virtual topology */ 2958 vmsr_init_topo_info(&r->guest_topo_info, ms); 2959 2960 /* Retrieve the number of vcpu */ 2961 r->guest_vcpus = ms->smp.cpus; 2962 2963 /* Retrieve the number of virtual sockets */ 2964 r->guest_vsockets = ms->smp.sockets; 2965 2966 /* Allocate register memory (MSR_PKG_STATUS) for each vcpu */ 2967 r->msr_value = g_new0(uint64_t, r->guest_vcpus); 2968 2969 /* Retrieve the CPUArchIDlist */ 2970 r->guest_cpu_list = mc->possible_cpu_arch_ids(ms); 2971 2972 /* Max number of cpus on the Host */ 2973 r->host_topo.maxcpus = vmsr_get_maxcpus(); 2974 if (r->host_topo.maxcpus == 0) { 2975 error_report("host max cpus = 0"); 2976 return -1; 2977 } 2978 2979 /* Max number of packages on the host */ 2980 r->host_topo.maxpkgs = vmsr_get_max_physical_package(r->host_topo.maxcpus); 2981 if (r->host_topo.maxpkgs == 0) { 2982 error_report("host max pkgs = 0"); 2983 return -1; 2984 } 2985 2986 /* Allocate memory for each package on the host */ 2987 r->host_topo.pkg_cpu_count = g_new0(unsigned int, r->host_topo.maxpkgs); 2988 r->host_topo.maxticks = g_new0(unsigned int, r->host_topo.maxpkgs); 2989 2990 vmsr_count_cpus_per_package(r->host_topo.pkg_cpu_count, 2991 r->host_topo.maxpkgs); 2992 for (int i = 0; i < r->host_topo.maxpkgs; i++) { 2993 if (r->host_topo.pkg_cpu_count[i] == 0) { 2994 error_report("cpu per packages = 0 on package_%d", i); 2995 return -1; 2996 } 2997 } 2998 2999 /* Get QEMU PID*/ 3000 r->pid = getpid(); 3001 3002 /* Compute the socket path if necessary */ 3003 if (s->msr_energy.socket_path == NULL) { 3004 s->msr_energy.socket_path = vmsr_compute_default_paths(); 3005 } 3006 3007 /* Open socket with vmsr helper */ 3008 s->msr_energy.sioc = vmsr_open_socket(s->msr_energy.socket_path); 3009 3010 if (s->msr_energy.sioc == NULL) { 3011 error_report("vmsr socket opening failed"); 3012 return -1; 3013 } 3014 3015 /* Those MSR values should not change */ 3016 r->msr_unit = vmsr_read_msr(MSR_RAPL_POWER_UNIT, 0, r->pid, 3017 s->msr_energy.sioc); 3018 r->msr_limit = vmsr_read_msr(MSR_PKG_POWER_LIMIT, 0, r->pid, 3019 s->msr_energy.sioc); 3020 r->msr_info = vmsr_read_msr(MSR_PKG_POWER_INFO, 0, r->pid, 3021 s->msr_energy.sioc); 3022 if (r->msr_unit == 0 || r->msr_limit == 0 || r->msr_info == 0) { 3023 error_report("can't read any virtual msr"); 3024 return -1; 3025 } 3026 3027 qemu_thread_create(&r->msr_thr, "kvm-msr", 3028 kvm_msr_energy_thread, 3029 s, QEMU_THREAD_JOINABLE); 3030 return 0; 3031 } 3032 3033 int kvm_arch_get_default_type(MachineState *ms) 3034 { 3035 return 0; 3036 } 3037 3038 static int kvm_vm_enable_exception_payload(KVMState *s) 3039 { 3040 int ret = 0; 3041 has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD); 3042 if (has_exception_payload) { 3043 ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true); 3044 if (ret < 0) { 3045 error_report("kvm: Failed to enable exception payload cap: %s", 3046 strerror(-ret)); 3047 } 3048 } 3049 3050 return ret; 3051 } 3052 3053 static int kvm_vm_enable_triple_fault_event(KVMState *s) 3054 { 3055 int ret = 0; 3056 has_triple_fault_event = \ 3057 kvm_check_extension(s, 3058 KVM_CAP_X86_TRIPLE_FAULT_EVENT); 3059 if (has_triple_fault_event) { 3060 ret = kvm_vm_enable_cap(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 0, true); 3061 if (ret < 0) { 3062 error_report("kvm: Failed to enable triple fault event cap: %s", 3063 strerror(-ret)); 3064 } 3065 } 3066 return ret; 3067 } 3068 3069 static int kvm_vm_set_identity_map_addr(KVMState *s, uint64_t identity_base) 3070 { 3071 return kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base); 3072 } 3073 3074 static int kvm_vm_set_nr_mmu_pages(KVMState *s) 3075 { 3076 uint64_t shadow_mem; 3077 int ret = 0; 3078 shadow_mem = object_property_get_int(OBJECT(s), 3079 "kvm-shadow-mem", 3080 &error_abort); 3081 if (shadow_mem != -1) { 3082 shadow_mem /= 4096; 3083 ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem); 3084 } 3085 return ret; 3086 } 3087 3088 static int kvm_vm_set_tss_addr(KVMState *s, uint64_t tss_base) 3089 { 3090 return kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, tss_base); 3091 } 3092 3093 static int kvm_vm_enable_disable_exits(KVMState *s) 3094 { 3095 int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS); 3096 3097 if (disable_exits) { 3098 disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT | 3099 KVM_X86_DISABLE_EXITS_HLT | 3100 KVM_X86_DISABLE_EXITS_PAUSE | 3101 KVM_X86_DISABLE_EXITS_CSTATE); 3102 } 3103 3104 return kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0, 3105 disable_exits); 3106 } 3107 3108 static int kvm_vm_enable_bus_lock_exit(KVMState *s) 3109 { 3110 int ret = 0; 3111 ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT); 3112 if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) { 3113 error_report("kvm: bus lock detection unsupported"); 3114 return -ENOTSUP; 3115 } 3116 ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0, 3117 KVM_BUS_LOCK_DETECTION_EXIT); 3118 if (ret < 0) { 3119 error_report("kvm: Failed to enable bus lock detection cap: %s", 3120 strerror(-ret)); 3121 } 3122 3123 return ret; 3124 } 3125 3126 static int kvm_vm_enable_notify_vmexit(KVMState *s) 3127 { 3128 int ret = 0; 3129 if (s->notify_vmexit != NOTIFY_VMEXIT_OPTION_DISABLE) { 3130 uint64_t notify_window_flags = 3131 ((uint64_t)s->notify_window << 32) | 3132 KVM_X86_NOTIFY_VMEXIT_ENABLED | 3133 KVM_X86_NOTIFY_VMEXIT_USER; 3134 ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0, 3135 notify_window_flags); 3136 if (ret < 0) { 3137 error_report("kvm: Failed to enable notify vmexit cap: %s", 3138 strerror(-ret)); 3139 } 3140 } 3141 return ret; 3142 } 3143 3144 static int kvm_vm_enable_userspace_msr(KVMState *s) 3145 { 3146 int ret; 3147 3148 ret = kvm_vm_enable_cap(s, KVM_CAP_X86_USER_SPACE_MSR, 0, 3149 KVM_MSR_EXIT_REASON_FILTER); 3150 if (ret < 0) { 3151 error_report("Could not enable user space MSRs: %s", 3152 strerror(-ret)); 3153 exit(1); 3154 } 3155 3156 ret = kvm_filter_msr(s, MSR_CORE_THREAD_COUNT, 3157 kvm_rdmsr_core_thread_count, NULL); 3158 if (ret < 0) { 3159 error_report("Could not install MSR_CORE_THREAD_COUNT handler: %s", 3160 strerror(-ret)); 3161 exit(1); 3162 } 3163 3164 return 0; 3165 } 3166 3167 static int kvm_vm_enable_energy_msrs(KVMState *s) 3168 { 3169 int ret; 3170 3171 if (s->msr_energy.enable == true) { 3172 ret = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT, 3173 kvm_rdmsr_rapl_power_unit, NULL); 3174 if (ret < 0) { 3175 error_report("Could not install MSR_RAPL_POWER_UNIT handler: %s", 3176 strerror(-ret)); 3177 return ret; 3178 } 3179 3180 ret = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT, 3181 kvm_rdmsr_pkg_power_limit, NULL); 3182 if (ret < 0) { 3183 error_report("Could not install MSR_PKG_POWER_LIMIT handler: %s", 3184 strerror(-ret)); 3185 return ret; 3186 } 3187 3188 ret = kvm_filter_msr(s, MSR_PKG_POWER_INFO, 3189 kvm_rdmsr_pkg_power_info, NULL); 3190 if (ret < 0) { 3191 error_report("Could not install MSR_PKG_POWER_INFO handler: %s", 3192 strerror(-ret)); 3193 return ret; 3194 } 3195 ret = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS, 3196 kvm_rdmsr_pkg_energy_status, NULL); 3197 if (ret < 0) { 3198 error_report("Could not install MSR_PKG_ENERGY_STATUS handler: %s", 3199 strerror(-ret)); 3200 return ret; 3201 } 3202 } 3203 return 0; 3204 } 3205 3206 int kvm_arch_init(MachineState *ms, KVMState *s) 3207 { 3208 int ret; 3209 struct utsname utsname; 3210 Error *local_err = NULL; 3211 3212 /* 3213 * Initialize confidential guest (SEV/TDX) context, if required 3214 */ 3215 if (ms->cgs) { 3216 ret = confidential_guest_kvm_init(ms->cgs, &local_err); 3217 if (ret < 0) { 3218 error_report_err(local_err); 3219 return ret; 3220 } 3221 } 3222 3223 has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS); 3224 has_sregs2 = kvm_check_extension(s, KVM_CAP_SREGS2) > 0; 3225 3226 hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX); 3227 3228 ret = kvm_vm_enable_exception_payload(s); 3229 if (ret < 0) { 3230 return ret; 3231 } 3232 3233 ret = kvm_vm_enable_triple_fault_event(s); 3234 if (ret < 0) { 3235 return ret; 3236 } 3237 3238 if (s->xen_version) { 3239 #ifdef CONFIG_XEN_EMU 3240 if (!object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE)) { 3241 error_report("kvm: Xen support only available in PC machine"); 3242 return -ENOTSUP; 3243 } 3244 /* hyperv_enabled() doesn't work yet. */ 3245 uint32_t msr = XEN_HYPERCALL_MSR; 3246 ret = kvm_xen_init(s, msr); 3247 if (ret < 0) { 3248 return ret; 3249 } 3250 #else 3251 error_report("kvm: Xen support not enabled in qemu"); 3252 return -ENOTSUP; 3253 #endif 3254 } 3255 3256 ret = kvm_get_supported_msrs(s); 3257 if (ret < 0) { 3258 return ret; 3259 } 3260 3261 ret = kvm_get_supported_feature_msrs(s); 3262 if (ret < 0) { 3263 return ret; 3264 } 3265 3266 uname(&utsname); 3267 lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0; 3268 3269 ret = kvm_vm_set_identity_map_addr(s, KVM_IDENTITY_BASE); 3270 if (ret < 0) { 3271 return ret; 3272 } 3273 3274 /* Set TSS base one page after EPT identity map. */ 3275 ret = kvm_vm_set_tss_addr(s, KVM_IDENTITY_BASE + 0x1000); 3276 if (ret < 0) { 3277 return ret; 3278 } 3279 3280 /* Tell fw_cfg to notify the BIOS to reserve the range. */ 3281 e820_add_entry(KVM_IDENTITY_BASE, 0x4000, E820_RESERVED); 3282 3283 ret = kvm_vm_set_nr_mmu_pages(s); 3284 if (ret < 0) { 3285 return ret; 3286 } 3287 3288 if (kvm_check_extension(s, KVM_CAP_X86_SMM) && 3289 object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) && 3290 x86_machine_is_smm_enabled(X86_MACHINE(ms))) { 3291 smram_machine_done.notify = register_smram_listener; 3292 qemu_add_machine_init_done_notifier(&smram_machine_done); 3293 } 3294 3295 if (enable_cpu_pm) { 3296 ret = kvm_vm_enable_disable_exits(s); 3297 if (ret < 0) { 3298 error_report("kvm: guest stopping CPU not supported: %s", 3299 strerror(-ret)); 3300 return ret; 3301 } 3302 } 3303 3304 if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) { 3305 X86MachineState *x86ms = X86_MACHINE(ms); 3306 3307 if (x86ms->bus_lock_ratelimit > 0) { 3308 ret = kvm_vm_enable_bus_lock_exit(s); 3309 if (ret < 0) { 3310 return ret; 3311 } 3312 ratelimit_init(&bus_lock_ratelimit_ctrl); 3313 ratelimit_set_speed(&bus_lock_ratelimit_ctrl, 3314 x86ms->bus_lock_ratelimit, BUS_LOCK_SLICE_TIME); 3315 } 3316 } 3317 3318 if (kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) { 3319 ret = kvm_vm_enable_notify_vmexit(s); 3320 if (ret < 0) { 3321 return ret; 3322 } 3323 } 3324 3325 if (kvm_vm_check_extension(s, KVM_CAP_X86_USER_SPACE_MSR)) { 3326 ret = kvm_vm_enable_userspace_msr(s); 3327 if (ret < 0) { 3328 return ret; 3329 } 3330 3331 if (s->msr_energy.enable == true) { 3332 ret = kvm_vm_enable_energy_msrs(s); 3333 if (ret < 0) { 3334 return ret; 3335 } 3336 3337 ret = kvm_msr_energy_thread_init(s, ms); 3338 if (ret < 0) { 3339 error_report("kvm : error RAPL feature requirement not met"); 3340 return ret; 3341 } 3342 } 3343 } 3344 3345 return 0; 3346 } 3347 3348 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs) 3349 { 3350 lhs->selector = rhs->selector; 3351 lhs->base = rhs->base; 3352 lhs->limit = rhs->limit; 3353 lhs->type = 3; 3354 lhs->present = 1; 3355 lhs->dpl = 3; 3356 lhs->db = 0; 3357 lhs->s = 1; 3358 lhs->l = 0; 3359 lhs->g = 0; 3360 lhs->avl = 0; 3361 lhs->unusable = 0; 3362 } 3363 3364 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs) 3365 { 3366 unsigned flags = rhs->flags; 3367 lhs->selector = rhs->selector; 3368 lhs->base = rhs->base; 3369 lhs->limit = rhs->limit; 3370 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15; 3371 lhs->present = (flags & DESC_P_MASK) != 0; 3372 lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3; 3373 lhs->db = (flags >> DESC_B_SHIFT) & 1; 3374 lhs->s = (flags & DESC_S_MASK) != 0; 3375 lhs->l = (flags >> DESC_L_SHIFT) & 1; 3376 lhs->g = (flags & DESC_G_MASK) != 0; 3377 lhs->avl = (flags & DESC_AVL_MASK) != 0; 3378 lhs->unusable = !lhs->present; 3379 lhs->padding = 0; 3380 } 3381 3382 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs) 3383 { 3384 lhs->selector = rhs->selector; 3385 lhs->base = rhs->base; 3386 lhs->limit = rhs->limit; 3387 lhs->flags = (rhs->type << DESC_TYPE_SHIFT) | 3388 ((rhs->present && !rhs->unusable) * DESC_P_MASK) | 3389 (rhs->dpl << DESC_DPL_SHIFT) | 3390 (rhs->db << DESC_B_SHIFT) | 3391 (rhs->s * DESC_S_MASK) | 3392 (rhs->l << DESC_L_SHIFT) | 3393 (rhs->g * DESC_G_MASK) | 3394 (rhs->avl * DESC_AVL_MASK); 3395 } 3396 3397 static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set) 3398 { 3399 if (set) { 3400 *kvm_reg = *qemu_reg; 3401 } else { 3402 *qemu_reg = *kvm_reg; 3403 } 3404 } 3405 3406 static int kvm_getput_regs(X86CPU *cpu, int set) 3407 { 3408 CPUX86State *env = &cpu->env; 3409 struct kvm_regs regs; 3410 int ret = 0; 3411 3412 if (!set) { 3413 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, ®s); 3414 if (ret < 0) { 3415 return ret; 3416 } 3417 } 3418 3419 kvm_getput_reg(®s.rax, &env->regs[R_EAX], set); 3420 kvm_getput_reg(®s.rbx, &env->regs[R_EBX], set); 3421 kvm_getput_reg(®s.rcx, &env->regs[R_ECX], set); 3422 kvm_getput_reg(®s.rdx, &env->regs[R_EDX], set); 3423 kvm_getput_reg(®s.rsi, &env->regs[R_ESI], set); 3424 kvm_getput_reg(®s.rdi, &env->regs[R_EDI], set); 3425 kvm_getput_reg(®s.rsp, &env->regs[R_ESP], set); 3426 kvm_getput_reg(®s.rbp, &env->regs[R_EBP], set); 3427 #ifdef TARGET_X86_64 3428 kvm_getput_reg(®s.r8, &env->regs[8], set); 3429 kvm_getput_reg(®s.r9, &env->regs[9], set); 3430 kvm_getput_reg(®s.r10, &env->regs[10], set); 3431 kvm_getput_reg(®s.r11, &env->regs[11], set); 3432 kvm_getput_reg(®s.r12, &env->regs[12], set); 3433 kvm_getput_reg(®s.r13, &env->regs[13], set); 3434 kvm_getput_reg(®s.r14, &env->regs[14], set); 3435 kvm_getput_reg(®s.r15, &env->regs[15], set); 3436 #endif 3437 3438 kvm_getput_reg(®s.rflags, &env->eflags, set); 3439 kvm_getput_reg(®s.rip, &env->eip, set); 3440 3441 if (set) { 3442 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, ®s); 3443 } 3444 3445 return ret; 3446 } 3447 3448 static int kvm_put_xsave(X86CPU *cpu) 3449 { 3450 CPUX86State *env = &cpu->env; 3451 void *xsave = env->xsave_buf; 3452 3453 x86_cpu_xsave_all_areas(cpu, xsave, env->xsave_buf_len); 3454 3455 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave); 3456 } 3457 3458 static int kvm_put_xcrs(X86CPU *cpu) 3459 { 3460 CPUX86State *env = &cpu->env; 3461 struct kvm_xcrs xcrs = {}; 3462 3463 if (!has_xcrs) { 3464 return 0; 3465 } 3466 3467 xcrs.nr_xcrs = 1; 3468 xcrs.flags = 0; 3469 xcrs.xcrs[0].xcr = 0; 3470 xcrs.xcrs[0].value = env->xcr0; 3471 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs); 3472 } 3473 3474 static int kvm_put_sregs(X86CPU *cpu) 3475 { 3476 CPUX86State *env = &cpu->env; 3477 struct kvm_sregs sregs; 3478 3479 /* 3480 * The interrupt_bitmap is ignored because KVM_SET_SREGS is 3481 * always followed by KVM_SET_VCPU_EVENTS. 3482 */ 3483 memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap)); 3484 3485 if ((env->eflags & VM_MASK)) { 3486 set_v8086_seg(&sregs.cs, &env->segs[R_CS]); 3487 set_v8086_seg(&sregs.ds, &env->segs[R_DS]); 3488 set_v8086_seg(&sregs.es, &env->segs[R_ES]); 3489 set_v8086_seg(&sregs.fs, &env->segs[R_FS]); 3490 set_v8086_seg(&sregs.gs, &env->segs[R_GS]); 3491 set_v8086_seg(&sregs.ss, &env->segs[R_SS]); 3492 } else { 3493 set_seg(&sregs.cs, &env->segs[R_CS]); 3494 set_seg(&sregs.ds, &env->segs[R_DS]); 3495 set_seg(&sregs.es, &env->segs[R_ES]); 3496 set_seg(&sregs.fs, &env->segs[R_FS]); 3497 set_seg(&sregs.gs, &env->segs[R_GS]); 3498 set_seg(&sregs.ss, &env->segs[R_SS]); 3499 } 3500 3501 set_seg(&sregs.tr, &env->tr); 3502 set_seg(&sregs.ldt, &env->ldt); 3503 3504 sregs.idt.limit = env->idt.limit; 3505 sregs.idt.base = env->idt.base; 3506 memset(sregs.idt.padding, 0, sizeof sregs.idt.padding); 3507 sregs.gdt.limit = env->gdt.limit; 3508 sregs.gdt.base = env->gdt.base; 3509 memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding); 3510 3511 sregs.cr0 = env->cr[0]; 3512 sregs.cr2 = env->cr[2]; 3513 sregs.cr3 = env->cr[3]; 3514 sregs.cr4 = env->cr[4]; 3515 3516 sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state); 3517 sregs.apic_base = cpu_get_apic_base(cpu->apic_state); 3518 3519 sregs.efer = env->efer; 3520 3521 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs); 3522 } 3523 3524 static int kvm_put_sregs2(X86CPU *cpu) 3525 { 3526 CPUX86State *env = &cpu->env; 3527 struct kvm_sregs2 sregs; 3528 int i; 3529 3530 sregs.flags = 0; 3531 3532 if ((env->eflags & VM_MASK)) { 3533 set_v8086_seg(&sregs.cs, &env->segs[R_CS]); 3534 set_v8086_seg(&sregs.ds, &env->segs[R_DS]); 3535 set_v8086_seg(&sregs.es, &env->segs[R_ES]); 3536 set_v8086_seg(&sregs.fs, &env->segs[R_FS]); 3537 set_v8086_seg(&sregs.gs, &env->segs[R_GS]); 3538 set_v8086_seg(&sregs.ss, &env->segs[R_SS]); 3539 } else { 3540 set_seg(&sregs.cs, &env->segs[R_CS]); 3541 set_seg(&sregs.ds, &env->segs[R_DS]); 3542 set_seg(&sregs.es, &env->segs[R_ES]); 3543 set_seg(&sregs.fs, &env->segs[R_FS]); 3544 set_seg(&sregs.gs, &env->segs[R_GS]); 3545 set_seg(&sregs.ss, &env->segs[R_SS]); 3546 } 3547 3548 set_seg(&sregs.tr, &env->tr); 3549 set_seg(&sregs.ldt, &env->ldt); 3550 3551 sregs.idt.limit = env->idt.limit; 3552 sregs.idt.base = env->idt.base; 3553 memset(sregs.idt.padding, 0, sizeof sregs.idt.padding); 3554 sregs.gdt.limit = env->gdt.limit; 3555 sregs.gdt.base = env->gdt.base; 3556 memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding); 3557 3558 sregs.cr0 = env->cr[0]; 3559 sregs.cr2 = env->cr[2]; 3560 sregs.cr3 = env->cr[3]; 3561 sregs.cr4 = env->cr[4]; 3562 3563 sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state); 3564 sregs.apic_base = cpu_get_apic_base(cpu->apic_state); 3565 3566 sregs.efer = env->efer; 3567 3568 if (env->pdptrs_valid) { 3569 for (i = 0; i < 4; i++) { 3570 sregs.pdptrs[i] = env->pdptrs[i]; 3571 } 3572 sregs.flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID; 3573 } 3574 3575 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS2, &sregs); 3576 } 3577 3578 3579 static void kvm_msr_buf_reset(X86CPU *cpu) 3580 { 3581 memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE); 3582 } 3583 3584 static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value) 3585 { 3586 struct kvm_msrs *msrs = cpu->kvm_msr_buf; 3587 void *limit = ((void *)msrs) + MSR_BUF_SIZE; 3588 struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs]; 3589 3590 assert((void *)(entry + 1) <= limit); 3591 3592 entry->index = index; 3593 entry->reserved = 0; 3594 entry->data = value; 3595 msrs->nmsrs++; 3596 } 3597 3598 static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value) 3599 { 3600 kvm_msr_buf_reset(cpu); 3601 kvm_msr_entry_add(cpu, index, value); 3602 3603 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf); 3604 } 3605 3606 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value) 3607 { 3608 int ret; 3609 struct { 3610 struct kvm_msrs info; 3611 struct kvm_msr_entry entries[1]; 3612 } msr_data = { 3613 .info.nmsrs = 1, 3614 .entries[0].index = index, 3615 }; 3616 3617 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data); 3618 if (ret < 0) { 3619 return ret; 3620 } 3621 assert(ret == 1); 3622 *value = msr_data.entries[0].data; 3623 return ret; 3624 } 3625 void kvm_put_apicbase(X86CPU *cpu, uint64_t value) 3626 { 3627 int ret; 3628 3629 ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value); 3630 assert(ret == 1); 3631 } 3632 3633 static int kvm_put_tscdeadline_msr(X86CPU *cpu) 3634 { 3635 CPUX86State *env = &cpu->env; 3636 int ret; 3637 3638 if (!has_msr_tsc_deadline) { 3639 return 0; 3640 } 3641 3642 ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline); 3643 if (ret < 0) { 3644 return ret; 3645 } 3646 3647 assert(ret == 1); 3648 return 0; 3649 } 3650 3651 /* 3652 * Provide a separate write service for the feature control MSR in order to 3653 * kick the VCPU out of VMXON or even guest mode on reset. This has to be done 3654 * before writing any other state because forcibly leaving nested mode 3655 * invalidates the VCPU state. 3656 */ 3657 static int kvm_put_msr_feature_control(X86CPU *cpu) 3658 { 3659 int ret; 3660 3661 if (!has_msr_feature_control) { 3662 return 0; 3663 } 3664 3665 ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL, 3666 cpu->env.msr_ia32_feature_control); 3667 if (ret < 0) { 3668 return ret; 3669 } 3670 3671 assert(ret == 1); 3672 return 0; 3673 } 3674 3675 static uint64_t make_vmx_msr_value(uint32_t index, uint32_t features) 3676 { 3677 uint32_t default1, can_be_one, can_be_zero; 3678 uint32_t must_be_one; 3679 3680 switch (index) { 3681 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 3682 default1 = 0x00000016; 3683 break; 3684 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 3685 default1 = 0x0401e172; 3686 break; 3687 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 3688 default1 = 0x000011ff; 3689 break; 3690 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 3691 default1 = 0x00036dff; 3692 break; 3693 case MSR_IA32_VMX_PROCBASED_CTLS2: 3694 default1 = 0; 3695 break; 3696 default: 3697 abort(); 3698 } 3699 3700 /* If a feature bit is set, the control can be either set or clear. 3701 * Otherwise the value is limited to either 0 or 1 by default1. 3702 */ 3703 can_be_one = features | default1; 3704 can_be_zero = features | ~default1; 3705 must_be_one = ~can_be_zero; 3706 3707 /* 3708 * Bit 0:31 -> 0 if the control bit can be zero (i.e. 1 if it must be one). 3709 * Bit 32:63 -> 1 if the control bit can be one. 3710 */ 3711 return must_be_one | (((uint64_t)can_be_one) << 32); 3712 } 3713 3714 static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f) 3715 { 3716 uint64_t kvm_vmx_basic = 3717 kvm_arch_get_supported_msr_feature(kvm_state, 3718 MSR_IA32_VMX_BASIC); 3719 3720 if (!kvm_vmx_basic) { 3721 /* If the kernel doesn't support VMX feature (kvm_intel.nested=0), 3722 * then kvm_vmx_basic will be 0 and KVM_SET_MSR will fail. 3723 */ 3724 return; 3725 } 3726 3727 uint64_t kvm_vmx_misc = 3728 kvm_arch_get_supported_msr_feature(kvm_state, 3729 MSR_IA32_VMX_MISC); 3730 uint64_t kvm_vmx_ept_vpid = 3731 kvm_arch_get_supported_msr_feature(kvm_state, 3732 MSR_IA32_VMX_EPT_VPID_CAP); 3733 3734 /* 3735 * If the guest is 64-bit, a value of 1 is allowed for the host address 3736 * space size vmexit control. 3737 */ 3738 uint64_t fixed_vmx_exit = f[FEAT_8000_0001_EDX] & CPUID_EXT2_LM 3739 ? (uint64_t)VMX_VM_EXIT_HOST_ADDR_SPACE_SIZE << 32 : 0; 3740 3741 /* 3742 * Bits 0-30, 32-44 and 50-53 come from the host. KVM should 3743 * not change them for backwards compatibility. 3744 */ 3745 uint64_t fixed_vmx_basic = kvm_vmx_basic & 3746 (MSR_VMX_BASIC_VMCS_REVISION_MASK | 3747 MSR_VMX_BASIC_VMXON_REGION_SIZE_MASK | 3748 MSR_VMX_BASIC_VMCS_MEM_TYPE_MASK); 3749 3750 /* 3751 * Same for bits 0-4 and 25-27. Bits 16-24 (CR3 target count) can 3752 * change in the future but are always zero for now, clear them to be 3753 * future proof. Bits 32-63 in theory could change, though KVM does 3754 * not support dual-monitor treatment and probably never will; mask 3755 * them out as well. 3756 */ 3757 uint64_t fixed_vmx_misc = kvm_vmx_misc & 3758 (MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK | 3759 MSR_VMX_MISC_MAX_MSR_LIST_SIZE_MASK); 3760 3761 /* 3762 * EPT memory types should not change either, so we do not bother 3763 * adding features for them. 3764 */ 3765 uint64_t fixed_vmx_ept_mask = 3766 (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_ENABLE_EPT ? 3767 MSR_VMX_EPT_UC | MSR_VMX_EPT_WB : 0); 3768 uint64_t fixed_vmx_ept_vpid = kvm_vmx_ept_vpid & fixed_vmx_ept_mask; 3769 3770 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 3771 make_vmx_msr_value(MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 3772 f[FEAT_VMX_PROCBASED_CTLS])); 3773 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS, 3774 make_vmx_msr_value(MSR_IA32_VMX_TRUE_PINBASED_CTLS, 3775 f[FEAT_VMX_PINBASED_CTLS])); 3776 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_EXIT_CTLS, 3777 make_vmx_msr_value(MSR_IA32_VMX_TRUE_EXIT_CTLS, 3778 f[FEAT_VMX_EXIT_CTLS]) | fixed_vmx_exit); 3779 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS, 3780 make_vmx_msr_value(MSR_IA32_VMX_TRUE_ENTRY_CTLS, 3781 f[FEAT_VMX_ENTRY_CTLS])); 3782 kvm_msr_entry_add(cpu, MSR_IA32_VMX_PROCBASED_CTLS2, 3783 make_vmx_msr_value(MSR_IA32_VMX_PROCBASED_CTLS2, 3784 f[FEAT_VMX_SECONDARY_CTLS])); 3785 kvm_msr_entry_add(cpu, MSR_IA32_VMX_EPT_VPID_CAP, 3786 f[FEAT_VMX_EPT_VPID_CAPS] | fixed_vmx_ept_vpid); 3787 kvm_msr_entry_add(cpu, MSR_IA32_VMX_BASIC, 3788 f[FEAT_VMX_BASIC] | fixed_vmx_basic); 3789 kvm_msr_entry_add(cpu, MSR_IA32_VMX_MISC, 3790 f[FEAT_VMX_MISC] | fixed_vmx_misc); 3791 if (has_msr_vmx_vmfunc) { 3792 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMFUNC, f[FEAT_VMX_VMFUNC]); 3793 } 3794 3795 /* 3796 * Just to be safe, write these with constant values. The CRn_FIXED1 3797 * MSRs are generated by KVM based on the vCPU's CPUID. 3798 */ 3799 kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR0_FIXED0, 3800 CR0_PE_MASK | CR0_PG_MASK | CR0_NE_MASK); 3801 kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0, 3802 CR4_VMXE_MASK); 3803 3804 if (f[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { 3805 /* FRED injected-event data (0x2052). */ 3806 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x52); 3807 } else if (f[FEAT_VMX_EXIT_CTLS] & 3808 VMX_VM_EXIT_ACTIVATE_SECONDARY_CONTROLS) { 3809 /* Secondary VM-exit controls (0x2044). */ 3810 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x44); 3811 } else if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) { 3812 /* TSC multiplier (0x2032). */ 3813 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32); 3814 } else { 3815 /* Preemption timer (0x482E). */ 3816 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x2E); 3817 } 3818 } 3819 3820 static void kvm_msr_entry_add_perf(X86CPU *cpu, FeatureWordArray f) 3821 { 3822 uint64_t kvm_perf_cap = 3823 kvm_arch_get_supported_msr_feature(kvm_state, 3824 MSR_IA32_PERF_CAPABILITIES); 3825 3826 if (kvm_perf_cap) { 3827 kvm_msr_entry_add(cpu, MSR_IA32_PERF_CAPABILITIES, 3828 kvm_perf_cap & f[FEAT_PERF_CAPABILITIES]); 3829 } 3830 } 3831 3832 static int kvm_buf_set_msrs(X86CPU *cpu) 3833 { 3834 int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf); 3835 if (ret < 0) { 3836 return ret; 3837 } 3838 3839 if (ret < cpu->kvm_msr_buf->nmsrs) { 3840 struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret]; 3841 error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64, 3842 (uint32_t)e->index, (uint64_t)e->data); 3843 } 3844 3845 assert(ret == cpu->kvm_msr_buf->nmsrs); 3846 return 0; 3847 } 3848 3849 static void kvm_init_msrs(X86CPU *cpu) 3850 { 3851 CPUX86State *env = &cpu->env; 3852 3853 kvm_msr_buf_reset(cpu); 3854 if (has_msr_arch_capabs) { 3855 kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES, 3856 env->features[FEAT_ARCH_CAPABILITIES]); 3857 } 3858 3859 if (has_msr_core_capabs) { 3860 kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY, 3861 env->features[FEAT_CORE_CAPABILITY]); 3862 } 3863 3864 if (has_msr_perf_capabs && cpu->enable_pmu) { 3865 kvm_msr_entry_add_perf(cpu, env->features); 3866 } 3867 3868 if (has_msr_ucode_rev) { 3869 kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev); 3870 } 3871 3872 /* 3873 * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but 3874 * all kernels with MSR features should have them. 3875 */ 3876 if (kvm_feature_msrs && cpu_has_vmx(env)) { 3877 kvm_msr_entry_add_vmx(cpu, env->features); 3878 } 3879 3880 assert(kvm_buf_set_msrs(cpu) == 0); 3881 } 3882 3883 static int kvm_put_msrs(X86CPU *cpu, int level) 3884 { 3885 CPUX86State *env = &cpu->env; 3886 int i; 3887 3888 kvm_msr_buf_reset(cpu); 3889 3890 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs); 3891 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp); 3892 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip); 3893 kvm_msr_entry_add(cpu, MSR_PAT, env->pat); 3894 if (has_msr_star) { 3895 kvm_msr_entry_add(cpu, MSR_STAR, env->star); 3896 } 3897 if (has_msr_hsave_pa) { 3898 kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave); 3899 } 3900 if (has_msr_tsc_aux) { 3901 kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux); 3902 } 3903 if (has_msr_tsc_adjust) { 3904 kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust); 3905 } 3906 if (has_msr_misc_enable) { 3907 kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 3908 env->msr_ia32_misc_enable); 3909 } 3910 if (has_msr_smbase) { 3911 kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase); 3912 } 3913 if (has_msr_smi_count) { 3914 kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count); 3915 } 3916 if (has_msr_pkrs) { 3917 kvm_msr_entry_add(cpu, MSR_IA32_PKRS, env->pkrs); 3918 } 3919 if (has_msr_bndcfgs) { 3920 kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs); 3921 } 3922 if (has_msr_xss) { 3923 kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss); 3924 } 3925 if (has_msr_umwait) { 3926 kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, env->umwait); 3927 } 3928 if (has_msr_spec_ctrl) { 3929 kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl); 3930 } 3931 if (has_tsc_scale_msr) { 3932 kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, env->amd_tsc_scale_msr); 3933 } 3934 3935 if (has_msr_tsx_ctrl) { 3936 kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, env->tsx_ctrl); 3937 } 3938 if (has_msr_virt_ssbd) { 3939 kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd); 3940 } 3941 if (has_msr_hwcr) { 3942 kvm_msr_entry_add(cpu, MSR_K7_HWCR, env->msr_hwcr); 3943 } 3944 3945 #ifdef TARGET_X86_64 3946 if (lm_capable_kernel) { 3947 kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar); 3948 kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase); 3949 kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask); 3950 kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar); 3951 if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { 3952 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, env->fred_rsp0); 3953 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, env->fred_rsp1); 3954 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, env->fred_rsp2); 3955 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, env->fred_rsp3); 3956 kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, env->fred_stklvls); 3957 kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, env->fred_ssp1); 3958 kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, env->fred_ssp2); 3959 kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, env->fred_ssp3); 3960 kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, env->fred_config); 3961 } 3962 } 3963 #endif 3964 3965 /* 3966 * The following MSRs have side effects on the guest or are too heavy 3967 * for normal writeback. Limit them to reset or full state updates. 3968 */ 3969 if (level >= KVM_PUT_RESET_STATE) { 3970 kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc); 3971 if (env->features[FEAT_KVM] & (CPUID_KVM_CLOCK | CPUID_KVM_CLOCK2)) { 3972 kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr); 3973 kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr); 3974 } 3975 if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF_INT) { 3976 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr); 3977 } 3978 if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF) { 3979 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr); 3980 } 3981 if (env->features[FEAT_KVM] & CPUID_KVM_PV_EOI) { 3982 kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr); 3983 } 3984 if (env->features[FEAT_KVM] & CPUID_KVM_STEAL_TIME) { 3985 kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr); 3986 } 3987 3988 if (env->features[FEAT_KVM] & CPUID_KVM_POLL_CONTROL) { 3989 kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr); 3990 } 3991 3992 if (has_architectural_pmu_version > 0) { 3993 if (has_architectural_pmu_version > 1) { 3994 /* Stop the counter. */ 3995 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0); 3996 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0); 3997 } 3998 3999 /* Set the counter values. */ 4000 for (i = 0; i < num_architectural_pmu_fixed_counters; i++) { 4001 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 4002 env->msr_fixed_counters[i]); 4003 } 4004 for (i = 0; i < num_architectural_pmu_gp_counters; i++) { 4005 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 4006 env->msr_gp_counters[i]); 4007 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 4008 env->msr_gp_evtsel[i]); 4009 } 4010 if (has_architectural_pmu_version > 1) { 4011 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 4012 env->msr_global_status); 4013 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 4014 env->msr_global_ovf_ctrl); 4015 4016 /* Now start the PMU. */ 4017 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 4018 env->msr_fixed_ctr_ctrl); 4019 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 4020 env->msr_global_ctrl); 4021 } 4022 } 4023 /* 4024 * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add, 4025 * only sync them to KVM on the first cpu 4026 */ 4027 if (current_cpu == first_cpu) { 4028 if (has_msr_hv_hypercall) { 4029 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 4030 env->msr_hv_guest_os_id); 4031 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 4032 env->msr_hv_hypercall); 4033 } 4034 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) { 4035 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 4036 env->msr_hv_tsc); 4037 } 4038 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) { 4039 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 4040 env->msr_hv_reenlightenment_control); 4041 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 4042 env->msr_hv_tsc_emulation_control); 4043 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 4044 env->msr_hv_tsc_emulation_status); 4045 } 4046 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG) && 4047 has_msr_hv_syndbg_options) { 4048 kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS, 4049 hyperv_syndbg_query_options()); 4050 } 4051 } 4052 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) { 4053 kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 4054 env->msr_hv_vapic); 4055 } 4056 if (has_msr_hv_crash) { 4057 int j; 4058 4059 for (j = 0; j < HV_CRASH_PARAMS; j++) 4060 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 4061 env->msr_hv_crash_params[j]); 4062 4063 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY); 4064 } 4065 if (has_msr_hv_runtime) { 4066 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime); 4067 } 4068 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) 4069 && hv_vpindex_settable) { 4070 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX, 4071 hyperv_vp_index(CPU(cpu))); 4072 } 4073 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) { 4074 int j; 4075 4076 kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION); 4077 4078 kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 4079 env->msr_hv_synic_control); 4080 kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 4081 env->msr_hv_synic_evt_page); 4082 kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 4083 env->msr_hv_synic_msg_page); 4084 4085 for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) { 4086 kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j, 4087 env->msr_hv_synic_sint[j]); 4088 } 4089 } 4090 if (has_msr_hv_stimer) { 4091 int j; 4092 4093 for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) { 4094 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2, 4095 env->msr_hv_stimer_config[j]); 4096 } 4097 4098 for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) { 4099 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2, 4100 env->msr_hv_stimer_count[j]); 4101 } 4102 } 4103 if (env->features[FEAT_1_EDX] & CPUID_MTRR) { 4104 uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits); 4105 4106 kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype); 4107 kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]); 4108 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]); 4109 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]); 4110 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]); 4111 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]); 4112 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]); 4113 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]); 4114 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]); 4115 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]); 4116 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]); 4117 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]); 4118 for (i = 0; i < MSR_MTRRcap_VCNT; i++) { 4119 /* The CPU GPs if we write to a bit above the physical limit of 4120 * the host CPU (and KVM emulates that) 4121 */ 4122 uint64_t mask = env->mtrr_var[i].mask; 4123 mask &= phys_mask; 4124 4125 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 4126 env->mtrr_var[i].base); 4127 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask); 4128 } 4129 } 4130 if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) { 4131 int addr_num = kvm_arch_get_supported_cpuid(kvm_state, 4132 0x14, 1, R_EAX) & 0x7; 4133 4134 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 4135 env->msr_rtit_ctrl); 4136 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 4137 env->msr_rtit_status); 4138 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 4139 env->msr_rtit_output_base); 4140 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 4141 env->msr_rtit_output_mask); 4142 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 4143 env->msr_rtit_cr3_match); 4144 for (i = 0; i < addr_num; i++) { 4145 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 4146 env->msr_rtit_addrs[i]); 4147 } 4148 } 4149 4150 if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) { 4151 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, 4152 env->msr_ia32_sgxlepubkeyhash[0]); 4153 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, 4154 env->msr_ia32_sgxlepubkeyhash[1]); 4155 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, 4156 env->msr_ia32_sgxlepubkeyhash[2]); 4157 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 4158 env->msr_ia32_sgxlepubkeyhash[3]); 4159 } 4160 4161 if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) { 4162 kvm_msr_entry_add(cpu, MSR_IA32_XFD, 4163 env->msr_xfd); 4164 kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR, 4165 env->msr_xfd_err); 4166 } 4167 4168 if (kvm_enabled() && cpu->enable_pmu && 4169 (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) { 4170 uint64_t depth; 4171 int ret; 4172 4173 /* 4174 * Only migrate Arch LBR states when the host Arch LBR depth 4175 * equals that of source guest's, this is to avoid mismatch 4176 * of guest/host config for the msr hence avoid unexpected 4177 * misbehavior. 4178 */ 4179 ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth); 4180 4181 if (ret == 1 && !!depth && depth == env->msr_lbr_depth) { 4182 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, env->msr_lbr_ctl); 4183 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, env->msr_lbr_depth); 4184 4185 for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) { 4186 if (!env->lbr_records[i].from) { 4187 continue; 4188 } 4189 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i, 4190 env->lbr_records[i].from); 4191 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i, 4192 env->lbr_records[i].to); 4193 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i, 4194 env->lbr_records[i].info); 4195 } 4196 } 4197 } 4198 4199 /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see 4200 * kvm_put_msr_feature_control. */ 4201 } 4202 4203 if (env->mcg_cap) { 4204 kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status); 4205 kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl); 4206 if (has_msr_mcg_ext_ctl) { 4207 kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl); 4208 } 4209 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) { 4210 kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]); 4211 } 4212 } 4213 4214 return kvm_buf_set_msrs(cpu); 4215 } 4216 4217 4218 static int kvm_get_xsave(X86CPU *cpu) 4219 { 4220 CPUX86State *env = &cpu->env; 4221 void *xsave = env->xsave_buf; 4222 unsigned long type; 4223 int ret; 4224 4225 type = has_xsave2 ? KVM_GET_XSAVE2 : KVM_GET_XSAVE; 4226 ret = kvm_vcpu_ioctl(CPU(cpu), type, xsave); 4227 if (ret < 0) { 4228 return ret; 4229 } 4230 x86_cpu_xrstor_all_areas(cpu, xsave, env->xsave_buf_len); 4231 4232 return 0; 4233 } 4234 4235 static int kvm_get_xcrs(X86CPU *cpu) 4236 { 4237 CPUX86State *env = &cpu->env; 4238 int i, ret; 4239 struct kvm_xcrs xcrs; 4240 4241 if (!has_xcrs) { 4242 return 0; 4243 } 4244 4245 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs); 4246 if (ret < 0) { 4247 return ret; 4248 } 4249 4250 for (i = 0; i < xcrs.nr_xcrs; i++) { 4251 /* Only support xcr0 now */ 4252 if (xcrs.xcrs[i].xcr == 0) { 4253 env->xcr0 = xcrs.xcrs[i].value; 4254 break; 4255 } 4256 } 4257 return 0; 4258 } 4259 4260 static int kvm_get_sregs(X86CPU *cpu) 4261 { 4262 CPUX86State *env = &cpu->env; 4263 struct kvm_sregs sregs; 4264 int ret; 4265 4266 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs); 4267 if (ret < 0) { 4268 return ret; 4269 } 4270 4271 /* 4272 * The interrupt_bitmap is ignored because KVM_GET_SREGS is 4273 * always preceded by KVM_GET_VCPU_EVENTS. 4274 */ 4275 4276 get_seg(&env->segs[R_CS], &sregs.cs); 4277 get_seg(&env->segs[R_DS], &sregs.ds); 4278 get_seg(&env->segs[R_ES], &sregs.es); 4279 get_seg(&env->segs[R_FS], &sregs.fs); 4280 get_seg(&env->segs[R_GS], &sregs.gs); 4281 get_seg(&env->segs[R_SS], &sregs.ss); 4282 4283 get_seg(&env->tr, &sregs.tr); 4284 get_seg(&env->ldt, &sregs.ldt); 4285 4286 env->idt.limit = sregs.idt.limit; 4287 env->idt.base = sregs.idt.base; 4288 env->gdt.limit = sregs.gdt.limit; 4289 env->gdt.base = sregs.gdt.base; 4290 4291 env->cr[0] = sregs.cr0; 4292 env->cr[2] = sregs.cr2; 4293 env->cr[3] = sregs.cr3; 4294 env->cr[4] = sregs.cr4; 4295 4296 env->efer = sregs.efer; 4297 if (sev_es_enabled() && env->efer & MSR_EFER_LME && 4298 env->cr[0] & CR0_PG_MASK) { 4299 env->efer |= MSR_EFER_LMA; 4300 } 4301 4302 /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */ 4303 x86_update_hflags(env); 4304 4305 return 0; 4306 } 4307 4308 static int kvm_get_sregs2(X86CPU *cpu) 4309 { 4310 CPUX86State *env = &cpu->env; 4311 struct kvm_sregs2 sregs; 4312 int i, ret; 4313 4314 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs); 4315 if (ret < 0) { 4316 return ret; 4317 } 4318 4319 get_seg(&env->segs[R_CS], &sregs.cs); 4320 get_seg(&env->segs[R_DS], &sregs.ds); 4321 get_seg(&env->segs[R_ES], &sregs.es); 4322 get_seg(&env->segs[R_FS], &sregs.fs); 4323 get_seg(&env->segs[R_GS], &sregs.gs); 4324 get_seg(&env->segs[R_SS], &sregs.ss); 4325 4326 get_seg(&env->tr, &sregs.tr); 4327 get_seg(&env->ldt, &sregs.ldt); 4328 4329 env->idt.limit = sregs.idt.limit; 4330 env->idt.base = sregs.idt.base; 4331 env->gdt.limit = sregs.gdt.limit; 4332 env->gdt.base = sregs.gdt.base; 4333 4334 env->cr[0] = sregs.cr0; 4335 env->cr[2] = sregs.cr2; 4336 env->cr[3] = sregs.cr3; 4337 env->cr[4] = sregs.cr4; 4338 4339 env->efer = sregs.efer; 4340 if (sev_es_enabled() && env->efer & MSR_EFER_LME && 4341 env->cr[0] & CR0_PG_MASK) { 4342 env->efer |= MSR_EFER_LMA; 4343 } 4344 4345 env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID; 4346 4347 if (env->pdptrs_valid) { 4348 for (i = 0; i < 4; i++) { 4349 env->pdptrs[i] = sregs.pdptrs[i]; 4350 } 4351 } 4352 4353 /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */ 4354 x86_update_hflags(env); 4355 4356 return 0; 4357 } 4358 4359 static int kvm_get_msrs(X86CPU *cpu) 4360 { 4361 CPUX86State *env = &cpu->env; 4362 struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries; 4363 int ret, i; 4364 uint64_t mtrr_top_bits; 4365 4366 kvm_msr_buf_reset(cpu); 4367 4368 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0); 4369 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0); 4370 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0); 4371 kvm_msr_entry_add(cpu, MSR_PAT, 0); 4372 if (has_msr_star) { 4373 kvm_msr_entry_add(cpu, MSR_STAR, 0); 4374 } 4375 if (has_msr_hsave_pa) { 4376 kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0); 4377 } 4378 if (has_msr_tsc_aux) { 4379 kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0); 4380 } 4381 if (has_msr_tsc_adjust) { 4382 kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0); 4383 } 4384 if (has_msr_tsc_deadline) { 4385 kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0); 4386 } 4387 if (has_msr_misc_enable) { 4388 kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0); 4389 } 4390 if (has_msr_smbase) { 4391 kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0); 4392 } 4393 if (has_msr_smi_count) { 4394 kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0); 4395 } 4396 if (has_msr_feature_control) { 4397 kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0); 4398 } 4399 if (has_msr_pkrs) { 4400 kvm_msr_entry_add(cpu, MSR_IA32_PKRS, 0); 4401 } 4402 if (has_msr_bndcfgs) { 4403 kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0); 4404 } 4405 if (has_msr_xss) { 4406 kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0); 4407 } 4408 if (has_msr_umwait) { 4409 kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, 0); 4410 } 4411 if (has_msr_spec_ctrl) { 4412 kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0); 4413 } 4414 if (has_tsc_scale_msr) { 4415 kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, 0); 4416 } 4417 4418 if (has_msr_tsx_ctrl) { 4419 kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, 0); 4420 } 4421 if (has_msr_virt_ssbd) { 4422 kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0); 4423 } 4424 if (!env->tsc_valid) { 4425 kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0); 4426 env->tsc_valid = !runstate_is_running(); 4427 } 4428 if (has_msr_hwcr) { 4429 kvm_msr_entry_add(cpu, MSR_K7_HWCR, 0); 4430 } 4431 4432 #ifdef TARGET_X86_64 4433 if (lm_capable_kernel) { 4434 kvm_msr_entry_add(cpu, MSR_CSTAR, 0); 4435 kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0); 4436 kvm_msr_entry_add(cpu, MSR_FMASK, 0); 4437 kvm_msr_entry_add(cpu, MSR_LSTAR, 0); 4438 if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { 4439 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, 0); 4440 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, 0); 4441 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, 0); 4442 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, 0); 4443 kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, 0); 4444 kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, 0); 4445 kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, 0); 4446 kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, 0); 4447 kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, 0); 4448 } 4449 } 4450 #endif 4451 if (env->features[FEAT_KVM] & (CPUID_KVM_CLOCK | CPUID_KVM_CLOCK2)) { 4452 kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0); 4453 kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0); 4454 } 4455 if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF_INT) { 4456 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0); 4457 } 4458 if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF) { 4459 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0); 4460 } 4461 if (env->features[FEAT_KVM] & CPUID_KVM_PV_EOI) { 4462 kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0); 4463 } 4464 if (env->features[FEAT_KVM] & CPUID_KVM_STEAL_TIME) { 4465 kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0); 4466 } 4467 if (env->features[FEAT_KVM] & CPUID_KVM_POLL_CONTROL) { 4468 kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1); 4469 } 4470 if (has_architectural_pmu_version > 0) { 4471 if (has_architectural_pmu_version > 1) { 4472 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0); 4473 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0); 4474 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0); 4475 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0); 4476 } 4477 for (i = 0; i < num_architectural_pmu_fixed_counters; i++) { 4478 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0); 4479 } 4480 for (i = 0; i < num_architectural_pmu_gp_counters; i++) { 4481 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0); 4482 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0); 4483 } 4484 } 4485 4486 if (env->mcg_cap) { 4487 kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0); 4488 kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0); 4489 if (has_msr_mcg_ext_ctl) { 4490 kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0); 4491 } 4492 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) { 4493 kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0); 4494 } 4495 } 4496 4497 if (has_msr_hv_hypercall) { 4498 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0); 4499 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0); 4500 } 4501 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) { 4502 kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0); 4503 } 4504 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) { 4505 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0); 4506 } 4507 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) { 4508 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0); 4509 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0); 4510 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0); 4511 } 4512 if (has_msr_hv_syndbg_options) { 4513 kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS, 0); 4514 } 4515 if (has_msr_hv_crash) { 4516 int j; 4517 4518 for (j = 0; j < HV_CRASH_PARAMS; j++) { 4519 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0); 4520 } 4521 } 4522 if (has_msr_hv_runtime) { 4523 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0); 4524 } 4525 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) { 4526 uint32_t msr; 4527 4528 kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0); 4529 kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0); 4530 kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0); 4531 for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) { 4532 kvm_msr_entry_add(cpu, msr, 0); 4533 } 4534 } 4535 if (has_msr_hv_stimer) { 4536 uint32_t msr; 4537 4538 for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT; 4539 msr++) { 4540 kvm_msr_entry_add(cpu, msr, 0); 4541 } 4542 } 4543 if (env->features[FEAT_1_EDX] & CPUID_MTRR) { 4544 kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0); 4545 kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0); 4546 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0); 4547 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0); 4548 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0); 4549 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0); 4550 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0); 4551 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0); 4552 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0); 4553 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0); 4554 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0); 4555 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0); 4556 for (i = 0; i < MSR_MTRRcap_VCNT; i++) { 4557 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0); 4558 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0); 4559 } 4560 } 4561 4562 if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) { 4563 int addr_num = 4564 kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7; 4565 4566 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0); 4567 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0); 4568 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0); 4569 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0); 4570 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0); 4571 for (i = 0; i < addr_num; i++) { 4572 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0); 4573 } 4574 } 4575 4576 if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) { 4577 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, 0); 4578 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, 0); 4579 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, 0); 4580 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0); 4581 } 4582 4583 if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) { 4584 kvm_msr_entry_add(cpu, MSR_IA32_XFD, 0); 4585 kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR, 0); 4586 } 4587 4588 if (kvm_enabled() && cpu->enable_pmu && 4589 (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) { 4590 uint64_t depth; 4591 4592 ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth); 4593 if (ret == 1 && depth == ARCH_LBR_NR_ENTRIES) { 4594 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, 0); 4595 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, 0); 4596 4597 for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) { 4598 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i, 0); 4599 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i, 0); 4600 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i, 0); 4601 } 4602 } 4603 } 4604 4605 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf); 4606 if (ret < 0) { 4607 return ret; 4608 } 4609 4610 if (ret < cpu->kvm_msr_buf->nmsrs) { 4611 struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret]; 4612 error_report("error: failed to get MSR 0x%" PRIx32, 4613 (uint32_t)e->index); 4614 } 4615 4616 assert(ret == cpu->kvm_msr_buf->nmsrs); 4617 /* 4618 * MTRR masks: Each mask consists of 5 parts 4619 * a 10..0: must be zero 4620 * b 11 : valid bit 4621 * c n-1.12: actual mask bits 4622 * d 51..n: reserved must be zero 4623 * e 63.52: reserved must be zero 4624 * 4625 * 'n' is the number of physical bits supported by the CPU and is 4626 * apparently always <= 52. We know our 'n' but don't know what 4627 * the destinations 'n' is; it might be smaller, in which case 4628 * it masks (c) on loading. It might be larger, in which case 4629 * we fill 'd' so that d..c is consistent irrespetive of the 'n' 4630 * we're migrating to. 4631 */ 4632 4633 if (cpu->fill_mtrr_mask) { 4634 QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52); 4635 assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS); 4636 mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits); 4637 } else { 4638 mtrr_top_bits = 0; 4639 } 4640 4641 for (i = 0; i < ret; i++) { 4642 uint32_t index = msrs[i].index; 4643 switch (index) { 4644 case MSR_IA32_SYSENTER_CS: 4645 env->sysenter_cs = msrs[i].data; 4646 break; 4647 case MSR_IA32_SYSENTER_ESP: 4648 env->sysenter_esp = msrs[i].data; 4649 break; 4650 case MSR_IA32_SYSENTER_EIP: 4651 env->sysenter_eip = msrs[i].data; 4652 break; 4653 case MSR_PAT: 4654 env->pat = msrs[i].data; 4655 break; 4656 case MSR_STAR: 4657 env->star = msrs[i].data; 4658 break; 4659 #ifdef TARGET_X86_64 4660 case MSR_CSTAR: 4661 env->cstar = msrs[i].data; 4662 break; 4663 case MSR_KERNELGSBASE: 4664 env->kernelgsbase = msrs[i].data; 4665 break; 4666 case MSR_FMASK: 4667 env->fmask = msrs[i].data; 4668 break; 4669 case MSR_LSTAR: 4670 env->lstar = msrs[i].data; 4671 break; 4672 case MSR_IA32_FRED_RSP0: 4673 env->fred_rsp0 = msrs[i].data; 4674 break; 4675 case MSR_IA32_FRED_RSP1: 4676 env->fred_rsp1 = msrs[i].data; 4677 break; 4678 case MSR_IA32_FRED_RSP2: 4679 env->fred_rsp2 = msrs[i].data; 4680 break; 4681 case MSR_IA32_FRED_RSP3: 4682 env->fred_rsp3 = msrs[i].data; 4683 break; 4684 case MSR_IA32_FRED_STKLVLS: 4685 env->fred_stklvls = msrs[i].data; 4686 break; 4687 case MSR_IA32_FRED_SSP1: 4688 env->fred_ssp1 = msrs[i].data; 4689 break; 4690 case MSR_IA32_FRED_SSP2: 4691 env->fred_ssp2 = msrs[i].data; 4692 break; 4693 case MSR_IA32_FRED_SSP3: 4694 env->fred_ssp3 = msrs[i].data; 4695 break; 4696 case MSR_IA32_FRED_CONFIG: 4697 env->fred_config = msrs[i].data; 4698 break; 4699 #endif 4700 case MSR_IA32_TSC: 4701 env->tsc = msrs[i].data; 4702 break; 4703 case MSR_TSC_AUX: 4704 env->tsc_aux = msrs[i].data; 4705 break; 4706 case MSR_TSC_ADJUST: 4707 env->tsc_adjust = msrs[i].data; 4708 break; 4709 case MSR_IA32_TSCDEADLINE: 4710 env->tsc_deadline = msrs[i].data; 4711 break; 4712 case MSR_VM_HSAVE_PA: 4713 env->vm_hsave = msrs[i].data; 4714 break; 4715 case MSR_KVM_SYSTEM_TIME: 4716 env->system_time_msr = msrs[i].data; 4717 break; 4718 case MSR_KVM_WALL_CLOCK: 4719 env->wall_clock_msr = msrs[i].data; 4720 break; 4721 case MSR_MCG_STATUS: 4722 env->mcg_status = msrs[i].data; 4723 break; 4724 case MSR_MCG_CTL: 4725 env->mcg_ctl = msrs[i].data; 4726 break; 4727 case MSR_MCG_EXT_CTL: 4728 env->mcg_ext_ctl = msrs[i].data; 4729 break; 4730 case MSR_IA32_MISC_ENABLE: 4731 env->msr_ia32_misc_enable = msrs[i].data; 4732 break; 4733 case MSR_IA32_SMBASE: 4734 env->smbase = msrs[i].data; 4735 break; 4736 case MSR_SMI_COUNT: 4737 env->msr_smi_count = msrs[i].data; 4738 break; 4739 case MSR_IA32_FEATURE_CONTROL: 4740 env->msr_ia32_feature_control = msrs[i].data; 4741 break; 4742 case MSR_IA32_BNDCFGS: 4743 env->msr_bndcfgs = msrs[i].data; 4744 break; 4745 case MSR_IA32_XSS: 4746 env->xss = msrs[i].data; 4747 break; 4748 case MSR_IA32_UMWAIT_CONTROL: 4749 env->umwait = msrs[i].data; 4750 break; 4751 case MSR_IA32_PKRS: 4752 env->pkrs = msrs[i].data; 4753 break; 4754 default: 4755 if (msrs[i].index >= MSR_MC0_CTL && 4756 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) { 4757 env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data; 4758 } 4759 break; 4760 case MSR_KVM_ASYNC_PF_EN: 4761 env->async_pf_en_msr = msrs[i].data; 4762 break; 4763 case MSR_KVM_ASYNC_PF_INT: 4764 env->async_pf_int_msr = msrs[i].data; 4765 break; 4766 case MSR_KVM_PV_EOI_EN: 4767 env->pv_eoi_en_msr = msrs[i].data; 4768 break; 4769 case MSR_KVM_STEAL_TIME: 4770 env->steal_time_msr = msrs[i].data; 4771 break; 4772 case MSR_KVM_POLL_CONTROL: { 4773 env->poll_control_msr = msrs[i].data; 4774 break; 4775 } 4776 case MSR_CORE_PERF_FIXED_CTR_CTRL: 4777 env->msr_fixed_ctr_ctrl = msrs[i].data; 4778 break; 4779 case MSR_CORE_PERF_GLOBAL_CTRL: 4780 env->msr_global_ctrl = msrs[i].data; 4781 break; 4782 case MSR_CORE_PERF_GLOBAL_STATUS: 4783 env->msr_global_status = msrs[i].data; 4784 break; 4785 case MSR_CORE_PERF_GLOBAL_OVF_CTRL: 4786 env->msr_global_ovf_ctrl = msrs[i].data; 4787 break; 4788 case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1: 4789 env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data; 4790 break; 4791 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1: 4792 env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data; 4793 break; 4794 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1: 4795 env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data; 4796 break; 4797 case HV_X64_MSR_HYPERCALL: 4798 env->msr_hv_hypercall = msrs[i].data; 4799 break; 4800 case HV_X64_MSR_GUEST_OS_ID: 4801 env->msr_hv_guest_os_id = msrs[i].data; 4802 break; 4803 case HV_X64_MSR_APIC_ASSIST_PAGE: 4804 env->msr_hv_vapic = msrs[i].data; 4805 break; 4806 case HV_X64_MSR_REFERENCE_TSC: 4807 env->msr_hv_tsc = msrs[i].data; 4808 break; 4809 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: 4810 env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data; 4811 break; 4812 case HV_X64_MSR_VP_RUNTIME: 4813 env->msr_hv_runtime = msrs[i].data; 4814 break; 4815 case HV_X64_MSR_SCONTROL: 4816 env->msr_hv_synic_control = msrs[i].data; 4817 break; 4818 case HV_X64_MSR_SIEFP: 4819 env->msr_hv_synic_evt_page = msrs[i].data; 4820 break; 4821 case HV_X64_MSR_SIMP: 4822 env->msr_hv_synic_msg_page = msrs[i].data; 4823 break; 4824 case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: 4825 env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data; 4826 break; 4827 case HV_X64_MSR_STIMER0_CONFIG: 4828 case HV_X64_MSR_STIMER1_CONFIG: 4829 case HV_X64_MSR_STIMER2_CONFIG: 4830 case HV_X64_MSR_STIMER3_CONFIG: 4831 env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] = 4832 msrs[i].data; 4833 break; 4834 case HV_X64_MSR_STIMER0_COUNT: 4835 case HV_X64_MSR_STIMER1_COUNT: 4836 case HV_X64_MSR_STIMER2_COUNT: 4837 case HV_X64_MSR_STIMER3_COUNT: 4838 env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] = 4839 msrs[i].data; 4840 break; 4841 case HV_X64_MSR_REENLIGHTENMENT_CONTROL: 4842 env->msr_hv_reenlightenment_control = msrs[i].data; 4843 break; 4844 case HV_X64_MSR_TSC_EMULATION_CONTROL: 4845 env->msr_hv_tsc_emulation_control = msrs[i].data; 4846 break; 4847 case HV_X64_MSR_TSC_EMULATION_STATUS: 4848 env->msr_hv_tsc_emulation_status = msrs[i].data; 4849 break; 4850 case HV_X64_MSR_SYNDBG_OPTIONS: 4851 env->msr_hv_syndbg_options = msrs[i].data; 4852 break; 4853 case MSR_MTRRdefType: 4854 env->mtrr_deftype = msrs[i].data; 4855 break; 4856 case MSR_MTRRfix64K_00000: 4857 env->mtrr_fixed[0] = msrs[i].data; 4858 break; 4859 case MSR_MTRRfix16K_80000: 4860 env->mtrr_fixed[1] = msrs[i].data; 4861 break; 4862 case MSR_MTRRfix16K_A0000: 4863 env->mtrr_fixed[2] = msrs[i].data; 4864 break; 4865 case MSR_MTRRfix4K_C0000: 4866 env->mtrr_fixed[3] = msrs[i].data; 4867 break; 4868 case MSR_MTRRfix4K_C8000: 4869 env->mtrr_fixed[4] = msrs[i].data; 4870 break; 4871 case MSR_MTRRfix4K_D0000: 4872 env->mtrr_fixed[5] = msrs[i].data; 4873 break; 4874 case MSR_MTRRfix4K_D8000: 4875 env->mtrr_fixed[6] = msrs[i].data; 4876 break; 4877 case MSR_MTRRfix4K_E0000: 4878 env->mtrr_fixed[7] = msrs[i].data; 4879 break; 4880 case MSR_MTRRfix4K_E8000: 4881 env->mtrr_fixed[8] = msrs[i].data; 4882 break; 4883 case MSR_MTRRfix4K_F0000: 4884 env->mtrr_fixed[9] = msrs[i].data; 4885 break; 4886 case MSR_MTRRfix4K_F8000: 4887 env->mtrr_fixed[10] = msrs[i].data; 4888 break; 4889 case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1): 4890 if (index & 1) { 4891 env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data | 4892 mtrr_top_bits; 4893 } else { 4894 env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data; 4895 } 4896 break; 4897 case MSR_IA32_SPEC_CTRL: 4898 env->spec_ctrl = msrs[i].data; 4899 break; 4900 case MSR_AMD64_TSC_RATIO: 4901 env->amd_tsc_scale_msr = msrs[i].data; 4902 break; 4903 case MSR_IA32_TSX_CTRL: 4904 env->tsx_ctrl = msrs[i].data; 4905 break; 4906 case MSR_VIRT_SSBD: 4907 env->virt_ssbd = msrs[i].data; 4908 break; 4909 case MSR_IA32_RTIT_CTL: 4910 env->msr_rtit_ctrl = msrs[i].data; 4911 break; 4912 case MSR_IA32_RTIT_STATUS: 4913 env->msr_rtit_status = msrs[i].data; 4914 break; 4915 case MSR_IA32_RTIT_OUTPUT_BASE: 4916 env->msr_rtit_output_base = msrs[i].data; 4917 break; 4918 case MSR_IA32_RTIT_OUTPUT_MASK: 4919 env->msr_rtit_output_mask = msrs[i].data; 4920 break; 4921 case MSR_IA32_RTIT_CR3_MATCH: 4922 env->msr_rtit_cr3_match = msrs[i].data; 4923 break; 4924 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 4925 env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data; 4926 break; 4927 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 4928 env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] = 4929 msrs[i].data; 4930 break; 4931 case MSR_IA32_XFD: 4932 env->msr_xfd = msrs[i].data; 4933 break; 4934 case MSR_IA32_XFD_ERR: 4935 env->msr_xfd_err = msrs[i].data; 4936 break; 4937 case MSR_ARCH_LBR_CTL: 4938 env->msr_lbr_ctl = msrs[i].data; 4939 break; 4940 case MSR_ARCH_LBR_DEPTH: 4941 env->msr_lbr_depth = msrs[i].data; 4942 break; 4943 case MSR_ARCH_LBR_FROM_0 ... MSR_ARCH_LBR_FROM_0 + 31: 4944 env->lbr_records[index - MSR_ARCH_LBR_FROM_0].from = msrs[i].data; 4945 break; 4946 case MSR_ARCH_LBR_TO_0 ... MSR_ARCH_LBR_TO_0 + 31: 4947 env->lbr_records[index - MSR_ARCH_LBR_TO_0].to = msrs[i].data; 4948 break; 4949 case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31: 4950 env->lbr_records[index - MSR_ARCH_LBR_INFO_0].info = msrs[i].data; 4951 break; 4952 case MSR_K7_HWCR: 4953 env->msr_hwcr = msrs[i].data; 4954 break; 4955 } 4956 } 4957 4958 return 0; 4959 } 4960 4961 static int kvm_put_mp_state(X86CPU *cpu) 4962 { 4963 struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state }; 4964 4965 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state); 4966 } 4967 4968 static int kvm_get_mp_state(X86CPU *cpu) 4969 { 4970 CPUState *cs = CPU(cpu); 4971 CPUX86State *env = &cpu->env; 4972 struct kvm_mp_state mp_state; 4973 int ret; 4974 4975 ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state); 4976 if (ret < 0) { 4977 return ret; 4978 } 4979 env->mp_state = mp_state.mp_state; 4980 if (kvm_irqchip_in_kernel()) { 4981 cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED); 4982 } 4983 return 0; 4984 } 4985 4986 static int kvm_get_apic(X86CPU *cpu) 4987 { 4988 DeviceState *apic = cpu->apic_state; 4989 struct kvm_lapic_state kapic; 4990 int ret; 4991 4992 if (apic && kvm_irqchip_in_kernel()) { 4993 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic); 4994 if (ret < 0) { 4995 return ret; 4996 } 4997 4998 kvm_get_apic_state(apic, &kapic); 4999 } 5000 return 0; 5001 } 5002 5003 static int kvm_put_vcpu_events(X86CPU *cpu, int level) 5004 { 5005 CPUState *cs = CPU(cpu); 5006 CPUX86State *env = &cpu->env; 5007 struct kvm_vcpu_events events = {}; 5008 5009 events.flags = 0; 5010 5011 if (has_exception_payload) { 5012 events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD; 5013 events.exception.pending = env->exception_pending; 5014 events.exception_has_payload = env->exception_has_payload; 5015 events.exception_payload = env->exception_payload; 5016 } 5017 events.exception.nr = env->exception_nr; 5018 events.exception.injected = env->exception_injected; 5019 events.exception.has_error_code = env->has_error_code; 5020 events.exception.error_code = env->error_code; 5021 5022 events.interrupt.injected = (env->interrupt_injected >= 0); 5023 events.interrupt.nr = env->interrupt_injected; 5024 events.interrupt.soft = env->soft_interrupt; 5025 5026 events.nmi.injected = env->nmi_injected; 5027 events.nmi.pending = env->nmi_pending; 5028 events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK); 5029 5030 events.sipi_vector = env->sipi_vector; 5031 5032 if (has_msr_smbase) { 5033 events.flags |= KVM_VCPUEVENT_VALID_SMM; 5034 events.smi.smm = !!(env->hflags & HF_SMM_MASK); 5035 events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK); 5036 if (kvm_irqchip_in_kernel()) { 5037 /* As soon as these are moved to the kernel, remove them 5038 * from cs->interrupt_request. 5039 */ 5040 events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI; 5041 events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT; 5042 cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI); 5043 } else { 5044 /* Keep these in cs->interrupt_request. */ 5045 events.smi.pending = 0; 5046 events.smi.latched_init = 0; 5047 } 5048 } 5049 5050 if (level >= KVM_PUT_RESET_STATE) { 5051 events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING; 5052 if (env->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { 5053 events.flags |= KVM_VCPUEVENT_VALID_SIPI_VECTOR; 5054 } 5055 } 5056 5057 if (has_triple_fault_event) { 5058 events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; 5059 events.triple_fault.pending = env->triple_fault_pending; 5060 } 5061 5062 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events); 5063 } 5064 5065 static int kvm_get_vcpu_events(X86CPU *cpu) 5066 { 5067 CPUX86State *env = &cpu->env; 5068 struct kvm_vcpu_events events; 5069 int ret; 5070 5071 memset(&events, 0, sizeof(events)); 5072 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events); 5073 if (ret < 0) { 5074 return ret; 5075 } 5076 5077 if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) { 5078 env->exception_pending = events.exception.pending; 5079 env->exception_has_payload = events.exception_has_payload; 5080 env->exception_payload = events.exception_payload; 5081 } else { 5082 env->exception_pending = 0; 5083 env->exception_has_payload = false; 5084 } 5085 env->exception_injected = events.exception.injected; 5086 env->exception_nr = 5087 (env->exception_pending || env->exception_injected) ? 5088 events.exception.nr : -1; 5089 env->has_error_code = events.exception.has_error_code; 5090 env->error_code = events.exception.error_code; 5091 5092 env->interrupt_injected = 5093 events.interrupt.injected ? events.interrupt.nr : -1; 5094 env->soft_interrupt = events.interrupt.soft; 5095 5096 env->nmi_injected = events.nmi.injected; 5097 env->nmi_pending = events.nmi.pending; 5098 if (events.nmi.masked) { 5099 env->hflags2 |= HF2_NMI_MASK; 5100 } else { 5101 env->hflags2 &= ~HF2_NMI_MASK; 5102 } 5103 5104 if (events.flags & KVM_VCPUEVENT_VALID_SMM) { 5105 if (events.smi.smm) { 5106 env->hflags |= HF_SMM_MASK; 5107 } else { 5108 env->hflags &= ~HF_SMM_MASK; 5109 } 5110 if (events.smi.pending) { 5111 cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI); 5112 } else { 5113 cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI); 5114 } 5115 if (events.smi.smm_inside_nmi) { 5116 env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK; 5117 } else { 5118 env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK; 5119 } 5120 if (events.smi.latched_init) { 5121 cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT); 5122 } else { 5123 cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT); 5124 } 5125 } 5126 5127 if (events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { 5128 env->triple_fault_pending = events.triple_fault.pending; 5129 } 5130 5131 env->sipi_vector = events.sipi_vector; 5132 5133 return 0; 5134 } 5135 5136 static int kvm_put_debugregs(X86CPU *cpu) 5137 { 5138 CPUX86State *env = &cpu->env; 5139 struct kvm_debugregs dbgregs; 5140 int i; 5141 5142 memset(&dbgregs, 0, sizeof(dbgregs)); 5143 for (i = 0; i < 4; i++) { 5144 dbgregs.db[i] = env->dr[i]; 5145 } 5146 dbgregs.dr6 = env->dr[6]; 5147 dbgregs.dr7 = env->dr[7]; 5148 dbgregs.flags = 0; 5149 5150 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs); 5151 } 5152 5153 static int kvm_get_debugregs(X86CPU *cpu) 5154 { 5155 CPUX86State *env = &cpu->env; 5156 struct kvm_debugregs dbgregs; 5157 int i, ret; 5158 5159 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs); 5160 if (ret < 0) { 5161 return ret; 5162 } 5163 for (i = 0; i < 4; i++) { 5164 env->dr[i] = dbgregs.db[i]; 5165 } 5166 env->dr[4] = env->dr[6] = dbgregs.dr6; 5167 env->dr[5] = env->dr[7] = dbgregs.dr7; 5168 5169 return 0; 5170 } 5171 5172 static int kvm_put_nested_state(X86CPU *cpu) 5173 { 5174 CPUX86State *env = &cpu->env; 5175 int max_nested_state_len = kvm_max_nested_state_length(); 5176 5177 if (!env->nested_state) { 5178 return 0; 5179 } 5180 5181 /* 5182 * Copy flags that are affected by reset from env->hflags and env->hflags2. 5183 */ 5184 if (env->hflags & HF_GUEST_MASK) { 5185 env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE; 5186 } else { 5187 env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE; 5188 } 5189 5190 /* Don't set KVM_STATE_NESTED_GIF_SET on VMX as it is illegal */ 5191 if (cpu_has_svm(env) && (env->hflags2 & HF2_GIF_MASK)) { 5192 env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET; 5193 } else { 5194 env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET; 5195 } 5196 5197 assert(env->nested_state->size <= max_nested_state_len); 5198 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state); 5199 } 5200 5201 static int kvm_get_nested_state(X86CPU *cpu) 5202 { 5203 CPUX86State *env = &cpu->env; 5204 int max_nested_state_len = kvm_max_nested_state_length(); 5205 int ret; 5206 5207 if (!env->nested_state) { 5208 return 0; 5209 } 5210 5211 /* 5212 * It is possible that migration restored a smaller size into 5213 * nested_state->hdr.size than what our kernel support. 5214 * We preserve migration origin nested_state->hdr.size for 5215 * call to KVM_SET_NESTED_STATE but wish that our next call 5216 * to KVM_GET_NESTED_STATE will use max size our kernel support. 5217 */ 5218 env->nested_state->size = max_nested_state_len; 5219 5220 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state); 5221 if (ret < 0) { 5222 return ret; 5223 } 5224 5225 /* 5226 * Copy flags that are affected by reset to env->hflags and env->hflags2. 5227 */ 5228 if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) { 5229 env->hflags |= HF_GUEST_MASK; 5230 } else { 5231 env->hflags &= ~HF_GUEST_MASK; 5232 } 5233 5234 /* Keep HF2_GIF_MASK set on !SVM as x86_cpu_pending_interrupt() needs it */ 5235 if (cpu_has_svm(env)) { 5236 if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) { 5237 env->hflags2 |= HF2_GIF_MASK; 5238 } else { 5239 env->hflags2 &= ~HF2_GIF_MASK; 5240 } 5241 } 5242 5243 return ret; 5244 } 5245 5246 int kvm_arch_put_registers(CPUState *cpu, int level, Error **errp) 5247 { 5248 X86CPU *x86_cpu = X86_CPU(cpu); 5249 int ret; 5250 5251 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 5252 5253 /* 5254 * Put MSR_IA32_FEATURE_CONTROL first, this ensures the VM gets out of VMX 5255 * root operation upon vCPU reset. kvm_put_msr_feature_control() should also 5256 * precede kvm_put_nested_state() when 'real' nested state is set. 5257 */ 5258 if (level >= KVM_PUT_RESET_STATE) { 5259 ret = kvm_put_msr_feature_control(x86_cpu); 5260 if (ret < 0) { 5261 error_setg_errno(errp, -ret, "Failed to set feature control MSR"); 5262 return ret; 5263 } 5264 } 5265 5266 /* must be before kvm_put_nested_state so that EFER.SVME is set */ 5267 ret = has_sregs2 ? kvm_put_sregs2(x86_cpu) : kvm_put_sregs(x86_cpu); 5268 if (ret < 0) { 5269 error_setg_errno(errp, -ret, "Failed to set special registers"); 5270 return ret; 5271 } 5272 5273 if (level >= KVM_PUT_RESET_STATE) { 5274 ret = kvm_put_nested_state(x86_cpu); 5275 if (ret < 0) { 5276 error_setg_errno(errp, -ret, "Failed to set nested state"); 5277 return ret; 5278 } 5279 } 5280 5281 if (level == KVM_PUT_FULL_STATE) { 5282 /* We don't check for kvm_arch_set_tsc_khz() errors here, 5283 * because TSC frequency mismatch shouldn't abort migration, 5284 * unless the user explicitly asked for a more strict TSC 5285 * setting (e.g. using an explicit "tsc-freq" option). 5286 */ 5287 kvm_arch_set_tsc_khz(cpu); 5288 } 5289 5290 #ifdef CONFIG_XEN_EMU 5291 if (xen_mode == XEN_EMULATE && level == KVM_PUT_FULL_STATE) { 5292 ret = kvm_put_xen_state(cpu); 5293 if (ret < 0) { 5294 error_setg_errno(errp, -ret, "Failed to set Xen state"); 5295 return ret; 5296 } 5297 } 5298 #endif 5299 5300 ret = kvm_getput_regs(x86_cpu, 1); 5301 if (ret < 0) { 5302 error_setg_errno(errp, -ret, "Failed to set general purpose registers"); 5303 return ret; 5304 } 5305 ret = kvm_put_xsave(x86_cpu); 5306 if (ret < 0) { 5307 error_setg_errno(errp, -ret, "Failed to set XSAVE"); 5308 return ret; 5309 } 5310 ret = kvm_put_xcrs(x86_cpu); 5311 if (ret < 0) { 5312 error_setg_errno(errp, -ret, "Failed to set XCRs"); 5313 return ret; 5314 } 5315 ret = kvm_put_msrs(x86_cpu, level); 5316 if (ret < 0) { 5317 error_setg_errno(errp, -ret, "Failed to set MSRs"); 5318 return ret; 5319 } 5320 ret = kvm_put_vcpu_events(x86_cpu, level); 5321 if (ret < 0) { 5322 error_setg_errno(errp, -ret, "Failed to set vCPU events"); 5323 return ret; 5324 } 5325 if (level >= KVM_PUT_RESET_STATE) { 5326 ret = kvm_put_mp_state(x86_cpu); 5327 if (ret < 0) { 5328 error_setg_errno(errp, -ret, "Failed to set MP state"); 5329 return ret; 5330 } 5331 } 5332 5333 ret = kvm_put_tscdeadline_msr(x86_cpu); 5334 if (ret < 0) { 5335 error_setg_errno(errp, -ret, "Failed to set TSC deadline MSR"); 5336 return ret; 5337 } 5338 ret = kvm_put_debugregs(x86_cpu); 5339 if (ret < 0) { 5340 error_setg_errno(errp, -ret, "Failed to set debug registers"); 5341 return ret; 5342 } 5343 return 0; 5344 } 5345 5346 int kvm_arch_get_registers(CPUState *cs, Error **errp) 5347 { 5348 X86CPU *cpu = X86_CPU(cs); 5349 int ret; 5350 5351 assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs)); 5352 5353 ret = kvm_get_vcpu_events(cpu); 5354 if (ret < 0) { 5355 error_setg_errno(errp, -ret, "Failed to get vCPU events"); 5356 goto out; 5357 } 5358 /* 5359 * KVM_GET_MPSTATE can modify CS and RIP, call it before 5360 * KVM_GET_REGS and KVM_GET_SREGS. 5361 */ 5362 ret = kvm_get_mp_state(cpu); 5363 if (ret < 0) { 5364 error_setg_errno(errp, -ret, "Failed to get MP state"); 5365 goto out; 5366 } 5367 ret = kvm_getput_regs(cpu, 0); 5368 if (ret < 0) { 5369 error_setg_errno(errp, -ret, "Failed to get general purpose registers"); 5370 goto out; 5371 } 5372 ret = kvm_get_xsave(cpu); 5373 if (ret < 0) { 5374 error_setg_errno(errp, -ret, "Failed to get XSAVE"); 5375 goto out; 5376 } 5377 ret = kvm_get_xcrs(cpu); 5378 if (ret < 0) { 5379 error_setg_errno(errp, -ret, "Failed to get XCRs"); 5380 goto out; 5381 } 5382 ret = has_sregs2 ? kvm_get_sregs2(cpu) : kvm_get_sregs(cpu); 5383 if (ret < 0) { 5384 error_setg_errno(errp, -ret, "Failed to get special registers"); 5385 goto out; 5386 } 5387 ret = kvm_get_msrs(cpu); 5388 if (ret < 0) { 5389 error_setg_errno(errp, -ret, "Failed to get MSRs"); 5390 goto out; 5391 } 5392 ret = kvm_get_apic(cpu); 5393 if (ret < 0) { 5394 error_setg_errno(errp, -ret, "Failed to get APIC"); 5395 goto out; 5396 } 5397 ret = kvm_get_debugregs(cpu); 5398 if (ret < 0) { 5399 error_setg_errno(errp, -ret, "Failed to get debug registers"); 5400 goto out; 5401 } 5402 ret = kvm_get_nested_state(cpu); 5403 if (ret < 0) { 5404 error_setg_errno(errp, -ret, "Failed to get nested state"); 5405 goto out; 5406 } 5407 #ifdef CONFIG_XEN_EMU 5408 if (xen_mode == XEN_EMULATE) { 5409 ret = kvm_get_xen_state(cs); 5410 if (ret < 0) { 5411 error_setg_errno(errp, -ret, "Failed to get Xen state"); 5412 goto out; 5413 } 5414 } 5415 #endif 5416 ret = 0; 5417 out: 5418 cpu_sync_bndcs_hflags(&cpu->env); 5419 return ret; 5420 } 5421 5422 void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) 5423 { 5424 X86CPU *x86_cpu = X86_CPU(cpu); 5425 CPUX86State *env = &x86_cpu->env; 5426 int ret; 5427 5428 /* Inject NMI */ 5429 if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { 5430 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { 5431 bql_lock(); 5432 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 5433 bql_unlock(); 5434 DPRINTF("injected NMI\n"); 5435 ret = kvm_vcpu_ioctl(cpu, KVM_NMI); 5436 if (ret < 0) { 5437 fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n", 5438 strerror(-ret)); 5439 } 5440 } 5441 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 5442 bql_lock(); 5443 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 5444 bql_unlock(); 5445 DPRINTF("injected SMI\n"); 5446 ret = kvm_vcpu_ioctl(cpu, KVM_SMI); 5447 if (ret < 0) { 5448 fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n", 5449 strerror(-ret)); 5450 } 5451 } 5452 } 5453 5454 if (!kvm_pic_in_kernel()) { 5455 bql_lock(); 5456 } 5457 5458 /* Force the VCPU out of its inner loop to process any INIT requests 5459 * or (for userspace APIC, but it is cheap to combine the checks here) 5460 * pending TPR access reports. 5461 */ 5462 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 5463 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 5464 !(env->hflags & HF_SMM_MASK)) { 5465 cpu->exit_request = 1; 5466 } 5467 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 5468 cpu->exit_request = 1; 5469 } 5470 } 5471 5472 if (!kvm_pic_in_kernel()) { 5473 /* Try to inject an interrupt if the guest can accept it */ 5474 if (run->ready_for_interrupt_injection && 5475 (cpu->interrupt_request & CPU_INTERRUPT_HARD) && 5476 (env->eflags & IF_MASK)) { 5477 int irq; 5478 5479 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 5480 irq = cpu_get_pic_interrupt(env); 5481 if (irq >= 0) { 5482 struct kvm_interrupt intr; 5483 5484 intr.irq = irq; 5485 DPRINTF("injected interrupt %d\n", irq); 5486 ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr); 5487 if (ret < 0) { 5488 fprintf(stderr, 5489 "KVM: injection failed, interrupt lost (%s)\n", 5490 strerror(-ret)); 5491 } 5492 } 5493 } 5494 5495 /* If we have an interrupt but the guest is not ready to receive an 5496 * interrupt, request an interrupt window exit. This will 5497 * cause a return to userspace as soon as the guest is ready to 5498 * receive interrupts. */ 5499 if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 5500 run->request_interrupt_window = 1; 5501 } else { 5502 run->request_interrupt_window = 0; 5503 } 5504 5505 DPRINTF("setting tpr\n"); 5506 run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state); 5507 5508 bql_unlock(); 5509 } 5510 } 5511 5512 static void kvm_rate_limit_on_bus_lock(void) 5513 { 5514 uint64_t delay_ns = ratelimit_calculate_delay(&bus_lock_ratelimit_ctrl, 1); 5515 5516 if (delay_ns) { 5517 g_usleep(delay_ns / SCALE_US); 5518 } 5519 } 5520 5521 MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run) 5522 { 5523 X86CPU *x86_cpu = X86_CPU(cpu); 5524 CPUX86State *env = &x86_cpu->env; 5525 5526 if (run->flags & KVM_RUN_X86_SMM) { 5527 env->hflags |= HF_SMM_MASK; 5528 } else { 5529 env->hflags &= ~HF_SMM_MASK; 5530 } 5531 if (run->if_flag) { 5532 env->eflags |= IF_MASK; 5533 } else { 5534 env->eflags &= ~IF_MASK; 5535 } 5536 if (run->flags & KVM_RUN_X86_BUS_LOCK) { 5537 kvm_rate_limit_on_bus_lock(); 5538 } 5539 5540 #ifdef CONFIG_XEN_EMU 5541 /* 5542 * If the callback is asserted as a GSI (or PCI INTx) then check if 5543 * vcpu_info->evtchn_upcall_pending has been cleared, and deassert 5544 * the callback IRQ if so. Ideally we could hook into the PIC/IOAPIC 5545 * EOI and only resample then, exactly how the VFIO eventfd pairs 5546 * are designed to work for level triggered interrupts. 5547 */ 5548 if (x86_cpu->env.xen_callback_asserted) { 5549 kvm_xen_maybe_deassert_callback(cpu); 5550 } 5551 #endif 5552 5553 /* We need to protect the apic state against concurrent accesses from 5554 * different threads in case the userspace irqchip is used. */ 5555 if (!kvm_irqchip_in_kernel()) { 5556 bql_lock(); 5557 } 5558 cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8); 5559 cpu_set_apic_base(x86_cpu->apic_state, run->apic_base); 5560 if (!kvm_irqchip_in_kernel()) { 5561 bql_unlock(); 5562 } 5563 return cpu_get_mem_attrs(env); 5564 } 5565 5566 int kvm_arch_process_async_events(CPUState *cs) 5567 { 5568 X86CPU *cpu = X86_CPU(cs); 5569 CPUX86State *env = &cpu->env; 5570 5571 if (cs->interrupt_request & CPU_INTERRUPT_MCE) { 5572 /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */ 5573 assert(env->mcg_cap); 5574 5575 cs->interrupt_request &= ~CPU_INTERRUPT_MCE; 5576 5577 kvm_cpu_synchronize_state(cs); 5578 5579 if (env->exception_nr == EXCP08_DBLE) { 5580 /* this means triple fault */ 5581 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 5582 cs->exit_request = 1; 5583 return 0; 5584 } 5585 kvm_queue_exception(env, EXCP12_MCHK, 0, 0); 5586 env->has_error_code = 0; 5587 5588 cs->halted = 0; 5589 if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) { 5590 env->mp_state = KVM_MP_STATE_RUNNABLE; 5591 } 5592 } 5593 5594 if ((cs->interrupt_request & CPU_INTERRUPT_INIT) && 5595 !(env->hflags & HF_SMM_MASK)) { 5596 kvm_cpu_synchronize_state(cs); 5597 do_cpu_init(cpu); 5598 } 5599 5600 if (kvm_irqchip_in_kernel()) { 5601 return 0; 5602 } 5603 5604 if (cs->interrupt_request & CPU_INTERRUPT_POLL) { 5605 cs->interrupt_request &= ~CPU_INTERRUPT_POLL; 5606 apic_poll_irq(cpu->apic_state); 5607 } 5608 if (((cs->interrupt_request & CPU_INTERRUPT_HARD) && 5609 (env->eflags & IF_MASK)) || 5610 (cs->interrupt_request & CPU_INTERRUPT_NMI)) { 5611 cs->halted = 0; 5612 } 5613 if (cs->interrupt_request & CPU_INTERRUPT_SIPI) { 5614 kvm_cpu_synchronize_state(cs); 5615 do_cpu_sipi(cpu); 5616 } 5617 if (cs->interrupt_request & CPU_INTERRUPT_TPR) { 5618 cs->interrupt_request &= ~CPU_INTERRUPT_TPR; 5619 kvm_cpu_synchronize_state(cs); 5620 apic_handle_tpr_access_report(cpu->apic_state, env->eip, 5621 env->tpr_access_type); 5622 } 5623 5624 return cs->halted; 5625 } 5626 5627 static int kvm_handle_halt(X86CPU *cpu) 5628 { 5629 CPUState *cs = CPU(cpu); 5630 CPUX86State *env = &cpu->env; 5631 5632 if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) && 5633 (env->eflags & IF_MASK)) && 5634 !(cs->interrupt_request & CPU_INTERRUPT_NMI)) { 5635 cs->halted = 1; 5636 return EXCP_HLT; 5637 } 5638 5639 return 0; 5640 } 5641 5642 static int kvm_handle_tpr_access(X86CPU *cpu) 5643 { 5644 CPUState *cs = CPU(cpu); 5645 struct kvm_run *run = cs->kvm_run; 5646 5647 apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip, 5648 run->tpr_access.is_write ? TPR_ACCESS_WRITE 5649 : TPR_ACCESS_READ); 5650 return 1; 5651 } 5652 5653 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp) 5654 { 5655 static const uint8_t int3 = 0xcc; 5656 5657 if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) || 5658 cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) { 5659 return -EINVAL; 5660 } 5661 return 0; 5662 } 5663 5664 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp) 5665 { 5666 uint8_t int3; 5667 5668 if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0)) { 5669 return -EINVAL; 5670 } 5671 if (int3 != 0xcc) { 5672 return 0; 5673 } 5674 if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) { 5675 return -EINVAL; 5676 } 5677 return 0; 5678 } 5679 5680 static struct { 5681 target_ulong addr; 5682 int len; 5683 int type; 5684 } hw_breakpoint[4]; 5685 5686 static int nb_hw_breakpoint; 5687 5688 static int find_hw_breakpoint(target_ulong addr, int len, int type) 5689 { 5690 int n; 5691 5692 for (n = 0; n < nb_hw_breakpoint; n++) { 5693 if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type && 5694 (hw_breakpoint[n].len == len || len == -1)) { 5695 return n; 5696 } 5697 } 5698 return -1; 5699 } 5700 5701 int kvm_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type) 5702 { 5703 switch (type) { 5704 case GDB_BREAKPOINT_HW: 5705 len = 1; 5706 break; 5707 case GDB_WATCHPOINT_WRITE: 5708 case GDB_WATCHPOINT_ACCESS: 5709 switch (len) { 5710 case 1: 5711 break; 5712 case 2: 5713 case 4: 5714 case 8: 5715 if (addr & (len - 1)) { 5716 return -EINVAL; 5717 } 5718 break; 5719 default: 5720 return -EINVAL; 5721 } 5722 break; 5723 default: 5724 return -ENOSYS; 5725 } 5726 5727 if (nb_hw_breakpoint == 4) { 5728 return -ENOBUFS; 5729 } 5730 if (find_hw_breakpoint(addr, len, type) >= 0) { 5731 return -EEXIST; 5732 } 5733 hw_breakpoint[nb_hw_breakpoint].addr = addr; 5734 hw_breakpoint[nb_hw_breakpoint].len = len; 5735 hw_breakpoint[nb_hw_breakpoint].type = type; 5736 nb_hw_breakpoint++; 5737 5738 return 0; 5739 } 5740 5741 int kvm_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type) 5742 { 5743 int n; 5744 5745 n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type); 5746 if (n < 0) { 5747 return -ENOENT; 5748 } 5749 nb_hw_breakpoint--; 5750 hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint]; 5751 5752 return 0; 5753 } 5754 5755 void kvm_arch_remove_all_hw_breakpoints(void) 5756 { 5757 nb_hw_breakpoint = 0; 5758 } 5759 5760 static CPUWatchpoint hw_watchpoint; 5761 5762 static int kvm_handle_debug(X86CPU *cpu, 5763 struct kvm_debug_exit_arch *arch_info) 5764 { 5765 CPUState *cs = CPU(cpu); 5766 CPUX86State *env = &cpu->env; 5767 int ret = 0; 5768 int n; 5769 5770 if (arch_info->exception == EXCP01_DB) { 5771 if (arch_info->dr6 & DR6_BS) { 5772 if (cs->singlestep_enabled) { 5773 ret = EXCP_DEBUG; 5774 } 5775 } else { 5776 for (n = 0; n < 4; n++) { 5777 if (arch_info->dr6 & (1 << n)) { 5778 switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) { 5779 case 0x0: 5780 ret = EXCP_DEBUG; 5781 break; 5782 case 0x1: 5783 ret = EXCP_DEBUG; 5784 cs->watchpoint_hit = &hw_watchpoint; 5785 hw_watchpoint.vaddr = hw_breakpoint[n].addr; 5786 hw_watchpoint.flags = BP_MEM_WRITE; 5787 break; 5788 case 0x3: 5789 ret = EXCP_DEBUG; 5790 cs->watchpoint_hit = &hw_watchpoint; 5791 hw_watchpoint.vaddr = hw_breakpoint[n].addr; 5792 hw_watchpoint.flags = BP_MEM_ACCESS; 5793 break; 5794 } 5795 } 5796 } 5797 } 5798 } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) { 5799 ret = EXCP_DEBUG; 5800 } 5801 if (ret == 0) { 5802 cpu_synchronize_state(cs); 5803 assert(env->exception_nr == -1); 5804 5805 /* pass to guest */ 5806 kvm_queue_exception(env, arch_info->exception, 5807 arch_info->exception == EXCP01_DB, 5808 arch_info->dr6); 5809 env->has_error_code = 0; 5810 } 5811 5812 return ret; 5813 } 5814 5815 void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg) 5816 { 5817 const uint8_t type_code[] = { 5818 [GDB_BREAKPOINT_HW] = 0x0, 5819 [GDB_WATCHPOINT_WRITE] = 0x1, 5820 [GDB_WATCHPOINT_ACCESS] = 0x3 5821 }; 5822 const uint8_t len_code[] = { 5823 [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2 5824 }; 5825 int n; 5826 5827 if (kvm_sw_breakpoints_active(cpu)) { 5828 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP; 5829 } 5830 if (nb_hw_breakpoint > 0) { 5831 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; 5832 dbg->arch.debugreg[7] = 0x0600; 5833 for (n = 0; n < nb_hw_breakpoint; n++) { 5834 dbg->arch.debugreg[n] = hw_breakpoint[n].addr; 5835 dbg->arch.debugreg[7] |= (2 << (n * 2)) | 5836 (type_code[hw_breakpoint[n].type] << (16 + n*4)) | 5837 ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4)); 5838 } 5839 } 5840 } 5841 5842 static int kvm_install_msr_filters(KVMState *s) 5843 { 5844 uint64_t zero = 0; 5845 struct kvm_msr_filter filter = { 5846 .flags = KVM_MSR_FILTER_DEFAULT_ALLOW, 5847 }; 5848 int i, j = 0; 5849 5850 QEMU_BUILD_BUG_ON(ARRAY_SIZE(msr_handlers) != ARRAY_SIZE(filter.ranges)); 5851 for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) { 5852 KVMMSRHandlers *handler = &msr_handlers[i]; 5853 if (handler->msr) { 5854 struct kvm_msr_filter_range *range = &filter.ranges[j++]; 5855 5856 *range = (struct kvm_msr_filter_range) { 5857 .flags = 0, 5858 .nmsrs = 1, 5859 .base = handler->msr, 5860 .bitmap = (__u8 *)&zero, 5861 }; 5862 5863 if (handler->rdmsr) { 5864 range->flags |= KVM_MSR_FILTER_READ; 5865 } 5866 5867 if (handler->wrmsr) { 5868 range->flags |= KVM_MSR_FILTER_WRITE; 5869 } 5870 } 5871 } 5872 5873 return kvm_vm_ioctl(s, KVM_X86_SET_MSR_FILTER, &filter); 5874 } 5875 5876 static int kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr, 5877 QEMUWRMSRHandler *wrmsr) 5878 { 5879 int i, ret; 5880 5881 for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) { 5882 if (!msr_handlers[i].msr) { 5883 msr_handlers[i] = (KVMMSRHandlers) { 5884 .msr = msr, 5885 .rdmsr = rdmsr, 5886 .wrmsr = wrmsr, 5887 }; 5888 5889 ret = kvm_install_msr_filters(s); 5890 if (ret) { 5891 msr_handlers[i] = (KVMMSRHandlers) { }; 5892 return ret; 5893 } 5894 5895 return 0; 5896 } 5897 } 5898 5899 return -EINVAL; 5900 } 5901 5902 static int kvm_handle_rdmsr(X86CPU *cpu, struct kvm_run *run) 5903 { 5904 int i; 5905 bool r; 5906 5907 for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) { 5908 KVMMSRHandlers *handler = &msr_handlers[i]; 5909 if (run->msr.index == handler->msr) { 5910 if (handler->rdmsr) { 5911 r = handler->rdmsr(cpu, handler->msr, 5912 (uint64_t *)&run->msr.data); 5913 run->msr.error = r ? 0 : 1; 5914 return 0; 5915 } 5916 } 5917 } 5918 5919 g_assert_not_reached(); 5920 } 5921 5922 static int kvm_handle_wrmsr(X86CPU *cpu, struct kvm_run *run) 5923 { 5924 int i; 5925 bool r; 5926 5927 for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) { 5928 KVMMSRHandlers *handler = &msr_handlers[i]; 5929 if (run->msr.index == handler->msr) { 5930 if (handler->wrmsr) { 5931 r = handler->wrmsr(cpu, handler->msr, run->msr.data); 5932 run->msr.error = r ? 0 : 1; 5933 return 0; 5934 } 5935 } 5936 } 5937 5938 g_assert_not_reached(); 5939 } 5940 5941 static bool has_sgx_provisioning; 5942 5943 static bool __kvm_enable_sgx_provisioning(KVMState *s) 5944 { 5945 int fd, ret; 5946 5947 if (!kvm_vm_check_extension(s, KVM_CAP_SGX_ATTRIBUTE)) { 5948 return false; 5949 } 5950 5951 fd = qemu_open_old("/dev/sgx_provision", O_RDONLY); 5952 if (fd < 0) { 5953 return false; 5954 } 5955 5956 ret = kvm_vm_enable_cap(s, KVM_CAP_SGX_ATTRIBUTE, 0, fd); 5957 if (ret) { 5958 error_report("Could not enable SGX PROVISIONKEY: %s", strerror(-ret)); 5959 exit(1); 5960 } 5961 close(fd); 5962 return true; 5963 } 5964 5965 bool kvm_enable_sgx_provisioning(KVMState *s) 5966 { 5967 return MEMORIZE(__kvm_enable_sgx_provisioning(s), has_sgx_provisioning); 5968 } 5969 5970 static bool host_supports_vmx(void) 5971 { 5972 uint32_t ecx, unused; 5973 5974 host_cpuid(1, 0, &unused, &unused, &ecx, &unused); 5975 return ecx & CPUID_EXT_VMX; 5976 } 5977 5978 /* 5979 * Currently the handling here only supports use of KVM_HC_MAP_GPA_RANGE 5980 * to service guest-initiated memory attribute update requests so that 5981 * KVM_SET_MEMORY_ATTRIBUTES can update whether or not a page should be 5982 * backed by the private memory pool provided by guest_memfd, and as such 5983 * is only applicable to guest_memfd-backed guests (e.g. SNP/TDX). 5984 * 5985 * Other other use-cases for KVM_HC_MAP_GPA_RANGE, such as for SEV live 5986 * migration, are not implemented here currently. 5987 * 5988 * For the guest_memfd use-case, these exits will generally be synthesized 5989 * by KVM based on platform-specific hypercalls, like GHCB requests in the 5990 * case of SEV-SNP, and not issued directly within the guest though the 5991 * KVM_HC_MAP_GPA_RANGE hypercall. So in this case, KVM_HC_MAP_GPA_RANGE is 5992 * not actually advertised to guests via the KVM CPUID feature bit, as 5993 * opposed to SEV live migration where it would be. Since it is unlikely the 5994 * SEV live migration use-case would be useful for guest-memfd backed guests, 5995 * because private/shared page tracking is already provided through other 5996 * means, these 2 use-cases should be treated as being mutually-exclusive. 5997 */ 5998 static int kvm_handle_hc_map_gpa_range(struct kvm_run *run) 5999 { 6000 uint64_t gpa, size, attributes; 6001 6002 if (!machine_require_guest_memfd(current_machine)) 6003 return -EINVAL; 6004 6005 gpa = run->hypercall.args[0]; 6006 size = run->hypercall.args[1] * TARGET_PAGE_SIZE; 6007 attributes = run->hypercall.args[2]; 6008 6009 trace_kvm_hc_map_gpa_range(gpa, size, attributes, run->hypercall.flags); 6010 6011 return kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED); 6012 } 6013 6014 static int kvm_handle_hypercall(struct kvm_run *run) 6015 { 6016 if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE) 6017 return kvm_handle_hc_map_gpa_range(run); 6018 6019 return -EINVAL; 6020 } 6021 6022 #define VMX_INVALID_GUEST_STATE 0x80000021 6023 6024 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) 6025 { 6026 X86CPU *cpu = X86_CPU(cs); 6027 uint64_t code; 6028 int ret; 6029 bool ctx_invalid; 6030 KVMState *state; 6031 6032 switch (run->exit_reason) { 6033 case KVM_EXIT_HLT: 6034 DPRINTF("handle_hlt\n"); 6035 bql_lock(); 6036 ret = kvm_handle_halt(cpu); 6037 bql_unlock(); 6038 break; 6039 case KVM_EXIT_SET_TPR: 6040 ret = 0; 6041 break; 6042 case KVM_EXIT_TPR_ACCESS: 6043 bql_lock(); 6044 ret = kvm_handle_tpr_access(cpu); 6045 bql_unlock(); 6046 break; 6047 case KVM_EXIT_FAIL_ENTRY: 6048 code = run->fail_entry.hardware_entry_failure_reason; 6049 fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n", 6050 code); 6051 if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) { 6052 fprintf(stderr, 6053 "\nIf you're running a guest on an Intel machine without " 6054 "unrestricted mode\n" 6055 "support, the failure can be most likely due to the guest " 6056 "entering an invalid\n" 6057 "state for Intel VT. For example, the guest maybe running " 6058 "in big real mode\n" 6059 "which is not supported on less recent Intel processors." 6060 "\n\n"); 6061 } 6062 ret = -1; 6063 break; 6064 case KVM_EXIT_EXCEPTION: 6065 fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n", 6066 run->ex.exception, run->ex.error_code); 6067 ret = -1; 6068 break; 6069 case KVM_EXIT_DEBUG: 6070 DPRINTF("kvm_exit_debug\n"); 6071 bql_lock(); 6072 ret = kvm_handle_debug(cpu, &run->debug.arch); 6073 bql_unlock(); 6074 break; 6075 case KVM_EXIT_HYPERV: 6076 ret = kvm_hv_handle_exit(cpu, &run->hyperv); 6077 break; 6078 case KVM_EXIT_IOAPIC_EOI: 6079 ioapic_eoi_broadcast(run->eoi.vector); 6080 ret = 0; 6081 break; 6082 case KVM_EXIT_X86_BUS_LOCK: 6083 /* already handled in kvm_arch_post_run */ 6084 ret = 0; 6085 break; 6086 case KVM_EXIT_NOTIFY: 6087 ctx_invalid = !!(run->notify.flags & KVM_NOTIFY_CONTEXT_INVALID); 6088 state = KVM_STATE(current_accel()); 6089 if (ctx_invalid || 6090 state->notify_vmexit == NOTIFY_VMEXIT_OPTION_INTERNAL_ERROR) { 6091 warn_report("KVM internal error: Encountered a notify exit " 6092 "with invalid context in guest."); 6093 ret = -1; 6094 } else { 6095 warn_report_once("KVM: Encountered a notify exit with valid " 6096 "context in guest. " 6097 "The guest could be misbehaving."); 6098 ret = 0; 6099 } 6100 break; 6101 case KVM_EXIT_X86_RDMSR: 6102 /* We only enable MSR filtering, any other exit is bogus */ 6103 assert(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER); 6104 ret = kvm_handle_rdmsr(cpu, run); 6105 break; 6106 case KVM_EXIT_X86_WRMSR: 6107 /* We only enable MSR filtering, any other exit is bogus */ 6108 assert(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER); 6109 ret = kvm_handle_wrmsr(cpu, run); 6110 break; 6111 #ifdef CONFIG_XEN_EMU 6112 case KVM_EXIT_XEN: 6113 ret = kvm_xen_handle_exit(cpu, &run->xen); 6114 break; 6115 #endif 6116 case KVM_EXIT_HYPERCALL: 6117 ret = kvm_handle_hypercall(run); 6118 break; 6119 default: 6120 fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); 6121 ret = -1; 6122 break; 6123 } 6124 6125 return ret; 6126 } 6127 6128 bool kvm_arch_stop_on_emulation_error(CPUState *cs) 6129 { 6130 X86CPU *cpu = X86_CPU(cs); 6131 CPUX86State *env = &cpu->env; 6132 6133 kvm_cpu_synchronize_state(cs); 6134 return !(env->cr[0] & CR0_PE_MASK) || 6135 ((env->segs[R_CS].selector & 3) != 3); 6136 } 6137 6138 void kvm_arch_init_irq_routing(KVMState *s) 6139 { 6140 /* We know at this point that we're using the in-kernel 6141 * irqchip, so we can use irqfds, and on x86 we know 6142 * we can use msi via irqfd and GSI routing. 6143 */ 6144 kvm_msi_via_irqfd_allowed = true; 6145 kvm_gsi_routing_allowed = true; 6146 6147 if (kvm_irqchip_is_split()) { 6148 KVMRouteChange c = kvm_irqchip_begin_route_changes(s); 6149 int i; 6150 6151 /* If the ioapic is in QEMU and the lapics are in KVM, reserve 6152 MSI routes for signaling interrupts to the local apics. */ 6153 for (i = 0; i < IOAPIC_NUM_PINS; i++) { 6154 if (kvm_irqchip_add_msi_route(&c, 0, NULL) < 0) { 6155 error_report("Could not enable split IRQ mode."); 6156 exit(1); 6157 } 6158 } 6159 kvm_irqchip_commit_route_changes(&c); 6160 } 6161 } 6162 6163 int kvm_arch_irqchip_create(KVMState *s) 6164 { 6165 int ret; 6166 if (kvm_kernel_irqchip_split()) { 6167 ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24); 6168 if (ret) { 6169 error_report("Could not enable split irqchip mode: %s", 6170 strerror(-ret)); 6171 exit(1); 6172 } else { 6173 DPRINTF("Enabled KVM_CAP_SPLIT_IRQCHIP\n"); 6174 kvm_split_irqchip = true; 6175 return 1; 6176 } 6177 } else { 6178 return 0; 6179 } 6180 } 6181 6182 uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address) 6183 { 6184 CPUX86State *env; 6185 uint64_t ext_id; 6186 6187 if (!first_cpu) { 6188 return address; 6189 } 6190 env = &X86_CPU(first_cpu)->env; 6191 if (!(env->features[FEAT_KVM] & CPUID_KVM_MSI_EXT_DEST_ID)) { 6192 return address; 6193 } 6194 6195 /* 6196 * If the remappable format bit is set, or the upper bits are 6197 * already set in address_hi, or the low extended bits aren't 6198 * there anyway, do nothing. 6199 */ 6200 ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT); 6201 if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) || (address >> 32)) { 6202 return address; 6203 } 6204 6205 address &= ~ext_id; 6206 address |= ext_id << 35; 6207 return address; 6208 } 6209 6210 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, 6211 uint64_t address, uint32_t data, PCIDevice *dev) 6212 { 6213 X86IOMMUState *iommu = x86_iommu_get_default(); 6214 6215 if (iommu) { 6216 X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu); 6217 6218 if (class->int_remap) { 6219 int ret; 6220 MSIMessage src, dst; 6221 6222 src.address = route->u.msi.address_hi; 6223 src.address <<= VTD_MSI_ADDR_HI_SHIFT; 6224 src.address |= route->u.msi.address_lo; 6225 src.data = route->u.msi.data; 6226 6227 ret = class->int_remap(iommu, &src, &dst, dev ? \ 6228 pci_requester_id(dev) : \ 6229 X86_IOMMU_SID_INVALID); 6230 if (ret) { 6231 trace_kvm_x86_fixup_msi_error(route->gsi); 6232 return 1; 6233 } 6234 6235 /* 6236 * Handled untranslated compatibility format interrupt with 6237 * extended destination ID in the low bits 11-5. */ 6238 dst.address = kvm_swizzle_msi_ext_dest_id(dst.address); 6239 6240 route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT; 6241 route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK; 6242 route->u.msi.data = dst.data; 6243 return 0; 6244 } 6245 } 6246 6247 #ifdef CONFIG_XEN_EMU 6248 if (xen_mode == XEN_EMULATE) { 6249 int handled = xen_evtchn_translate_pirq_msi(route, address, data); 6250 6251 /* 6252 * If it was a PIRQ and successfully routed (handled == 0) or it was 6253 * an error (handled < 0), return. If it wasn't a PIRQ, keep going. 6254 */ 6255 if (handled <= 0) { 6256 return handled; 6257 } 6258 } 6259 #endif 6260 6261 address = kvm_swizzle_msi_ext_dest_id(address); 6262 route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT; 6263 route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK; 6264 return 0; 6265 } 6266 6267 typedef struct MSIRouteEntry MSIRouteEntry; 6268 6269 struct MSIRouteEntry { 6270 PCIDevice *dev; /* Device pointer */ 6271 int vector; /* MSI/MSIX vector index */ 6272 int virq; /* Virtual IRQ index */ 6273 QLIST_ENTRY(MSIRouteEntry) list; 6274 }; 6275 6276 /* List of used GSI routes */ 6277 static QLIST_HEAD(, MSIRouteEntry) msi_route_list = \ 6278 QLIST_HEAD_INITIALIZER(msi_route_list); 6279 6280 void kvm_update_msi_routes_all(void *private, bool global, 6281 uint32_t index, uint32_t mask) 6282 { 6283 int cnt = 0, vector; 6284 MSIRouteEntry *entry; 6285 MSIMessage msg; 6286 PCIDevice *dev; 6287 6288 /* TODO: explicit route update */ 6289 QLIST_FOREACH(entry, &msi_route_list, list) { 6290 cnt++; 6291 vector = entry->vector; 6292 dev = entry->dev; 6293 if (msix_enabled(dev) && !msix_is_masked(dev, vector)) { 6294 msg = msix_get_message(dev, vector); 6295 } else if (msi_enabled(dev) && !msi_is_masked(dev, vector)) { 6296 msg = msi_get_message(dev, vector); 6297 } else { 6298 /* 6299 * Either MSI/MSIX is disabled for the device, or the 6300 * specific message was masked out. Skip this one. 6301 */ 6302 continue; 6303 } 6304 kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev); 6305 } 6306 kvm_irqchip_commit_routes(kvm_state); 6307 trace_kvm_x86_update_msi_routes(cnt); 6308 } 6309 6310 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route, 6311 int vector, PCIDevice *dev) 6312 { 6313 static bool notify_list_inited = false; 6314 MSIRouteEntry *entry; 6315 6316 if (!dev) { 6317 /* These are (possibly) IOAPIC routes only used for split 6318 * kernel irqchip mode, while what we are housekeeping are 6319 * PCI devices only. */ 6320 return 0; 6321 } 6322 6323 entry = g_new0(MSIRouteEntry, 1); 6324 entry->dev = dev; 6325 entry->vector = vector; 6326 entry->virq = route->gsi; 6327 QLIST_INSERT_HEAD(&msi_route_list, entry, list); 6328 6329 trace_kvm_x86_add_msi_route(route->gsi); 6330 6331 if (!notify_list_inited) { 6332 /* For the first time we do add route, add ourselves into 6333 * IOMMU's IEC notify list if needed. */ 6334 X86IOMMUState *iommu = x86_iommu_get_default(); 6335 if (iommu) { 6336 x86_iommu_iec_register_notifier(iommu, 6337 kvm_update_msi_routes_all, 6338 NULL); 6339 } 6340 notify_list_inited = true; 6341 } 6342 return 0; 6343 } 6344 6345 int kvm_arch_release_virq_post(int virq) 6346 { 6347 MSIRouteEntry *entry, *next; 6348 QLIST_FOREACH_SAFE(entry, &msi_route_list, list, next) { 6349 if (entry->virq == virq) { 6350 trace_kvm_x86_remove_msi_route(virq); 6351 QLIST_REMOVE(entry, list); 6352 g_free(entry); 6353 break; 6354 } 6355 } 6356 return 0; 6357 } 6358 6359 int kvm_arch_msi_data_to_gsi(uint32_t data) 6360 { 6361 abort(); 6362 } 6363 6364 bool kvm_has_waitpkg(void) 6365 { 6366 return has_msr_umwait; 6367 } 6368 6369 #define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 6370 6371 void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask) 6372 { 6373 KVMState *s = kvm_state; 6374 uint64_t supported; 6375 6376 mask &= XSTATE_DYNAMIC_MASK; 6377 if (!mask) { 6378 return; 6379 } 6380 /* 6381 * Just ignore bits that are not in CPUID[EAX=0xD,ECX=0]. 6382 * ARCH_REQ_XCOMP_GUEST_PERM would fail, and QEMU has warned 6383 * about them already because they are not supported features. 6384 */ 6385 supported = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX); 6386 supported |= (uint64_t)kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EDX) << 32; 6387 mask &= supported; 6388 6389 while (mask) { 6390 int bit = ctz64(mask); 6391 int rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit); 6392 if (rc) { 6393 /* 6394 * Older kernel version (<5.17) do not support 6395 * ARCH_REQ_XCOMP_GUEST_PERM, but also do not return 6396 * any dynamic feature from kvm_arch_get_supported_cpuid. 6397 */ 6398 warn_report("prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure " 6399 "for feature bit %d", bit); 6400 } 6401 mask &= ~BIT_ULL(bit); 6402 } 6403 } 6404 6405 static int kvm_arch_get_notify_vmexit(Object *obj, Error **errp) 6406 { 6407 KVMState *s = KVM_STATE(obj); 6408 return s->notify_vmexit; 6409 } 6410 6411 static void kvm_arch_set_notify_vmexit(Object *obj, int value, Error **errp) 6412 { 6413 KVMState *s = KVM_STATE(obj); 6414 6415 if (s->fd != -1) { 6416 error_setg(errp, "Cannot set properties after the accelerator has been initialized"); 6417 return; 6418 } 6419 6420 s->notify_vmexit = value; 6421 } 6422 6423 static void kvm_arch_get_notify_window(Object *obj, Visitor *v, 6424 const char *name, void *opaque, 6425 Error **errp) 6426 { 6427 KVMState *s = KVM_STATE(obj); 6428 uint32_t value = s->notify_window; 6429 6430 visit_type_uint32(v, name, &value, errp); 6431 } 6432 6433 static void kvm_arch_set_notify_window(Object *obj, Visitor *v, 6434 const char *name, void *opaque, 6435 Error **errp) 6436 { 6437 KVMState *s = KVM_STATE(obj); 6438 uint32_t value; 6439 6440 if (s->fd != -1) { 6441 error_setg(errp, "Cannot set properties after the accelerator has been initialized"); 6442 return; 6443 } 6444 6445 if (!visit_type_uint32(v, name, &value, errp)) { 6446 return; 6447 } 6448 6449 s->notify_window = value; 6450 } 6451 6452 static void kvm_arch_get_xen_version(Object *obj, Visitor *v, 6453 const char *name, void *opaque, 6454 Error **errp) 6455 { 6456 KVMState *s = KVM_STATE(obj); 6457 uint32_t value = s->xen_version; 6458 6459 visit_type_uint32(v, name, &value, errp); 6460 } 6461 6462 static void kvm_arch_set_xen_version(Object *obj, Visitor *v, 6463 const char *name, void *opaque, 6464 Error **errp) 6465 { 6466 KVMState *s = KVM_STATE(obj); 6467 Error *error = NULL; 6468 uint32_t value; 6469 6470 visit_type_uint32(v, name, &value, &error); 6471 if (error) { 6472 error_propagate(errp, error); 6473 return; 6474 } 6475 6476 s->xen_version = value; 6477 if (value && xen_mode == XEN_DISABLED) { 6478 xen_mode = XEN_EMULATE; 6479 } 6480 } 6481 6482 static void kvm_arch_get_xen_gnttab_max_frames(Object *obj, Visitor *v, 6483 const char *name, void *opaque, 6484 Error **errp) 6485 { 6486 KVMState *s = KVM_STATE(obj); 6487 uint16_t value = s->xen_gnttab_max_frames; 6488 6489 visit_type_uint16(v, name, &value, errp); 6490 } 6491 6492 static void kvm_arch_set_xen_gnttab_max_frames(Object *obj, Visitor *v, 6493 const char *name, void *opaque, 6494 Error **errp) 6495 { 6496 KVMState *s = KVM_STATE(obj); 6497 Error *error = NULL; 6498 uint16_t value; 6499 6500 visit_type_uint16(v, name, &value, &error); 6501 if (error) { 6502 error_propagate(errp, error); 6503 return; 6504 } 6505 6506 s->xen_gnttab_max_frames = value; 6507 } 6508 6509 static void kvm_arch_get_xen_evtchn_max_pirq(Object *obj, Visitor *v, 6510 const char *name, void *opaque, 6511 Error **errp) 6512 { 6513 KVMState *s = KVM_STATE(obj); 6514 uint16_t value = s->xen_evtchn_max_pirq; 6515 6516 visit_type_uint16(v, name, &value, errp); 6517 } 6518 6519 static void kvm_arch_set_xen_evtchn_max_pirq(Object *obj, Visitor *v, 6520 const char *name, void *opaque, 6521 Error **errp) 6522 { 6523 KVMState *s = KVM_STATE(obj); 6524 Error *error = NULL; 6525 uint16_t value; 6526 6527 visit_type_uint16(v, name, &value, &error); 6528 if (error) { 6529 error_propagate(errp, error); 6530 return; 6531 } 6532 6533 s->xen_evtchn_max_pirq = value; 6534 } 6535 6536 void kvm_arch_accel_class_init(ObjectClass *oc) 6537 { 6538 object_class_property_add_enum(oc, "notify-vmexit", "NotifyVMexitOption", 6539 &NotifyVmexitOption_lookup, 6540 kvm_arch_get_notify_vmexit, 6541 kvm_arch_set_notify_vmexit); 6542 object_class_property_set_description(oc, "notify-vmexit", 6543 "Enable notify VM exit"); 6544 6545 object_class_property_add(oc, "notify-window", "uint32", 6546 kvm_arch_get_notify_window, 6547 kvm_arch_set_notify_window, 6548 NULL, NULL); 6549 object_class_property_set_description(oc, "notify-window", 6550 "Clock cycles without an event window " 6551 "after which a notification VM exit occurs"); 6552 6553 object_class_property_add(oc, "xen-version", "uint32", 6554 kvm_arch_get_xen_version, 6555 kvm_arch_set_xen_version, 6556 NULL, NULL); 6557 object_class_property_set_description(oc, "xen-version", 6558 "Xen version to be emulated " 6559 "(in XENVER_version form " 6560 "e.g. 0x4000a for 4.10)"); 6561 6562 object_class_property_add(oc, "xen-gnttab-max-frames", "uint16", 6563 kvm_arch_get_xen_gnttab_max_frames, 6564 kvm_arch_set_xen_gnttab_max_frames, 6565 NULL, NULL); 6566 object_class_property_set_description(oc, "xen-gnttab-max-frames", 6567 "Maximum number of grant table frames"); 6568 6569 object_class_property_add(oc, "xen-evtchn-max-pirq", "uint16", 6570 kvm_arch_get_xen_evtchn_max_pirq, 6571 kvm_arch_set_xen_evtchn_max_pirq, 6572 NULL, NULL); 6573 object_class_property_set_description(oc, "xen-evtchn-max-pirq", 6574 "Maximum number of Xen PIRQs"); 6575 } 6576 6577 void kvm_set_max_apic_id(uint32_t max_apic_id) 6578 { 6579 kvm_vm_enable_cap(kvm_state, KVM_CAP_MAX_VCPU_ID, 0, max_apic_id); 6580 } 6581