1 /* 2 * QEMU KVM support 3 * 4 * Copyright (C) 2006-2008 Qumranet Technologies 5 * Copyright IBM, Corp. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2 or later. 11 * See the COPYING file in the top-level directory. 12 * 13 */ 14 15 #include "qemu/osdep.h" 16 #include "qapi/qapi-events-run-state.h" 17 #include "qapi/error.h" 18 #include "qapi/visitor.h" 19 #include <math.h> 20 #include <sys/ioctl.h> 21 #include <sys/utsname.h> 22 #include <sys/syscall.h> 23 #include <sys/resource.h> 24 #include <sys/time.h> 25 26 #include <linux/kvm.h> 27 #include <linux/kvm_para.h> 28 #include "standard-headers/asm-x86/kvm_para.h" 29 #include "hw/xen/interface/arch-x86/cpuid.h" 30 31 #include "cpu.h" 32 #include "host-cpu.h" 33 #include "vmsr_energy.h" 34 #include "system/system.h" 35 #include "system/hw_accel.h" 36 #include "system/kvm_int.h" 37 #include "system/runstate.h" 38 #include "kvm_i386.h" 39 #include "../confidential-guest.h" 40 #include "sev.h" 41 #include "xen-emu.h" 42 #include "hyperv.h" 43 #include "hyperv-proto.h" 44 45 #include "gdbstub/enums.h" 46 #include "qemu/host-utils.h" 47 #include "qemu/main-loop.h" 48 #include "qemu/ratelimit.h" 49 #include "qemu/config-file.h" 50 #include "qemu/error-report.h" 51 #include "qemu/memalign.h" 52 #include "hw/i386/x86.h" 53 #include "hw/i386/kvm/xen_evtchn.h" 54 #include "hw/i386/pc.h" 55 #include "hw/i386/apic.h" 56 #include "hw/i386/apic_internal.h" 57 #include "hw/i386/apic-msidef.h" 58 #include "hw/i386/intel_iommu.h" 59 #include "hw/i386/topology.h" 60 #include "hw/i386/x86-iommu.h" 61 #include "hw/i386/e820_memory_layout.h" 62 63 #include "hw/xen/xen.h" 64 65 #include "hw/pci/pci.h" 66 #include "hw/pci/msi.h" 67 #include "hw/pci/msix.h" 68 #include "migration/blocker.h" 69 #include "exec/memattrs.h" 70 #include "exec/target_page.h" 71 #include "trace.h" 72 73 #include CONFIG_DEVICES 74 75 //#define DEBUG_KVM 76 77 #ifdef DEBUG_KVM 78 #define DPRINTF(fmt, ...) \ 79 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 80 #else 81 #define DPRINTF(fmt, ...) \ 82 do { } while (0) 83 #endif 84 85 /* 86 * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly. 87 * In order to use vm86 mode, an EPT identity map and a TSS are needed. 88 * Since these must be part of guest physical memory, we need to allocate 89 * them, both by setting their start addresses in the kernel and by 90 * creating a corresponding e820 entry. We need 4 pages before the BIOS, 91 * so this value allows up to 16M BIOSes. 92 */ 93 #define KVM_IDENTITY_BASE 0xfeffc000 94 95 /* From arch/x86/kvm/lapic.h */ 96 #define KVM_APIC_BUS_CYCLE_NS 1 97 #define KVM_APIC_BUS_FREQUENCY (1000000000ULL / KVM_APIC_BUS_CYCLE_NS) 98 99 /* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus 100 * 255 kvm_msr_entry structs */ 101 #define MSR_BUF_SIZE 4096 102 103 typedef bool QEMURDMSRHandler(X86CPU *cpu, uint32_t msr, uint64_t *val); 104 typedef bool QEMUWRMSRHandler(X86CPU *cpu, uint32_t msr, uint64_t val); 105 typedef struct { 106 uint32_t msr; 107 QEMURDMSRHandler *rdmsr; 108 QEMUWRMSRHandler *wrmsr; 109 } KVMMSRHandlers; 110 111 static void kvm_init_msrs(X86CPU *cpu); 112 static int kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr, 113 QEMUWRMSRHandler *wrmsr); 114 115 const KVMCapabilityInfo kvm_arch_required_capabilities[] = { 116 KVM_CAP_INFO(SET_TSS_ADDR), 117 KVM_CAP_INFO(EXT_CPUID), 118 KVM_CAP_INFO(MP_STATE), 119 KVM_CAP_INFO(SIGNAL_MSI), 120 KVM_CAP_INFO(IRQ_ROUTING), 121 KVM_CAP_INFO(DEBUGREGS), 122 KVM_CAP_INFO(XSAVE), 123 KVM_CAP_INFO(VCPU_EVENTS), 124 KVM_CAP_INFO(X86_ROBUST_SINGLESTEP), 125 KVM_CAP_INFO(MCE), 126 KVM_CAP_INFO(ADJUST_CLOCK), 127 KVM_CAP_INFO(SET_IDENTITY_MAP_ADDR), 128 KVM_CAP_LAST_INFO 129 }; 130 131 static bool has_msr_star; 132 static bool has_msr_hsave_pa; 133 static bool has_msr_tsc_aux; 134 static bool has_msr_tsc_adjust; 135 static bool has_msr_tsc_deadline; 136 static bool has_msr_feature_control; 137 static bool has_msr_misc_enable; 138 static bool has_msr_smbase; 139 static bool has_msr_bndcfgs; 140 static int lm_capable_kernel; 141 static bool has_msr_hv_hypercall; 142 static bool has_msr_hv_crash; 143 static bool has_msr_hv_reset; 144 static bool has_msr_hv_vpindex; 145 static bool hv_vpindex_settable; 146 static bool has_msr_hv_runtime; 147 static bool has_msr_hv_synic; 148 static bool has_msr_hv_stimer; 149 static bool has_msr_hv_frequencies; 150 static bool has_msr_hv_reenlightenment; 151 static bool has_msr_hv_syndbg_options; 152 static bool has_msr_xss; 153 static bool has_msr_umwait; 154 static bool has_msr_spec_ctrl; 155 static bool has_tsc_scale_msr; 156 static bool has_msr_tsx_ctrl; 157 static bool has_msr_virt_ssbd; 158 static bool has_msr_smi_count; 159 static bool has_msr_arch_capabs; 160 static bool has_msr_core_capabs; 161 static bool has_msr_vmx_vmfunc; 162 static bool has_msr_ucode_rev; 163 static bool has_msr_vmx_procbased_ctls2; 164 static bool has_msr_perf_capabs; 165 static bool has_msr_pkrs; 166 static bool has_msr_hwcr; 167 168 static uint32_t has_architectural_pmu_version; 169 static uint32_t num_architectural_pmu_gp_counters; 170 static uint32_t num_architectural_pmu_fixed_counters; 171 172 static int has_xsave2; 173 static int has_xcrs; 174 static int has_sregs2; 175 static int has_exception_payload; 176 static int has_triple_fault_event; 177 178 static bool has_msr_mcg_ext_ctl; 179 180 static struct kvm_cpuid2 *cpuid_cache; 181 static struct kvm_cpuid2 *hv_cpuid_cache; 182 static struct kvm_msr_list *kvm_feature_msrs; 183 184 static KVMMSRHandlers msr_handlers[KVM_MSR_FILTER_MAX_RANGES]; 185 186 #define BUS_LOCK_SLICE_TIME 1000000000ULL /* ns */ 187 static RateLimit bus_lock_ratelimit_ctrl; 188 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value); 189 190 static const char *vm_type_name[] = { 191 [KVM_X86_DEFAULT_VM] = "default", 192 [KVM_X86_SEV_VM] = "SEV", 193 [KVM_X86_SEV_ES_VM] = "SEV-ES", 194 [KVM_X86_SNP_VM] = "SEV-SNP", 195 }; 196 197 bool kvm_is_vm_type_supported(int type) 198 { 199 uint32_t machine_types; 200 201 /* 202 * old KVM doesn't support KVM_CAP_VM_TYPES but KVM_X86_DEFAULT_VM 203 * is always supported 204 */ 205 if (type == KVM_X86_DEFAULT_VM) { 206 return true; 207 } 208 209 machine_types = kvm_check_extension(KVM_STATE(current_machine->accelerator), 210 KVM_CAP_VM_TYPES); 211 return !!(machine_types & BIT(type)); 212 } 213 214 int kvm_get_vm_type(MachineState *ms) 215 { 216 int kvm_type = KVM_X86_DEFAULT_VM; 217 218 if (ms->cgs) { 219 if (!object_dynamic_cast(OBJECT(ms->cgs), TYPE_X86_CONFIDENTIAL_GUEST)) { 220 error_report("configuration type %s not supported for x86 guests", 221 object_get_typename(OBJECT(ms->cgs))); 222 exit(1); 223 } 224 kvm_type = x86_confidential_guest_kvm_type( 225 X86_CONFIDENTIAL_GUEST(ms->cgs)); 226 } 227 228 if (!kvm_is_vm_type_supported(kvm_type)) { 229 error_report("vm-type %s not supported by KVM", vm_type_name[kvm_type]); 230 exit(1); 231 } 232 233 return kvm_type; 234 } 235 236 bool kvm_enable_hypercall(uint64_t enable_mask) 237 { 238 KVMState *s = KVM_STATE(current_accel()); 239 240 return !kvm_vm_enable_cap(s, KVM_CAP_EXIT_HYPERCALL, 0, enable_mask); 241 } 242 243 bool kvm_has_smm(void) 244 { 245 return kvm_vm_check_extension(kvm_state, KVM_CAP_X86_SMM); 246 } 247 248 bool kvm_has_adjust_clock_stable(void) 249 { 250 int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK); 251 252 return (ret & KVM_CLOCK_TSC_STABLE); 253 } 254 255 bool kvm_has_exception_payload(void) 256 { 257 return has_exception_payload; 258 } 259 260 static bool kvm_x2apic_api_set_flags(uint64_t flags) 261 { 262 KVMState *s = KVM_STATE(current_accel()); 263 264 return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags); 265 } 266 267 #define MEMORIZE(fn, _result) \ 268 ({ \ 269 static bool _memorized; \ 270 \ 271 if (_memorized) { \ 272 return _result; \ 273 } \ 274 _memorized = true; \ 275 _result = fn; \ 276 }) 277 278 static bool has_x2apic_api; 279 280 bool kvm_has_x2apic_api(void) 281 { 282 return has_x2apic_api; 283 } 284 285 bool kvm_enable_x2apic(void) 286 { 287 return MEMORIZE( 288 kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS | 289 KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK), 290 has_x2apic_api); 291 } 292 293 bool kvm_hv_vpindex_settable(void) 294 { 295 return hv_vpindex_settable; 296 } 297 298 static int kvm_get_tsc(CPUState *cs) 299 { 300 X86CPU *cpu = X86_CPU(cs); 301 CPUX86State *env = &cpu->env; 302 uint64_t value; 303 int ret; 304 305 if (env->tsc_valid) { 306 return 0; 307 } 308 309 env->tsc_valid = !runstate_is_running(); 310 311 ret = kvm_get_one_msr(cpu, MSR_IA32_TSC, &value); 312 if (ret < 0) { 313 return ret; 314 } 315 316 env->tsc = value; 317 return 0; 318 } 319 320 static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg) 321 { 322 kvm_get_tsc(cpu); 323 } 324 325 void kvm_synchronize_all_tsc(void) 326 { 327 CPUState *cpu; 328 329 if (kvm_enabled()) { 330 CPU_FOREACH(cpu) { 331 run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL); 332 } 333 } 334 } 335 336 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max) 337 { 338 struct kvm_cpuid2 *cpuid; 339 int r, size; 340 341 size = sizeof(*cpuid) + max * sizeof(*cpuid->entries); 342 cpuid = g_malloc0(size); 343 cpuid->nent = max; 344 r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid); 345 if (r == 0 && cpuid->nent >= max) { 346 r = -E2BIG; 347 } 348 if (r < 0) { 349 if (r == -E2BIG) { 350 g_free(cpuid); 351 return NULL; 352 } else { 353 fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n", 354 strerror(-r)); 355 exit(1); 356 } 357 } 358 return cpuid; 359 } 360 361 /* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough 362 * for all entries. 363 */ 364 static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s) 365 { 366 struct kvm_cpuid2 *cpuid; 367 int max = 1; 368 369 if (cpuid_cache != NULL) { 370 return cpuid_cache; 371 } 372 while ((cpuid = try_get_cpuid(s, max)) == NULL) { 373 max *= 2; 374 } 375 cpuid_cache = cpuid; 376 return cpuid; 377 } 378 379 static bool host_tsx_broken(void) 380 { 381 int family, model, stepping;\ 382 char vendor[CPUID_VENDOR_SZ + 1]; 383 384 host_cpu_vendor_fms(vendor, &family, &model, &stepping); 385 386 /* Check if we are running on a Haswell host known to have broken TSX */ 387 return !strcmp(vendor, CPUID_VENDOR_INTEL) && 388 (family == 6) && 389 ((model == 63 && stepping < 4) || 390 model == 60 || model == 69 || model == 70); 391 } 392 393 /* Returns the value for a specific register on the cpuid entry 394 */ 395 static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg) 396 { 397 uint32_t ret = 0; 398 switch (reg) { 399 case R_EAX: 400 ret = entry->eax; 401 break; 402 case R_EBX: 403 ret = entry->ebx; 404 break; 405 case R_ECX: 406 ret = entry->ecx; 407 break; 408 case R_EDX: 409 ret = entry->edx; 410 break; 411 } 412 return ret; 413 } 414 415 /* Find matching entry for function/index on kvm_cpuid2 struct 416 */ 417 static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid, 418 uint32_t function, 419 uint32_t index) 420 { 421 int i; 422 for (i = 0; i < cpuid->nent; ++i) { 423 if (cpuid->entries[i].function == function && 424 cpuid->entries[i].index == index) { 425 return &cpuid->entries[i]; 426 } 427 } 428 /* not found: */ 429 return NULL; 430 } 431 432 uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, 433 uint32_t index, int reg) 434 { 435 struct kvm_cpuid2 *cpuid; 436 uint32_t ret = 0; 437 uint32_t cpuid_1_edx, unused; 438 uint64_t bitmask; 439 440 cpuid = get_supported_cpuid(s); 441 442 struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index); 443 if (entry) { 444 ret = cpuid_entry_get_reg(entry, reg); 445 } 446 447 /* Fixups for the data returned by KVM, below */ 448 449 if (function == 1 && reg == R_EDX) { 450 /* KVM before 2.6.30 misreports the following features */ 451 ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA; 452 /* KVM never reports CPUID_HT but QEMU can support when vcpus > 1 */ 453 ret |= CPUID_HT; 454 } else if (function == 1 && reg == R_ECX) { 455 /* We can set the hypervisor flag, even if KVM does not return it on 456 * GET_SUPPORTED_CPUID 457 */ 458 ret |= CPUID_EXT_HYPERVISOR; 459 /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it 460 * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER, 461 * and the irqchip is in the kernel. 462 */ 463 if (kvm_irqchip_in_kernel() && 464 kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) { 465 ret |= CPUID_EXT_TSC_DEADLINE_TIMER; 466 } 467 468 /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled 469 * without the in-kernel irqchip 470 */ 471 if (!kvm_irqchip_in_kernel()) { 472 ret &= ~CPUID_EXT_X2APIC; 473 } 474 475 if (enable_cpu_pm) { 476 int disable_exits = kvm_check_extension(s, 477 KVM_CAP_X86_DISABLE_EXITS); 478 479 if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) { 480 ret |= CPUID_EXT_MONITOR; 481 } 482 } 483 } else if (function == 6 && reg == R_EAX) { 484 ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */ 485 } else if (function == 7 && index == 0 && reg == R_EBX) { 486 /* Not new instructions, just an optimization. */ 487 uint32_t ebx; 488 host_cpuid(7, 0, &unused, &ebx, &unused, &unused); 489 ret |= ebx & CPUID_7_0_EBX_ERMS; 490 491 if (host_tsx_broken()) { 492 ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE); 493 } 494 } else if (function == 7 && index == 0 && reg == R_EDX) { 495 /* Not new instructions, just an optimization. */ 496 uint32_t edx; 497 host_cpuid(7, 0, &unused, &unused, &unused, &edx); 498 ret |= edx & CPUID_7_0_EDX_FSRM; 499 500 /* 501 * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts. 502 * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is 503 * returned by KVM_GET_MSR_INDEX_LIST. 504 */ 505 if (!has_msr_arch_capabs) { 506 ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES; 507 } 508 } else if (function == 7 && index == 1 && reg == R_EAX) { 509 /* Not new instructions, just an optimization. */ 510 uint32_t eax; 511 host_cpuid(7, 1, &eax, &unused, &unused, &unused); 512 ret |= eax & (CPUID_7_1_EAX_FZRM | CPUID_7_1_EAX_FSRS | CPUID_7_1_EAX_FSRC); 513 } else if (function == 7 && index == 2 && reg == R_EDX) { 514 uint32_t edx; 515 host_cpuid(7, 2, &unused, &unused, &unused, &edx); 516 ret |= edx & CPUID_7_2_EDX_MCDT_NO; 517 } else if (function == 0xd && index == 0 && 518 (reg == R_EAX || reg == R_EDX)) { 519 /* 520 * The value returned by KVM_GET_SUPPORTED_CPUID does not include 521 * features that still have to be enabled with the arch_prctl 522 * system call. QEMU needs the full value, which is retrieved 523 * with KVM_GET_DEVICE_ATTR. 524 */ 525 struct kvm_device_attr attr = { 526 .group = 0, 527 .attr = KVM_X86_XCOMP_GUEST_SUPP, 528 .addr = (unsigned long) &bitmask 529 }; 530 531 bool sys_attr = kvm_check_extension(s, KVM_CAP_SYS_ATTRIBUTES); 532 if (!sys_attr) { 533 return ret; 534 } 535 536 int rc = kvm_ioctl(s, KVM_GET_DEVICE_ATTR, &attr); 537 if (rc < 0) { 538 if (rc != -ENXIO) { 539 warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) " 540 "error: %d", rc); 541 } 542 return ret; 543 } 544 ret = (reg == R_EAX) ? bitmask : bitmask >> 32; 545 } else if (function == 0x80000001 && reg == R_ECX) { 546 /* 547 * It's safe to enable TOPOEXT even if it's not returned by 548 * GET_SUPPORTED_CPUID. Unconditionally enabling TOPOEXT here allows 549 * us to keep CPU models including TOPOEXT runnable on older kernels. 550 */ 551 ret |= CPUID_EXT3_TOPOEXT; 552 } else if (function == 0x80000001 && reg == R_EDX) { 553 /* On Intel, kvm returns cpuid according to the Intel spec, 554 * so add missing bits according to the AMD spec: 555 */ 556 cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX); 557 ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES; 558 } else if (function == 0x80000007 && reg == R_EBX) { 559 ret |= CPUID_8000_0007_EBX_OVERFLOW_RECOV | CPUID_8000_0007_EBX_SUCCOR; 560 } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) { 561 /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't 562 * be enabled without the in-kernel irqchip 563 */ 564 if (!kvm_irqchip_in_kernel()) { 565 ret &= ~CPUID_KVM_PV_UNHALT; 566 } 567 if (kvm_irqchip_is_split()) { 568 ret |= CPUID_KVM_MSI_EXT_DEST_ID; 569 } 570 } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) { 571 ret |= CPUID_KVM_HINTS_REALTIME; 572 } 573 574 if (current_machine->cgs) { 575 ret = x86_confidential_guest_mask_cpuid_features( 576 X86_CONFIDENTIAL_GUEST(current_machine->cgs), 577 function, index, reg, ret); 578 } 579 return ret; 580 } 581 582 uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index) 583 { 584 struct { 585 struct kvm_msrs info; 586 struct kvm_msr_entry entries[1]; 587 } msr_data = {}; 588 uint64_t value; 589 uint32_t ret, can_be_one, must_be_one; 590 591 if (kvm_feature_msrs == NULL) { /* Host doesn't support feature MSRs */ 592 return 0; 593 } 594 595 /* Check if requested MSR is supported feature MSR */ 596 int i; 597 for (i = 0; i < kvm_feature_msrs->nmsrs; i++) 598 if (kvm_feature_msrs->indices[i] == index) { 599 break; 600 } 601 if (i == kvm_feature_msrs->nmsrs) { 602 return 0; /* if the feature MSR is not supported, simply return 0 */ 603 } 604 605 msr_data.info.nmsrs = 1; 606 msr_data.entries[0].index = index; 607 608 ret = kvm_ioctl(s, KVM_GET_MSRS, &msr_data); 609 if (ret != 1) { 610 error_report("KVM get MSR (index=0x%x) feature failed, %s", 611 index, strerror(-ret)); 612 exit(1); 613 } 614 615 value = msr_data.entries[0].data; 616 switch (index) { 617 case MSR_IA32_VMX_PROCBASED_CTLS2: 618 if (!has_msr_vmx_procbased_ctls2) { 619 /* KVM forgot to add these bits for some time, do this ourselves. */ 620 if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) & 621 CPUID_XSAVE_XSAVES) { 622 value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32; 623 } 624 if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) & 625 CPUID_EXT_RDRAND) { 626 value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32; 627 } 628 if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & 629 CPUID_7_0_EBX_INVPCID) { 630 value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32; 631 } 632 if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & 633 CPUID_7_0_EBX_RDSEED) { 634 value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32; 635 } 636 if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) & 637 CPUID_EXT2_RDTSCP) { 638 value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32; 639 } 640 } 641 /* fall through */ 642 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 643 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 644 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 645 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 646 /* 647 * Return true for bits that can be one, but do not have to be one. 648 * The SDM tells us which bits could have a "must be one" setting, 649 * so we can do the opposite transformation in make_vmx_msr_value. 650 */ 651 must_be_one = (uint32_t)value; 652 can_be_one = (uint32_t)(value >> 32); 653 return can_be_one & ~must_be_one; 654 655 default: 656 return value; 657 } 658 } 659 660 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap, 661 int *max_banks) 662 { 663 *max_banks = kvm_check_extension(s, KVM_CAP_MCE); 664 return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap); 665 } 666 667 static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code) 668 { 669 CPUState *cs = CPU(cpu); 670 CPUX86State *env = &cpu->env; 671 uint64_t status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_MISCV | 672 MCI_STATUS_ADDRV; 673 uint64_t mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV; 674 int flags = 0; 675 676 if (!IS_AMD_CPU(env)) { 677 status |= MCI_STATUS_S | MCI_STATUS_UC; 678 if (code == BUS_MCEERR_AR) { 679 status |= MCI_STATUS_AR | 0x134; 680 mcg_status |= MCG_STATUS_EIPV; 681 } else { 682 status |= 0xc0; 683 } 684 } else { 685 if (code == BUS_MCEERR_AR) { 686 status |= MCI_STATUS_UC | MCI_STATUS_POISON; 687 mcg_status |= MCG_STATUS_EIPV; 688 } else { 689 /* Setting the POISON bit for deferred errors indicates to the 690 * guest kernel that the address provided by the MCE is valid 691 * and usable which will ensure that the guest kernel will send 692 * a SIGBUS_AO signal to the guest process. This allows for 693 * more desirable behavior in the case that the guest process 694 * with poisoned memory has set the MCE_KILL_EARLY prctl flag 695 * which indicates that the process would prefer to handle or 696 * shutdown due to the poisoned memory condition before the 697 * memory has been accessed. 698 * 699 * While the POISON bit would not be set in a deferred error 700 * sent from hardware, the bit is not meaningful for deferred 701 * errors and can be reused in this scenario. 702 */ 703 status |= MCI_STATUS_DEFERRED | MCI_STATUS_POISON; 704 } 705 } 706 707 flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0; 708 /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the 709 * guest kernel back into env->mcg_ext_ctl. 710 */ 711 cpu_synchronize_state(cs); 712 if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) { 713 mcg_status |= MCG_STATUS_LMCE; 714 flags = 0; 715 } 716 717 cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr, 718 (MCM_ADDR_PHYS << 6) | 0xc, flags); 719 } 720 721 static void emit_hypervisor_memory_failure(MemoryFailureAction action, bool ar) 722 { 723 MemoryFailureFlags mff = {.action_required = ar, .recursive = false}; 724 725 qapi_event_send_memory_failure(MEMORY_FAILURE_RECIPIENT_HYPERVISOR, action, 726 &mff); 727 } 728 729 static void hardware_memory_error(void *host_addr) 730 { 731 emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_FATAL, true); 732 error_report("QEMU got Hardware memory error at addr %p", host_addr); 733 exit(1); 734 } 735 736 void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) 737 { 738 X86CPU *cpu = X86_CPU(c); 739 CPUX86State *env = &cpu->env; 740 ram_addr_t ram_addr; 741 hwaddr paddr; 742 743 /* If we get an action required MCE, it has been injected by KVM 744 * while the VM was running. An action optional MCE instead should 745 * be coming from the main thread, which qemu_init_sigbus identifies 746 * as the "early kill" thread. 747 */ 748 assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); 749 750 if ((env->mcg_cap & MCG_SER_P) && addr) { 751 ram_addr = qemu_ram_addr_from_host(addr); 752 if (ram_addr != RAM_ADDR_INVALID && 753 kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { 754 kvm_hwpoison_page_add(ram_addr); 755 kvm_mce_inject(cpu, paddr, code); 756 757 /* 758 * Use different logging severity based on error type. 759 * If there is additional MCE reporting on the hypervisor, QEMU VA 760 * could be another source to identify the PA and MCE details. 761 */ 762 if (code == BUS_MCEERR_AR) { 763 error_report("Guest MCE Memory Error at QEMU addr %p and " 764 "GUEST addr 0x%" HWADDR_PRIx " of type %s injected", 765 addr, paddr, "BUS_MCEERR_AR"); 766 } else { 767 warn_report("Guest MCE Memory Error at QEMU addr %p and " 768 "GUEST addr 0x%" HWADDR_PRIx " of type %s injected", 769 addr, paddr, "BUS_MCEERR_AO"); 770 } 771 772 return; 773 } 774 775 if (code == BUS_MCEERR_AO) { 776 warn_report("Hardware memory error at addr %p of type %s " 777 "for memory used by QEMU itself instead of guest system!", 778 addr, "BUS_MCEERR_AO"); 779 } 780 } 781 782 if (code == BUS_MCEERR_AR) { 783 hardware_memory_error(addr); 784 } 785 786 /* Hope we are lucky for AO MCE, just notify a event */ 787 emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_IGNORE, false); 788 } 789 790 static void kvm_queue_exception(CPUX86State *env, 791 int32_t exception_nr, 792 uint8_t exception_has_payload, 793 uint64_t exception_payload) 794 { 795 assert(env->exception_nr == -1); 796 assert(!env->exception_pending); 797 assert(!env->exception_injected); 798 assert(!env->exception_has_payload); 799 800 env->exception_nr = exception_nr; 801 802 if (has_exception_payload) { 803 env->exception_pending = 1; 804 805 env->exception_has_payload = exception_has_payload; 806 env->exception_payload = exception_payload; 807 } else { 808 env->exception_injected = 1; 809 810 if (exception_nr == EXCP01_DB) { 811 assert(exception_has_payload); 812 env->dr[6] = exception_payload; 813 } else if (exception_nr == EXCP0E_PAGE) { 814 assert(exception_has_payload); 815 env->cr[2] = exception_payload; 816 } else { 817 assert(!exception_has_payload); 818 } 819 } 820 } 821 822 static void cpu_update_state(void *opaque, bool running, RunState state) 823 { 824 CPUX86State *env = opaque; 825 826 if (running) { 827 env->tsc_valid = false; 828 } 829 } 830 831 unsigned long kvm_arch_vcpu_id(CPUState *cs) 832 { 833 X86CPU *cpu = X86_CPU(cs); 834 return cpu->apic_id; 835 } 836 837 #ifndef KVM_CPUID_SIGNATURE_NEXT 838 #define KVM_CPUID_SIGNATURE_NEXT 0x40000100 839 #endif 840 841 static bool hyperv_enabled(X86CPU *cpu) 842 { 843 return kvm_check_extension(kvm_state, KVM_CAP_HYPERV) > 0 && 844 ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) || 845 cpu->hyperv_features || cpu->hyperv_passthrough); 846 } 847 848 /* 849 * Check whether target_freq is within conservative 850 * ntp correctable bounds (250ppm) of freq 851 */ 852 static inline bool freq_within_bounds(int freq, int target_freq) 853 { 854 int max_freq = freq + (freq * 250 / 1000000); 855 int min_freq = freq - (freq * 250 / 1000000); 856 857 if (target_freq >= min_freq && target_freq <= max_freq) { 858 return true; 859 } 860 861 return false; 862 } 863 864 static int kvm_arch_set_tsc_khz(CPUState *cs) 865 { 866 X86CPU *cpu = X86_CPU(cs); 867 CPUX86State *env = &cpu->env; 868 int r, cur_freq; 869 bool set_ioctl = false; 870 871 if (!env->tsc_khz) { 872 return 0; 873 } 874 875 cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ? 876 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP; 877 878 /* 879 * If TSC scaling is supported, attempt to set TSC frequency. 880 */ 881 if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) { 882 set_ioctl = true; 883 } 884 885 /* 886 * If desired TSC frequency is within bounds of NTP correction, 887 * attempt to set TSC frequency. 888 */ 889 if (cur_freq != -ENOTSUP && freq_within_bounds(cur_freq, env->tsc_khz)) { 890 set_ioctl = true; 891 } 892 893 r = set_ioctl ? 894 kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) : 895 -ENOTSUP; 896 897 if (r < 0) { 898 /* When KVM_SET_TSC_KHZ fails, it's an error only if the current 899 * TSC frequency doesn't match the one we want. 900 */ 901 cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ? 902 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : 903 -ENOTSUP; 904 if (cur_freq <= 0 || cur_freq != env->tsc_khz) { 905 warn_report("TSC frequency mismatch between " 906 "VM (%" PRId64 " kHz) and host (%d kHz), " 907 "and TSC scaling unavailable", 908 env->tsc_khz, cur_freq); 909 return r; 910 } 911 } 912 913 return 0; 914 } 915 916 static bool tsc_is_stable_and_known(CPUX86State *env) 917 { 918 if (!env->tsc_khz) { 919 return false; 920 } 921 return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) 922 || env->user_tsc_khz; 923 } 924 925 #define DEFAULT_EVMCS_VERSION ((1 << 8) | 1) 926 927 static struct { 928 const char *desc; 929 struct { 930 uint32_t func; 931 int reg; 932 uint32_t bits; 933 } flags[2]; 934 uint64_t dependencies; 935 bool skip_passthrough; 936 } kvm_hyperv_properties[] = { 937 [HYPERV_FEAT_RELAXED] = { 938 .desc = "relaxed timing (hv-relaxed)", 939 .flags = { 940 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX, 941 .bits = HV_RELAXED_TIMING_RECOMMENDED} 942 } 943 }, 944 [HYPERV_FEAT_VAPIC] = { 945 .desc = "virtual APIC (hv-vapic)", 946 .flags = { 947 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 948 .bits = HV_APIC_ACCESS_AVAILABLE} 949 } 950 }, 951 [HYPERV_FEAT_TIME] = { 952 .desc = "clocksources (hv-time)", 953 .flags = { 954 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 955 .bits = HV_TIME_REF_COUNT_AVAILABLE | HV_REFERENCE_TSC_AVAILABLE} 956 } 957 }, 958 [HYPERV_FEAT_CRASH] = { 959 .desc = "crash MSRs (hv-crash)", 960 .flags = { 961 {.func = HV_CPUID_FEATURES, .reg = R_EDX, 962 .bits = HV_GUEST_CRASH_MSR_AVAILABLE} 963 } 964 }, 965 [HYPERV_FEAT_RESET] = { 966 .desc = "reset MSR (hv-reset)", 967 .flags = { 968 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 969 .bits = HV_RESET_AVAILABLE} 970 } 971 }, 972 [HYPERV_FEAT_VPINDEX] = { 973 .desc = "VP_INDEX MSR (hv-vpindex)", 974 .flags = { 975 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 976 .bits = HV_VP_INDEX_AVAILABLE} 977 } 978 }, 979 [HYPERV_FEAT_RUNTIME] = { 980 .desc = "VP_RUNTIME MSR (hv-runtime)", 981 .flags = { 982 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 983 .bits = HV_VP_RUNTIME_AVAILABLE} 984 } 985 }, 986 [HYPERV_FEAT_SYNIC] = { 987 .desc = "synthetic interrupt controller (hv-synic)", 988 .flags = { 989 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 990 .bits = HV_SYNIC_AVAILABLE} 991 } 992 }, 993 [HYPERV_FEAT_STIMER] = { 994 .desc = "synthetic timers (hv-stimer)", 995 .flags = { 996 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 997 .bits = HV_SYNTIMERS_AVAILABLE} 998 }, 999 .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME) 1000 }, 1001 [HYPERV_FEAT_FREQUENCIES] = { 1002 .desc = "frequency MSRs (hv-frequencies)", 1003 .flags = { 1004 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 1005 .bits = HV_ACCESS_FREQUENCY_MSRS}, 1006 {.func = HV_CPUID_FEATURES, .reg = R_EDX, 1007 .bits = HV_FREQUENCY_MSRS_AVAILABLE} 1008 } 1009 }, 1010 [HYPERV_FEAT_REENLIGHTENMENT] = { 1011 .desc = "reenlightenment MSRs (hv-reenlightenment)", 1012 .flags = { 1013 {.func = HV_CPUID_FEATURES, .reg = R_EAX, 1014 .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL} 1015 } 1016 }, 1017 [HYPERV_FEAT_TLBFLUSH] = { 1018 .desc = "paravirtualized TLB flush (hv-tlbflush)", 1019 .flags = { 1020 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX, 1021 .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED | 1022 HV_EX_PROCESSOR_MASKS_RECOMMENDED} 1023 }, 1024 .dependencies = BIT(HYPERV_FEAT_VPINDEX) 1025 }, 1026 [HYPERV_FEAT_EVMCS] = { 1027 .desc = "enlightened VMCS (hv-evmcs)", 1028 .flags = { 1029 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX, 1030 .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED} 1031 }, 1032 .dependencies = BIT(HYPERV_FEAT_VAPIC) 1033 }, 1034 [HYPERV_FEAT_IPI] = { 1035 .desc = "paravirtualized IPI (hv-ipi)", 1036 .flags = { 1037 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX, 1038 .bits = HV_CLUSTER_IPI_RECOMMENDED | 1039 HV_EX_PROCESSOR_MASKS_RECOMMENDED} 1040 }, 1041 .dependencies = BIT(HYPERV_FEAT_VPINDEX) 1042 }, 1043 [HYPERV_FEAT_STIMER_DIRECT] = { 1044 .desc = "direct mode synthetic timers (hv-stimer-direct)", 1045 .flags = { 1046 {.func = HV_CPUID_FEATURES, .reg = R_EDX, 1047 .bits = HV_STIMER_DIRECT_MODE_AVAILABLE} 1048 }, 1049 .dependencies = BIT(HYPERV_FEAT_STIMER) 1050 }, 1051 [HYPERV_FEAT_AVIC] = { 1052 .desc = "AVIC/APICv support (hv-avic/hv-apicv)", 1053 .flags = { 1054 {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX, 1055 .bits = HV_DEPRECATING_AEOI_RECOMMENDED} 1056 } 1057 }, 1058 [HYPERV_FEAT_SYNDBG] = { 1059 .desc = "Enable synthetic kernel debugger channel (hv-syndbg)", 1060 .flags = { 1061 {.func = HV_CPUID_FEATURES, .reg = R_EDX, 1062 .bits = HV_FEATURE_DEBUG_MSRS_AVAILABLE} 1063 }, 1064 .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED), 1065 .skip_passthrough = true, 1066 }, 1067 [HYPERV_FEAT_MSR_BITMAP] = { 1068 .desc = "enlightened MSR-Bitmap (hv-emsr-bitmap)", 1069 .flags = { 1070 {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX, 1071 .bits = HV_NESTED_MSR_BITMAP} 1072 } 1073 }, 1074 [HYPERV_FEAT_XMM_INPUT] = { 1075 .desc = "XMM fast hypercall input (hv-xmm-input)", 1076 .flags = { 1077 {.func = HV_CPUID_FEATURES, .reg = R_EDX, 1078 .bits = HV_HYPERCALL_XMM_INPUT_AVAILABLE} 1079 } 1080 }, 1081 [HYPERV_FEAT_TLBFLUSH_EXT] = { 1082 .desc = "Extended gva ranges for TLB flush hypercalls (hv-tlbflush-ext)", 1083 .flags = { 1084 {.func = HV_CPUID_FEATURES, .reg = R_EDX, 1085 .bits = HV_EXT_GVA_RANGES_FLUSH_AVAILABLE} 1086 }, 1087 .dependencies = BIT(HYPERV_FEAT_TLBFLUSH) 1088 }, 1089 [HYPERV_FEAT_TLBFLUSH_DIRECT] = { 1090 .desc = "direct TLB flush (hv-tlbflush-direct)", 1091 .flags = { 1092 {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX, 1093 .bits = HV_NESTED_DIRECT_FLUSH} 1094 }, 1095 .dependencies = BIT(HYPERV_FEAT_VAPIC) 1096 }, 1097 }; 1098 1099 static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max, 1100 bool do_sys_ioctl) 1101 { 1102 struct kvm_cpuid2 *cpuid; 1103 int r, size; 1104 1105 size = sizeof(*cpuid) + max * sizeof(*cpuid->entries); 1106 cpuid = g_malloc0(size); 1107 cpuid->nent = max; 1108 1109 if (do_sys_ioctl) { 1110 r = kvm_ioctl(kvm_state, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 1111 } else { 1112 r = kvm_vcpu_ioctl(cs, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 1113 } 1114 if (r == 0 && cpuid->nent >= max) { 1115 r = -E2BIG; 1116 } 1117 if (r < 0) { 1118 if (r == -E2BIG) { 1119 g_free(cpuid); 1120 return NULL; 1121 } else { 1122 fprintf(stderr, "KVM_GET_SUPPORTED_HV_CPUID failed: %s\n", 1123 strerror(-r)); 1124 exit(1); 1125 } 1126 } 1127 return cpuid; 1128 } 1129 1130 /* 1131 * Run KVM_GET_SUPPORTED_HV_CPUID ioctl(), allocating a buffer large enough 1132 * for all entries. 1133 */ 1134 static struct kvm_cpuid2 *get_supported_hv_cpuid(CPUState *cs) 1135 { 1136 struct kvm_cpuid2 *cpuid; 1137 /* 0x40000000..0x40000005, 0x4000000A, 0x40000080..0x40000082 leaves */ 1138 int max = 11; 1139 int i; 1140 bool do_sys_ioctl; 1141 1142 do_sys_ioctl = 1143 kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID) > 0; 1144 1145 /* 1146 * Non-empty KVM context is needed when KVM_CAP_SYS_HYPERV_CPUID is 1147 * unsupported, kvm_hyperv_expand_features() checks for that. 1148 */ 1149 assert(do_sys_ioctl || cs->kvm_state); 1150 1151 /* 1152 * When the buffer is too small, KVM_GET_SUPPORTED_HV_CPUID fails with 1153 * -E2BIG, however, it doesn't report back the right size. Keep increasing 1154 * it and re-trying until we succeed. 1155 */ 1156 while ((cpuid = try_get_hv_cpuid(cs, max, do_sys_ioctl)) == NULL) { 1157 max++; 1158 } 1159 1160 /* 1161 * KVM_GET_SUPPORTED_HV_CPUID does not set EVMCS CPUID bit before 1162 * KVM_CAP_HYPERV_ENLIGHTENED_VMCS is enabled but we want to get the 1163 * information early, just check for the capability and set the bit 1164 * manually. 1165 */ 1166 if (!do_sys_ioctl && kvm_check_extension(cs->kvm_state, 1167 KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) { 1168 for (i = 0; i < cpuid->nent; i++) { 1169 if (cpuid->entries[i].function == HV_CPUID_ENLIGHTMENT_INFO) { 1170 cpuid->entries[i].eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED; 1171 } 1172 } 1173 } 1174 1175 return cpuid; 1176 } 1177 1178 /* 1179 * When KVM_GET_SUPPORTED_HV_CPUID is not supported we fill CPUID feature 1180 * leaves from KVM_CAP_HYPERV* and present MSRs data. 1181 */ 1182 static struct kvm_cpuid2 *get_supported_hv_cpuid_legacy(CPUState *cs) 1183 { 1184 X86CPU *cpu = X86_CPU(cs); 1185 struct kvm_cpuid2 *cpuid; 1186 struct kvm_cpuid_entry2 *entry_feat, *entry_recomm; 1187 1188 /* HV_CPUID_FEATURES, HV_CPUID_ENLIGHTMENT_INFO */ 1189 cpuid = g_malloc0(sizeof(*cpuid) + 2 * sizeof(*cpuid->entries)); 1190 cpuid->nent = 2; 1191 1192 /* HV_CPUID_VENDOR_AND_MAX_FUNCTIONS */ 1193 entry_feat = &cpuid->entries[0]; 1194 entry_feat->function = HV_CPUID_FEATURES; 1195 1196 entry_recomm = &cpuid->entries[1]; 1197 entry_recomm->function = HV_CPUID_ENLIGHTMENT_INFO; 1198 entry_recomm->ebx = cpu->hyperv_spinlock_attempts; 1199 1200 if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0) { 1201 entry_feat->eax |= HV_HYPERCALL_AVAILABLE; 1202 entry_feat->eax |= HV_APIC_ACCESS_AVAILABLE; 1203 entry_feat->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE; 1204 entry_recomm->eax |= HV_RELAXED_TIMING_RECOMMENDED; 1205 entry_recomm->eax |= HV_APIC_ACCESS_RECOMMENDED; 1206 } 1207 1208 if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) { 1209 entry_feat->eax |= HV_TIME_REF_COUNT_AVAILABLE; 1210 entry_feat->eax |= HV_REFERENCE_TSC_AVAILABLE; 1211 } 1212 1213 if (has_msr_hv_frequencies) { 1214 entry_feat->eax |= HV_ACCESS_FREQUENCY_MSRS; 1215 entry_feat->edx |= HV_FREQUENCY_MSRS_AVAILABLE; 1216 } 1217 1218 if (has_msr_hv_crash) { 1219 entry_feat->edx |= HV_GUEST_CRASH_MSR_AVAILABLE; 1220 } 1221 1222 if (has_msr_hv_reenlightenment) { 1223 entry_feat->eax |= HV_ACCESS_REENLIGHTENMENTS_CONTROL; 1224 } 1225 1226 if (has_msr_hv_reset) { 1227 entry_feat->eax |= HV_RESET_AVAILABLE; 1228 } 1229 1230 if (has_msr_hv_vpindex) { 1231 entry_feat->eax |= HV_VP_INDEX_AVAILABLE; 1232 } 1233 1234 if (has_msr_hv_runtime) { 1235 entry_feat->eax |= HV_VP_RUNTIME_AVAILABLE; 1236 } 1237 1238 if (has_msr_hv_synic) { 1239 unsigned int cap = cpu->hyperv_synic_kvm_only ? 1240 KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2; 1241 1242 if (kvm_check_extension(cs->kvm_state, cap) > 0) { 1243 entry_feat->eax |= HV_SYNIC_AVAILABLE; 1244 } 1245 } 1246 1247 if (has_msr_hv_stimer) { 1248 entry_feat->eax |= HV_SYNTIMERS_AVAILABLE; 1249 } 1250 1251 if (has_msr_hv_syndbg_options) { 1252 entry_feat->edx |= HV_GUEST_DEBUGGING_AVAILABLE; 1253 entry_feat->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; 1254 entry_feat->ebx |= HV_PARTITION_DEBUGGING_ALLOWED; 1255 } 1256 1257 if (kvm_check_extension(cs->kvm_state, 1258 KVM_CAP_HYPERV_TLBFLUSH) > 0) { 1259 entry_recomm->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED; 1260 entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED; 1261 } 1262 1263 if (kvm_check_extension(cs->kvm_state, 1264 KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) { 1265 entry_recomm->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED; 1266 } 1267 1268 if (kvm_check_extension(cs->kvm_state, 1269 KVM_CAP_HYPERV_SEND_IPI) > 0) { 1270 entry_recomm->eax |= HV_CLUSTER_IPI_RECOMMENDED; 1271 entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED; 1272 } 1273 1274 return cpuid; 1275 } 1276 1277 static uint32_t hv_cpuid_get_host(CPUState *cs, uint32_t func, int reg) 1278 { 1279 struct kvm_cpuid_entry2 *entry; 1280 struct kvm_cpuid2 *cpuid; 1281 1282 if (hv_cpuid_cache) { 1283 cpuid = hv_cpuid_cache; 1284 } else { 1285 if (kvm_check_extension(kvm_state, KVM_CAP_HYPERV_CPUID) > 0) { 1286 cpuid = get_supported_hv_cpuid(cs); 1287 } else { 1288 /* 1289 * 'cs->kvm_state' may be NULL when Hyper-V features are expanded 1290 * before KVM context is created but this is only done when 1291 * KVM_CAP_SYS_HYPERV_CPUID is supported and it implies 1292 * KVM_CAP_HYPERV_CPUID. 1293 */ 1294 assert(cs->kvm_state); 1295 1296 cpuid = get_supported_hv_cpuid_legacy(cs); 1297 } 1298 hv_cpuid_cache = cpuid; 1299 } 1300 1301 if (!cpuid) { 1302 return 0; 1303 } 1304 1305 entry = cpuid_find_entry(cpuid, func, 0); 1306 if (!entry) { 1307 return 0; 1308 } 1309 1310 return cpuid_entry_get_reg(entry, reg); 1311 } 1312 1313 static bool hyperv_feature_supported(CPUState *cs, int feature) 1314 { 1315 uint32_t func, bits; 1316 int i, reg; 1317 1318 /* 1319 * kvm_hyperv_properties needs to define at least one CPUID flag which 1320 * must be used to detect the feature, it's hard to say whether it is 1321 * supported or not otherwise. 1322 */ 1323 assert(kvm_hyperv_properties[feature].flags[0].func); 1324 1325 for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) { 1326 1327 func = kvm_hyperv_properties[feature].flags[i].func; 1328 reg = kvm_hyperv_properties[feature].flags[i].reg; 1329 bits = kvm_hyperv_properties[feature].flags[i].bits; 1330 1331 if (!func) { 1332 continue; 1333 } 1334 1335 if ((hv_cpuid_get_host(cs, func, reg) & bits) != bits) { 1336 return false; 1337 } 1338 } 1339 1340 return true; 1341 } 1342 1343 /* Checks that all feature dependencies are enabled */ 1344 static bool hv_feature_check_deps(X86CPU *cpu, int feature, Error **errp) 1345 { 1346 uint64_t deps; 1347 int dep_feat; 1348 1349 deps = kvm_hyperv_properties[feature].dependencies; 1350 while (deps) { 1351 dep_feat = ctz64(deps); 1352 if (!(hyperv_feat_enabled(cpu, dep_feat))) { 1353 error_setg(errp, "Hyper-V %s requires Hyper-V %s", 1354 kvm_hyperv_properties[feature].desc, 1355 kvm_hyperv_properties[dep_feat].desc); 1356 return false; 1357 } 1358 deps &= ~(1ull << dep_feat); 1359 } 1360 1361 return true; 1362 } 1363 1364 static uint32_t hv_build_cpuid_leaf(CPUState *cs, uint32_t func, int reg) 1365 { 1366 X86CPU *cpu = X86_CPU(cs); 1367 uint32_t r = 0; 1368 int i, j; 1369 1370 for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties); i++) { 1371 if (!hyperv_feat_enabled(cpu, i)) { 1372 continue; 1373 } 1374 1375 for (j = 0; j < ARRAY_SIZE(kvm_hyperv_properties[i].flags); j++) { 1376 if (kvm_hyperv_properties[i].flags[j].func != func) { 1377 continue; 1378 } 1379 if (kvm_hyperv_properties[i].flags[j].reg != reg) { 1380 continue; 1381 } 1382 1383 r |= kvm_hyperv_properties[i].flags[j].bits; 1384 } 1385 } 1386 1387 /* HV_CPUID_NESTED_FEATURES.EAX also encodes the supported eVMCS range */ 1388 if (func == HV_CPUID_NESTED_FEATURES && reg == R_EAX) { 1389 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) { 1390 r |= DEFAULT_EVMCS_VERSION; 1391 } 1392 } 1393 1394 return r; 1395 } 1396 1397 /* 1398 * Expand Hyper-V CPU features. In partucular, check that all the requested 1399 * features are supported by the host and the sanity of the configuration 1400 * (that all the required dependencies are included). Also, this takes care 1401 * of 'hv_passthrough' mode and fills the environment with all supported 1402 * Hyper-V features. 1403 */ 1404 bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp) 1405 { 1406 CPUState *cs = CPU(cpu); 1407 Error *local_err = NULL; 1408 int feat; 1409 1410 if (!hyperv_enabled(cpu)) 1411 return true; 1412 1413 /* 1414 * When kvm_hyperv_expand_features is called at CPU feature expansion 1415 * time per-CPU kvm_state is not available yet so we can only proceed 1416 * when KVM_CAP_SYS_HYPERV_CPUID is supported. 1417 */ 1418 if (!cs->kvm_state && 1419 !kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID)) 1420 return true; 1421 1422 if (cpu->hyperv_passthrough) { 1423 cpu->hyperv_vendor_id[0] = 1424 hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EBX); 1425 cpu->hyperv_vendor_id[1] = 1426 hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_ECX); 1427 cpu->hyperv_vendor_id[2] = 1428 hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EDX); 1429 cpu->hyperv_vendor = g_realloc(cpu->hyperv_vendor, 1430 sizeof(cpu->hyperv_vendor_id) + 1); 1431 memcpy(cpu->hyperv_vendor, cpu->hyperv_vendor_id, 1432 sizeof(cpu->hyperv_vendor_id)); 1433 cpu->hyperv_vendor[sizeof(cpu->hyperv_vendor_id)] = 0; 1434 1435 cpu->hyperv_interface_id[0] = 1436 hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EAX); 1437 cpu->hyperv_interface_id[1] = 1438 hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EBX); 1439 cpu->hyperv_interface_id[2] = 1440 hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_ECX); 1441 cpu->hyperv_interface_id[3] = 1442 hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EDX); 1443 1444 cpu->hyperv_ver_id_build = 1445 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EAX); 1446 cpu->hyperv_ver_id_major = 1447 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) >> 16; 1448 cpu->hyperv_ver_id_minor = 1449 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) & 0xffff; 1450 cpu->hyperv_ver_id_sp = 1451 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_ECX); 1452 cpu->hyperv_ver_id_sb = 1453 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) >> 24; 1454 cpu->hyperv_ver_id_sn = 1455 hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) & 0xffffff; 1456 1457 cpu->hv_max_vps = hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, 1458 R_EAX); 1459 cpu->hyperv_limits[0] = 1460 hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EBX); 1461 cpu->hyperv_limits[1] = 1462 hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_ECX); 1463 cpu->hyperv_limits[2] = 1464 hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EDX); 1465 1466 cpu->hyperv_spinlock_attempts = 1467 hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EBX); 1468 1469 /* 1470 * Mark feature as enabled in 'cpu->hyperv_features' as 1471 * hv_build_cpuid_leaf() uses this info to build guest CPUIDs. 1472 */ 1473 for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) { 1474 if (hyperv_feature_supported(cs, feat) && 1475 !kvm_hyperv_properties[feat].skip_passthrough) { 1476 cpu->hyperv_features |= BIT(feat); 1477 } 1478 } 1479 } else { 1480 /* Check features availability and dependencies */ 1481 for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) { 1482 /* If the feature was not requested skip it. */ 1483 if (!hyperv_feat_enabled(cpu, feat)) { 1484 continue; 1485 } 1486 1487 /* Check if the feature is supported by KVM */ 1488 if (!hyperv_feature_supported(cs, feat)) { 1489 error_setg(errp, "Hyper-V %s is not supported by kernel", 1490 kvm_hyperv_properties[feat].desc); 1491 return false; 1492 } 1493 1494 /* Check dependencies */ 1495 if (!hv_feature_check_deps(cpu, feat, &local_err)) { 1496 error_propagate(errp, local_err); 1497 return false; 1498 } 1499 } 1500 } 1501 1502 /* Additional dependencies not covered by kvm_hyperv_properties[] */ 1503 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) && 1504 !cpu->hyperv_synic_kvm_only && 1505 !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) { 1506 error_setg(errp, "Hyper-V %s requires Hyper-V %s", 1507 kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc, 1508 kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc); 1509 return false; 1510 } 1511 1512 return true; 1513 } 1514 1515 /* 1516 * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent. 1517 */ 1518 static int hyperv_fill_cpuids(CPUState *cs, 1519 struct kvm_cpuid_entry2 *cpuid_ent) 1520 { 1521 X86CPU *cpu = X86_CPU(cs); 1522 struct kvm_cpuid_entry2 *c; 1523 uint32_t signature[3]; 1524 uint32_t cpuid_i = 0, max_cpuid_leaf = 0; 1525 uint32_t nested_eax = 1526 hv_build_cpuid_leaf(cs, HV_CPUID_NESTED_FEATURES, R_EAX); 1527 1528 max_cpuid_leaf = nested_eax ? HV_CPUID_NESTED_FEATURES : 1529 HV_CPUID_IMPLEMENT_LIMITS; 1530 1531 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) { 1532 max_cpuid_leaf = 1533 MAX(max_cpuid_leaf, HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES); 1534 } 1535 1536 c = &cpuid_ent[cpuid_i++]; 1537 c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS; 1538 c->eax = max_cpuid_leaf; 1539 c->ebx = cpu->hyperv_vendor_id[0]; 1540 c->ecx = cpu->hyperv_vendor_id[1]; 1541 c->edx = cpu->hyperv_vendor_id[2]; 1542 1543 c = &cpuid_ent[cpuid_i++]; 1544 c->function = HV_CPUID_INTERFACE; 1545 c->eax = cpu->hyperv_interface_id[0]; 1546 c->ebx = cpu->hyperv_interface_id[1]; 1547 c->ecx = cpu->hyperv_interface_id[2]; 1548 c->edx = cpu->hyperv_interface_id[3]; 1549 1550 c = &cpuid_ent[cpuid_i++]; 1551 c->function = HV_CPUID_VERSION; 1552 c->eax = cpu->hyperv_ver_id_build; 1553 c->ebx = (uint32_t)cpu->hyperv_ver_id_major << 16 | 1554 cpu->hyperv_ver_id_minor; 1555 c->ecx = cpu->hyperv_ver_id_sp; 1556 c->edx = (uint32_t)cpu->hyperv_ver_id_sb << 24 | 1557 (cpu->hyperv_ver_id_sn & 0xffffff); 1558 1559 c = &cpuid_ent[cpuid_i++]; 1560 c->function = HV_CPUID_FEATURES; 1561 c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EAX); 1562 c->ebx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EBX); 1563 c->edx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EDX); 1564 1565 /* Unconditionally required with any Hyper-V enlightenment */ 1566 c->eax |= HV_HYPERCALL_AVAILABLE; 1567 1568 /* SynIC and Vmbus devices require messages/signals hypercalls */ 1569 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) && 1570 !cpu->hyperv_synic_kvm_only) { 1571 c->ebx |= HV_POST_MESSAGES | HV_SIGNAL_EVENTS; 1572 } 1573 1574 1575 /* Not exposed by KVM but needed to make CPU hotplug in Windows work */ 1576 c->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE; 1577 1578 c = &cpuid_ent[cpuid_i++]; 1579 c->function = HV_CPUID_ENLIGHTMENT_INFO; 1580 c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX); 1581 c->ebx = cpu->hyperv_spinlock_attempts; 1582 1583 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) && 1584 !hyperv_feat_enabled(cpu, HYPERV_FEAT_AVIC)) { 1585 c->eax |= HV_APIC_ACCESS_RECOMMENDED; 1586 } 1587 1588 if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_ON) { 1589 c->eax |= HV_NO_NONARCH_CORESHARING; 1590 } else if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO) { 1591 c->eax |= hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX) & 1592 HV_NO_NONARCH_CORESHARING; 1593 } 1594 1595 c = &cpuid_ent[cpuid_i++]; 1596 c->function = HV_CPUID_IMPLEMENT_LIMITS; 1597 c->eax = cpu->hv_max_vps; 1598 c->ebx = cpu->hyperv_limits[0]; 1599 c->ecx = cpu->hyperv_limits[1]; 1600 c->edx = cpu->hyperv_limits[2]; 1601 1602 if (nested_eax) { 1603 uint32_t function; 1604 1605 /* Create zeroed 0x40000006..0x40000009 leaves */ 1606 for (function = HV_CPUID_IMPLEMENT_LIMITS + 1; 1607 function < HV_CPUID_NESTED_FEATURES; function++) { 1608 c = &cpuid_ent[cpuid_i++]; 1609 c->function = function; 1610 } 1611 1612 c = &cpuid_ent[cpuid_i++]; 1613 c->function = HV_CPUID_NESTED_FEATURES; 1614 c->eax = nested_eax; 1615 } 1616 1617 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) { 1618 c = &cpuid_ent[cpuid_i++]; 1619 c->function = HV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS; 1620 c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ? 1621 HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS; 1622 memcpy(signature, "Microsoft VS", 12); 1623 c->eax = 0; 1624 c->ebx = signature[0]; 1625 c->ecx = signature[1]; 1626 c->edx = signature[2]; 1627 1628 c = &cpuid_ent[cpuid_i++]; 1629 c->function = HV_CPUID_SYNDBG_INTERFACE; 1630 memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12); 1631 c->eax = signature[0]; 1632 c->ebx = 0; 1633 c->ecx = 0; 1634 c->edx = 0; 1635 1636 c = &cpuid_ent[cpuid_i++]; 1637 c->function = HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES; 1638 c->eax = HV_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; 1639 c->ebx = 0; 1640 c->ecx = 0; 1641 c->edx = 0; 1642 } 1643 1644 return cpuid_i; 1645 } 1646 1647 static Error *hv_passthrough_mig_blocker; 1648 static Error *hv_no_nonarch_cs_mig_blocker; 1649 1650 /* Checks that the exposed eVMCS version range is supported by KVM */ 1651 static bool evmcs_version_supported(uint16_t evmcs_version, 1652 uint16_t supported_evmcs_version) 1653 { 1654 uint8_t min_version = evmcs_version & 0xff; 1655 uint8_t max_version = evmcs_version >> 8; 1656 uint8_t min_supported_version = supported_evmcs_version & 0xff; 1657 uint8_t max_supported_version = supported_evmcs_version >> 8; 1658 1659 return (min_version >= min_supported_version) && 1660 (max_version <= max_supported_version); 1661 } 1662 1663 static int hyperv_init_vcpu(X86CPU *cpu) 1664 { 1665 CPUState *cs = CPU(cpu); 1666 Error *local_err = NULL; 1667 int ret; 1668 1669 if (cpu->hyperv_passthrough && hv_passthrough_mig_blocker == NULL) { 1670 error_setg(&hv_passthrough_mig_blocker, 1671 "'hv-passthrough' CPU flag prevents migration, use explicit" 1672 " set of hv-* flags instead"); 1673 ret = migrate_add_blocker(&hv_passthrough_mig_blocker, &local_err); 1674 if (ret < 0) { 1675 error_report_err(local_err); 1676 return ret; 1677 } 1678 } 1679 1680 if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO && 1681 hv_no_nonarch_cs_mig_blocker == NULL) { 1682 error_setg(&hv_no_nonarch_cs_mig_blocker, 1683 "'hv-no-nonarch-coresharing=auto' CPU flag prevents migration" 1684 " use explicit 'hv-no-nonarch-coresharing=on' instead (but" 1685 " make sure SMT is disabled and/or that vCPUs are properly" 1686 " pinned)"); 1687 ret = migrate_add_blocker(&hv_no_nonarch_cs_mig_blocker, &local_err); 1688 if (ret < 0) { 1689 error_report_err(local_err); 1690 return ret; 1691 } 1692 } 1693 1694 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && !hv_vpindex_settable) { 1695 /* 1696 * the kernel doesn't support setting vp_index; assert that its value 1697 * is in sync 1698 */ 1699 uint64_t value; 1700 1701 ret = kvm_get_one_msr(cpu, HV_X64_MSR_VP_INDEX, &value); 1702 if (ret < 0) { 1703 return ret; 1704 } 1705 1706 if (value != hyperv_vp_index(CPU(cpu))) { 1707 error_report("kernel's vp_index != QEMU's vp_index"); 1708 return -ENXIO; 1709 } 1710 } 1711 1712 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) { 1713 uint32_t synic_cap = cpu->hyperv_synic_kvm_only ? 1714 KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2; 1715 ret = kvm_vcpu_enable_cap(cs, synic_cap, 0); 1716 if (ret < 0) { 1717 error_report("failed to turn on HyperV SynIC in KVM: %s", 1718 strerror(-ret)); 1719 return ret; 1720 } 1721 1722 if (!cpu->hyperv_synic_kvm_only) { 1723 ret = hyperv_x86_synic_add(cpu); 1724 if (ret < 0) { 1725 error_report("failed to create HyperV SynIC: %s", 1726 strerror(-ret)); 1727 return ret; 1728 } 1729 } 1730 } 1731 1732 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) { 1733 uint16_t evmcs_version = DEFAULT_EVMCS_VERSION; 1734 uint16_t supported_evmcs_version; 1735 1736 ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0, 1737 (uintptr_t)&supported_evmcs_version); 1738 1739 /* 1740 * KVM is required to support EVMCS ver.1. as that's what 'hv-evmcs' 1741 * option sets. Note: we hardcode the maximum supported eVMCS version 1742 * to '1' as well so 'hv-evmcs' feature is migratable even when (and if) 1743 * ver.2 is implemented. A new option (e.g. 'hv-evmcs=2') will then have 1744 * to be added. 1745 */ 1746 if (ret < 0) { 1747 error_report("Hyper-V %s is not supported by kernel", 1748 kvm_hyperv_properties[HYPERV_FEAT_EVMCS].desc); 1749 return ret; 1750 } 1751 1752 if (!evmcs_version_supported(evmcs_version, supported_evmcs_version)) { 1753 error_report("eVMCS version range [%d..%d] is not supported by " 1754 "kernel (supported: [%d..%d])", evmcs_version & 0xff, 1755 evmcs_version >> 8, supported_evmcs_version & 0xff, 1756 supported_evmcs_version >> 8); 1757 return -ENOTSUP; 1758 } 1759 } 1760 1761 if (cpu->hyperv_enforce_cpuid) { 1762 ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENFORCE_CPUID, 0, 1); 1763 if (ret < 0) { 1764 error_report("failed to enable KVM_CAP_HYPERV_ENFORCE_CPUID: %s", 1765 strerror(-ret)); 1766 return ret; 1767 } 1768 } 1769 1770 /* Skip SynIC and VP_INDEX since they are hard deps already */ 1771 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_STIMER) && 1772 hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) && 1773 hyperv_feat_enabled(cpu, HYPERV_FEAT_RUNTIME)) { 1774 hyperv_x86_set_vmbus_recommended_features_enabled(); 1775 } 1776 1777 return 0; 1778 } 1779 1780 static Error *invtsc_mig_blocker; 1781 1782 #define KVM_MAX_CPUID_ENTRIES 100 1783 1784 static void kvm_init_xsave(CPUX86State *env) 1785 { 1786 if (has_xsave2) { 1787 env->xsave_buf_len = QEMU_ALIGN_UP(has_xsave2, 4096); 1788 } else { 1789 env->xsave_buf_len = sizeof(struct kvm_xsave); 1790 } 1791 1792 env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len); 1793 memset(env->xsave_buf, 0, env->xsave_buf_len); 1794 /* 1795 * The allocated storage must be large enough for all of the 1796 * possible XSAVE state components. 1797 */ 1798 assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX) <= 1799 env->xsave_buf_len); 1800 } 1801 1802 static void kvm_init_nested_state(CPUX86State *env) 1803 { 1804 struct kvm_vmx_nested_state_hdr *vmx_hdr; 1805 uint32_t size; 1806 1807 if (!env->nested_state) { 1808 return; 1809 } 1810 1811 size = env->nested_state->size; 1812 1813 memset(env->nested_state, 0, size); 1814 env->nested_state->size = size; 1815 1816 if (cpu_has_vmx(env)) { 1817 env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX; 1818 vmx_hdr = &env->nested_state->hdr.vmx; 1819 vmx_hdr->vmxon_pa = -1ull; 1820 vmx_hdr->vmcs12_pa = -1ull; 1821 } else if (cpu_has_svm(env)) { 1822 env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM; 1823 } 1824 } 1825 1826 static uint32_t kvm_x86_build_cpuid(CPUX86State *env, 1827 struct kvm_cpuid_entry2 *entries, 1828 uint32_t cpuid_i) 1829 { 1830 uint32_t limit, i, j; 1831 uint32_t unused; 1832 struct kvm_cpuid_entry2 *c; 1833 1834 cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused); 1835 1836 for (i = 0; i <= limit; i++) { 1837 j = 0; 1838 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1839 goto full; 1840 } 1841 c = &entries[cpuid_i++]; 1842 switch (i) { 1843 case 2: { 1844 /* Keep reading function 2 till all the input is received */ 1845 int times; 1846 1847 c->function = i; 1848 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1849 times = c->eax & 0xff; 1850 if (times > 1) { 1851 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC | 1852 KVM_CPUID_FLAG_STATE_READ_NEXT; 1853 } 1854 1855 for (j = 1; j < times; ++j) { 1856 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1857 goto full; 1858 } 1859 c = &entries[cpuid_i++]; 1860 c->function = i; 1861 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC; 1862 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1863 } 1864 break; 1865 } 1866 case 0x1f: 1867 if (!x86_has_extended_topo(env->avail_cpu_topo)) { 1868 cpuid_i--; 1869 break; 1870 } 1871 /* fallthrough */ 1872 case 4: 1873 case 0xb: 1874 case 0xd: 1875 for (j = 0; ; j++) { 1876 c->function = i; 1877 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1878 c->index = j; 1879 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx); 1880 1881 if (i == 4 && c->eax == 0) { 1882 break; 1883 } 1884 if (i == 0xb && !(c->ecx & 0xff00)) { 1885 break; 1886 } 1887 if (i == 0x1f && !(c->ecx & 0xff00)) { 1888 break; 1889 } 1890 if (i == 0xd && c->eax == 0) { 1891 if (j < 63) { 1892 continue; 1893 } else { 1894 cpuid_i--; 1895 break; 1896 } 1897 } 1898 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1899 goto full; 1900 } 1901 c = &entries[cpuid_i++]; 1902 } 1903 break; 1904 case 0x12: 1905 for (j = 0; ; j++) { 1906 c->function = i; 1907 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1908 c->index = j; 1909 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx); 1910 1911 if (j > 1 && (c->eax & 0xf) != 1) { 1912 break; 1913 } 1914 1915 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1916 goto full; 1917 } 1918 c = &entries[cpuid_i++]; 1919 } 1920 break; 1921 case 0x7: 1922 case 0x14: 1923 case 0x1d: 1924 case 0x1e: 1925 case 0x24: { 1926 uint32_t times; 1927 1928 c->function = i; 1929 c->index = 0; 1930 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1931 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1932 times = c->eax; 1933 1934 for (j = 1; j <= times; ++j) { 1935 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1936 goto full; 1937 } 1938 c = &entries[cpuid_i++]; 1939 c->function = i; 1940 c->index = j; 1941 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1942 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx); 1943 } 1944 break; 1945 } 1946 default: 1947 c->function = i; 1948 c->flags = 0; 1949 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1950 if (!c->eax && !c->ebx && !c->ecx && !c->edx) { 1951 /* 1952 * KVM already returns all zeroes if a CPUID entry is missing, 1953 * so we can omit it and avoid hitting KVM's 80-entry limit. 1954 */ 1955 cpuid_i--; 1956 } 1957 break; 1958 } 1959 } 1960 1961 if (limit >= 0x0a) { 1962 uint32_t eax, edx; 1963 1964 cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx); 1965 1966 has_architectural_pmu_version = eax & 0xff; 1967 if (has_architectural_pmu_version > 0) { 1968 num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8; 1969 1970 /* Shouldn't be more than 32, since that's the number of bits 1971 * available in EBX to tell us _which_ counters are available. 1972 * Play it safe. 1973 */ 1974 if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) { 1975 num_architectural_pmu_gp_counters = MAX_GP_COUNTERS; 1976 } 1977 1978 if (has_architectural_pmu_version > 1) { 1979 num_architectural_pmu_fixed_counters = edx & 0x1f; 1980 1981 if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) { 1982 num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS; 1983 } 1984 } 1985 } 1986 } 1987 1988 cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused); 1989 1990 for (i = 0x80000000; i <= limit; i++) { 1991 j = 0; 1992 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1993 goto full; 1994 } 1995 c = &entries[cpuid_i++]; 1996 1997 switch (i) { 1998 case 0x8000001d: 1999 /* Query for all AMD cache information leaves */ 2000 for (j = 0; ; j++) { 2001 c->function = i; 2002 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2003 c->index = j; 2004 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx); 2005 2006 if (c->eax == 0) { 2007 break; 2008 } 2009 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 2010 goto full; 2011 } 2012 c = &entries[cpuid_i++]; 2013 } 2014 break; 2015 default: 2016 c->function = i; 2017 c->flags = 0; 2018 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 2019 if (!c->eax && !c->ebx && !c->ecx && !c->edx) { 2020 /* 2021 * KVM already returns all zeroes if a CPUID entry is missing, 2022 * so we can omit it and avoid hitting KVM's 80-entry limit. 2023 */ 2024 cpuid_i--; 2025 } 2026 break; 2027 } 2028 } 2029 2030 /* Call Centaur's CPUID instructions they are supported. */ 2031 if (env->cpuid_xlevel2 > 0) { 2032 cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused); 2033 2034 for (i = 0xC0000000; i <= limit; i++) { 2035 j = 0; 2036 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 2037 goto full; 2038 } 2039 c = &entries[cpuid_i++]; 2040 2041 c->function = i; 2042 c->flags = 0; 2043 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 2044 } 2045 } 2046 2047 return cpuid_i; 2048 2049 full: 2050 fprintf(stderr, "cpuid_data is full, no space for " 2051 "cpuid(eax:0x%x,ecx:0x%x)\n", i, j); 2052 abort(); 2053 } 2054 2055 int kvm_arch_init_vcpu(CPUState *cs) 2056 { 2057 struct { 2058 struct kvm_cpuid2 cpuid; 2059 struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES]; 2060 } cpuid_data; 2061 /* 2062 * The kernel defines these structs with padding fields so there 2063 * should be no extra padding in our cpuid_data struct. 2064 */ 2065 QEMU_BUILD_BUG_ON(sizeof(cpuid_data) != 2066 sizeof(struct kvm_cpuid2) + 2067 sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES); 2068 2069 X86CPU *cpu = X86_CPU(cs); 2070 CPUX86State *env = &cpu->env; 2071 uint32_t cpuid_i; 2072 struct kvm_cpuid_entry2 *c; 2073 uint32_t signature[3]; 2074 int kvm_base = KVM_CPUID_SIGNATURE; 2075 int max_nested_state_len; 2076 int r; 2077 Error *local_err = NULL; 2078 2079 memset(&cpuid_data, 0, sizeof(cpuid_data)); 2080 2081 cpuid_i = 0; 2082 2083 has_xsave2 = kvm_check_extension(cs->kvm_state, KVM_CAP_XSAVE2); 2084 2085 r = kvm_arch_set_tsc_khz(cs); 2086 if (r < 0) { 2087 return r; 2088 } 2089 2090 /* vcpu's TSC frequency is either specified by user, or following 2091 * the value used by KVM if the former is not present. In the 2092 * latter case, we query it from KVM and record in env->tsc_khz, 2093 * so that vcpu's TSC frequency can be migrated later via this field. 2094 */ 2095 if (!env->tsc_khz) { 2096 r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ? 2097 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : 2098 -ENOTSUP; 2099 if (r > 0) { 2100 env->tsc_khz = r; 2101 } 2102 } 2103 2104 env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY; 2105 2106 /* 2107 * kvm_hyperv_expand_features() is called here for the second time in case 2108 * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle 2109 * 'query-cpu-model-expansion' in this case as we don't have a KVM vCPU to 2110 * check which Hyper-V enlightenments are supported and which are not, we 2111 * can still proceed and check/expand Hyper-V enlightenments here so legacy 2112 * behavior is preserved. 2113 */ 2114 if (!kvm_hyperv_expand_features(cpu, &local_err)) { 2115 error_report_err(local_err); 2116 return -ENOSYS; 2117 } 2118 2119 if (hyperv_enabled(cpu)) { 2120 r = hyperv_init_vcpu(cpu); 2121 if (r) { 2122 return r; 2123 } 2124 2125 cpuid_i = hyperv_fill_cpuids(cs, cpuid_data.entries); 2126 kvm_base = KVM_CPUID_SIGNATURE_NEXT; 2127 has_msr_hv_hypercall = true; 2128 } 2129 2130 if (cs->kvm_state->xen_version) { 2131 #ifdef CONFIG_XEN_EMU 2132 struct kvm_cpuid_entry2 *xen_max_leaf; 2133 2134 memcpy(signature, "XenVMMXenVMM", 12); 2135 2136 xen_max_leaf = c = &cpuid_data.entries[cpuid_i++]; 2137 c->function = kvm_base + XEN_CPUID_SIGNATURE; 2138 c->eax = kvm_base + XEN_CPUID_TIME; 2139 c->ebx = signature[0]; 2140 c->ecx = signature[1]; 2141 c->edx = signature[2]; 2142 2143 c = &cpuid_data.entries[cpuid_i++]; 2144 c->function = kvm_base + XEN_CPUID_VENDOR; 2145 c->eax = cs->kvm_state->xen_version; 2146 c->ebx = 0; 2147 c->ecx = 0; 2148 c->edx = 0; 2149 2150 c = &cpuid_data.entries[cpuid_i++]; 2151 c->function = kvm_base + XEN_CPUID_HVM_MSR; 2152 /* Number of hypercall-transfer pages */ 2153 c->eax = 1; 2154 /* Hypercall MSR base address */ 2155 if (hyperv_enabled(cpu)) { 2156 c->ebx = XEN_HYPERCALL_MSR_HYPERV; 2157 kvm_xen_init(cs->kvm_state, c->ebx); 2158 } else { 2159 c->ebx = XEN_HYPERCALL_MSR; 2160 } 2161 c->ecx = 0; 2162 c->edx = 0; 2163 2164 c = &cpuid_data.entries[cpuid_i++]; 2165 c->function = kvm_base + XEN_CPUID_TIME; 2166 c->eax = ((!!tsc_is_stable_and_known(env) << 1) | 2167 (!!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP) << 2)); 2168 /* default=0 (emulate if necessary) */ 2169 c->ebx = 0; 2170 /* guest tsc frequency */ 2171 c->ecx = env->user_tsc_khz; 2172 /* guest tsc incarnation (migration count) */ 2173 c->edx = 0; 2174 2175 c = &cpuid_data.entries[cpuid_i++]; 2176 c->function = kvm_base + XEN_CPUID_HVM; 2177 xen_max_leaf->eax = kvm_base + XEN_CPUID_HVM; 2178 if (cs->kvm_state->xen_version >= XEN_VERSION(4, 5)) { 2179 c->function = kvm_base + XEN_CPUID_HVM; 2180 2181 if (cpu->xen_vapic) { 2182 c->eax |= XEN_HVM_CPUID_APIC_ACCESS_VIRT; 2183 c->eax |= XEN_HVM_CPUID_X2APIC_VIRT; 2184 } 2185 2186 c->eax |= XEN_HVM_CPUID_IOMMU_MAPPINGS; 2187 2188 if (cs->kvm_state->xen_version >= XEN_VERSION(4, 6)) { 2189 c->eax |= XEN_HVM_CPUID_VCPU_ID_PRESENT; 2190 c->ebx = cs->cpu_index; 2191 } 2192 2193 if (cs->kvm_state->xen_version >= XEN_VERSION(4, 17)) { 2194 c->eax |= XEN_HVM_CPUID_UPCALL_VECTOR; 2195 } 2196 } 2197 2198 r = kvm_xen_init_vcpu(cs); 2199 if (r) { 2200 return r; 2201 } 2202 2203 kvm_base += 0x100; 2204 #else /* CONFIG_XEN_EMU */ 2205 /* This should never happen as kvm_arch_init() would have died first. */ 2206 fprintf(stderr, "Cannot enable Xen CPUID without Xen support\n"); 2207 abort(); 2208 #endif 2209 } else if (cpu->expose_kvm) { 2210 memcpy(signature, "KVMKVMKVM\0\0\0", 12); 2211 c = &cpuid_data.entries[cpuid_i++]; 2212 c->function = KVM_CPUID_SIGNATURE | kvm_base; 2213 c->eax = KVM_CPUID_FEATURES | kvm_base; 2214 c->ebx = signature[0]; 2215 c->ecx = signature[1]; 2216 c->edx = signature[2]; 2217 2218 c = &cpuid_data.entries[cpuid_i++]; 2219 c->function = KVM_CPUID_FEATURES | kvm_base; 2220 c->eax = env->features[FEAT_KVM]; 2221 c->edx = env->features[FEAT_KVM_HINTS]; 2222 } 2223 2224 if (cpu->kvm_pv_enforce_cpuid) { 2225 r = kvm_vcpu_enable_cap(cs, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 0, 1); 2226 if (r < 0) { 2227 fprintf(stderr, 2228 "failed to enable KVM_CAP_ENFORCE_PV_FEATURE_CPUID: %s", 2229 strerror(-r)); 2230 abort(); 2231 } 2232 } 2233 2234 cpuid_i = kvm_x86_build_cpuid(env, cpuid_data.entries, cpuid_i); 2235 cpuid_data.cpuid.nent = cpuid_i; 2236 2237 if (((env->cpuid_version >> 8)&0xF) >= 6 2238 && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) == 2239 (CPUID_MCE | CPUID_MCA)) { 2240 uint64_t mcg_cap, unsupported_caps; 2241 int banks; 2242 int ret; 2243 2244 ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks); 2245 if (ret < 0) { 2246 fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret)); 2247 return ret; 2248 } 2249 2250 if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) { 2251 error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)", 2252 (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks); 2253 return -ENOTSUP; 2254 } 2255 2256 unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK); 2257 if (unsupported_caps) { 2258 if (unsupported_caps & MCG_LMCE_P) { 2259 error_report("kvm: LMCE not supported"); 2260 return -ENOTSUP; 2261 } 2262 warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64, 2263 unsupported_caps); 2264 } 2265 2266 env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK; 2267 ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap); 2268 if (ret < 0) { 2269 fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret)); 2270 return ret; 2271 } 2272 } 2273 2274 cpu->vmsentry = qemu_add_vm_change_state_handler(cpu_update_state, env); 2275 2276 c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0); 2277 if (c) { 2278 has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) || 2279 !!(c->ecx & CPUID_EXT_SMX); 2280 } 2281 2282 c = cpuid_find_entry(&cpuid_data.cpuid, 7, 0); 2283 if (c && (c->ebx & CPUID_7_0_EBX_SGX)) { 2284 has_msr_feature_control = true; 2285 } 2286 2287 if (env->mcg_cap & MCG_LMCE_P) { 2288 has_msr_mcg_ext_ctl = has_msr_feature_control = true; 2289 } 2290 2291 if (!env->user_tsc_khz) { 2292 if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) && 2293 invtsc_mig_blocker == NULL) { 2294 error_setg(&invtsc_mig_blocker, 2295 "State blocked by non-migratable CPU device" 2296 " (invtsc flag)"); 2297 r = migrate_add_blocker(&invtsc_mig_blocker, &local_err); 2298 if (r < 0) { 2299 error_report_err(local_err); 2300 return r; 2301 } 2302 } 2303 } 2304 2305 if (cpu->vmware_cpuid_freq 2306 /* Guests depend on 0x40000000 to detect this feature, so only expose 2307 * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */ 2308 && cpu->expose_kvm 2309 && kvm_base == KVM_CPUID_SIGNATURE 2310 /* TSC clock must be stable and known for this feature. */ 2311 && tsc_is_stable_and_known(env)) { 2312 2313 c = &cpuid_data.entries[cpuid_i++]; 2314 c->function = KVM_CPUID_SIGNATURE | 0x10; 2315 c->eax = env->tsc_khz; 2316 c->ebx = env->apic_bus_freq / 1000; /* Hz to KHz */ 2317 c->ecx = c->edx = 0; 2318 2319 c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0); 2320 c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10); 2321 } 2322 2323 cpuid_data.cpuid.nent = cpuid_i; 2324 2325 cpuid_data.cpuid.padding = 0; 2326 r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data); 2327 if (r) { 2328 goto fail; 2329 } 2330 kvm_init_xsave(env); 2331 2332 max_nested_state_len = kvm_max_nested_state_length(); 2333 if (max_nested_state_len > 0) { 2334 assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data)); 2335 2336 if (cpu_has_vmx(env) || cpu_has_svm(env)) { 2337 env->nested_state = g_malloc0(max_nested_state_len); 2338 env->nested_state->size = max_nested_state_len; 2339 2340 kvm_init_nested_state(env); 2341 } 2342 } 2343 2344 cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE); 2345 2346 if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) { 2347 has_msr_tsc_aux = false; 2348 } 2349 2350 kvm_init_msrs(cpu); 2351 2352 return 0; 2353 2354 fail: 2355 migrate_del_blocker(&invtsc_mig_blocker); 2356 2357 return r; 2358 } 2359 2360 int kvm_arch_destroy_vcpu(CPUState *cs) 2361 { 2362 X86CPU *cpu = X86_CPU(cs); 2363 CPUX86State *env = &cpu->env; 2364 2365 g_free(env->xsave_buf); 2366 2367 g_free(cpu->kvm_msr_buf); 2368 cpu->kvm_msr_buf = NULL; 2369 2370 g_free(env->nested_state); 2371 env->nested_state = NULL; 2372 2373 qemu_del_vm_change_state_handler(cpu->vmsentry); 2374 2375 return 0; 2376 } 2377 2378 void kvm_arch_reset_vcpu(X86CPU *cpu) 2379 { 2380 CPUX86State *env = &cpu->env; 2381 2382 env->xcr0 = 1; 2383 if (kvm_irqchip_in_kernel()) { 2384 env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE : 2385 KVM_MP_STATE_UNINITIALIZED; 2386 } else { 2387 env->mp_state = KVM_MP_STATE_RUNNABLE; 2388 } 2389 2390 /* enabled by default */ 2391 env->poll_control_msr = 1; 2392 2393 kvm_init_nested_state(env); 2394 2395 sev_es_set_reset_vector(CPU(cpu)); 2396 } 2397 2398 void kvm_arch_after_reset_vcpu(X86CPU *cpu) 2399 { 2400 CPUX86State *env = &cpu->env; 2401 int i; 2402 2403 /* 2404 * Reset SynIC after all other devices have been reset to let them remove 2405 * their SINT routes first. 2406 */ 2407 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) { 2408 for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) { 2409 env->msr_hv_synic_sint[i] = HV_SINT_MASKED; 2410 } 2411 2412 hyperv_x86_synic_reset(cpu); 2413 } 2414 } 2415 2416 void kvm_arch_reset_parked_vcpu(unsigned long vcpu_id, int kvm_fd) 2417 { 2418 g_autofree struct kvm_msrs *msrs = NULL; 2419 2420 msrs = g_malloc0(sizeof(*msrs) + sizeof(msrs->entries[0])); 2421 msrs->entries[0].index = MSR_IA32_TSC; 2422 msrs->entries[0].data = 1; /* match the value in x86_cpu_reset() */ 2423 msrs->nmsrs++; 2424 2425 if (ioctl(kvm_fd, KVM_SET_MSRS, msrs) != 1) { 2426 warn_report("parked vCPU %lu TSC reset failed: %d", 2427 vcpu_id, errno); 2428 } 2429 } 2430 2431 void kvm_arch_do_init_vcpu(X86CPU *cpu) 2432 { 2433 CPUX86State *env = &cpu->env; 2434 2435 /* APs get directly into wait-for-SIPI state. */ 2436 if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) { 2437 env->mp_state = KVM_MP_STATE_INIT_RECEIVED; 2438 } 2439 } 2440 2441 static int kvm_get_supported_feature_msrs(KVMState *s) 2442 { 2443 int ret = 0; 2444 2445 if (kvm_feature_msrs != NULL) { 2446 return 0; 2447 } 2448 2449 if (!kvm_check_extension(s, KVM_CAP_GET_MSR_FEATURES)) { 2450 return 0; 2451 } 2452 2453 struct kvm_msr_list msr_list; 2454 2455 msr_list.nmsrs = 0; 2456 ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, &msr_list); 2457 if (ret < 0 && ret != -E2BIG) { 2458 error_report("Fetch KVM feature MSR list failed: %s", 2459 strerror(-ret)); 2460 return ret; 2461 } 2462 2463 assert(msr_list.nmsrs > 0); 2464 kvm_feature_msrs = g_malloc0(sizeof(msr_list) + 2465 msr_list.nmsrs * sizeof(msr_list.indices[0])); 2466 2467 kvm_feature_msrs->nmsrs = msr_list.nmsrs; 2468 ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, kvm_feature_msrs); 2469 2470 if (ret < 0) { 2471 error_report("Fetch KVM feature MSR list failed: %s", 2472 strerror(-ret)); 2473 g_free(kvm_feature_msrs); 2474 kvm_feature_msrs = NULL; 2475 return ret; 2476 } 2477 2478 return 0; 2479 } 2480 2481 static int kvm_get_supported_msrs(KVMState *s) 2482 { 2483 int ret = 0; 2484 struct kvm_msr_list msr_list, *kvm_msr_list; 2485 2486 /* 2487 * Obtain MSR list from KVM. These are the MSRs that we must 2488 * save/restore. 2489 */ 2490 msr_list.nmsrs = 0; 2491 ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list); 2492 if (ret < 0 && ret != -E2BIG) { 2493 return ret; 2494 } 2495 /* 2496 * Old kernel modules had a bug and could write beyond the provided 2497 * memory. Allocate at least a safe amount of 1K. 2498 */ 2499 kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) + 2500 msr_list.nmsrs * 2501 sizeof(msr_list.indices[0]))); 2502 2503 kvm_msr_list->nmsrs = msr_list.nmsrs; 2504 ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list); 2505 if (ret >= 0) { 2506 int i; 2507 2508 for (i = 0; i < kvm_msr_list->nmsrs; i++) { 2509 switch (kvm_msr_list->indices[i]) { 2510 case MSR_STAR: 2511 has_msr_star = true; 2512 break; 2513 case MSR_VM_HSAVE_PA: 2514 has_msr_hsave_pa = true; 2515 break; 2516 case MSR_TSC_AUX: 2517 has_msr_tsc_aux = true; 2518 break; 2519 case MSR_TSC_ADJUST: 2520 has_msr_tsc_adjust = true; 2521 break; 2522 case MSR_IA32_TSCDEADLINE: 2523 has_msr_tsc_deadline = true; 2524 break; 2525 case MSR_IA32_SMBASE: 2526 has_msr_smbase = true; 2527 break; 2528 case MSR_SMI_COUNT: 2529 has_msr_smi_count = true; 2530 break; 2531 case MSR_IA32_MISC_ENABLE: 2532 has_msr_misc_enable = true; 2533 break; 2534 case MSR_IA32_BNDCFGS: 2535 has_msr_bndcfgs = true; 2536 break; 2537 case MSR_IA32_XSS: 2538 has_msr_xss = true; 2539 break; 2540 case MSR_IA32_UMWAIT_CONTROL: 2541 has_msr_umwait = true; 2542 break; 2543 case HV_X64_MSR_CRASH_CTL: 2544 has_msr_hv_crash = true; 2545 break; 2546 case HV_X64_MSR_RESET: 2547 has_msr_hv_reset = true; 2548 break; 2549 case HV_X64_MSR_VP_INDEX: 2550 has_msr_hv_vpindex = true; 2551 break; 2552 case HV_X64_MSR_VP_RUNTIME: 2553 has_msr_hv_runtime = true; 2554 break; 2555 case HV_X64_MSR_SCONTROL: 2556 has_msr_hv_synic = true; 2557 break; 2558 case HV_X64_MSR_STIMER0_CONFIG: 2559 has_msr_hv_stimer = true; 2560 break; 2561 case HV_X64_MSR_TSC_FREQUENCY: 2562 has_msr_hv_frequencies = true; 2563 break; 2564 case HV_X64_MSR_REENLIGHTENMENT_CONTROL: 2565 has_msr_hv_reenlightenment = true; 2566 break; 2567 case HV_X64_MSR_SYNDBG_OPTIONS: 2568 has_msr_hv_syndbg_options = true; 2569 break; 2570 case MSR_IA32_SPEC_CTRL: 2571 has_msr_spec_ctrl = true; 2572 break; 2573 case MSR_AMD64_TSC_RATIO: 2574 has_tsc_scale_msr = true; 2575 break; 2576 case MSR_IA32_TSX_CTRL: 2577 has_msr_tsx_ctrl = true; 2578 break; 2579 case MSR_VIRT_SSBD: 2580 has_msr_virt_ssbd = true; 2581 break; 2582 case MSR_IA32_ARCH_CAPABILITIES: 2583 has_msr_arch_capabs = true; 2584 break; 2585 case MSR_IA32_CORE_CAPABILITY: 2586 has_msr_core_capabs = true; 2587 break; 2588 case MSR_IA32_PERF_CAPABILITIES: 2589 has_msr_perf_capabs = true; 2590 break; 2591 case MSR_IA32_VMX_VMFUNC: 2592 has_msr_vmx_vmfunc = true; 2593 break; 2594 case MSR_IA32_UCODE_REV: 2595 has_msr_ucode_rev = true; 2596 break; 2597 case MSR_IA32_VMX_PROCBASED_CTLS2: 2598 has_msr_vmx_procbased_ctls2 = true; 2599 break; 2600 case MSR_IA32_PKRS: 2601 has_msr_pkrs = true; 2602 break; 2603 case MSR_K7_HWCR: 2604 has_msr_hwcr = true; 2605 } 2606 } 2607 } 2608 2609 g_free(kvm_msr_list); 2610 2611 return ret; 2612 } 2613 2614 static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, 2615 uint32_t msr, 2616 uint64_t *val) 2617 { 2618 *val = cpu_x86_get_msr_core_thread_count(cpu); 2619 2620 return true; 2621 } 2622 2623 static bool kvm_rdmsr_rapl_power_unit(X86CPU *cpu, 2624 uint32_t msr, 2625 uint64_t *val) 2626 { 2627 2628 CPUState *cs = CPU(cpu); 2629 2630 *val = cs->kvm_state->msr_energy.msr_unit; 2631 2632 return true; 2633 } 2634 2635 static bool kvm_rdmsr_pkg_power_limit(X86CPU *cpu, 2636 uint32_t msr, 2637 uint64_t *val) 2638 { 2639 2640 CPUState *cs = CPU(cpu); 2641 2642 *val = cs->kvm_state->msr_energy.msr_limit; 2643 2644 return true; 2645 } 2646 2647 static bool kvm_rdmsr_pkg_power_info(X86CPU *cpu, 2648 uint32_t msr, 2649 uint64_t *val) 2650 { 2651 2652 CPUState *cs = CPU(cpu); 2653 2654 *val = cs->kvm_state->msr_energy.msr_info; 2655 2656 return true; 2657 } 2658 2659 static bool kvm_rdmsr_pkg_energy_status(X86CPU *cpu, 2660 uint32_t msr, 2661 uint64_t *val) 2662 { 2663 2664 CPUState *cs = CPU(cpu); 2665 *val = cs->kvm_state->msr_energy.msr_value[cs->cpu_index]; 2666 2667 return true; 2668 } 2669 2670 static Notifier smram_machine_done; 2671 static KVMMemoryListener smram_listener; 2672 static AddressSpace smram_address_space; 2673 static MemoryRegion smram_as_root; 2674 static MemoryRegion smram_as_mem; 2675 2676 static void register_smram_listener(Notifier *n, void *unused) 2677 { 2678 MemoryRegion *smram = 2679 (MemoryRegion *) object_resolve_path("/machine/smram", NULL); 2680 2681 /* Outer container... */ 2682 memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull); 2683 memory_region_set_enabled(&smram_as_root, true); 2684 2685 /* ... with two regions inside: normal system memory with low 2686 * priority, and... 2687 */ 2688 memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram", 2689 get_system_memory(), 0, ~0ull); 2690 memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0); 2691 memory_region_set_enabled(&smram_as_mem, true); 2692 2693 if (smram) { 2694 /* ... SMRAM with higher priority */ 2695 memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10); 2696 memory_region_set_enabled(smram, true); 2697 } 2698 2699 address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM"); 2700 kvm_memory_listener_register(kvm_state, &smram_listener, 2701 &smram_address_space, 1, "kvm-smram"); 2702 } 2703 2704 static void *kvm_msr_energy_thread(void *data) 2705 { 2706 KVMState *s = data; 2707 struct KVMMsrEnergy *vmsr = &s->msr_energy; 2708 2709 g_autofree vmsr_package_energy_stat *pkg_stat = NULL; 2710 g_autofree vmsr_thread_stat *thd_stat = NULL; 2711 g_autofree CPUState *cpu = NULL; 2712 g_autofree unsigned int *vpkgs_energy_stat = NULL; 2713 unsigned int num_threads = 0; 2714 2715 X86CPUTopoIDs topo_ids; 2716 2717 rcu_register_thread(); 2718 2719 /* Allocate memory for each package energy status */ 2720 pkg_stat = g_new0(vmsr_package_energy_stat, vmsr->host_topo.maxpkgs); 2721 2722 /* Allocate memory for thread stats */ 2723 thd_stat = g_new0(vmsr_thread_stat, 1); 2724 2725 /* Allocate memory for holding virtual package energy counter */ 2726 vpkgs_energy_stat = g_new0(unsigned int, vmsr->guest_vsockets); 2727 2728 /* Populate the max tick of each packages */ 2729 for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { 2730 /* 2731 * Max numbers of ticks per package 2732 * Time in second * Number of ticks/second * Number of cores/package 2733 * ex: 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max 2734 */ 2735 vmsr->host_topo.maxticks[i] = (MSR_ENERGY_THREAD_SLEEP_US / 1000000) 2736 * sysconf(_SC_CLK_TCK) 2737 * vmsr->host_topo.pkg_cpu_count[i]; 2738 } 2739 2740 while (true) { 2741 /* Get all qemu threads id */ 2742 g_autofree pid_t *thread_ids 2743 = vmsr_get_thread_ids(vmsr->pid, &num_threads); 2744 2745 if (thread_ids == NULL) { 2746 goto clean; 2747 } 2748 2749 thd_stat = g_renew(vmsr_thread_stat, thd_stat, num_threads); 2750 /* Unlike g_new0, g_renew0 function doesn't exist yet... */ 2751 memset(thd_stat, 0, num_threads * sizeof(vmsr_thread_stat)); 2752 2753 /* Populate all the thread stats */ 2754 for (int i = 0; i < num_threads; i++) { 2755 thd_stat[i].utime = g_new0(unsigned long long, 2); 2756 thd_stat[i].stime = g_new0(unsigned long long, 2); 2757 thd_stat[i].thread_id = thread_ids[i]; 2758 vmsr_read_thread_stat(vmsr->pid, 2759 thd_stat[i].thread_id, 2760 &thd_stat[i].utime[0], 2761 &thd_stat[i].stime[0], 2762 &thd_stat[i].cpu_id); 2763 thd_stat[i].pkg_id = 2764 vmsr_get_physical_package_id(thd_stat[i].cpu_id); 2765 } 2766 2767 /* Retrieve all packages power plane energy counter */ 2768 for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { 2769 for (int j = 0; j < num_threads; j++) { 2770 /* 2771 * Use the first thread we found that ran on the CPU 2772 * of the package to read the packages energy counter 2773 */ 2774 if (thd_stat[j].pkg_id == i) { 2775 pkg_stat[i].e_start = 2776 vmsr_read_msr(MSR_PKG_ENERGY_STATUS, 2777 thd_stat[j].cpu_id, 2778 thd_stat[j].thread_id, 2779 s->msr_energy.sioc); 2780 break; 2781 } 2782 } 2783 } 2784 2785 /* Sleep a short period while the other threads are working */ 2786 usleep(MSR_ENERGY_THREAD_SLEEP_US); 2787 2788 /* 2789 * Retrieve all packages power plane energy counter 2790 * Calculate the delta of all packages 2791 */ 2792 for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { 2793 for (int j = 0; j < num_threads; j++) { 2794 /* 2795 * Use the first thread we found that ran on the CPU 2796 * of the package to read the packages energy counter 2797 */ 2798 if (thd_stat[j].pkg_id == i) { 2799 pkg_stat[i].e_end = 2800 vmsr_read_msr(MSR_PKG_ENERGY_STATUS, 2801 thd_stat[j].cpu_id, 2802 thd_stat[j].thread_id, 2803 s->msr_energy.sioc); 2804 /* 2805 * Prevent the case we have migrate the VM 2806 * during the sleep period or any other cases 2807 * were energy counter might be lower after 2808 * the sleep period. 2809 */ 2810 if (pkg_stat[i].e_end > pkg_stat[i].e_start) { 2811 pkg_stat[i].e_delta = 2812 pkg_stat[i].e_end - pkg_stat[i].e_start; 2813 } else { 2814 pkg_stat[i].e_delta = 0; 2815 } 2816 break; 2817 } 2818 } 2819 } 2820 2821 /* Delta of ticks spend by each thread between the sample */ 2822 for (int i = 0; i < num_threads; i++) { 2823 vmsr_read_thread_stat(vmsr->pid, 2824 thd_stat[i].thread_id, 2825 &thd_stat[i].utime[1], 2826 &thd_stat[i].stime[1], 2827 &thd_stat[i].cpu_id); 2828 2829 if (vmsr->pid < 0) { 2830 /* 2831 * We don't count the dead thread 2832 * i.e threads that existed before the sleep 2833 * and not anymore 2834 */ 2835 thd_stat[i].delta_ticks = 0; 2836 } else { 2837 vmsr_delta_ticks(thd_stat, i); 2838 } 2839 } 2840 2841 /* 2842 * Identify the vcpu threads 2843 * Calculate the number of vcpu per package 2844 */ 2845 CPU_FOREACH(cpu) { 2846 for (int i = 0; i < num_threads; i++) { 2847 if (cpu->thread_id == thd_stat[i].thread_id) { 2848 thd_stat[i].is_vcpu = true; 2849 thd_stat[i].vcpu_id = cpu->cpu_index; 2850 pkg_stat[thd_stat[i].pkg_id].nb_vcpu++; 2851 thd_stat[i].acpi_id = kvm_arch_vcpu_id(cpu); 2852 break; 2853 } 2854 } 2855 } 2856 2857 /* Retrieve the virtual package number of each vCPU */ 2858 for (int i = 0; i < vmsr->guest_cpu_list->len; i++) { 2859 for (int j = 0; j < num_threads; j++) { 2860 if ((thd_stat[j].acpi_id == 2861 vmsr->guest_cpu_list->cpus[i].arch_id) 2862 && (thd_stat[j].is_vcpu == true)) { 2863 x86_topo_ids_from_apicid(thd_stat[j].acpi_id, 2864 &vmsr->guest_topo_info, &topo_ids); 2865 thd_stat[j].vpkg_id = topo_ids.pkg_id; 2866 } 2867 } 2868 } 2869 2870 /* Calculate the total energy of all non-vCPU thread */ 2871 for (int i = 0; i < num_threads; i++) { 2872 if ((thd_stat[i].is_vcpu != true) && 2873 (thd_stat[i].delta_ticks > 0)) { 2874 double temp; 2875 temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta, 2876 thd_stat[i].delta_ticks, 2877 vmsr->host_topo.maxticks[thd_stat[i].pkg_id]); 2878 pkg_stat[thd_stat[i].pkg_id].e_ratio 2879 += (uint64_t)lround(temp); 2880 } 2881 } 2882 2883 /* Calculate the ratio per non-vCPU thread of each package */ 2884 for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) { 2885 if (pkg_stat[i].nb_vcpu > 0) { 2886 pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu; 2887 } 2888 } 2889 2890 /* 2891 * Calculate the energy for each Package: 2892 * Energy Package = sum of each vCPU energy that belongs to the package 2893 */ 2894 for (int i = 0; i < num_threads; i++) { 2895 if ((thd_stat[i].is_vcpu == true) && \ 2896 (thd_stat[i].delta_ticks > 0)) { 2897 double temp; 2898 temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta, 2899 thd_stat[i].delta_ticks, 2900 vmsr->host_topo.maxticks[thd_stat[i].pkg_id]); 2901 vpkgs_energy_stat[thd_stat[i].vpkg_id] += 2902 (uint64_t)lround(temp); 2903 vpkgs_energy_stat[thd_stat[i].vpkg_id] += 2904 pkg_stat[thd_stat[i].pkg_id].e_ratio; 2905 } 2906 } 2907 2908 /* 2909 * Finally populate the vmsr register of each vCPU with the total 2910 * package value to emulate the real hardware where each CPU return the 2911 * value of the package it belongs. 2912 */ 2913 for (int i = 0; i < num_threads; i++) { 2914 if ((thd_stat[i].is_vcpu == true) && \ 2915 (thd_stat[i].delta_ticks > 0)) { 2916 vmsr->msr_value[thd_stat[i].vcpu_id] = \ 2917 vpkgs_energy_stat[thd_stat[i].vpkg_id]; 2918 } 2919 } 2920 2921 /* Freeing memory before zeroing the pointer */ 2922 for (int i = 0; i < num_threads; i++) { 2923 g_free(thd_stat[i].utime); 2924 g_free(thd_stat[i].stime); 2925 } 2926 } 2927 2928 clean: 2929 rcu_unregister_thread(); 2930 return NULL; 2931 } 2932 2933 static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms) 2934 { 2935 MachineClass *mc = MACHINE_GET_CLASS(ms); 2936 struct KVMMsrEnergy *r = &s->msr_energy; 2937 2938 /* 2939 * Sanity check 2940 * 1. Host cpu must be Intel cpu 2941 * 2. RAPL must be enabled on the Host 2942 */ 2943 if (!is_host_cpu_intel()) { 2944 error_report("The RAPL feature can only be enabled on hosts " 2945 "with Intel CPU models"); 2946 return -1; 2947 } 2948 2949 if (!is_rapl_enabled()) { 2950 return -1; 2951 } 2952 2953 /* Retrieve the virtual topology */ 2954 vmsr_init_topo_info(&r->guest_topo_info, ms); 2955 2956 /* Retrieve the number of vcpu */ 2957 r->guest_vcpus = ms->smp.cpus; 2958 2959 /* Retrieve the number of virtual sockets */ 2960 r->guest_vsockets = ms->smp.sockets; 2961 2962 /* Allocate register memory (MSR_PKG_STATUS) for each vcpu */ 2963 r->msr_value = g_new0(uint64_t, r->guest_vcpus); 2964 2965 /* Retrieve the CPUArchIDlist */ 2966 r->guest_cpu_list = mc->possible_cpu_arch_ids(ms); 2967 2968 /* Max number of cpus on the Host */ 2969 r->host_topo.maxcpus = vmsr_get_maxcpus(); 2970 if (r->host_topo.maxcpus == 0) { 2971 error_report("host max cpus = 0"); 2972 return -1; 2973 } 2974 2975 /* Max number of packages on the host */ 2976 r->host_topo.maxpkgs = vmsr_get_max_physical_package(r->host_topo.maxcpus); 2977 if (r->host_topo.maxpkgs == 0) { 2978 error_report("host max pkgs = 0"); 2979 return -1; 2980 } 2981 2982 /* Allocate memory for each package on the host */ 2983 r->host_topo.pkg_cpu_count = g_new0(unsigned int, r->host_topo.maxpkgs); 2984 r->host_topo.maxticks = g_new0(unsigned int, r->host_topo.maxpkgs); 2985 2986 vmsr_count_cpus_per_package(r->host_topo.pkg_cpu_count, 2987 r->host_topo.maxpkgs); 2988 for (int i = 0; i < r->host_topo.maxpkgs; i++) { 2989 if (r->host_topo.pkg_cpu_count[i] == 0) { 2990 error_report("cpu per packages = 0 on package_%d", i); 2991 return -1; 2992 } 2993 } 2994 2995 /* Get QEMU PID*/ 2996 r->pid = getpid(); 2997 2998 /* Compute the socket path if necessary */ 2999 if (s->msr_energy.socket_path == NULL) { 3000 s->msr_energy.socket_path = vmsr_compute_default_paths(); 3001 } 3002 3003 /* Open socket with vmsr helper */ 3004 s->msr_energy.sioc = vmsr_open_socket(s->msr_energy.socket_path); 3005 3006 if (s->msr_energy.sioc == NULL) { 3007 error_report("vmsr socket opening failed"); 3008 return -1; 3009 } 3010 3011 /* Those MSR values should not change */ 3012 r->msr_unit = vmsr_read_msr(MSR_RAPL_POWER_UNIT, 0, r->pid, 3013 s->msr_energy.sioc); 3014 r->msr_limit = vmsr_read_msr(MSR_PKG_POWER_LIMIT, 0, r->pid, 3015 s->msr_energy.sioc); 3016 r->msr_info = vmsr_read_msr(MSR_PKG_POWER_INFO, 0, r->pid, 3017 s->msr_energy.sioc); 3018 if (r->msr_unit == 0 || r->msr_limit == 0 || r->msr_info == 0) { 3019 error_report("can't read any virtual msr"); 3020 return -1; 3021 } 3022 3023 qemu_thread_create(&r->msr_thr, "kvm-msr", 3024 kvm_msr_energy_thread, 3025 s, QEMU_THREAD_JOINABLE); 3026 return 0; 3027 } 3028 3029 int kvm_arch_get_default_type(MachineState *ms) 3030 { 3031 return 0; 3032 } 3033 3034 static int kvm_vm_enable_exception_payload(KVMState *s) 3035 { 3036 int ret = 0; 3037 has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD); 3038 if (has_exception_payload) { 3039 ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true); 3040 if (ret < 0) { 3041 error_report("kvm: Failed to enable exception payload cap: %s", 3042 strerror(-ret)); 3043 } 3044 } 3045 3046 return ret; 3047 } 3048 3049 static int kvm_vm_enable_triple_fault_event(KVMState *s) 3050 { 3051 int ret = 0; 3052 has_triple_fault_event = \ 3053 kvm_check_extension(s, 3054 KVM_CAP_X86_TRIPLE_FAULT_EVENT); 3055 if (has_triple_fault_event) { 3056 ret = kvm_vm_enable_cap(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 0, true); 3057 if (ret < 0) { 3058 error_report("kvm: Failed to enable triple fault event cap: %s", 3059 strerror(-ret)); 3060 } 3061 } 3062 return ret; 3063 } 3064 3065 static int kvm_vm_set_identity_map_addr(KVMState *s, uint64_t identity_base) 3066 { 3067 return kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base); 3068 } 3069 3070 static int kvm_vm_set_nr_mmu_pages(KVMState *s) 3071 { 3072 uint64_t shadow_mem; 3073 int ret = 0; 3074 shadow_mem = object_property_get_int(OBJECT(s), 3075 "kvm-shadow-mem", 3076 &error_abort); 3077 if (shadow_mem != -1) { 3078 shadow_mem /= 4096; 3079 ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem); 3080 } 3081 return ret; 3082 } 3083 3084 static int kvm_vm_set_tss_addr(KVMState *s, uint64_t tss_base) 3085 { 3086 return kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, tss_base); 3087 } 3088 3089 static int kvm_vm_enable_disable_exits(KVMState *s) 3090 { 3091 int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS); 3092 3093 if (disable_exits) { 3094 disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT | 3095 KVM_X86_DISABLE_EXITS_HLT | 3096 KVM_X86_DISABLE_EXITS_PAUSE | 3097 KVM_X86_DISABLE_EXITS_CSTATE); 3098 } 3099 3100 return kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0, 3101 disable_exits); 3102 } 3103 3104 static int kvm_vm_enable_bus_lock_exit(KVMState *s) 3105 { 3106 int ret = 0; 3107 ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT); 3108 if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) { 3109 error_report("kvm: bus lock detection unsupported"); 3110 return -ENOTSUP; 3111 } 3112 ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0, 3113 KVM_BUS_LOCK_DETECTION_EXIT); 3114 if (ret < 0) { 3115 error_report("kvm: Failed to enable bus lock detection cap: %s", 3116 strerror(-ret)); 3117 } 3118 3119 return ret; 3120 } 3121 3122 static int kvm_vm_enable_notify_vmexit(KVMState *s) 3123 { 3124 int ret = 0; 3125 if (s->notify_vmexit != NOTIFY_VMEXIT_OPTION_DISABLE) { 3126 uint64_t notify_window_flags = 3127 ((uint64_t)s->notify_window << 32) | 3128 KVM_X86_NOTIFY_VMEXIT_ENABLED | 3129 KVM_X86_NOTIFY_VMEXIT_USER; 3130 ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0, 3131 notify_window_flags); 3132 if (ret < 0) { 3133 error_report("kvm: Failed to enable notify vmexit cap: %s", 3134 strerror(-ret)); 3135 } 3136 } 3137 return ret; 3138 } 3139 3140 static int kvm_vm_enable_userspace_msr(KVMState *s) 3141 { 3142 int ret; 3143 3144 ret = kvm_vm_enable_cap(s, KVM_CAP_X86_USER_SPACE_MSR, 0, 3145 KVM_MSR_EXIT_REASON_FILTER); 3146 if (ret < 0) { 3147 error_report("Could not enable user space MSRs: %s", 3148 strerror(-ret)); 3149 exit(1); 3150 } 3151 3152 ret = kvm_filter_msr(s, MSR_CORE_THREAD_COUNT, 3153 kvm_rdmsr_core_thread_count, NULL); 3154 if (ret < 0) { 3155 error_report("Could not install MSR_CORE_THREAD_COUNT handler: %s", 3156 strerror(-ret)); 3157 exit(1); 3158 } 3159 3160 return 0; 3161 } 3162 3163 static int kvm_vm_enable_energy_msrs(KVMState *s) 3164 { 3165 int ret; 3166 3167 if (s->msr_energy.enable == true) { 3168 ret = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT, 3169 kvm_rdmsr_rapl_power_unit, NULL); 3170 if (ret < 0) { 3171 error_report("Could not install MSR_RAPL_POWER_UNIT handler: %s", 3172 strerror(-ret)); 3173 return ret; 3174 } 3175 3176 ret = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT, 3177 kvm_rdmsr_pkg_power_limit, NULL); 3178 if (ret < 0) { 3179 error_report("Could not install MSR_PKG_POWER_LIMIT handler: %s", 3180 strerror(-ret)); 3181 return ret; 3182 } 3183 3184 ret = kvm_filter_msr(s, MSR_PKG_POWER_INFO, 3185 kvm_rdmsr_pkg_power_info, NULL); 3186 if (ret < 0) { 3187 error_report("Could not install MSR_PKG_POWER_INFO handler: %s", 3188 strerror(-ret)); 3189 return ret; 3190 } 3191 ret = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS, 3192 kvm_rdmsr_pkg_energy_status, NULL); 3193 if (ret < 0) { 3194 error_report("Could not install MSR_PKG_ENERGY_STATUS handler: %s", 3195 strerror(-ret)); 3196 return ret; 3197 } 3198 } 3199 return 0; 3200 } 3201 3202 int kvm_arch_init(MachineState *ms, KVMState *s) 3203 { 3204 int ret; 3205 struct utsname utsname; 3206 Error *local_err = NULL; 3207 3208 /* 3209 * Initialize SEV context, if required 3210 * 3211 * If no memory encryption is requested (ms->cgs == NULL) this is 3212 * a no-op. 3213 * 3214 * It's also a no-op if a non-SEV confidential guest support 3215 * mechanism is selected. SEV is the only mechanism available to 3216 * select on x86 at present, so this doesn't arise, but if new 3217 * mechanisms are supported in future (e.g. TDX), they'll need 3218 * their own initialization either here or elsewhere. 3219 */ 3220 if (ms->cgs) { 3221 ret = confidential_guest_kvm_init(ms->cgs, &local_err); 3222 if (ret < 0) { 3223 error_report_err(local_err); 3224 return ret; 3225 } 3226 } 3227 3228 has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS); 3229 has_sregs2 = kvm_check_extension(s, KVM_CAP_SREGS2) > 0; 3230 3231 hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX); 3232 3233 ret = kvm_vm_enable_exception_payload(s); 3234 if (ret < 0) { 3235 return ret; 3236 } 3237 3238 ret = kvm_vm_enable_triple_fault_event(s); 3239 if (ret < 0) { 3240 return ret; 3241 } 3242 3243 if (s->xen_version) { 3244 #ifdef CONFIG_XEN_EMU 3245 if (!object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE)) { 3246 error_report("kvm: Xen support only available in PC machine"); 3247 return -ENOTSUP; 3248 } 3249 /* hyperv_enabled() doesn't work yet. */ 3250 uint32_t msr = XEN_HYPERCALL_MSR; 3251 ret = kvm_xen_init(s, msr); 3252 if (ret < 0) { 3253 return ret; 3254 } 3255 #else 3256 error_report("kvm: Xen support not enabled in qemu"); 3257 return -ENOTSUP; 3258 #endif 3259 } 3260 3261 ret = kvm_get_supported_msrs(s); 3262 if (ret < 0) { 3263 return ret; 3264 } 3265 3266 ret = kvm_get_supported_feature_msrs(s); 3267 if (ret < 0) { 3268 return ret; 3269 } 3270 3271 uname(&utsname); 3272 lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0; 3273 3274 ret = kvm_vm_set_identity_map_addr(s, KVM_IDENTITY_BASE); 3275 if (ret < 0) { 3276 return ret; 3277 } 3278 3279 /* Set TSS base one page after EPT identity map. */ 3280 ret = kvm_vm_set_tss_addr(s, KVM_IDENTITY_BASE + 0x1000); 3281 if (ret < 0) { 3282 return ret; 3283 } 3284 3285 /* Tell fw_cfg to notify the BIOS to reserve the range. */ 3286 e820_add_entry(KVM_IDENTITY_BASE, 0x4000, E820_RESERVED); 3287 3288 ret = kvm_vm_set_nr_mmu_pages(s); 3289 if (ret < 0) { 3290 return ret; 3291 } 3292 3293 if (kvm_check_extension(s, KVM_CAP_X86_SMM) && 3294 object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) && 3295 x86_machine_is_smm_enabled(X86_MACHINE(ms))) { 3296 smram_machine_done.notify = register_smram_listener; 3297 qemu_add_machine_init_done_notifier(&smram_machine_done); 3298 } 3299 3300 if (enable_cpu_pm) { 3301 ret = kvm_vm_enable_disable_exits(s); 3302 if (ret < 0) { 3303 error_report("kvm: guest stopping CPU not supported: %s", 3304 strerror(-ret)); 3305 return ret; 3306 } 3307 } 3308 3309 if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) { 3310 X86MachineState *x86ms = X86_MACHINE(ms); 3311 3312 if (x86ms->bus_lock_ratelimit > 0) { 3313 ret = kvm_vm_enable_bus_lock_exit(s); 3314 if (ret < 0) { 3315 return ret; 3316 } 3317 ratelimit_init(&bus_lock_ratelimit_ctrl); 3318 ratelimit_set_speed(&bus_lock_ratelimit_ctrl, 3319 x86ms->bus_lock_ratelimit, BUS_LOCK_SLICE_TIME); 3320 } 3321 } 3322 3323 if (kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) { 3324 ret = kvm_vm_enable_notify_vmexit(s); 3325 if (ret < 0) { 3326 return ret; 3327 } 3328 } 3329 3330 if (kvm_vm_check_extension(s, KVM_CAP_X86_USER_SPACE_MSR)) { 3331 ret = kvm_vm_enable_userspace_msr(s); 3332 if (ret < 0) { 3333 return ret; 3334 } 3335 3336 if (s->msr_energy.enable == true) { 3337 ret = kvm_vm_enable_energy_msrs(s); 3338 if (ret < 0) { 3339 return ret; 3340 } 3341 3342 ret = kvm_msr_energy_thread_init(s, ms); 3343 if (ret < 0) { 3344 error_report("kvm : error RAPL feature requirement not met"); 3345 return ret; 3346 } 3347 } 3348 } 3349 3350 return 0; 3351 } 3352 3353 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs) 3354 { 3355 lhs->selector = rhs->selector; 3356 lhs->base = rhs->base; 3357 lhs->limit = rhs->limit; 3358 lhs->type = 3; 3359 lhs->present = 1; 3360 lhs->dpl = 3; 3361 lhs->db = 0; 3362 lhs->s = 1; 3363 lhs->l = 0; 3364 lhs->g = 0; 3365 lhs->avl = 0; 3366 lhs->unusable = 0; 3367 } 3368 3369 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs) 3370 { 3371 unsigned flags = rhs->flags; 3372 lhs->selector = rhs->selector; 3373 lhs->base = rhs->base; 3374 lhs->limit = rhs->limit; 3375 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15; 3376 lhs->present = (flags & DESC_P_MASK) != 0; 3377 lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3; 3378 lhs->db = (flags >> DESC_B_SHIFT) & 1; 3379 lhs->s = (flags & DESC_S_MASK) != 0; 3380 lhs->l = (flags >> DESC_L_SHIFT) & 1; 3381 lhs->g = (flags & DESC_G_MASK) != 0; 3382 lhs->avl = (flags & DESC_AVL_MASK) != 0; 3383 lhs->unusable = !lhs->present; 3384 lhs->padding = 0; 3385 } 3386 3387 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs) 3388 { 3389 lhs->selector = rhs->selector; 3390 lhs->base = rhs->base; 3391 lhs->limit = rhs->limit; 3392 lhs->flags = (rhs->type << DESC_TYPE_SHIFT) | 3393 ((rhs->present && !rhs->unusable) * DESC_P_MASK) | 3394 (rhs->dpl << DESC_DPL_SHIFT) | 3395 (rhs->db << DESC_B_SHIFT) | 3396 (rhs->s * DESC_S_MASK) | 3397 (rhs->l << DESC_L_SHIFT) | 3398 (rhs->g * DESC_G_MASK) | 3399 (rhs->avl * DESC_AVL_MASK); 3400 } 3401 3402 static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set) 3403 { 3404 if (set) { 3405 *kvm_reg = *qemu_reg; 3406 } else { 3407 *qemu_reg = *kvm_reg; 3408 } 3409 } 3410 3411 static int kvm_getput_regs(X86CPU *cpu, int set) 3412 { 3413 CPUX86State *env = &cpu->env; 3414 struct kvm_regs regs; 3415 int ret = 0; 3416 3417 if (!set) { 3418 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, ®s); 3419 if (ret < 0) { 3420 return ret; 3421 } 3422 } 3423 3424 kvm_getput_reg(®s.rax, &env->regs[R_EAX], set); 3425 kvm_getput_reg(®s.rbx, &env->regs[R_EBX], set); 3426 kvm_getput_reg(®s.rcx, &env->regs[R_ECX], set); 3427 kvm_getput_reg(®s.rdx, &env->regs[R_EDX], set); 3428 kvm_getput_reg(®s.rsi, &env->regs[R_ESI], set); 3429 kvm_getput_reg(®s.rdi, &env->regs[R_EDI], set); 3430 kvm_getput_reg(®s.rsp, &env->regs[R_ESP], set); 3431 kvm_getput_reg(®s.rbp, &env->regs[R_EBP], set); 3432 #ifdef TARGET_X86_64 3433 kvm_getput_reg(®s.r8, &env->regs[8], set); 3434 kvm_getput_reg(®s.r9, &env->regs[9], set); 3435 kvm_getput_reg(®s.r10, &env->regs[10], set); 3436 kvm_getput_reg(®s.r11, &env->regs[11], set); 3437 kvm_getput_reg(®s.r12, &env->regs[12], set); 3438 kvm_getput_reg(®s.r13, &env->regs[13], set); 3439 kvm_getput_reg(®s.r14, &env->regs[14], set); 3440 kvm_getput_reg(®s.r15, &env->regs[15], set); 3441 #endif 3442 3443 kvm_getput_reg(®s.rflags, &env->eflags, set); 3444 kvm_getput_reg(®s.rip, &env->eip, set); 3445 3446 if (set) { 3447 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, ®s); 3448 } 3449 3450 return ret; 3451 } 3452 3453 static int kvm_put_xsave(X86CPU *cpu) 3454 { 3455 CPUX86State *env = &cpu->env; 3456 void *xsave = env->xsave_buf; 3457 3458 x86_cpu_xsave_all_areas(cpu, xsave, env->xsave_buf_len); 3459 3460 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave); 3461 } 3462 3463 static int kvm_put_xcrs(X86CPU *cpu) 3464 { 3465 CPUX86State *env = &cpu->env; 3466 struct kvm_xcrs xcrs = {}; 3467 3468 if (!has_xcrs) { 3469 return 0; 3470 } 3471 3472 xcrs.nr_xcrs = 1; 3473 xcrs.flags = 0; 3474 xcrs.xcrs[0].xcr = 0; 3475 xcrs.xcrs[0].value = env->xcr0; 3476 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs); 3477 } 3478 3479 static int kvm_put_sregs(X86CPU *cpu) 3480 { 3481 CPUX86State *env = &cpu->env; 3482 struct kvm_sregs sregs; 3483 3484 /* 3485 * The interrupt_bitmap is ignored because KVM_SET_SREGS is 3486 * always followed by KVM_SET_VCPU_EVENTS. 3487 */ 3488 memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap)); 3489 3490 if ((env->eflags & VM_MASK)) { 3491 set_v8086_seg(&sregs.cs, &env->segs[R_CS]); 3492 set_v8086_seg(&sregs.ds, &env->segs[R_DS]); 3493 set_v8086_seg(&sregs.es, &env->segs[R_ES]); 3494 set_v8086_seg(&sregs.fs, &env->segs[R_FS]); 3495 set_v8086_seg(&sregs.gs, &env->segs[R_GS]); 3496 set_v8086_seg(&sregs.ss, &env->segs[R_SS]); 3497 } else { 3498 set_seg(&sregs.cs, &env->segs[R_CS]); 3499 set_seg(&sregs.ds, &env->segs[R_DS]); 3500 set_seg(&sregs.es, &env->segs[R_ES]); 3501 set_seg(&sregs.fs, &env->segs[R_FS]); 3502 set_seg(&sregs.gs, &env->segs[R_GS]); 3503 set_seg(&sregs.ss, &env->segs[R_SS]); 3504 } 3505 3506 set_seg(&sregs.tr, &env->tr); 3507 set_seg(&sregs.ldt, &env->ldt); 3508 3509 sregs.idt.limit = env->idt.limit; 3510 sregs.idt.base = env->idt.base; 3511 memset(sregs.idt.padding, 0, sizeof sregs.idt.padding); 3512 sregs.gdt.limit = env->gdt.limit; 3513 sregs.gdt.base = env->gdt.base; 3514 memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding); 3515 3516 sregs.cr0 = env->cr[0]; 3517 sregs.cr2 = env->cr[2]; 3518 sregs.cr3 = env->cr[3]; 3519 sregs.cr4 = env->cr[4]; 3520 3521 sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state); 3522 sregs.apic_base = cpu_get_apic_base(cpu->apic_state); 3523 3524 sregs.efer = env->efer; 3525 3526 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs); 3527 } 3528 3529 static int kvm_put_sregs2(X86CPU *cpu) 3530 { 3531 CPUX86State *env = &cpu->env; 3532 struct kvm_sregs2 sregs; 3533 int i; 3534 3535 sregs.flags = 0; 3536 3537 if ((env->eflags & VM_MASK)) { 3538 set_v8086_seg(&sregs.cs, &env->segs[R_CS]); 3539 set_v8086_seg(&sregs.ds, &env->segs[R_DS]); 3540 set_v8086_seg(&sregs.es, &env->segs[R_ES]); 3541 set_v8086_seg(&sregs.fs, &env->segs[R_FS]); 3542 set_v8086_seg(&sregs.gs, &env->segs[R_GS]); 3543 set_v8086_seg(&sregs.ss, &env->segs[R_SS]); 3544 } else { 3545 set_seg(&sregs.cs, &env->segs[R_CS]); 3546 set_seg(&sregs.ds, &env->segs[R_DS]); 3547 set_seg(&sregs.es, &env->segs[R_ES]); 3548 set_seg(&sregs.fs, &env->segs[R_FS]); 3549 set_seg(&sregs.gs, &env->segs[R_GS]); 3550 set_seg(&sregs.ss, &env->segs[R_SS]); 3551 } 3552 3553 set_seg(&sregs.tr, &env->tr); 3554 set_seg(&sregs.ldt, &env->ldt); 3555 3556 sregs.idt.limit = env->idt.limit; 3557 sregs.idt.base = env->idt.base; 3558 memset(sregs.idt.padding, 0, sizeof sregs.idt.padding); 3559 sregs.gdt.limit = env->gdt.limit; 3560 sregs.gdt.base = env->gdt.base; 3561 memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding); 3562 3563 sregs.cr0 = env->cr[0]; 3564 sregs.cr2 = env->cr[2]; 3565 sregs.cr3 = env->cr[3]; 3566 sregs.cr4 = env->cr[4]; 3567 3568 sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state); 3569 sregs.apic_base = cpu_get_apic_base(cpu->apic_state); 3570 3571 sregs.efer = env->efer; 3572 3573 if (env->pdptrs_valid) { 3574 for (i = 0; i < 4; i++) { 3575 sregs.pdptrs[i] = env->pdptrs[i]; 3576 } 3577 sregs.flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID; 3578 } 3579 3580 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS2, &sregs); 3581 } 3582 3583 3584 static void kvm_msr_buf_reset(X86CPU *cpu) 3585 { 3586 memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE); 3587 } 3588 3589 static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value) 3590 { 3591 struct kvm_msrs *msrs = cpu->kvm_msr_buf; 3592 void *limit = ((void *)msrs) + MSR_BUF_SIZE; 3593 struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs]; 3594 3595 assert((void *)(entry + 1) <= limit); 3596 3597 entry->index = index; 3598 entry->reserved = 0; 3599 entry->data = value; 3600 msrs->nmsrs++; 3601 } 3602 3603 static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value) 3604 { 3605 kvm_msr_buf_reset(cpu); 3606 kvm_msr_entry_add(cpu, index, value); 3607 3608 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf); 3609 } 3610 3611 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value) 3612 { 3613 int ret; 3614 struct { 3615 struct kvm_msrs info; 3616 struct kvm_msr_entry entries[1]; 3617 } msr_data = { 3618 .info.nmsrs = 1, 3619 .entries[0].index = index, 3620 }; 3621 3622 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data); 3623 if (ret < 0) { 3624 return ret; 3625 } 3626 assert(ret == 1); 3627 *value = msr_data.entries[0].data; 3628 return ret; 3629 } 3630 void kvm_put_apicbase(X86CPU *cpu, uint64_t value) 3631 { 3632 int ret; 3633 3634 ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value); 3635 assert(ret == 1); 3636 } 3637 3638 static int kvm_put_tscdeadline_msr(X86CPU *cpu) 3639 { 3640 CPUX86State *env = &cpu->env; 3641 int ret; 3642 3643 if (!has_msr_tsc_deadline) { 3644 return 0; 3645 } 3646 3647 ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline); 3648 if (ret < 0) { 3649 return ret; 3650 } 3651 3652 assert(ret == 1); 3653 return 0; 3654 } 3655 3656 /* 3657 * Provide a separate write service for the feature control MSR in order to 3658 * kick the VCPU out of VMXON or even guest mode on reset. This has to be done 3659 * before writing any other state because forcibly leaving nested mode 3660 * invalidates the VCPU state. 3661 */ 3662 static int kvm_put_msr_feature_control(X86CPU *cpu) 3663 { 3664 int ret; 3665 3666 if (!has_msr_feature_control) { 3667 return 0; 3668 } 3669 3670 ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL, 3671 cpu->env.msr_ia32_feature_control); 3672 if (ret < 0) { 3673 return ret; 3674 } 3675 3676 assert(ret == 1); 3677 return 0; 3678 } 3679 3680 static uint64_t make_vmx_msr_value(uint32_t index, uint32_t features) 3681 { 3682 uint32_t default1, can_be_one, can_be_zero; 3683 uint32_t must_be_one; 3684 3685 switch (index) { 3686 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 3687 default1 = 0x00000016; 3688 break; 3689 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 3690 default1 = 0x0401e172; 3691 break; 3692 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 3693 default1 = 0x000011ff; 3694 break; 3695 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 3696 default1 = 0x00036dff; 3697 break; 3698 case MSR_IA32_VMX_PROCBASED_CTLS2: 3699 default1 = 0; 3700 break; 3701 default: 3702 abort(); 3703 } 3704 3705 /* If a feature bit is set, the control can be either set or clear. 3706 * Otherwise the value is limited to either 0 or 1 by default1. 3707 */ 3708 can_be_one = features | default1; 3709 can_be_zero = features | ~default1; 3710 must_be_one = ~can_be_zero; 3711 3712 /* 3713 * Bit 0:31 -> 0 if the control bit can be zero (i.e. 1 if it must be one). 3714 * Bit 32:63 -> 1 if the control bit can be one. 3715 */ 3716 return must_be_one | (((uint64_t)can_be_one) << 32); 3717 } 3718 3719 static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f) 3720 { 3721 uint64_t kvm_vmx_basic = 3722 kvm_arch_get_supported_msr_feature(kvm_state, 3723 MSR_IA32_VMX_BASIC); 3724 3725 if (!kvm_vmx_basic) { 3726 /* If the kernel doesn't support VMX feature (kvm_intel.nested=0), 3727 * then kvm_vmx_basic will be 0 and KVM_SET_MSR will fail. 3728 */ 3729 return; 3730 } 3731 3732 uint64_t kvm_vmx_misc = 3733 kvm_arch_get_supported_msr_feature(kvm_state, 3734 MSR_IA32_VMX_MISC); 3735 uint64_t kvm_vmx_ept_vpid = 3736 kvm_arch_get_supported_msr_feature(kvm_state, 3737 MSR_IA32_VMX_EPT_VPID_CAP); 3738 3739 /* 3740 * If the guest is 64-bit, a value of 1 is allowed for the host address 3741 * space size vmexit control. 3742 */ 3743 uint64_t fixed_vmx_exit = f[FEAT_8000_0001_EDX] & CPUID_EXT2_LM 3744 ? (uint64_t)VMX_VM_EXIT_HOST_ADDR_SPACE_SIZE << 32 : 0; 3745 3746 /* 3747 * Bits 0-30, 32-44 and 50-53 come from the host. KVM should 3748 * not change them for backwards compatibility. 3749 */ 3750 uint64_t fixed_vmx_basic = kvm_vmx_basic & 3751 (MSR_VMX_BASIC_VMCS_REVISION_MASK | 3752 MSR_VMX_BASIC_VMXON_REGION_SIZE_MASK | 3753 MSR_VMX_BASIC_VMCS_MEM_TYPE_MASK); 3754 3755 /* 3756 * Same for bits 0-4 and 25-27. Bits 16-24 (CR3 target count) can 3757 * change in the future but are always zero for now, clear them to be 3758 * future proof. Bits 32-63 in theory could change, though KVM does 3759 * not support dual-monitor treatment and probably never will; mask 3760 * them out as well. 3761 */ 3762 uint64_t fixed_vmx_misc = kvm_vmx_misc & 3763 (MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK | 3764 MSR_VMX_MISC_MAX_MSR_LIST_SIZE_MASK); 3765 3766 /* 3767 * EPT memory types should not change either, so we do not bother 3768 * adding features for them. 3769 */ 3770 uint64_t fixed_vmx_ept_mask = 3771 (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_ENABLE_EPT ? 3772 MSR_VMX_EPT_UC | MSR_VMX_EPT_WB : 0); 3773 uint64_t fixed_vmx_ept_vpid = kvm_vmx_ept_vpid & fixed_vmx_ept_mask; 3774 3775 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 3776 make_vmx_msr_value(MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 3777 f[FEAT_VMX_PROCBASED_CTLS])); 3778 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS, 3779 make_vmx_msr_value(MSR_IA32_VMX_TRUE_PINBASED_CTLS, 3780 f[FEAT_VMX_PINBASED_CTLS])); 3781 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_EXIT_CTLS, 3782 make_vmx_msr_value(MSR_IA32_VMX_TRUE_EXIT_CTLS, 3783 f[FEAT_VMX_EXIT_CTLS]) | fixed_vmx_exit); 3784 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS, 3785 make_vmx_msr_value(MSR_IA32_VMX_TRUE_ENTRY_CTLS, 3786 f[FEAT_VMX_ENTRY_CTLS])); 3787 kvm_msr_entry_add(cpu, MSR_IA32_VMX_PROCBASED_CTLS2, 3788 make_vmx_msr_value(MSR_IA32_VMX_PROCBASED_CTLS2, 3789 f[FEAT_VMX_SECONDARY_CTLS])); 3790 kvm_msr_entry_add(cpu, MSR_IA32_VMX_EPT_VPID_CAP, 3791 f[FEAT_VMX_EPT_VPID_CAPS] | fixed_vmx_ept_vpid); 3792 kvm_msr_entry_add(cpu, MSR_IA32_VMX_BASIC, 3793 f[FEAT_VMX_BASIC] | fixed_vmx_basic); 3794 kvm_msr_entry_add(cpu, MSR_IA32_VMX_MISC, 3795 f[FEAT_VMX_MISC] | fixed_vmx_misc); 3796 if (has_msr_vmx_vmfunc) { 3797 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMFUNC, f[FEAT_VMX_VMFUNC]); 3798 } 3799 3800 /* 3801 * Just to be safe, write these with constant values. The CRn_FIXED1 3802 * MSRs are generated by KVM based on the vCPU's CPUID. 3803 */ 3804 kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR0_FIXED0, 3805 CR0_PE_MASK | CR0_PG_MASK | CR0_NE_MASK); 3806 kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0, 3807 CR4_VMXE_MASK); 3808 3809 if (f[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { 3810 /* FRED injected-event data (0x2052). */ 3811 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x52); 3812 } else if (f[FEAT_VMX_EXIT_CTLS] & 3813 VMX_VM_EXIT_ACTIVATE_SECONDARY_CONTROLS) { 3814 /* Secondary VM-exit controls (0x2044). */ 3815 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x44); 3816 } else if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) { 3817 /* TSC multiplier (0x2032). */ 3818 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32); 3819 } else { 3820 /* Preemption timer (0x482E). */ 3821 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x2E); 3822 } 3823 } 3824 3825 static void kvm_msr_entry_add_perf(X86CPU *cpu, FeatureWordArray f) 3826 { 3827 uint64_t kvm_perf_cap = 3828 kvm_arch_get_supported_msr_feature(kvm_state, 3829 MSR_IA32_PERF_CAPABILITIES); 3830 3831 if (kvm_perf_cap) { 3832 kvm_msr_entry_add(cpu, MSR_IA32_PERF_CAPABILITIES, 3833 kvm_perf_cap & f[FEAT_PERF_CAPABILITIES]); 3834 } 3835 } 3836 3837 static int kvm_buf_set_msrs(X86CPU *cpu) 3838 { 3839 int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf); 3840 if (ret < 0) { 3841 return ret; 3842 } 3843 3844 if (ret < cpu->kvm_msr_buf->nmsrs) { 3845 struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret]; 3846 error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64, 3847 (uint32_t)e->index, (uint64_t)e->data); 3848 } 3849 3850 assert(ret == cpu->kvm_msr_buf->nmsrs); 3851 return 0; 3852 } 3853 3854 static void kvm_init_msrs(X86CPU *cpu) 3855 { 3856 CPUX86State *env = &cpu->env; 3857 3858 kvm_msr_buf_reset(cpu); 3859 if (has_msr_arch_capabs) { 3860 kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES, 3861 env->features[FEAT_ARCH_CAPABILITIES]); 3862 } 3863 3864 if (has_msr_core_capabs) { 3865 kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY, 3866 env->features[FEAT_CORE_CAPABILITY]); 3867 } 3868 3869 if (has_msr_perf_capabs && cpu->enable_pmu) { 3870 kvm_msr_entry_add_perf(cpu, env->features); 3871 } 3872 3873 if (has_msr_ucode_rev) { 3874 kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev); 3875 } 3876 3877 /* 3878 * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but 3879 * all kernels with MSR features should have them. 3880 */ 3881 if (kvm_feature_msrs && cpu_has_vmx(env)) { 3882 kvm_msr_entry_add_vmx(cpu, env->features); 3883 } 3884 3885 assert(kvm_buf_set_msrs(cpu) == 0); 3886 } 3887 3888 static int kvm_put_msrs(X86CPU *cpu, int level) 3889 { 3890 CPUX86State *env = &cpu->env; 3891 int i; 3892 3893 kvm_msr_buf_reset(cpu); 3894 3895 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs); 3896 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp); 3897 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip); 3898 kvm_msr_entry_add(cpu, MSR_PAT, env->pat); 3899 if (has_msr_star) { 3900 kvm_msr_entry_add(cpu, MSR_STAR, env->star); 3901 } 3902 if (has_msr_hsave_pa) { 3903 kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave); 3904 } 3905 if (has_msr_tsc_aux) { 3906 kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux); 3907 } 3908 if (has_msr_tsc_adjust) { 3909 kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust); 3910 } 3911 if (has_msr_misc_enable) { 3912 kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 3913 env->msr_ia32_misc_enable); 3914 } 3915 if (has_msr_smbase) { 3916 kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase); 3917 } 3918 if (has_msr_smi_count) { 3919 kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count); 3920 } 3921 if (has_msr_pkrs) { 3922 kvm_msr_entry_add(cpu, MSR_IA32_PKRS, env->pkrs); 3923 } 3924 if (has_msr_bndcfgs) { 3925 kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs); 3926 } 3927 if (has_msr_xss) { 3928 kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss); 3929 } 3930 if (has_msr_umwait) { 3931 kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, env->umwait); 3932 } 3933 if (has_msr_spec_ctrl) { 3934 kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl); 3935 } 3936 if (has_tsc_scale_msr) { 3937 kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, env->amd_tsc_scale_msr); 3938 } 3939 3940 if (has_msr_tsx_ctrl) { 3941 kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, env->tsx_ctrl); 3942 } 3943 if (has_msr_virt_ssbd) { 3944 kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd); 3945 } 3946 if (has_msr_hwcr) { 3947 kvm_msr_entry_add(cpu, MSR_K7_HWCR, env->msr_hwcr); 3948 } 3949 3950 #ifdef TARGET_X86_64 3951 if (lm_capable_kernel) { 3952 kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar); 3953 kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase); 3954 kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask); 3955 kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar); 3956 if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { 3957 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, env->fred_rsp0); 3958 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, env->fred_rsp1); 3959 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, env->fred_rsp2); 3960 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, env->fred_rsp3); 3961 kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, env->fred_stklvls); 3962 kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, env->fred_ssp1); 3963 kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, env->fred_ssp2); 3964 kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, env->fred_ssp3); 3965 kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, env->fred_config); 3966 } 3967 } 3968 #endif 3969 3970 /* 3971 * The following MSRs have side effects on the guest or are too heavy 3972 * for normal writeback. Limit them to reset or full state updates. 3973 */ 3974 if (level >= KVM_PUT_RESET_STATE) { 3975 kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc); 3976 if (env->features[FEAT_KVM] & (CPUID_KVM_CLOCK | CPUID_KVM_CLOCK2)) { 3977 kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr); 3978 kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr); 3979 } 3980 if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF_INT) { 3981 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr); 3982 } 3983 if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF) { 3984 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr); 3985 } 3986 if (env->features[FEAT_KVM] & CPUID_KVM_PV_EOI) { 3987 kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr); 3988 } 3989 if (env->features[FEAT_KVM] & CPUID_KVM_STEAL_TIME) { 3990 kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr); 3991 } 3992 3993 if (env->features[FEAT_KVM] & CPUID_KVM_POLL_CONTROL) { 3994 kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr); 3995 } 3996 3997 if (has_architectural_pmu_version > 0) { 3998 if (has_architectural_pmu_version > 1) { 3999 /* Stop the counter. */ 4000 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0); 4001 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0); 4002 } 4003 4004 /* Set the counter values. */ 4005 for (i = 0; i < num_architectural_pmu_fixed_counters; i++) { 4006 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 4007 env->msr_fixed_counters[i]); 4008 } 4009 for (i = 0; i < num_architectural_pmu_gp_counters; i++) { 4010 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 4011 env->msr_gp_counters[i]); 4012 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 4013 env->msr_gp_evtsel[i]); 4014 } 4015 if (has_architectural_pmu_version > 1) { 4016 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 4017 env->msr_global_status); 4018 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 4019 env->msr_global_ovf_ctrl); 4020 4021 /* Now start the PMU. */ 4022 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 4023 env->msr_fixed_ctr_ctrl); 4024 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 4025 env->msr_global_ctrl); 4026 } 4027 } 4028 /* 4029 * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add, 4030 * only sync them to KVM on the first cpu 4031 */ 4032 if (current_cpu == first_cpu) { 4033 if (has_msr_hv_hypercall) { 4034 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 4035 env->msr_hv_guest_os_id); 4036 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 4037 env->msr_hv_hypercall); 4038 } 4039 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) { 4040 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 4041 env->msr_hv_tsc); 4042 } 4043 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) { 4044 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 4045 env->msr_hv_reenlightenment_control); 4046 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 4047 env->msr_hv_tsc_emulation_control); 4048 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 4049 env->msr_hv_tsc_emulation_status); 4050 } 4051 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG) && 4052 has_msr_hv_syndbg_options) { 4053 kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS, 4054 hyperv_syndbg_query_options()); 4055 } 4056 } 4057 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) { 4058 kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 4059 env->msr_hv_vapic); 4060 } 4061 if (has_msr_hv_crash) { 4062 int j; 4063 4064 for (j = 0; j < HV_CRASH_PARAMS; j++) 4065 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 4066 env->msr_hv_crash_params[j]); 4067 4068 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY); 4069 } 4070 if (has_msr_hv_runtime) { 4071 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime); 4072 } 4073 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) 4074 && hv_vpindex_settable) { 4075 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX, 4076 hyperv_vp_index(CPU(cpu))); 4077 } 4078 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) { 4079 int j; 4080 4081 kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION); 4082 4083 kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 4084 env->msr_hv_synic_control); 4085 kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 4086 env->msr_hv_synic_evt_page); 4087 kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 4088 env->msr_hv_synic_msg_page); 4089 4090 for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) { 4091 kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j, 4092 env->msr_hv_synic_sint[j]); 4093 } 4094 } 4095 if (has_msr_hv_stimer) { 4096 int j; 4097 4098 for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) { 4099 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2, 4100 env->msr_hv_stimer_config[j]); 4101 } 4102 4103 for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) { 4104 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2, 4105 env->msr_hv_stimer_count[j]); 4106 } 4107 } 4108 if (env->features[FEAT_1_EDX] & CPUID_MTRR) { 4109 uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits); 4110 4111 kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype); 4112 kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]); 4113 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]); 4114 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]); 4115 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]); 4116 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]); 4117 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]); 4118 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]); 4119 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]); 4120 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]); 4121 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]); 4122 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]); 4123 for (i = 0; i < MSR_MTRRcap_VCNT; i++) { 4124 /* The CPU GPs if we write to a bit above the physical limit of 4125 * the host CPU (and KVM emulates that) 4126 */ 4127 uint64_t mask = env->mtrr_var[i].mask; 4128 mask &= phys_mask; 4129 4130 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 4131 env->mtrr_var[i].base); 4132 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask); 4133 } 4134 } 4135 if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) { 4136 int addr_num = kvm_arch_get_supported_cpuid(kvm_state, 4137 0x14, 1, R_EAX) & 0x7; 4138 4139 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 4140 env->msr_rtit_ctrl); 4141 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 4142 env->msr_rtit_status); 4143 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 4144 env->msr_rtit_output_base); 4145 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 4146 env->msr_rtit_output_mask); 4147 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 4148 env->msr_rtit_cr3_match); 4149 for (i = 0; i < addr_num; i++) { 4150 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 4151 env->msr_rtit_addrs[i]); 4152 } 4153 } 4154 4155 if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) { 4156 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, 4157 env->msr_ia32_sgxlepubkeyhash[0]); 4158 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, 4159 env->msr_ia32_sgxlepubkeyhash[1]); 4160 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, 4161 env->msr_ia32_sgxlepubkeyhash[2]); 4162 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 4163 env->msr_ia32_sgxlepubkeyhash[3]); 4164 } 4165 4166 if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) { 4167 kvm_msr_entry_add(cpu, MSR_IA32_XFD, 4168 env->msr_xfd); 4169 kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR, 4170 env->msr_xfd_err); 4171 } 4172 4173 if (kvm_enabled() && cpu->enable_pmu && 4174 (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) { 4175 uint64_t depth; 4176 int ret; 4177 4178 /* 4179 * Only migrate Arch LBR states when the host Arch LBR depth 4180 * equals that of source guest's, this is to avoid mismatch 4181 * of guest/host config for the msr hence avoid unexpected 4182 * misbehavior. 4183 */ 4184 ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth); 4185 4186 if (ret == 1 && !!depth && depth == env->msr_lbr_depth) { 4187 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, env->msr_lbr_ctl); 4188 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, env->msr_lbr_depth); 4189 4190 for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) { 4191 if (!env->lbr_records[i].from) { 4192 continue; 4193 } 4194 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i, 4195 env->lbr_records[i].from); 4196 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i, 4197 env->lbr_records[i].to); 4198 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i, 4199 env->lbr_records[i].info); 4200 } 4201 } 4202 } 4203 4204 /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see 4205 * kvm_put_msr_feature_control. */ 4206 } 4207 4208 if (env->mcg_cap) { 4209 kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status); 4210 kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl); 4211 if (has_msr_mcg_ext_ctl) { 4212 kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl); 4213 } 4214 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) { 4215 kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]); 4216 } 4217 } 4218 4219 return kvm_buf_set_msrs(cpu); 4220 } 4221 4222 4223 static int kvm_get_xsave(X86CPU *cpu) 4224 { 4225 CPUX86State *env = &cpu->env; 4226 void *xsave = env->xsave_buf; 4227 unsigned long type; 4228 int ret; 4229 4230 type = has_xsave2 ? KVM_GET_XSAVE2 : KVM_GET_XSAVE; 4231 ret = kvm_vcpu_ioctl(CPU(cpu), type, xsave); 4232 if (ret < 0) { 4233 return ret; 4234 } 4235 x86_cpu_xrstor_all_areas(cpu, xsave, env->xsave_buf_len); 4236 4237 return 0; 4238 } 4239 4240 static int kvm_get_xcrs(X86CPU *cpu) 4241 { 4242 CPUX86State *env = &cpu->env; 4243 int i, ret; 4244 struct kvm_xcrs xcrs; 4245 4246 if (!has_xcrs) { 4247 return 0; 4248 } 4249 4250 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs); 4251 if (ret < 0) { 4252 return ret; 4253 } 4254 4255 for (i = 0; i < xcrs.nr_xcrs; i++) { 4256 /* Only support xcr0 now */ 4257 if (xcrs.xcrs[i].xcr == 0) { 4258 env->xcr0 = xcrs.xcrs[i].value; 4259 break; 4260 } 4261 } 4262 return 0; 4263 } 4264 4265 static int kvm_get_sregs(X86CPU *cpu) 4266 { 4267 CPUX86State *env = &cpu->env; 4268 struct kvm_sregs sregs; 4269 int ret; 4270 4271 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs); 4272 if (ret < 0) { 4273 return ret; 4274 } 4275 4276 /* 4277 * The interrupt_bitmap is ignored because KVM_GET_SREGS is 4278 * always preceded by KVM_GET_VCPU_EVENTS. 4279 */ 4280 4281 get_seg(&env->segs[R_CS], &sregs.cs); 4282 get_seg(&env->segs[R_DS], &sregs.ds); 4283 get_seg(&env->segs[R_ES], &sregs.es); 4284 get_seg(&env->segs[R_FS], &sregs.fs); 4285 get_seg(&env->segs[R_GS], &sregs.gs); 4286 get_seg(&env->segs[R_SS], &sregs.ss); 4287 4288 get_seg(&env->tr, &sregs.tr); 4289 get_seg(&env->ldt, &sregs.ldt); 4290 4291 env->idt.limit = sregs.idt.limit; 4292 env->idt.base = sregs.idt.base; 4293 env->gdt.limit = sregs.gdt.limit; 4294 env->gdt.base = sregs.gdt.base; 4295 4296 env->cr[0] = sregs.cr0; 4297 env->cr[2] = sregs.cr2; 4298 env->cr[3] = sregs.cr3; 4299 env->cr[4] = sregs.cr4; 4300 4301 env->efer = sregs.efer; 4302 if (sev_es_enabled() && env->efer & MSR_EFER_LME && 4303 env->cr[0] & CR0_PG_MASK) { 4304 env->efer |= MSR_EFER_LMA; 4305 } 4306 4307 /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */ 4308 x86_update_hflags(env); 4309 4310 return 0; 4311 } 4312 4313 static int kvm_get_sregs2(X86CPU *cpu) 4314 { 4315 CPUX86State *env = &cpu->env; 4316 struct kvm_sregs2 sregs; 4317 int i, ret; 4318 4319 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs); 4320 if (ret < 0) { 4321 return ret; 4322 } 4323 4324 get_seg(&env->segs[R_CS], &sregs.cs); 4325 get_seg(&env->segs[R_DS], &sregs.ds); 4326 get_seg(&env->segs[R_ES], &sregs.es); 4327 get_seg(&env->segs[R_FS], &sregs.fs); 4328 get_seg(&env->segs[R_GS], &sregs.gs); 4329 get_seg(&env->segs[R_SS], &sregs.ss); 4330 4331 get_seg(&env->tr, &sregs.tr); 4332 get_seg(&env->ldt, &sregs.ldt); 4333 4334 env->idt.limit = sregs.idt.limit; 4335 env->idt.base = sregs.idt.base; 4336 env->gdt.limit = sregs.gdt.limit; 4337 env->gdt.base = sregs.gdt.base; 4338 4339 env->cr[0] = sregs.cr0; 4340 env->cr[2] = sregs.cr2; 4341 env->cr[3] = sregs.cr3; 4342 env->cr[4] = sregs.cr4; 4343 4344 env->efer = sregs.efer; 4345 if (sev_es_enabled() && env->efer & MSR_EFER_LME && 4346 env->cr[0] & CR0_PG_MASK) { 4347 env->efer |= MSR_EFER_LMA; 4348 } 4349 4350 env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID; 4351 4352 if (env->pdptrs_valid) { 4353 for (i = 0; i < 4; i++) { 4354 env->pdptrs[i] = sregs.pdptrs[i]; 4355 } 4356 } 4357 4358 /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */ 4359 x86_update_hflags(env); 4360 4361 return 0; 4362 } 4363 4364 static int kvm_get_msrs(X86CPU *cpu) 4365 { 4366 CPUX86State *env = &cpu->env; 4367 struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries; 4368 int ret, i; 4369 uint64_t mtrr_top_bits; 4370 4371 kvm_msr_buf_reset(cpu); 4372 4373 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0); 4374 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0); 4375 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0); 4376 kvm_msr_entry_add(cpu, MSR_PAT, 0); 4377 if (has_msr_star) { 4378 kvm_msr_entry_add(cpu, MSR_STAR, 0); 4379 } 4380 if (has_msr_hsave_pa) { 4381 kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0); 4382 } 4383 if (has_msr_tsc_aux) { 4384 kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0); 4385 } 4386 if (has_msr_tsc_adjust) { 4387 kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0); 4388 } 4389 if (has_msr_tsc_deadline) { 4390 kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0); 4391 } 4392 if (has_msr_misc_enable) { 4393 kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0); 4394 } 4395 if (has_msr_smbase) { 4396 kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0); 4397 } 4398 if (has_msr_smi_count) { 4399 kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0); 4400 } 4401 if (has_msr_feature_control) { 4402 kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0); 4403 } 4404 if (has_msr_pkrs) { 4405 kvm_msr_entry_add(cpu, MSR_IA32_PKRS, 0); 4406 } 4407 if (has_msr_bndcfgs) { 4408 kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0); 4409 } 4410 if (has_msr_xss) { 4411 kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0); 4412 } 4413 if (has_msr_umwait) { 4414 kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, 0); 4415 } 4416 if (has_msr_spec_ctrl) { 4417 kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0); 4418 } 4419 if (has_tsc_scale_msr) { 4420 kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, 0); 4421 } 4422 4423 if (has_msr_tsx_ctrl) { 4424 kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, 0); 4425 } 4426 if (has_msr_virt_ssbd) { 4427 kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0); 4428 } 4429 if (!env->tsc_valid) { 4430 kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0); 4431 env->tsc_valid = !runstate_is_running(); 4432 } 4433 if (has_msr_hwcr) { 4434 kvm_msr_entry_add(cpu, MSR_K7_HWCR, 0); 4435 } 4436 4437 #ifdef TARGET_X86_64 4438 if (lm_capable_kernel) { 4439 kvm_msr_entry_add(cpu, MSR_CSTAR, 0); 4440 kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0); 4441 kvm_msr_entry_add(cpu, MSR_FMASK, 0); 4442 kvm_msr_entry_add(cpu, MSR_LSTAR, 0); 4443 if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) { 4444 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, 0); 4445 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, 0); 4446 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, 0); 4447 kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, 0); 4448 kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, 0); 4449 kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, 0); 4450 kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, 0); 4451 kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, 0); 4452 kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, 0); 4453 } 4454 } 4455 #endif 4456 if (env->features[FEAT_KVM] & (CPUID_KVM_CLOCK | CPUID_KVM_CLOCK2)) { 4457 kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0); 4458 kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0); 4459 } 4460 if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF_INT) { 4461 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0); 4462 } 4463 if (env->features[FEAT_KVM] & CPUID_KVM_ASYNCPF) { 4464 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0); 4465 } 4466 if (env->features[FEAT_KVM] & CPUID_KVM_PV_EOI) { 4467 kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0); 4468 } 4469 if (env->features[FEAT_KVM] & CPUID_KVM_STEAL_TIME) { 4470 kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0); 4471 } 4472 if (env->features[FEAT_KVM] & CPUID_KVM_POLL_CONTROL) { 4473 kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1); 4474 } 4475 if (has_architectural_pmu_version > 0) { 4476 if (has_architectural_pmu_version > 1) { 4477 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0); 4478 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0); 4479 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0); 4480 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0); 4481 } 4482 for (i = 0; i < num_architectural_pmu_fixed_counters; i++) { 4483 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0); 4484 } 4485 for (i = 0; i < num_architectural_pmu_gp_counters; i++) { 4486 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0); 4487 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0); 4488 } 4489 } 4490 4491 if (env->mcg_cap) { 4492 kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0); 4493 kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0); 4494 if (has_msr_mcg_ext_ctl) { 4495 kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0); 4496 } 4497 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) { 4498 kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0); 4499 } 4500 } 4501 4502 if (has_msr_hv_hypercall) { 4503 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0); 4504 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0); 4505 } 4506 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) { 4507 kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0); 4508 } 4509 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) { 4510 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0); 4511 } 4512 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) { 4513 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0); 4514 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0); 4515 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0); 4516 } 4517 if (has_msr_hv_syndbg_options) { 4518 kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS, 0); 4519 } 4520 if (has_msr_hv_crash) { 4521 int j; 4522 4523 for (j = 0; j < HV_CRASH_PARAMS; j++) { 4524 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0); 4525 } 4526 } 4527 if (has_msr_hv_runtime) { 4528 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0); 4529 } 4530 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) { 4531 uint32_t msr; 4532 4533 kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0); 4534 kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0); 4535 kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0); 4536 for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) { 4537 kvm_msr_entry_add(cpu, msr, 0); 4538 } 4539 } 4540 if (has_msr_hv_stimer) { 4541 uint32_t msr; 4542 4543 for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT; 4544 msr++) { 4545 kvm_msr_entry_add(cpu, msr, 0); 4546 } 4547 } 4548 if (env->features[FEAT_1_EDX] & CPUID_MTRR) { 4549 kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0); 4550 kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0); 4551 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0); 4552 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0); 4553 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0); 4554 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0); 4555 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0); 4556 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0); 4557 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0); 4558 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0); 4559 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0); 4560 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0); 4561 for (i = 0; i < MSR_MTRRcap_VCNT; i++) { 4562 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0); 4563 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0); 4564 } 4565 } 4566 4567 if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) { 4568 int addr_num = 4569 kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7; 4570 4571 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0); 4572 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0); 4573 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0); 4574 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0); 4575 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0); 4576 for (i = 0; i < addr_num; i++) { 4577 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0); 4578 } 4579 } 4580 4581 if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) { 4582 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, 0); 4583 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, 0); 4584 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, 0); 4585 kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0); 4586 } 4587 4588 if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) { 4589 kvm_msr_entry_add(cpu, MSR_IA32_XFD, 0); 4590 kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR, 0); 4591 } 4592 4593 if (kvm_enabled() && cpu->enable_pmu && 4594 (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) { 4595 uint64_t depth; 4596 4597 ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth); 4598 if (ret == 1 && depth == ARCH_LBR_NR_ENTRIES) { 4599 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, 0); 4600 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, 0); 4601 4602 for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) { 4603 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i, 0); 4604 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i, 0); 4605 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i, 0); 4606 } 4607 } 4608 } 4609 4610 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf); 4611 if (ret < 0) { 4612 return ret; 4613 } 4614 4615 if (ret < cpu->kvm_msr_buf->nmsrs) { 4616 struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret]; 4617 error_report("error: failed to get MSR 0x%" PRIx32, 4618 (uint32_t)e->index); 4619 } 4620 4621 assert(ret == cpu->kvm_msr_buf->nmsrs); 4622 /* 4623 * MTRR masks: Each mask consists of 5 parts 4624 * a 10..0: must be zero 4625 * b 11 : valid bit 4626 * c n-1.12: actual mask bits 4627 * d 51..n: reserved must be zero 4628 * e 63.52: reserved must be zero 4629 * 4630 * 'n' is the number of physical bits supported by the CPU and is 4631 * apparently always <= 52. We know our 'n' but don't know what 4632 * the destinations 'n' is; it might be smaller, in which case 4633 * it masks (c) on loading. It might be larger, in which case 4634 * we fill 'd' so that d..c is consistent irrespetive of the 'n' 4635 * we're migrating to. 4636 */ 4637 4638 if (cpu->fill_mtrr_mask) { 4639 QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52); 4640 assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS); 4641 mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits); 4642 } else { 4643 mtrr_top_bits = 0; 4644 } 4645 4646 for (i = 0; i < ret; i++) { 4647 uint32_t index = msrs[i].index; 4648 switch (index) { 4649 case MSR_IA32_SYSENTER_CS: 4650 env->sysenter_cs = msrs[i].data; 4651 break; 4652 case MSR_IA32_SYSENTER_ESP: 4653 env->sysenter_esp = msrs[i].data; 4654 break; 4655 case MSR_IA32_SYSENTER_EIP: 4656 env->sysenter_eip = msrs[i].data; 4657 break; 4658 case MSR_PAT: 4659 env->pat = msrs[i].data; 4660 break; 4661 case MSR_STAR: 4662 env->star = msrs[i].data; 4663 break; 4664 #ifdef TARGET_X86_64 4665 case MSR_CSTAR: 4666 env->cstar = msrs[i].data; 4667 break; 4668 case MSR_KERNELGSBASE: 4669 env->kernelgsbase = msrs[i].data; 4670 break; 4671 case MSR_FMASK: 4672 env->fmask = msrs[i].data; 4673 break; 4674 case MSR_LSTAR: 4675 env->lstar = msrs[i].data; 4676 break; 4677 case MSR_IA32_FRED_RSP0: 4678 env->fred_rsp0 = msrs[i].data; 4679 break; 4680 case MSR_IA32_FRED_RSP1: 4681 env->fred_rsp1 = msrs[i].data; 4682 break; 4683 case MSR_IA32_FRED_RSP2: 4684 env->fred_rsp2 = msrs[i].data; 4685 break; 4686 case MSR_IA32_FRED_RSP3: 4687 env->fred_rsp3 = msrs[i].data; 4688 break; 4689 case MSR_IA32_FRED_STKLVLS: 4690 env->fred_stklvls = msrs[i].data; 4691 break; 4692 case MSR_IA32_FRED_SSP1: 4693 env->fred_ssp1 = msrs[i].data; 4694 break; 4695 case MSR_IA32_FRED_SSP2: 4696 env->fred_ssp2 = msrs[i].data; 4697 break; 4698 case MSR_IA32_FRED_SSP3: 4699 env->fred_ssp3 = msrs[i].data; 4700 break; 4701 case MSR_IA32_FRED_CONFIG: 4702 env->fred_config = msrs[i].data; 4703 break; 4704 #endif 4705 case MSR_IA32_TSC: 4706 env->tsc = msrs[i].data; 4707 break; 4708 case MSR_TSC_AUX: 4709 env->tsc_aux = msrs[i].data; 4710 break; 4711 case MSR_TSC_ADJUST: 4712 env->tsc_adjust = msrs[i].data; 4713 break; 4714 case MSR_IA32_TSCDEADLINE: 4715 env->tsc_deadline = msrs[i].data; 4716 break; 4717 case MSR_VM_HSAVE_PA: 4718 env->vm_hsave = msrs[i].data; 4719 break; 4720 case MSR_KVM_SYSTEM_TIME: 4721 env->system_time_msr = msrs[i].data; 4722 break; 4723 case MSR_KVM_WALL_CLOCK: 4724 env->wall_clock_msr = msrs[i].data; 4725 break; 4726 case MSR_MCG_STATUS: 4727 env->mcg_status = msrs[i].data; 4728 break; 4729 case MSR_MCG_CTL: 4730 env->mcg_ctl = msrs[i].data; 4731 break; 4732 case MSR_MCG_EXT_CTL: 4733 env->mcg_ext_ctl = msrs[i].data; 4734 break; 4735 case MSR_IA32_MISC_ENABLE: 4736 env->msr_ia32_misc_enable = msrs[i].data; 4737 break; 4738 case MSR_IA32_SMBASE: 4739 env->smbase = msrs[i].data; 4740 break; 4741 case MSR_SMI_COUNT: 4742 env->msr_smi_count = msrs[i].data; 4743 break; 4744 case MSR_IA32_FEATURE_CONTROL: 4745 env->msr_ia32_feature_control = msrs[i].data; 4746 break; 4747 case MSR_IA32_BNDCFGS: 4748 env->msr_bndcfgs = msrs[i].data; 4749 break; 4750 case MSR_IA32_XSS: 4751 env->xss = msrs[i].data; 4752 break; 4753 case MSR_IA32_UMWAIT_CONTROL: 4754 env->umwait = msrs[i].data; 4755 break; 4756 case MSR_IA32_PKRS: 4757 env->pkrs = msrs[i].data; 4758 break; 4759 default: 4760 if (msrs[i].index >= MSR_MC0_CTL && 4761 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) { 4762 env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data; 4763 } 4764 break; 4765 case MSR_KVM_ASYNC_PF_EN: 4766 env->async_pf_en_msr = msrs[i].data; 4767 break; 4768 case MSR_KVM_ASYNC_PF_INT: 4769 env->async_pf_int_msr = msrs[i].data; 4770 break; 4771 case MSR_KVM_PV_EOI_EN: 4772 env->pv_eoi_en_msr = msrs[i].data; 4773 break; 4774 case MSR_KVM_STEAL_TIME: 4775 env->steal_time_msr = msrs[i].data; 4776 break; 4777 case MSR_KVM_POLL_CONTROL: { 4778 env->poll_control_msr = msrs[i].data; 4779 break; 4780 } 4781 case MSR_CORE_PERF_FIXED_CTR_CTRL: 4782 env->msr_fixed_ctr_ctrl = msrs[i].data; 4783 break; 4784 case MSR_CORE_PERF_GLOBAL_CTRL: 4785 env->msr_global_ctrl = msrs[i].data; 4786 break; 4787 case MSR_CORE_PERF_GLOBAL_STATUS: 4788 env->msr_global_status = msrs[i].data; 4789 break; 4790 case MSR_CORE_PERF_GLOBAL_OVF_CTRL: 4791 env->msr_global_ovf_ctrl = msrs[i].data; 4792 break; 4793 case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1: 4794 env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data; 4795 break; 4796 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1: 4797 env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data; 4798 break; 4799 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1: 4800 env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data; 4801 break; 4802 case HV_X64_MSR_HYPERCALL: 4803 env->msr_hv_hypercall = msrs[i].data; 4804 break; 4805 case HV_X64_MSR_GUEST_OS_ID: 4806 env->msr_hv_guest_os_id = msrs[i].data; 4807 break; 4808 case HV_X64_MSR_APIC_ASSIST_PAGE: 4809 env->msr_hv_vapic = msrs[i].data; 4810 break; 4811 case HV_X64_MSR_REFERENCE_TSC: 4812 env->msr_hv_tsc = msrs[i].data; 4813 break; 4814 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: 4815 env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data; 4816 break; 4817 case HV_X64_MSR_VP_RUNTIME: 4818 env->msr_hv_runtime = msrs[i].data; 4819 break; 4820 case HV_X64_MSR_SCONTROL: 4821 env->msr_hv_synic_control = msrs[i].data; 4822 break; 4823 case HV_X64_MSR_SIEFP: 4824 env->msr_hv_synic_evt_page = msrs[i].data; 4825 break; 4826 case HV_X64_MSR_SIMP: 4827 env->msr_hv_synic_msg_page = msrs[i].data; 4828 break; 4829 case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: 4830 env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data; 4831 break; 4832 case HV_X64_MSR_STIMER0_CONFIG: 4833 case HV_X64_MSR_STIMER1_CONFIG: 4834 case HV_X64_MSR_STIMER2_CONFIG: 4835 case HV_X64_MSR_STIMER3_CONFIG: 4836 env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] = 4837 msrs[i].data; 4838 break; 4839 case HV_X64_MSR_STIMER0_COUNT: 4840 case HV_X64_MSR_STIMER1_COUNT: 4841 case HV_X64_MSR_STIMER2_COUNT: 4842 case HV_X64_MSR_STIMER3_COUNT: 4843 env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] = 4844 msrs[i].data; 4845 break; 4846 case HV_X64_MSR_REENLIGHTENMENT_CONTROL: 4847 env->msr_hv_reenlightenment_control = msrs[i].data; 4848 break; 4849 case HV_X64_MSR_TSC_EMULATION_CONTROL: 4850 env->msr_hv_tsc_emulation_control = msrs[i].data; 4851 break; 4852 case HV_X64_MSR_TSC_EMULATION_STATUS: 4853 env->msr_hv_tsc_emulation_status = msrs[i].data; 4854 break; 4855 case HV_X64_MSR_SYNDBG_OPTIONS: 4856 env->msr_hv_syndbg_options = msrs[i].data; 4857 break; 4858 case MSR_MTRRdefType: 4859 env->mtrr_deftype = msrs[i].data; 4860 break; 4861 case MSR_MTRRfix64K_00000: 4862 env->mtrr_fixed[0] = msrs[i].data; 4863 break; 4864 case MSR_MTRRfix16K_80000: 4865 env->mtrr_fixed[1] = msrs[i].data; 4866 break; 4867 case MSR_MTRRfix16K_A0000: 4868 env->mtrr_fixed[2] = msrs[i].data; 4869 break; 4870 case MSR_MTRRfix4K_C0000: 4871 env->mtrr_fixed[3] = msrs[i].data; 4872 break; 4873 case MSR_MTRRfix4K_C8000: 4874 env->mtrr_fixed[4] = msrs[i].data; 4875 break; 4876 case MSR_MTRRfix4K_D0000: 4877 env->mtrr_fixed[5] = msrs[i].data; 4878 break; 4879 case MSR_MTRRfix4K_D8000: 4880 env->mtrr_fixed[6] = msrs[i].data; 4881 break; 4882 case MSR_MTRRfix4K_E0000: 4883 env->mtrr_fixed[7] = msrs[i].data; 4884 break; 4885 case MSR_MTRRfix4K_E8000: 4886 env->mtrr_fixed[8] = msrs[i].data; 4887 break; 4888 case MSR_MTRRfix4K_F0000: 4889 env->mtrr_fixed[9] = msrs[i].data; 4890 break; 4891 case MSR_MTRRfix4K_F8000: 4892 env->mtrr_fixed[10] = msrs[i].data; 4893 break; 4894 case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1): 4895 if (index & 1) { 4896 env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data | 4897 mtrr_top_bits; 4898 } else { 4899 env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data; 4900 } 4901 break; 4902 case MSR_IA32_SPEC_CTRL: 4903 env->spec_ctrl = msrs[i].data; 4904 break; 4905 case MSR_AMD64_TSC_RATIO: 4906 env->amd_tsc_scale_msr = msrs[i].data; 4907 break; 4908 case MSR_IA32_TSX_CTRL: 4909 env->tsx_ctrl = msrs[i].data; 4910 break; 4911 case MSR_VIRT_SSBD: 4912 env->virt_ssbd = msrs[i].data; 4913 break; 4914 case MSR_IA32_RTIT_CTL: 4915 env->msr_rtit_ctrl = msrs[i].data; 4916 break; 4917 case MSR_IA32_RTIT_STATUS: 4918 env->msr_rtit_status = msrs[i].data; 4919 break; 4920 case MSR_IA32_RTIT_OUTPUT_BASE: 4921 env->msr_rtit_output_base = msrs[i].data; 4922 break; 4923 case MSR_IA32_RTIT_OUTPUT_MASK: 4924 env->msr_rtit_output_mask = msrs[i].data; 4925 break; 4926 case MSR_IA32_RTIT_CR3_MATCH: 4927 env->msr_rtit_cr3_match = msrs[i].data; 4928 break; 4929 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 4930 env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data; 4931 break; 4932 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 4933 env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] = 4934 msrs[i].data; 4935 break; 4936 case MSR_IA32_XFD: 4937 env->msr_xfd = msrs[i].data; 4938 break; 4939 case MSR_IA32_XFD_ERR: 4940 env->msr_xfd_err = msrs[i].data; 4941 break; 4942 case MSR_ARCH_LBR_CTL: 4943 env->msr_lbr_ctl = msrs[i].data; 4944 break; 4945 case MSR_ARCH_LBR_DEPTH: 4946 env->msr_lbr_depth = msrs[i].data; 4947 break; 4948 case MSR_ARCH_LBR_FROM_0 ... MSR_ARCH_LBR_FROM_0 + 31: 4949 env->lbr_records[index - MSR_ARCH_LBR_FROM_0].from = msrs[i].data; 4950 break; 4951 case MSR_ARCH_LBR_TO_0 ... MSR_ARCH_LBR_TO_0 + 31: 4952 env->lbr_records[index - MSR_ARCH_LBR_TO_0].to = msrs[i].data; 4953 break; 4954 case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31: 4955 env->lbr_records[index - MSR_ARCH_LBR_INFO_0].info = msrs[i].data; 4956 break; 4957 case MSR_K7_HWCR: 4958 env->msr_hwcr = msrs[i].data; 4959 break; 4960 } 4961 } 4962 4963 return 0; 4964 } 4965 4966 static int kvm_put_mp_state(X86CPU *cpu) 4967 { 4968 struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state }; 4969 4970 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state); 4971 } 4972 4973 static int kvm_get_mp_state(X86CPU *cpu) 4974 { 4975 CPUState *cs = CPU(cpu); 4976 CPUX86State *env = &cpu->env; 4977 struct kvm_mp_state mp_state; 4978 int ret; 4979 4980 ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state); 4981 if (ret < 0) { 4982 return ret; 4983 } 4984 env->mp_state = mp_state.mp_state; 4985 if (kvm_irqchip_in_kernel()) { 4986 cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED); 4987 } 4988 return 0; 4989 } 4990 4991 static int kvm_get_apic(X86CPU *cpu) 4992 { 4993 DeviceState *apic = cpu->apic_state; 4994 struct kvm_lapic_state kapic; 4995 int ret; 4996 4997 if (apic && kvm_irqchip_in_kernel()) { 4998 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic); 4999 if (ret < 0) { 5000 return ret; 5001 } 5002 5003 kvm_get_apic_state(apic, &kapic); 5004 } 5005 return 0; 5006 } 5007 5008 static int kvm_put_vcpu_events(X86CPU *cpu, int level) 5009 { 5010 CPUState *cs = CPU(cpu); 5011 CPUX86State *env = &cpu->env; 5012 struct kvm_vcpu_events events = {}; 5013 5014 events.flags = 0; 5015 5016 if (has_exception_payload) { 5017 events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD; 5018 events.exception.pending = env->exception_pending; 5019 events.exception_has_payload = env->exception_has_payload; 5020 events.exception_payload = env->exception_payload; 5021 } 5022 events.exception.nr = env->exception_nr; 5023 events.exception.injected = env->exception_injected; 5024 events.exception.has_error_code = env->has_error_code; 5025 events.exception.error_code = env->error_code; 5026 5027 events.interrupt.injected = (env->interrupt_injected >= 0); 5028 events.interrupt.nr = env->interrupt_injected; 5029 events.interrupt.soft = env->soft_interrupt; 5030 5031 events.nmi.injected = env->nmi_injected; 5032 events.nmi.pending = env->nmi_pending; 5033 events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK); 5034 5035 events.sipi_vector = env->sipi_vector; 5036 5037 if (has_msr_smbase) { 5038 events.flags |= KVM_VCPUEVENT_VALID_SMM; 5039 events.smi.smm = !!(env->hflags & HF_SMM_MASK); 5040 events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK); 5041 if (kvm_irqchip_in_kernel()) { 5042 /* As soon as these are moved to the kernel, remove them 5043 * from cs->interrupt_request. 5044 */ 5045 events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI; 5046 events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT; 5047 cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI); 5048 } else { 5049 /* Keep these in cs->interrupt_request. */ 5050 events.smi.pending = 0; 5051 events.smi.latched_init = 0; 5052 } 5053 } 5054 5055 if (level >= KVM_PUT_RESET_STATE) { 5056 events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING; 5057 if (env->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { 5058 events.flags |= KVM_VCPUEVENT_VALID_SIPI_VECTOR; 5059 } 5060 } 5061 5062 if (has_triple_fault_event) { 5063 events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; 5064 events.triple_fault.pending = env->triple_fault_pending; 5065 } 5066 5067 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events); 5068 } 5069 5070 static int kvm_get_vcpu_events(X86CPU *cpu) 5071 { 5072 CPUX86State *env = &cpu->env; 5073 struct kvm_vcpu_events events; 5074 int ret; 5075 5076 memset(&events, 0, sizeof(events)); 5077 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events); 5078 if (ret < 0) { 5079 return ret; 5080 } 5081 5082 if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) { 5083 env->exception_pending = events.exception.pending; 5084 env->exception_has_payload = events.exception_has_payload; 5085 env->exception_payload = events.exception_payload; 5086 } else { 5087 env->exception_pending = 0; 5088 env->exception_has_payload = false; 5089 } 5090 env->exception_injected = events.exception.injected; 5091 env->exception_nr = 5092 (env->exception_pending || env->exception_injected) ? 5093 events.exception.nr : -1; 5094 env->has_error_code = events.exception.has_error_code; 5095 env->error_code = events.exception.error_code; 5096 5097 env->interrupt_injected = 5098 events.interrupt.injected ? events.interrupt.nr : -1; 5099 env->soft_interrupt = events.interrupt.soft; 5100 5101 env->nmi_injected = events.nmi.injected; 5102 env->nmi_pending = events.nmi.pending; 5103 if (events.nmi.masked) { 5104 env->hflags2 |= HF2_NMI_MASK; 5105 } else { 5106 env->hflags2 &= ~HF2_NMI_MASK; 5107 } 5108 5109 if (events.flags & KVM_VCPUEVENT_VALID_SMM) { 5110 if (events.smi.smm) { 5111 env->hflags |= HF_SMM_MASK; 5112 } else { 5113 env->hflags &= ~HF_SMM_MASK; 5114 } 5115 if (events.smi.pending) { 5116 cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI); 5117 } else { 5118 cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI); 5119 } 5120 if (events.smi.smm_inside_nmi) { 5121 env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK; 5122 } else { 5123 env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK; 5124 } 5125 if (events.smi.latched_init) { 5126 cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT); 5127 } else { 5128 cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT); 5129 } 5130 } 5131 5132 if (events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { 5133 env->triple_fault_pending = events.triple_fault.pending; 5134 } 5135 5136 env->sipi_vector = events.sipi_vector; 5137 5138 return 0; 5139 } 5140 5141 static int kvm_put_debugregs(X86CPU *cpu) 5142 { 5143 CPUX86State *env = &cpu->env; 5144 struct kvm_debugregs dbgregs; 5145 int i; 5146 5147 memset(&dbgregs, 0, sizeof(dbgregs)); 5148 for (i = 0; i < 4; i++) { 5149 dbgregs.db[i] = env->dr[i]; 5150 } 5151 dbgregs.dr6 = env->dr[6]; 5152 dbgregs.dr7 = env->dr[7]; 5153 dbgregs.flags = 0; 5154 5155 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs); 5156 } 5157 5158 static int kvm_get_debugregs(X86CPU *cpu) 5159 { 5160 CPUX86State *env = &cpu->env; 5161 struct kvm_debugregs dbgregs; 5162 int i, ret; 5163 5164 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs); 5165 if (ret < 0) { 5166 return ret; 5167 } 5168 for (i = 0; i < 4; i++) { 5169 env->dr[i] = dbgregs.db[i]; 5170 } 5171 env->dr[4] = env->dr[6] = dbgregs.dr6; 5172 env->dr[5] = env->dr[7] = dbgregs.dr7; 5173 5174 return 0; 5175 } 5176 5177 static int kvm_put_nested_state(X86CPU *cpu) 5178 { 5179 CPUX86State *env = &cpu->env; 5180 int max_nested_state_len = kvm_max_nested_state_length(); 5181 5182 if (!env->nested_state) { 5183 return 0; 5184 } 5185 5186 /* 5187 * Copy flags that are affected by reset from env->hflags and env->hflags2. 5188 */ 5189 if (env->hflags & HF_GUEST_MASK) { 5190 env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE; 5191 } else { 5192 env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE; 5193 } 5194 5195 /* Don't set KVM_STATE_NESTED_GIF_SET on VMX as it is illegal */ 5196 if (cpu_has_svm(env) && (env->hflags2 & HF2_GIF_MASK)) { 5197 env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET; 5198 } else { 5199 env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET; 5200 } 5201 5202 assert(env->nested_state->size <= max_nested_state_len); 5203 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state); 5204 } 5205 5206 static int kvm_get_nested_state(X86CPU *cpu) 5207 { 5208 CPUX86State *env = &cpu->env; 5209 int max_nested_state_len = kvm_max_nested_state_length(); 5210 int ret; 5211 5212 if (!env->nested_state) { 5213 return 0; 5214 } 5215 5216 /* 5217 * It is possible that migration restored a smaller size into 5218 * nested_state->hdr.size than what our kernel support. 5219 * We preserve migration origin nested_state->hdr.size for 5220 * call to KVM_SET_NESTED_STATE but wish that our next call 5221 * to KVM_GET_NESTED_STATE will use max size our kernel support. 5222 */ 5223 env->nested_state->size = max_nested_state_len; 5224 5225 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state); 5226 if (ret < 0) { 5227 return ret; 5228 } 5229 5230 /* 5231 * Copy flags that are affected by reset to env->hflags and env->hflags2. 5232 */ 5233 if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) { 5234 env->hflags |= HF_GUEST_MASK; 5235 } else { 5236 env->hflags &= ~HF_GUEST_MASK; 5237 } 5238 5239 /* Keep HF2_GIF_MASK set on !SVM as x86_cpu_pending_interrupt() needs it */ 5240 if (cpu_has_svm(env)) { 5241 if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) { 5242 env->hflags2 |= HF2_GIF_MASK; 5243 } else { 5244 env->hflags2 &= ~HF2_GIF_MASK; 5245 } 5246 } 5247 5248 return ret; 5249 } 5250 5251 int kvm_arch_put_registers(CPUState *cpu, int level, Error **errp) 5252 { 5253 X86CPU *x86_cpu = X86_CPU(cpu); 5254 int ret; 5255 5256 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 5257 5258 /* 5259 * Put MSR_IA32_FEATURE_CONTROL first, this ensures the VM gets out of VMX 5260 * root operation upon vCPU reset. kvm_put_msr_feature_control() should also 5261 * precede kvm_put_nested_state() when 'real' nested state is set. 5262 */ 5263 if (level >= KVM_PUT_RESET_STATE) { 5264 ret = kvm_put_msr_feature_control(x86_cpu); 5265 if (ret < 0) { 5266 error_setg_errno(errp, -ret, "Failed to set feature control MSR"); 5267 return ret; 5268 } 5269 } 5270 5271 /* must be before kvm_put_nested_state so that EFER.SVME is set */ 5272 ret = has_sregs2 ? kvm_put_sregs2(x86_cpu) : kvm_put_sregs(x86_cpu); 5273 if (ret < 0) { 5274 error_setg_errno(errp, -ret, "Failed to set special registers"); 5275 return ret; 5276 } 5277 5278 if (level >= KVM_PUT_RESET_STATE) { 5279 ret = kvm_put_nested_state(x86_cpu); 5280 if (ret < 0) { 5281 error_setg_errno(errp, -ret, "Failed to set nested state"); 5282 return ret; 5283 } 5284 } 5285 5286 if (level == KVM_PUT_FULL_STATE) { 5287 /* We don't check for kvm_arch_set_tsc_khz() errors here, 5288 * because TSC frequency mismatch shouldn't abort migration, 5289 * unless the user explicitly asked for a more strict TSC 5290 * setting (e.g. using an explicit "tsc-freq" option). 5291 */ 5292 kvm_arch_set_tsc_khz(cpu); 5293 } 5294 5295 #ifdef CONFIG_XEN_EMU 5296 if (xen_mode == XEN_EMULATE && level == KVM_PUT_FULL_STATE) { 5297 ret = kvm_put_xen_state(cpu); 5298 if (ret < 0) { 5299 error_setg_errno(errp, -ret, "Failed to set Xen state"); 5300 return ret; 5301 } 5302 } 5303 #endif 5304 5305 ret = kvm_getput_regs(x86_cpu, 1); 5306 if (ret < 0) { 5307 error_setg_errno(errp, -ret, "Failed to set general purpose registers"); 5308 return ret; 5309 } 5310 ret = kvm_put_xsave(x86_cpu); 5311 if (ret < 0) { 5312 error_setg_errno(errp, -ret, "Failed to set XSAVE"); 5313 return ret; 5314 } 5315 ret = kvm_put_xcrs(x86_cpu); 5316 if (ret < 0) { 5317 error_setg_errno(errp, -ret, "Failed to set XCRs"); 5318 return ret; 5319 } 5320 ret = kvm_put_msrs(x86_cpu, level); 5321 if (ret < 0) { 5322 error_setg_errno(errp, -ret, "Failed to set MSRs"); 5323 return ret; 5324 } 5325 ret = kvm_put_vcpu_events(x86_cpu, level); 5326 if (ret < 0) { 5327 error_setg_errno(errp, -ret, "Failed to set vCPU events"); 5328 return ret; 5329 } 5330 if (level >= KVM_PUT_RESET_STATE) { 5331 ret = kvm_put_mp_state(x86_cpu); 5332 if (ret < 0) { 5333 error_setg_errno(errp, -ret, "Failed to set MP state"); 5334 return ret; 5335 } 5336 } 5337 5338 ret = kvm_put_tscdeadline_msr(x86_cpu); 5339 if (ret < 0) { 5340 error_setg_errno(errp, -ret, "Failed to set TSC deadline MSR"); 5341 return ret; 5342 } 5343 ret = kvm_put_debugregs(x86_cpu); 5344 if (ret < 0) { 5345 error_setg_errno(errp, -ret, "Failed to set debug registers"); 5346 return ret; 5347 } 5348 return 0; 5349 } 5350 5351 int kvm_arch_get_registers(CPUState *cs, Error **errp) 5352 { 5353 X86CPU *cpu = X86_CPU(cs); 5354 int ret; 5355 5356 assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs)); 5357 5358 ret = kvm_get_vcpu_events(cpu); 5359 if (ret < 0) { 5360 error_setg_errno(errp, -ret, "Failed to get vCPU events"); 5361 goto out; 5362 } 5363 /* 5364 * KVM_GET_MPSTATE can modify CS and RIP, call it before 5365 * KVM_GET_REGS and KVM_GET_SREGS. 5366 */ 5367 ret = kvm_get_mp_state(cpu); 5368 if (ret < 0) { 5369 error_setg_errno(errp, -ret, "Failed to get MP state"); 5370 goto out; 5371 } 5372 ret = kvm_getput_regs(cpu, 0); 5373 if (ret < 0) { 5374 error_setg_errno(errp, -ret, "Failed to get general purpose registers"); 5375 goto out; 5376 } 5377 ret = kvm_get_xsave(cpu); 5378 if (ret < 0) { 5379 error_setg_errno(errp, -ret, "Failed to get XSAVE"); 5380 goto out; 5381 } 5382 ret = kvm_get_xcrs(cpu); 5383 if (ret < 0) { 5384 error_setg_errno(errp, -ret, "Failed to get XCRs"); 5385 goto out; 5386 } 5387 ret = has_sregs2 ? kvm_get_sregs2(cpu) : kvm_get_sregs(cpu); 5388 if (ret < 0) { 5389 error_setg_errno(errp, -ret, "Failed to get special registers"); 5390 goto out; 5391 } 5392 ret = kvm_get_msrs(cpu); 5393 if (ret < 0) { 5394 error_setg_errno(errp, -ret, "Failed to get MSRs"); 5395 goto out; 5396 } 5397 ret = kvm_get_apic(cpu); 5398 if (ret < 0) { 5399 error_setg_errno(errp, -ret, "Failed to get APIC"); 5400 goto out; 5401 } 5402 ret = kvm_get_debugregs(cpu); 5403 if (ret < 0) { 5404 error_setg_errno(errp, -ret, "Failed to get debug registers"); 5405 goto out; 5406 } 5407 ret = kvm_get_nested_state(cpu); 5408 if (ret < 0) { 5409 error_setg_errno(errp, -ret, "Failed to get nested state"); 5410 goto out; 5411 } 5412 #ifdef CONFIG_XEN_EMU 5413 if (xen_mode == XEN_EMULATE) { 5414 ret = kvm_get_xen_state(cs); 5415 if (ret < 0) { 5416 error_setg_errno(errp, -ret, "Failed to get Xen state"); 5417 goto out; 5418 } 5419 } 5420 #endif 5421 ret = 0; 5422 out: 5423 cpu_sync_bndcs_hflags(&cpu->env); 5424 return ret; 5425 } 5426 5427 void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) 5428 { 5429 X86CPU *x86_cpu = X86_CPU(cpu); 5430 CPUX86State *env = &x86_cpu->env; 5431 int ret; 5432 5433 /* Inject NMI */ 5434 if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { 5435 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { 5436 bql_lock(); 5437 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 5438 bql_unlock(); 5439 DPRINTF("injected NMI\n"); 5440 ret = kvm_vcpu_ioctl(cpu, KVM_NMI); 5441 if (ret < 0) { 5442 fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n", 5443 strerror(-ret)); 5444 } 5445 } 5446 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 5447 bql_lock(); 5448 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 5449 bql_unlock(); 5450 DPRINTF("injected SMI\n"); 5451 ret = kvm_vcpu_ioctl(cpu, KVM_SMI); 5452 if (ret < 0) { 5453 fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n", 5454 strerror(-ret)); 5455 } 5456 } 5457 } 5458 5459 if (!kvm_pic_in_kernel()) { 5460 bql_lock(); 5461 } 5462 5463 /* Force the VCPU out of its inner loop to process any INIT requests 5464 * or (for userspace APIC, but it is cheap to combine the checks here) 5465 * pending TPR access reports. 5466 */ 5467 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 5468 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 5469 !(env->hflags & HF_SMM_MASK)) { 5470 cpu->exit_request = 1; 5471 } 5472 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 5473 cpu->exit_request = 1; 5474 } 5475 } 5476 5477 if (!kvm_pic_in_kernel()) { 5478 /* Try to inject an interrupt if the guest can accept it */ 5479 if (run->ready_for_interrupt_injection && 5480 (cpu->interrupt_request & CPU_INTERRUPT_HARD) && 5481 (env->eflags & IF_MASK)) { 5482 int irq; 5483 5484 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 5485 irq = cpu_get_pic_interrupt(env); 5486 if (irq >= 0) { 5487 struct kvm_interrupt intr; 5488 5489 intr.irq = irq; 5490 DPRINTF("injected interrupt %d\n", irq); 5491 ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr); 5492 if (ret < 0) { 5493 fprintf(stderr, 5494 "KVM: injection failed, interrupt lost (%s)\n", 5495 strerror(-ret)); 5496 } 5497 } 5498 } 5499 5500 /* If we have an interrupt but the guest is not ready to receive an 5501 * interrupt, request an interrupt window exit. This will 5502 * cause a return to userspace as soon as the guest is ready to 5503 * receive interrupts. */ 5504 if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 5505 run->request_interrupt_window = 1; 5506 } else { 5507 run->request_interrupt_window = 0; 5508 } 5509 5510 DPRINTF("setting tpr\n"); 5511 run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state); 5512 5513 bql_unlock(); 5514 } 5515 } 5516 5517 static void kvm_rate_limit_on_bus_lock(void) 5518 { 5519 uint64_t delay_ns = ratelimit_calculate_delay(&bus_lock_ratelimit_ctrl, 1); 5520 5521 if (delay_ns) { 5522 g_usleep(delay_ns / SCALE_US); 5523 } 5524 } 5525 5526 MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run) 5527 { 5528 X86CPU *x86_cpu = X86_CPU(cpu); 5529 CPUX86State *env = &x86_cpu->env; 5530 5531 if (run->flags & KVM_RUN_X86_SMM) { 5532 env->hflags |= HF_SMM_MASK; 5533 } else { 5534 env->hflags &= ~HF_SMM_MASK; 5535 } 5536 if (run->if_flag) { 5537 env->eflags |= IF_MASK; 5538 } else { 5539 env->eflags &= ~IF_MASK; 5540 } 5541 if (run->flags & KVM_RUN_X86_BUS_LOCK) { 5542 kvm_rate_limit_on_bus_lock(); 5543 } 5544 5545 #ifdef CONFIG_XEN_EMU 5546 /* 5547 * If the callback is asserted as a GSI (or PCI INTx) then check if 5548 * vcpu_info->evtchn_upcall_pending has been cleared, and deassert 5549 * the callback IRQ if so. Ideally we could hook into the PIC/IOAPIC 5550 * EOI and only resample then, exactly how the VFIO eventfd pairs 5551 * are designed to work for level triggered interrupts. 5552 */ 5553 if (x86_cpu->env.xen_callback_asserted) { 5554 kvm_xen_maybe_deassert_callback(cpu); 5555 } 5556 #endif 5557 5558 /* We need to protect the apic state against concurrent accesses from 5559 * different threads in case the userspace irqchip is used. */ 5560 if (!kvm_irqchip_in_kernel()) { 5561 bql_lock(); 5562 } 5563 cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8); 5564 cpu_set_apic_base(x86_cpu->apic_state, run->apic_base); 5565 if (!kvm_irqchip_in_kernel()) { 5566 bql_unlock(); 5567 } 5568 return cpu_get_mem_attrs(env); 5569 } 5570 5571 int kvm_arch_process_async_events(CPUState *cs) 5572 { 5573 X86CPU *cpu = X86_CPU(cs); 5574 CPUX86State *env = &cpu->env; 5575 5576 if (cs->interrupt_request & CPU_INTERRUPT_MCE) { 5577 /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */ 5578 assert(env->mcg_cap); 5579 5580 cs->interrupt_request &= ~CPU_INTERRUPT_MCE; 5581 5582 kvm_cpu_synchronize_state(cs); 5583 5584 if (env->exception_nr == EXCP08_DBLE) { 5585 /* this means triple fault */ 5586 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 5587 cs->exit_request = 1; 5588 return 0; 5589 } 5590 kvm_queue_exception(env, EXCP12_MCHK, 0, 0); 5591 env->has_error_code = 0; 5592 5593 cs->halted = 0; 5594 if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) { 5595 env->mp_state = KVM_MP_STATE_RUNNABLE; 5596 } 5597 } 5598 5599 if ((cs->interrupt_request & CPU_INTERRUPT_INIT) && 5600 !(env->hflags & HF_SMM_MASK)) { 5601 kvm_cpu_synchronize_state(cs); 5602 do_cpu_init(cpu); 5603 } 5604 5605 if (kvm_irqchip_in_kernel()) { 5606 return 0; 5607 } 5608 5609 if (cs->interrupt_request & CPU_INTERRUPT_POLL) { 5610 cs->interrupt_request &= ~CPU_INTERRUPT_POLL; 5611 apic_poll_irq(cpu->apic_state); 5612 } 5613 if (((cs->interrupt_request & CPU_INTERRUPT_HARD) && 5614 (env->eflags & IF_MASK)) || 5615 (cs->interrupt_request & CPU_INTERRUPT_NMI)) { 5616 cs->halted = 0; 5617 } 5618 if (cs->interrupt_request & CPU_INTERRUPT_SIPI) { 5619 kvm_cpu_synchronize_state(cs); 5620 do_cpu_sipi(cpu); 5621 } 5622 if (cs->interrupt_request & CPU_INTERRUPT_TPR) { 5623 cs->interrupt_request &= ~CPU_INTERRUPT_TPR; 5624 kvm_cpu_synchronize_state(cs); 5625 apic_handle_tpr_access_report(cpu->apic_state, env->eip, 5626 env->tpr_access_type); 5627 } 5628 5629 return cs->halted; 5630 } 5631 5632 static int kvm_handle_halt(X86CPU *cpu) 5633 { 5634 CPUState *cs = CPU(cpu); 5635 CPUX86State *env = &cpu->env; 5636 5637 if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) && 5638 (env->eflags & IF_MASK)) && 5639 !(cs->interrupt_request & CPU_INTERRUPT_NMI)) { 5640 cs->halted = 1; 5641 return EXCP_HLT; 5642 } 5643 5644 return 0; 5645 } 5646 5647 static int kvm_handle_tpr_access(X86CPU *cpu) 5648 { 5649 CPUState *cs = CPU(cpu); 5650 struct kvm_run *run = cs->kvm_run; 5651 5652 apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip, 5653 run->tpr_access.is_write ? TPR_ACCESS_WRITE 5654 : TPR_ACCESS_READ); 5655 return 1; 5656 } 5657 5658 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp) 5659 { 5660 static const uint8_t int3 = 0xcc; 5661 5662 if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) || 5663 cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) { 5664 return -EINVAL; 5665 } 5666 return 0; 5667 } 5668 5669 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp) 5670 { 5671 uint8_t int3; 5672 5673 if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0)) { 5674 return -EINVAL; 5675 } 5676 if (int3 != 0xcc) { 5677 return 0; 5678 } 5679 if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) { 5680 return -EINVAL; 5681 } 5682 return 0; 5683 } 5684 5685 static struct { 5686 target_ulong addr; 5687 int len; 5688 int type; 5689 } hw_breakpoint[4]; 5690 5691 static int nb_hw_breakpoint; 5692 5693 static int find_hw_breakpoint(target_ulong addr, int len, int type) 5694 { 5695 int n; 5696 5697 for (n = 0; n < nb_hw_breakpoint; n++) { 5698 if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type && 5699 (hw_breakpoint[n].len == len || len == -1)) { 5700 return n; 5701 } 5702 } 5703 return -1; 5704 } 5705 5706 int kvm_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type) 5707 { 5708 switch (type) { 5709 case GDB_BREAKPOINT_HW: 5710 len = 1; 5711 break; 5712 case GDB_WATCHPOINT_WRITE: 5713 case GDB_WATCHPOINT_ACCESS: 5714 switch (len) { 5715 case 1: 5716 break; 5717 case 2: 5718 case 4: 5719 case 8: 5720 if (addr & (len - 1)) { 5721 return -EINVAL; 5722 } 5723 break; 5724 default: 5725 return -EINVAL; 5726 } 5727 break; 5728 default: 5729 return -ENOSYS; 5730 } 5731 5732 if (nb_hw_breakpoint == 4) { 5733 return -ENOBUFS; 5734 } 5735 if (find_hw_breakpoint(addr, len, type) >= 0) { 5736 return -EEXIST; 5737 } 5738 hw_breakpoint[nb_hw_breakpoint].addr = addr; 5739 hw_breakpoint[nb_hw_breakpoint].len = len; 5740 hw_breakpoint[nb_hw_breakpoint].type = type; 5741 nb_hw_breakpoint++; 5742 5743 return 0; 5744 } 5745 5746 int kvm_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type) 5747 { 5748 int n; 5749 5750 n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type); 5751 if (n < 0) { 5752 return -ENOENT; 5753 } 5754 nb_hw_breakpoint--; 5755 hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint]; 5756 5757 return 0; 5758 } 5759 5760 void kvm_arch_remove_all_hw_breakpoints(void) 5761 { 5762 nb_hw_breakpoint = 0; 5763 } 5764 5765 static CPUWatchpoint hw_watchpoint; 5766 5767 static int kvm_handle_debug(X86CPU *cpu, 5768 struct kvm_debug_exit_arch *arch_info) 5769 { 5770 CPUState *cs = CPU(cpu); 5771 CPUX86State *env = &cpu->env; 5772 int ret = 0; 5773 int n; 5774 5775 if (arch_info->exception == EXCP01_DB) { 5776 if (arch_info->dr6 & DR6_BS) { 5777 if (cs->singlestep_enabled) { 5778 ret = EXCP_DEBUG; 5779 } 5780 } else { 5781 for (n = 0; n < 4; n++) { 5782 if (arch_info->dr6 & (1 << n)) { 5783 switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) { 5784 case 0x0: 5785 ret = EXCP_DEBUG; 5786 break; 5787 case 0x1: 5788 ret = EXCP_DEBUG; 5789 cs->watchpoint_hit = &hw_watchpoint; 5790 hw_watchpoint.vaddr = hw_breakpoint[n].addr; 5791 hw_watchpoint.flags = BP_MEM_WRITE; 5792 break; 5793 case 0x3: 5794 ret = EXCP_DEBUG; 5795 cs->watchpoint_hit = &hw_watchpoint; 5796 hw_watchpoint.vaddr = hw_breakpoint[n].addr; 5797 hw_watchpoint.flags = BP_MEM_ACCESS; 5798 break; 5799 } 5800 } 5801 } 5802 } 5803 } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) { 5804 ret = EXCP_DEBUG; 5805 } 5806 if (ret == 0) { 5807 cpu_synchronize_state(cs); 5808 assert(env->exception_nr == -1); 5809 5810 /* pass to guest */ 5811 kvm_queue_exception(env, arch_info->exception, 5812 arch_info->exception == EXCP01_DB, 5813 arch_info->dr6); 5814 env->has_error_code = 0; 5815 } 5816 5817 return ret; 5818 } 5819 5820 void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg) 5821 { 5822 const uint8_t type_code[] = { 5823 [GDB_BREAKPOINT_HW] = 0x0, 5824 [GDB_WATCHPOINT_WRITE] = 0x1, 5825 [GDB_WATCHPOINT_ACCESS] = 0x3 5826 }; 5827 const uint8_t len_code[] = { 5828 [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2 5829 }; 5830 int n; 5831 5832 if (kvm_sw_breakpoints_active(cpu)) { 5833 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP; 5834 } 5835 if (nb_hw_breakpoint > 0) { 5836 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; 5837 dbg->arch.debugreg[7] = 0x0600; 5838 for (n = 0; n < nb_hw_breakpoint; n++) { 5839 dbg->arch.debugreg[n] = hw_breakpoint[n].addr; 5840 dbg->arch.debugreg[7] |= (2 << (n * 2)) | 5841 (type_code[hw_breakpoint[n].type] << (16 + n*4)) | 5842 ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4)); 5843 } 5844 } 5845 } 5846 5847 static int kvm_install_msr_filters(KVMState *s) 5848 { 5849 uint64_t zero = 0; 5850 struct kvm_msr_filter filter = { 5851 .flags = KVM_MSR_FILTER_DEFAULT_ALLOW, 5852 }; 5853 int i, j = 0; 5854 5855 QEMU_BUILD_BUG_ON(ARRAY_SIZE(msr_handlers) != ARRAY_SIZE(filter.ranges)); 5856 for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) { 5857 KVMMSRHandlers *handler = &msr_handlers[i]; 5858 if (handler->msr) { 5859 struct kvm_msr_filter_range *range = &filter.ranges[j++]; 5860 5861 *range = (struct kvm_msr_filter_range) { 5862 .flags = 0, 5863 .nmsrs = 1, 5864 .base = handler->msr, 5865 .bitmap = (__u8 *)&zero, 5866 }; 5867 5868 if (handler->rdmsr) { 5869 range->flags |= KVM_MSR_FILTER_READ; 5870 } 5871 5872 if (handler->wrmsr) { 5873 range->flags |= KVM_MSR_FILTER_WRITE; 5874 } 5875 } 5876 } 5877 5878 return kvm_vm_ioctl(s, KVM_X86_SET_MSR_FILTER, &filter); 5879 } 5880 5881 static int kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr, 5882 QEMUWRMSRHandler *wrmsr) 5883 { 5884 int i, ret; 5885 5886 for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) { 5887 if (!msr_handlers[i].msr) { 5888 msr_handlers[i] = (KVMMSRHandlers) { 5889 .msr = msr, 5890 .rdmsr = rdmsr, 5891 .wrmsr = wrmsr, 5892 }; 5893 5894 ret = kvm_install_msr_filters(s); 5895 if (ret) { 5896 msr_handlers[i] = (KVMMSRHandlers) { }; 5897 return ret; 5898 } 5899 5900 return 0; 5901 } 5902 } 5903 5904 return -EINVAL; 5905 } 5906 5907 static int kvm_handle_rdmsr(X86CPU *cpu, struct kvm_run *run) 5908 { 5909 int i; 5910 bool r; 5911 5912 for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) { 5913 KVMMSRHandlers *handler = &msr_handlers[i]; 5914 if (run->msr.index == handler->msr) { 5915 if (handler->rdmsr) { 5916 r = handler->rdmsr(cpu, handler->msr, 5917 (uint64_t *)&run->msr.data); 5918 run->msr.error = r ? 0 : 1; 5919 return 0; 5920 } 5921 } 5922 } 5923 5924 g_assert_not_reached(); 5925 } 5926 5927 static int kvm_handle_wrmsr(X86CPU *cpu, struct kvm_run *run) 5928 { 5929 int i; 5930 bool r; 5931 5932 for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) { 5933 KVMMSRHandlers *handler = &msr_handlers[i]; 5934 if (run->msr.index == handler->msr) { 5935 if (handler->wrmsr) { 5936 r = handler->wrmsr(cpu, handler->msr, run->msr.data); 5937 run->msr.error = r ? 0 : 1; 5938 return 0; 5939 } 5940 } 5941 } 5942 5943 g_assert_not_reached(); 5944 } 5945 5946 static bool has_sgx_provisioning; 5947 5948 static bool __kvm_enable_sgx_provisioning(KVMState *s) 5949 { 5950 int fd, ret; 5951 5952 if (!kvm_vm_check_extension(s, KVM_CAP_SGX_ATTRIBUTE)) { 5953 return false; 5954 } 5955 5956 fd = qemu_open_old("/dev/sgx_provision", O_RDONLY); 5957 if (fd < 0) { 5958 return false; 5959 } 5960 5961 ret = kvm_vm_enable_cap(s, KVM_CAP_SGX_ATTRIBUTE, 0, fd); 5962 if (ret) { 5963 error_report("Could not enable SGX PROVISIONKEY: %s", strerror(-ret)); 5964 exit(1); 5965 } 5966 close(fd); 5967 return true; 5968 } 5969 5970 bool kvm_enable_sgx_provisioning(KVMState *s) 5971 { 5972 return MEMORIZE(__kvm_enable_sgx_provisioning(s), has_sgx_provisioning); 5973 } 5974 5975 static bool host_supports_vmx(void) 5976 { 5977 uint32_t ecx, unused; 5978 5979 host_cpuid(1, 0, &unused, &unused, &ecx, &unused); 5980 return ecx & CPUID_EXT_VMX; 5981 } 5982 5983 /* 5984 * Currently the handling here only supports use of KVM_HC_MAP_GPA_RANGE 5985 * to service guest-initiated memory attribute update requests so that 5986 * KVM_SET_MEMORY_ATTRIBUTES can update whether or not a page should be 5987 * backed by the private memory pool provided by guest_memfd, and as such 5988 * is only applicable to guest_memfd-backed guests (e.g. SNP/TDX). 5989 * 5990 * Other other use-cases for KVM_HC_MAP_GPA_RANGE, such as for SEV live 5991 * migration, are not implemented here currently. 5992 * 5993 * For the guest_memfd use-case, these exits will generally be synthesized 5994 * by KVM based on platform-specific hypercalls, like GHCB requests in the 5995 * case of SEV-SNP, and not issued directly within the guest though the 5996 * KVM_HC_MAP_GPA_RANGE hypercall. So in this case, KVM_HC_MAP_GPA_RANGE is 5997 * not actually advertised to guests via the KVM CPUID feature bit, as 5998 * opposed to SEV live migration where it would be. Since it is unlikely the 5999 * SEV live migration use-case would be useful for guest-memfd backed guests, 6000 * because private/shared page tracking is already provided through other 6001 * means, these 2 use-cases should be treated as being mutually-exclusive. 6002 */ 6003 static int kvm_handle_hc_map_gpa_range(struct kvm_run *run) 6004 { 6005 uint64_t gpa, size, attributes; 6006 6007 if (!machine_require_guest_memfd(current_machine)) 6008 return -EINVAL; 6009 6010 gpa = run->hypercall.args[0]; 6011 size = run->hypercall.args[1] * TARGET_PAGE_SIZE; 6012 attributes = run->hypercall.args[2]; 6013 6014 trace_kvm_hc_map_gpa_range(gpa, size, attributes, run->hypercall.flags); 6015 6016 return kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED); 6017 } 6018 6019 static int kvm_handle_hypercall(struct kvm_run *run) 6020 { 6021 if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE) 6022 return kvm_handle_hc_map_gpa_range(run); 6023 6024 return -EINVAL; 6025 } 6026 6027 #define VMX_INVALID_GUEST_STATE 0x80000021 6028 6029 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) 6030 { 6031 X86CPU *cpu = X86_CPU(cs); 6032 uint64_t code; 6033 int ret; 6034 bool ctx_invalid; 6035 KVMState *state; 6036 6037 switch (run->exit_reason) { 6038 case KVM_EXIT_HLT: 6039 DPRINTF("handle_hlt\n"); 6040 bql_lock(); 6041 ret = kvm_handle_halt(cpu); 6042 bql_unlock(); 6043 break; 6044 case KVM_EXIT_SET_TPR: 6045 ret = 0; 6046 break; 6047 case KVM_EXIT_TPR_ACCESS: 6048 bql_lock(); 6049 ret = kvm_handle_tpr_access(cpu); 6050 bql_unlock(); 6051 break; 6052 case KVM_EXIT_FAIL_ENTRY: 6053 code = run->fail_entry.hardware_entry_failure_reason; 6054 fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n", 6055 code); 6056 if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) { 6057 fprintf(stderr, 6058 "\nIf you're running a guest on an Intel machine without " 6059 "unrestricted mode\n" 6060 "support, the failure can be most likely due to the guest " 6061 "entering an invalid\n" 6062 "state for Intel VT. For example, the guest maybe running " 6063 "in big real mode\n" 6064 "which is not supported on less recent Intel processors." 6065 "\n\n"); 6066 } 6067 ret = -1; 6068 break; 6069 case KVM_EXIT_EXCEPTION: 6070 fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n", 6071 run->ex.exception, run->ex.error_code); 6072 ret = -1; 6073 break; 6074 case KVM_EXIT_DEBUG: 6075 DPRINTF("kvm_exit_debug\n"); 6076 bql_lock(); 6077 ret = kvm_handle_debug(cpu, &run->debug.arch); 6078 bql_unlock(); 6079 break; 6080 case KVM_EXIT_HYPERV: 6081 ret = kvm_hv_handle_exit(cpu, &run->hyperv); 6082 break; 6083 case KVM_EXIT_IOAPIC_EOI: 6084 ioapic_eoi_broadcast(run->eoi.vector); 6085 ret = 0; 6086 break; 6087 case KVM_EXIT_X86_BUS_LOCK: 6088 /* already handled in kvm_arch_post_run */ 6089 ret = 0; 6090 break; 6091 case KVM_EXIT_NOTIFY: 6092 ctx_invalid = !!(run->notify.flags & KVM_NOTIFY_CONTEXT_INVALID); 6093 state = KVM_STATE(current_accel()); 6094 if (ctx_invalid || 6095 state->notify_vmexit == NOTIFY_VMEXIT_OPTION_INTERNAL_ERROR) { 6096 warn_report("KVM internal error: Encountered a notify exit " 6097 "with invalid context in guest."); 6098 ret = -1; 6099 } else { 6100 warn_report_once("KVM: Encountered a notify exit with valid " 6101 "context in guest. " 6102 "The guest could be misbehaving."); 6103 ret = 0; 6104 } 6105 break; 6106 case KVM_EXIT_X86_RDMSR: 6107 /* We only enable MSR filtering, any other exit is bogus */ 6108 assert(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER); 6109 ret = kvm_handle_rdmsr(cpu, run); 6110 break; 6111 case KVM_EXIT_X86_WRMSR: 6112 /* We only enable MSR filtering, any other exit is bogus */ 6113 assert(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER); 6114 ret = kvm_handle_wrmsr(cpu, run); 6115 break; 6116 #ifdef CONFIG_XEN_EMU 6117 case KVM_EXIT_XEN: 6118 ret = kvm_xen_handle_exit(cpu, &run->xen); 6119 break; 6120 #endif 6121 case KVM_EXIT_HYPERCALL: 6122 ret = kvm_handle_hypercall(run); 6123 break; 6124 default: 6125 fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); 6126 ret = -1; 6127 break; 6128 } 6129 6130 return ret; 6131 } 6132 6133 bool kvm_arch_stop_on_emulation_error(CPUState *cs) 6134 { 6135 X86CPU *cpu = X86_CPU(cs); 6136 CPUX86State *env = &cpu->env; 6137 6138 kvm_cpu_synchronize_state(cs); 6139 return !(env->cr[0] & CR0_PE_MASK) || 6140 ((env->segs[R_CS].selector & 3) != 3); 6141 } 6142 6143 void kvm_arch_init_irq_routing(KVMState *s) 6144 { 6145 /* We know at this point that we're using the in-kernel 6146 * irqchip, so we can use irqfds, and on x86 we know 6147 * we can use msi via irqfd and GSI routing. 6148 */ 6149 kvm_msi_via_irqfd_allowed = true; 6150 kvm_gsi_routing_allowed = true; 6151 6152 if (kvm_irqchip_is_split()) { 6153 KVMRouteChange c = kvm_irqchip_begin_route_changes(s); 6154 int i; 6155 6156 /* If the ioapic is in QEMU and the lapics are in KVM, reserve 6157 MSI routes for signaling interrupts to the local apics. */ 6158 for (i = 0; i < IOAPIC_NUM_PINS; i++) { 6159 if (kvm_irqchip_add_msi_route(&c, 0, NULL) < 0) { 6160 error_report("Could not enable split IRQ mode."); 6161 exit(1); 6162 } 6163 } 6164 kvm_irqchip_commit_route_changes(&c); 6165 } 6166 } 6167 6168 int kvm_arch_irqchip_create(KVMState *s) 6169 { 6170 int ret; 6171 if (kvm_kernel_irqchip_split()) { 6172 ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24); 6173 if (ret) { 6174 error_report("Could not enable split irqchip mode: %s", 6175 strerror(-ret)); 6176 exit(1); 6177 } else { 6178 DPRINTF("Enabled KVM_CAP_SPLIT_IRQCHIP\n"); 6179 kvm_split_irqchip = true; 6180 return 1; 6181 } 6182 } else { 6183 return 0; 6184 } 6185 } 6186 6187 uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address) 6188 { 6189 CPUX86State *env; 6190 uint64_t ext_id; 6191 6192 if (!first_cpu) { 6193 return address; 6194 } 6195 env = &X86_CPU(first_cpu)->env; 6196 if (!(env->features[FEAT_KVM] & CPUID_KVM_MSI_EXT_DEST_ID)) { 6197 return address; 6198 } 6199 6200 /* 6201 * If the remappable format bit is set, or the upper bits are 6202 * already set in address_hi, or the low extended bits aren't 6203 * there anyway, do nothing. 6204 */ 6205 ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT); 6206 if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) || (address >> 32)) { 6207 return address; 6208 } 6209 6210 address &= ~ext_id; 6211 address |= ext_id << 35; 6212 return address; 6213 } 6214 6215 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, 6216 uint64_t address, uint32_t data, PCIDevice *dev) 6217 { 6218 X86IOMMUState *iommu = x86_iommu_get_default(); 6219 6220 if (iommu) { 6221 X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu); 6222 6223 if (class->int_remap) { 6224 int ret; 6225 MSIMessage src, dst; 6226 6227 src.address = route->u.msi.address_hi; 6228 src.address <<= VTD_MSI_ADDR_HI_SHIFT; 6229 src.address |= route->u.msi.address_lo; 6230 src.data = route->u.msi.data; 6231 6232 ret = class->int_remap(iommu, &src, &dst, dev ? \ 6233 pci_requester_id(dev) : \ 6234 X86_IOMMU_SID_INVALID); 6235 if (ret) { 6236 trace_kvm_x86_fixup_msi_error(route->gsi); 6237 return 1; 6238 } 6239 6240 /* 6241 * Handled untranslated compatibility format interrupt with 6242 * extended destination ID in the low bits 11-5. */ 6243 dst.address = kvm_swizzle_msi_ext_dest_id(dst.address); 6244 6245 route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT; 6246 route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK; 6247 route->u.msi.data = dst.data; 6248 return 0; 6249 } 6250 } 6251 6252 #ifdef CONFIG_XEN_EMU 6253 if (xen_mode == XEN_EMULATE) { 6254 int handled = xen_evtchn_translate_pirq_msi(route, address, data); 6255 6256 /* 6257 * If it was a PIRQ and successfully routed (handled == 0) or it was 6258 * an error (handled < 0), return. If it wasn't a PIRQ, keep going. 6259 */ 6260 if (handled <= 0) { 6261 return handled; 6262 } 6263 } 6264 #endif 6265 6266 address = kvm_swizzle_msi_ext_dest_id(address); 6267 route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT; 6268 route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK; 6269 return 0; 6270 } 6271 6272 typedef struct MSIRouteEntry MSIRouteEntry; 6273 6274 struct MSIRouteEntry { 6275 PCIDevice *dev; /* Device pointer */ 6276 int vector; /* MSI/MSIX vector index */ 6277 int virq; /* Virtual IRQ index */ 6278 QLIST_ENTRY(MSIRouteEntry) list; 6279 }; 6280 6281 /* List of used GSI routes */ 6282 static QLIST_HEAD(, MSIRouteEntry) msi_route_list = \ 6283 QLIST_HEAD_INITIALIZER(msi_route_list); 6284 6285 void kvm_update_msi_routes_all(void *private, bool global, 6286 uint32_t index, uint32_t mask) 6287 { 6288 int cnt = 0, vector; 6289 MSIRouteEntry *entry; 6290 MSIMessage msg; 6291 PCIDevice *dev; 6292 6293 /* TODO: explicit route update */ 6294 QLIST_FOREACH(entry, &msi_route_list, list) { 6295 cnt++; 6296 vector = entry->vector; 6297 dev = entry->dev; 6298 if (msix_enabled(dev) && !msix_is_masked(dev, vector)) { 6299 msg = msix_get_message(dev, vector); 6300 } else if (msi_enabled(dev) && !msi_is_masked(dev, vector)) { 6301 msg = msi_get_message(dev, vector); 6302 } else { 6303 /* 6304 * Either MSI/MSIX is disabled for the device, or the 6305 * specific message was masked out. Skip this one. 6306 */ 6307 continue; 6308 } 6309 kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev); 6310 } 6311 kvm_irqchip_commit_routes(kvm_state); 6312 trace_kvm_x86_update_msi_routes(cnt); 6313 } 6314 6315 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route, 6316 int vector, PCIDevice *dev) 6317 { 6318 static bool notify_list_inited = false; 6319 MSIRouteEntry *entry; 6320 6321 if (!dev) { 6322 /* These are (possibly) IOAPIC routes only used for split 6323 * kernel irqchip mode, while what we are housekeeping are 6324 * PCI devices only. */ 6325 return 0; 6326 } 6327 6328 entry = g_new0(MSIRouteEntry, 1); 6329 entry->dev = dev; 6330 entry->vector = vector; 6331 entry->virq = route->gsi; 6332 QLIST_INSERT_HEAD(&msi_route_list, entry, list); 6333 6334 trace_kvm_x86_add_msi_route(route->gsi); 6335 6336 if (!notify_list_inited) { 6337 /* For the first time we do add route, add ourselves into 6338 * IOMMU's IEC notify list if needed. */ 6339 X86IOMMUState *iommu = x86_iommu_get_default(); 6340 if (iommu) { 6341 x86_iommu_iec_register_notifier(iommu, 6342 kvm_update_msi_routes_all, 6343 NULL); 6344 } 6345 notify_list_inited = true; 6346 } 6347 return 0; 6348 } 6349 6350 int kvm_arch_release_virq_post(int virq) 6351 { 6352 MSIRouteEntry *entry, *next; 6353 QLIST_FOREACH_SAFE(entry, &msi_route_list, list, next) { 6354 if (entry->virq == virq) { 6355 trace_kvm_x86_remove_msi_route(virq); 6356 QLIST_REMOVE(entry, list); 6357 g_free(entry); 6358 break; 6359 } 6360 } 6361 return 0; 6362 } 6363 6364 int kvm_arch_msi_data_to_gsi(uint32_t data) 6365 { 6366 abort(); 6367 } 6368 6369 bool kvm_has_waitpkg(void) 6370 { 6371 return has_msr_umwait; 6372 } 6373 6374 #define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 6375 6376 void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask) 6377 { 6378 KVMState *s = kvm_state; 6379 uint64_t supported; 6380 6381 mask &= XSTATE_DYNAMIC_MASK; 6382 if (!mask) { 6383 return; 6384 } 6385 /* 6386 * Just ignore bits that are not in CPUID[EAX=0xD,ECX=0]. 6387 * ARCH_REQ_XCOMP_GUEST_PERM would fail, and QEMU has warned 6388 * about them already because they are not supported features. 6389 */ 6390 supported = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX); 6391 supported |= (uint64_t)kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EDX) << 32; 6392 mask &= supported; 6393 6394 while (mask) { 6395 int bit = ctz64(mask); 6396 int rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit); 6397 if (rc) { 6398 /* 6399 * Older kernel version (<5.17) do not support 6400 * ARCH_REQ_XCOMP_GUEST_PERM, but also do not return 6401 * any dynamic feature from kvm_arch_get_supported_cpuid. 6402 */ 6403 warn_report("prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure " 6404 "for feature bit %d", bit); 6405 } 6406 mask &= ~BIT_ULL(bit); 6407 } 6408 } 6409 6410 static int kvm_arch_get_notify_vmexit(Object *obj, Error **errp) 6411 { 6412 KVMState *s = KVM_STATE(obj); 6413 return s->notify_vmexit; 6414 } 6415 6416 static void kvm_arch_set_notify_vmexit(Object *obj, int value, Error **errp) 6417 { 6418 KVMState *s = KVM_STATE(obj); 6419 6420 if (s->fd != -1) { 6421 error_setg(errp, "Cannot set properties after the accelerator has been initialized"); 6422 return; 6423 } 6424 6425 s->notify_vmexit = value; 6426 } 6427 6428 static void kvm_arch_get_notify_window(Object *obj, Visitor *v, 6429 const char *name, void *opaque, 6430 Error **errp) 6431 { 6432 KVMState *s = KVM_STATE(obj); 6433 uint32_t value = s->notify_window; 6434 6435 visit_type_uint32(v, name, &value, errp); 6436 } 6437 6438 static void kvm_arch_set_notify_window(Object *obj, Visitor *v, 6439 const char *name, void *opaque, 6440 Error **errp) 6441 { 6442 KVMState *s = KVM_STATE(obj); 6443 uint32_t value; 6444 6445 if (s->fd != -1) { 6446 error_setg(errp, "Cannot set properties after the accelerator has been initialized"); 6447 return; 6448 } 6449 6450 if (!visit_type_uint32(v, name, &value, errp)) { 6451 return; 6452 } 6453 6454 s->notify_window = value; 6455 } 6456 6457 static void kvm_arch_get_xen_version(Object *obj, Visitor *v, 6458 const char *name, void *opaque, 6459 Error **errp) 6460 { 6461 KVMState *s = KVM_STATE(obj); 6462 uint32_t value = s->xen_version; 6463 6464 visit_type_uint32(v, name, &value, errp); 6465 } 6466 6467 static void kvm_arch_set_xen_version(Object *obj, Visitor *v, 6468 const char *name, void *opaque, 6469 Error **errp) 6470 { 6471 KVMState *s = KVM_STATE(obj); 6472 Error *error = NULL; 6473 uint32_t value; 6474 6475 visit_type_uint32(v, name, &value, &error); 6476 if (error) { 6477 error_propagate(errp, error); 6478 return; 6479 } 6480 6481 s->xen_version = value; 6482 if (value && xen_mode == XEN_DISABLED) { 6483 xen_mode = XEN_EMULATE; 6484 } 6485 } 6486 6487 static void kvm_arch_get_xen_gnttab_max_frames(Object *obj, Visitor *v, 6488 const char *name, void *opaque, 6489 Error **errp) 6490 { 6491 KVMState *s = KVM_STATE(obj); 6492 uint16_t value = s->xen_gnttab_max_frames; 6493 6494 visit_type_uint16(v, name, &value, errp); 6495 } 6496 6497 static void kvm_arch_set_xen_gnttab_max_frames(Object *obj, Visitor *v, 6498 const char *name, void *opaque, 6499 Error **errp) 6500 { 6501 KVMState *s = KVM_STATE(obj); 6502 Error *error = NULL; 6503 uint16_t value; 6504 6505 visit_type_uint16(v, name, &value, &error); 6506 if (error) { 6507 error_propagate(errp, error); 6508 return; 6509 } 6510 6511 s->xen_gnttab_max_frames = value; 6512 } 6513 6514 static void kvm_arch_get_xen_evtchn_max_pirq(Object *obj, Visitor *v, 6515 const char *name, void *opaque, 6516 Error **errp) 6517 { 6518 KVMState *s = KVM_STATE(obj); 6519 uint16_t value = s->xen_evtchn_max_pirq; 6520 6521 visit_type_uint16(v, name, &value, errp); 6522 } 6523 6524 static void kvm_arch_set_xen_evtchn_max_pirq(Object *obj, Visitor *v, 6525 const char *name, void *opaque, 6526 Error **errp) 6527 { 6528 KVMState *s = KVM_STATE(obj); 6529 Error *error = NULL; 6530 uint16_t value; 6531 6532 visit_type_uint16(v, name, &value, &error); 6533 if (error) { 6534 error_propagate(errp, error); 6535 return; 6536 } 6537 6538 s->xen_evtchn_max_pirq = value; 6539 } 6540 6541 void kvm_arch_accel_class_init(ObjectClass *oc) 6542 { 6543 object_class_property_add_enum(oc, "notify-vmexit", "NotifyVMexitOption", 6544 &NotifyVmexitOption_lookup, 6545 kvm_arch_get_notify_vmexit, 6546 kvm_arch_set_notify_vmexit); 6547 object_class_property_set_description(oc, "notify-vmexit", 6548 "Enable notify VM exit"); 6549 6550 object_class_property_add(oc, "notify-window", "uint32", 6551 kvm_arch_get_notify_window, 6552 kvm_arch_set_notify_window, 6553 NULL, NULL); 6554 object_class_property_set_description(oc, "notify-window", 6555 "Clock cycles without an event window " 6556 "after which a notification VM exit occurs"); 6557 6558 object_class_property_add(oc, "xen-version", "uint32", 6559 kvm_arch_get_xen_version, 6560 kvm_arch_set_xen_version, 6561 NULL, NULL); 6562 object_class_property_set_description(oc, "xen-version", 6563 "Xen version to be emulated " 6564 "(in XENVER_version form " 6565 "e.g. 0x4000a for 4.10)"); 6566 6567 object_class_property_add(oc, "xen-gnttab-max-frames", "uint16", 6568 kvm_arch_get_xen_gnttab_max_frames, 6569 kvm_arch_set_xen_gnttab_max_frames, 6570 NULL, NULL); 6571 object_class_property_set_description(oc, "xen-gnttab-max-frames", 6572 "Maximum number of grant table frames"); 6573 6574 object_class_property_add(oc, "xen-evtchn-max-pirq", "uint16", 6575 kvm_arch_get_xen_evtchn_max_pirq, 6576 kvm_arch_set_xen_evtchn_max_pirq, 6577 NULL, NULL); 6578 object_class_property_set_description(oc, "xen-evtchn-max-pirq", 6579 "Maximum number of Xen PIRQs"); 6580 } 6581 6582 void kvm_set_max_apic_id(uint32_t max_apic_id) 6583 { 6584 kvm_vm_enable_cap(kvm_state, KVM_CAP_MAX_VCPU_ID, 0, max_apic_id); 6585 } 6586