1 /* 2 * QEMU Windows Hypervisor Platform accelerator (WHPX) 3 * 4 * Copyright Microsoft Corp. 2017 5 * 6 * This work is licensed under the terms of the GNU GPL, version 2 or later. 7 * See the COPYING file in the top-level directory. 8 * 9 */ 10 11 #include "qemu/osdep.h" 12 #include "cpu.h" 13 #include "system/address-spaces.h" 14 #include "system/ioport.h" 15 #include "gdbstub/helpers.h" 16 #include "qemu/accel.h" 17 #include "system/whpx.h" 18 #include "system/cpus.h" 19 #include "system/runstate.h" 20 #include "qemu/main-loop.h" 21 #include "hw/boards.h" 22 #include "hw/intc/ioapic.h" 23 #include "hw/i386/apic_internal.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "qapi/qapi-types-common.h" 27 #include "qapi/qapi-visit-common.h" 28 #include "migration/blocker.h" 29 #include <winerror.h> 30 31 #include "whpx-internal.h" 32 #include "whpx-accel-ops.h" 33 34 #include <winhvplatform.h> 35 #include <winhvemulation.h> 36 37 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL) 38 39 static const WHV_REGISTER_NAME whpx_register_names[] = { 40 41 /* X64 General purpose registers */ 42 WHvX64RegisterRax, 43 WHvX64RegisterRcx, 44 WHvX64RegisterRdx, 45 WHvX64RegisterRbx, 46 WHvX64RegisterRsp, 47 WHvX64RegisterRbp, 48 WHvX64RegisterRsi, 49 WHvX64RegisterRdi, 50 WHvX64RegisterR8, 51 WHvX64RegisterR9, 52 WHvX64RegisterR10, 53 WHvX64RegisterR11, 54 WHvX64RegisterR12, 55 WHvX64RegisterR13, 56 WHvX64RegisterR14, 57 WHvX64RegisterR15, 58 WHvX64RegisterRip, 59 WHvX64RegisterRflags, 60 61 /* X64 Segment registers */ 62 WHvX64RegisterEs, 63 WHvX64RegisterCs, 64 WHvX64RegisterSs, 65 WHvX64RegisterDs, 66 WHvX64RegisterFs, 67 WHvX64RegisterGs, 68 WHvX64RegisterLdtr, 69 WHvX64RegisterTr, 70 71 /* X64 Table registers */ 72 WHvX64RegisterIdtr, 73 WHvX64RegisterGdtr, 74 75 /* X64 Control Registers */ 76 WHvX64RegisterCr0, 77 WHvX64RegisterCr2, 78 WHvX64RegisterCr3, 79 WHvX64RegisterCr4, 80 WHvX64RegisterCr8, 81 82 /* X64 Debug Registers */ 83 /* 84 * WHvX64RegisterDr0, 85 * WHvX64RegisterDr1, 86 * WHvX64RegisterDr2, 87 * WHvX64RegisterDr3, 88 * WHvX64RegisterDr6, 89 * WHvX64RegisterDr7, 90 */ 91 92 /* X64 Floating Point and Vector Registers */ 93 WHvX64RegisterXmm0, 94 WHvX64RegisterXmm1, 95 WHvX64RegisterXmm2, 96 WHvX64RegisterXmm3, 97 WHvX64RegisterXmm4, 98 WHvX64RegisterXmm5, 99 WHvX64RegisterXmm6, 100 WHvX64RegisterXmm7, 101 WHvX64RegisterXmm8, 102 WHvX64RegisterXmm9, 103 WHvX64RegisterXmm10, 104 WHvX64RegisterXmm11, 105 WHvX64RegisterXmm12, 106 WHvX64RegisterXmm13, 107 WHvX64RegisterXmm14, 108 WHvX64RegisterXmm15, 109 WHvX64RegisterFpMmx0, 110 WHvX64RegisterFpMmx1, 111 WHvX64RegisterFpMmx2, 112 WHvX64RegisterFpMmx3, 113 WHvX64RegisterFpMmx4, 114 WHvX64RegisterFpMmx5, 115 WHvX64RegisterFpMmx6, 116 WHvX64RegisterFpMmx7, 117 WHvX64RegisterFpControlStatus, 118 WHvX64RegisterXmmControlStatus, 119 120 /* X64 MSRs */ 121 WHvX64RegisterEfer, 122 #ifdef TARGET_X86_64 123 WHvX64RegisterKernelGsBase, 124 #endif 125 WHvX64RegisterApicBase, 126 /* WHvX64RegisterPat, */ 127 WHvX64RegisterSysenterCs, 128 WHvX64RegisterSysenterEip, 129 WHvX64RegisterSysenterEsp, 130 WHvX64RegisterStar, 131 #ifdef TARGET_X86_64 132 WHvX64RegisterLstar, 133 WHvX64RegisterCstar, 134 WHvX64RegisterSfmask, 135 #endif 136 137 /* Interrupt / Event Registers */ 138 /* 139 * WHvRegisterPendingInterruption, 140 * WHvRegisterInterruptState, 141 * WHvRegisterPendingEvent0, 142 * WHvRegisterPendingEvent1 143 * WHvX64RegisterDeliverabilityNotifications, 144 */ 145 }; 146 147 struct whpx_register_set { 148 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)]; 149 }; 150 151 /* 152 * The current implementation of instruction stepping sets the TF flag 153 * in RFLAGS, causing the CPU to raise an INT1 after each instruction. 154 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception. 155 * 156 * This approach has a few limitations: 157 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag 158 * along with the other flags, possibly restoring it later. It would 159 * result in another INT1 when the flags are restored, triggering 160 * a stop in gdb that could be cleared by doing another step. 161 * 162 * Stepping over a POPF/LAHF instruction will let it overwrite the 163 * TF flags, ending the stepping mode. 164 * 165 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV, 166 * or anything that could result in a page fault) will save the flags 167 * to the stack, clear the TF flag, and let the guest execute the 168 * handler. Normally, the guest will restore the original flags, 169 * that will continue single-stepping. 170 * 171 * 3. Debuggers running on the guest may wish to set TF to do instruction 172 * stepping. INT1 events generated by it would be intercepted by us, 173 * as long as the gdb is connected to QEMU. 174 * 175 * In practice this means that: 176 * 1. Stepping through flags-modifying instructions may cause gdb to 177 * continue or stop in unexpected places. This will be fully recoverable 178 * and will not crash the target. 179 * 180 * 2. Stepping over an instruction that triggers an exception will step 181 * over the exception handler, not into it. 182 * 183 * 3. Debugging the guest via gdb, while running debugger on the guest 184 * at the same time may lead to unexpected effects. Removing all 185 * breakpoints set via QEMU will prevent any further interference 186 * with the guest-level debuggers. 187 * 188 * The limitations can be addressed as shown below: 189 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of 190 * stepping through them. The exact semantics of the instructions is 191 * defined in the "Combined Volume Set of Intel 64 and IA-32 192 * Architectures Software Developer's Manuals", however it involves a 193 * fair amount of corner cases due to compatibility with real mode, 194 * virtual 8086 mode, and differences between 64-bit and 32-bit modes. 195 * 196 * 2. We could step into the guest's exception handlers using the following 197 * sequence: 198 * a. Temporarily enable catching of all exception types via 199 * whpx_set_exception_exit_bitmap(). 200 * b. Once an exception is intercepted, read the IDT/GDT and locate 201 * the original handler. 202 * c. Patch the original handler, injecting an INT3 at the beginning. 203 * d. Update the exception exit bitmap to only catch the 204 * WHvX64ExceptionTypeBreakpointTrap exception. 205 * e. Let the affected CPU run in the exclusive mode. 206 * f. Restore the original handler and the exception exit bitmap. 207 * Note that handling all corner cases related to IDT/GDT is harder 208 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a 209 * rough idea. 210 * 211 * 3. In order to properly support guest-level debugging in parallel with 212 * the QEMU-level debugging, we would need to be able to pass some INT1 213 * events to the guest. This could be done via the following methods: 214 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1, 215 * it seems to only work for interrupts and not software 216 * exceptions. 217 * b. Locating and patching the original handler by parsing IDT/GDT. 218 * This involves relatively complex logic outlined in the previous 219 * paragraph. 220 * c. Emulating the exception invocation (i.e. manually updating RIP, 221 * RFLAGS, and pushing the old values to stack). This is even more 222 * complicated than the previous option, since it involves checking 223 * CPL, gate attributes, and doing various adjustments depending 224 * on the current CPU mode, whether the CPL is changing, etc. 225 */ 226 typedef enum WhpxStepMode { 227 WHPX_STEP_NONE = 0, 228 /* Halt other VCPUs */ 229 WHPX_STEP_EXCLUSIVE, 230 } WhpxStepMode; 231 232 struct AccelCPUState { 233 WHV_EMULATOR_HANDLE emulator; 234 bool window_registered; 235 bool interruptable; 236 bool ready_for_pic_interrupt; 237 uint64_t tpr; 238 uint64_t apic_base; 239 bool interruption_pending; 240 bool dirty; 241 242 /* Must be the last field as it may have a tail */ 243 WHV_RUN_VP_EXIT_CONTEXT exit_ctx; 244 }; 245 246 static bool whpx_allowed; 247 static bool whp_dispatch_initialized; 248 static HMODULE hWinHvPlatform, hWinHvEmulation; 249 static uint32_t max_vcpu_index; 250 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap; 251 252 struct whpx_state whpx_global; 253 struct WHPDispatch whp_dispatch; 254 255 static bool whpx_has_xsave(void) 256 { 257 return whpx_xsave_cap.XsaveSupport; 258 } 259 260 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86, 261 int r86) 262 { 263 WHV_X64_SEGMENT_REGISTER hs; 264 unsigned flags = qs->flags; 265 266 hs.Base = qs->base; 267 hs.Limit = qs->limit; 268 hs.Selector = qs->selector; 269 270 if (v86) { 271 hs.Attributes = 0; 272 hs.SegmentType = 3; 273 hs.Present = 1; 274 hs.DescriptorPrivilegeLevel = 3; 275 hs.NonSystemSegment = 1; 276 277 } else { 278 hs.Attributes = (flags >> DESC_TYPE_SHIFT); 279 280 if (r86) { 281 /* hs.Base &= 0xfffff; */ 282 } 283 } 284 285 return hs; 286 } 287 288 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs) 289 { 290 SegmentCache qs; 291 292 qs.base = hs->Base; 293 qs.limit = hs->Limit; 294 qs.selector = hs->Selector; 295 296 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT; 297 298 return qs; 299 } 300 301 /* X64 Extended Control Registers */ 302 static void whpx_set_xcrs(CPUState *cpu) 303 { 304 HRESULT hr; 305 struct whpx_state *whpx = &whpx_global; 306 WHV_REGISTER_VALUE xcr0; 307 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 308 309 if (!whpx_has_xsave()) { 310 return; 311 } 312 313 /* Only xcr0 is supported by the hypervisor currently */ 314 xcr0.Reg64 = cpu_env(cpu)->xcr0; 315 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 316 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 317 if (FAILED(hr)) { 318 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr); 319 } 320 } 321 322 static int whpx_set_tsc(CPUState *cpu) 323 { 324 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 325 WHV_REGISTER_VALUE tsc_val; 326 HRESULT hr; 327 struct whpx_state *whpx = &whpx_global; 328 329 /* 330 * Suspend the partition prior to setting the TSC to reduce the variance 331 * in TSC across vCPUs. When the first vCPU runs post suspend, the 332 * partition is automatically resumed. 333 */ 334 if (whp_dispatch.WHvSuspendPartitionTime) { 335 336 /* 337 * Unable to suspend partition while setting TSC is not a fatal 338 * error. It just increases the likelihood of TSC variance between 339 * vCPUs and some guest OS are able to handle that just fine. 340 */ 341 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition); 342 if (FAILED(hr)) { 343 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr); 344 } 345 } 346 347 tsc_val.Reg64 = cpu_env(cpu)->tsc; 348 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 349 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 350 if (FAILED(hr)) { 351 error_report("WHPX: Failed to set TSC, hr=%08lx", hr); 352 return -1; 353 } 354 355 return 0; 356 } 357 358 /* 359 * The CR8 register in the CPU is mapped to the TPR register of the APIC, 360 * however, they use a slightly different encoding. Specifically: 361 * 362 * APIC.TPR[bits 7:4] = CR8[bits 3:0] 363 * 364 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64 365 * and IA-32 Architectures Software Developer's Manual. 366 * 367 * The functions below translate the value of CR8 to TPR and vice versa. 368 */ 369 370 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr) 371 { 372 return tpr >> 4; 373 } 374 375 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8) 376 { 377 return cr8 << 4; 378 } 379 380 static void whpx_set_registers(CPUState *cpu, int level) 381 { 382 struct whpx_state *whpx = &whpx_global; 383 AccelCPUState *vcpu = cpu->accel; 384 X86CPU *x86_cpu = X86_CPU(cpu); 385 CPUX86State *env = &x86_cpu->env; 386 struct whpx_register_set vcxt; 387 HRESULT hr; 388 int idx; 389 int idx_next; 390 int i; 391 int v86, r86; 392 393 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 394 395 /* 396 * Following MSRs have side effects on the guest or are too heavy for 397 * runtime. Limit them to full state update. 398 */ 399 if (level >= WHPX_SET_RESET_STATE) { 400 whpx_set_tsc(cpu); 401 } 402 403 memset(&vcxt, 0, sizeof(struct whpx_register_set)); 404 405 v86 = (env->eflags & VM_MASK); 406 r86 = !(env->cr[0] & CR0_PE_MASK); 407 408 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 409 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state); 410 411 idx = 0; 412 413 /* Indexes for first 16 registers match between HV and QEMU definitions */ 414 idx_next = 16; 415 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 416 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx]; 417 } 418 idx = idx_next; 419 420 /* Same goes for RIP and RFLAGS */ 421 assert(whpx_register_names[idx] == WHvX64RegisterRip); 422 vcxt.values[idx++].Reg64 = env->eip; 423 424 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 425 vcxt.values[idx++].Reg64 = env->eflags; 426 427 /* Translate 6+4 segment registers. HV and QEMU order matches */ 428 assert(idx == WHvX64RegisterEs); 429 for (i = 0; i < 6; i += 1, idx += 1) { 430 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86); 431 } 432 433 assert(idx == WHvX64RegisterLdtr); 434 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0); 435 436 assert(idx == WHvX64RegisterTr); 437 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0); 438 439 assert(idx == WHvX64RegisterIdtr); 440 vcxt.values[idx].Table.Base = env->idt.base; 441 vcxt.values[idx].Table.Limit = env->idt.limit; 442 idx += 1; 443 444 assert(idx == WHvX64RegisterGdtr); 445 vcxt.values[idx].Table.Base = env->gdt.base; 446 vcxt.values[idx].Table.Limit = env->gdt.limit; 447 idx += 1; 448 449 /* CR0, 2, 3, 4, 8 */ 450 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 451 vcxt.values[idx++].Reg64 = env->cr[0]; 452 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 453 vcxt.values[idx++].Reg64 = env->cr[2]; 454 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 455 vcxt.values[idx++].Reg64 = env->cr[3]; 456 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 457 vcxt.values[idx++].Reg64 = env->cr[4]; 458 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 459 vcxt.values[idx++].Reg64 = vcpu->tpr; 460 461 /* 8 Debug Registers - Skipped */ 462 463 /* 464 * Extended control registers needs to be handled separately depending 465 * on whether xsave is supported/enabled or not. 466 */ 467 whpx_set_xcrs(cpu); 468 469 /* 16 XMM registers */ 470 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 471 idx_next = idx + 16; 472 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 473 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0); 474 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1); 475 } 476 idx = idx_next; 477 478 /* 8 FP registers */ 479 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 480 for (i = 0; i < 8; i += 1, idx += 1) { 481 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0); 482 /* vcxt.values[idx].Fp.AsUINT128.High64 = 483 env->fpregs[i].mmx.MMX_Q(1); 484 */ 485 } 486 487 /* FP control status register */ 488 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 489 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc; 490 vcxt.values[idx].FpControlStatus.FpStatus = 491 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11; 492 vcxt.values[idx].FpControlStatus.FpTag = 0; 493 for (i = 0; i < 8; ++i) { 494 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i; 495 } 496 vcxt.values[idx].FpControlStatus.Reserved = 0; 497 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop; 498 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip; 499 idx += 1; 500 501 /* XMM control status register */ 502 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 503 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0; 504 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr; 505 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff; 506 idx += 1; 507 508 /* MSRs */ 509 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 510 vcxt.values[idx++].Reg64 = env->efer; 511 #ifdef TARGET_X86_64 512 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 513 vcxt.values[idx++].Reg64 = env->kernelgsbase; 514 #endif 515 516 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 517 vcxt.values[idx++].Reg64 = vcpu->apic_base; 518 519 /* WHvX64RegisterPat - Skipped */ 520 521 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 522 vcxt.values[idx++].Reg64 = env->sysenter_cs; 523 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 524 vcxt.values[idx++].Reg64 = env->sysenter_eip; 525 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 526 vcxt.values[idx++].Reg64 = env->sysenter_esp; 527 assert(whpx_register_names[idx] == WHvX64RegisterStar); 528 vcxt.values[idx++].Reg64 = env->star; 529 #ifdef TARGET_X86_64 530 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 531 vcxt.values[idx++].Reg64 = env->lstar; 532 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 533 vcxt.values[idx++].Reg64 = env->cstar; 534 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 535 vcxt.values[idx++].Reg64 = env->fmask; 536 #endif 537 538 /* Interrupt / Event Registers - Skipped */ 539 540 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 541 542 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 543 whpx->partition, cpu->cpu_index, 544 whpx_register_names, 545 RTL_NUMBER_OF(whpx_register_names), 546 &vcxt.values[0]); 547 548 if (FAILED(hr)) { 549 error_report("WHPX: Failed to set virtual processor context, hr=%08lx", 550 hr); 551 } 552 } 553 554 static int whpx_get_tsc(CPUState *cpu) 555 { 556 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc; 557 WHV_REGISTER_VALUE tsc_val; 558 HRESULT hr; 559 struct whpx_state *whpx = &whpx_global; 560 561 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 562 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val); 563 if (FAILED(hr)) { 564 error_report("WHPX: Failed to get TSC, hr=%08lx", hr); 565 return -1; 566 } 567 568 cpu_env(cpu)->tsc = tsc_val.Reg64; 569 return 0; 570 } 571 572 /* X64 Extended Control Registers */ 573 static void whpx_get_xcrs(CPUState *cpu) 574 { 575 HRESULT hr; 576 struct whpx_state *whpx = &whpx_global; 577 WHV_REGISTER_VALUE xcr0; 578 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0; 579 580 if (!whpx_has_xsave()) { 581 return; 582 } 583 584 /* Only xcr0 is supported by the hypervisor currently */ 585 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 586 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0); 587 if (FAILED(hr)) { 588 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr); 589 return; 590 } 591 592 cpu_env(cpu)->xcr0 = xcr0.Reg64; 593 } 594 595 static void whpx_get_registers(CPUState *cpu) 596 { 597 struct whpx_state *whpx = &whpx_global; 598 AccelCPUState *vcpu = cpu->accel; 599 X86CPU *x86_cpu = X86_CPU(cpu); 600 CPUX86State *env = &x86_cpu->env; 601 struct whpx_register_set vcxt; 602 uint64_t tpr, apic_base; 603 HRESULT hr; 604 int idx; 605 int idx_next; 606 int i; 607 608 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 609 610 if (!env->tsc_valid) { 611 whpx_get_tsc(cpu); 612 env->tsc_valid = !runstate_is_running(); 613 } 614 615 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 616 whpx->partition, cpu->cpu_index, 617 whpx_register_names, 618 RTL_NUMBER_OF(whpx_register_names), 619 &vcxt.values[0]); 620 if (FAILED(hr)) { 621 error_report("WHPX: Failed to get virtual processor context, hr=%08lx", 622 hr); 623 } 624 625 if (whpx_apic_in_platform()) { 626 /* 627 * Fetch the TPR value from the emulated APIC. It may get overwritten 628 * below with the value from CR8 returned by 629 * WHvGetVirtualProcessorRegisters(). 630 */ 631 whpx_apic_get(x86_cpu->apic_state); 632 vcpu->tpr = whpx_apic_tpr_to_cr8( 633 cpu_get_apic_tpr(x86_cpu->apic_state)); 634 } 635 636 idx = 0; 637 638 /* Indexes for first 16 registers match between HV and QEMU definitions */ 639 idx_next = 16; 640 for (idx = 0; idx < CPU_NB_REGS; idx += 1) { 641 env->regs[idx] = vcxt.values[idx].Reg64; 642 } 643 idx = idx_next; 644 645 /* Same goes for RIP and RFLAGS */ 646 assert(whpx_register_names[idx] == WHvX64RegisterRip); 647 env->eip = vcxt.values[idx++].Reg64; 648 assert(whpx_register_names[idx] == WHvX64RegisterRflags); 649 env->eflags = vcxt.values[idx++].Reg64; 650 651 /* Translate 6+4 segment registers. HV and QEMU order matches */ 652 assert(idx == WHvX64RegisterEs); 653 for (i = 0; i < 6; i += 1, idx += 1) { 654 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment); 655 } 656 657 assert(idx == WHvX64RegisterLdtr); 658 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment); 659 assert(idx == WHvX64RegisterTr); 660 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment); 661 assert(idx == WHvX64RegisterIdtr); 662 env->idt.base = vcxt.values[idx].Table.Base; 663 env->idt.limit = vcxt.values[idx].Table.Limit; 664 idx += 1; 665 assert(idx == WHvX64RegisterGdtr); 666 env->gdt.base = vcxt.values[idx].Table.Base; 667 env->gdt.limit = vcxt.values[idx].Table.Limit; 668 idx += 1; 669 670 /* CR0, 2, 3, 4, 8 */ 671 assert(whpx_register_names[idx] == WHvX64RegisterCr0); 672 env->cr[0] = vcxt.values[idx++].Reg64; 673 assert(whpx_register_names[idx] == WHvX64RegisterCr2); 674 env->cr[2] = vcxt.values[idx++].Reg64; 675 assert(whpx_register_names[idx] == WHvX64RegisterCr3); 676 env->cr[3] = vcxt.values[idx++].Reg64; 677 assert(whpx_register_names[idx] == WHvX64RegisterCr4); 678 env->cr[4] = vcxt.values[idx++].Reg64; 679 assert(whpx_register_names[idx] == WHvX64RegisterCr8); 680 tpr = vcxt.values[idx++].Reg64; 681 if (tpr != vcpu->tpr) { 682 vcpu->tpr = tpr; 683 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr)); 684 } 685 686 /* 8 Debug Registers - Skipped */ 687 688 /* 689 * Extended control registers needs to be handled separately depending 690 * on whether xsave is supported/enabled or not. 691 */ 692 whpx_get_xcrs(cpu); 693 694 /* 16 XMM registers */ 695 assert(whpx_register_names[idx] == WHvX64RegisterXmm0); 696 idx_next = idx + 16; 697 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) { 698 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64; 699 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64; 700 } 701 idx = idx_next; 702 703 /* 8 FP registers */ 704 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0); 705 for (i = 0; i < 8; i += 1, idx += 1) { 706 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64; 707 /* env->fpregs[i].mmx.MMX_Q(1) = 708 vcxt.values[idx].Fp.AsUINT128.High64; 709 */ 710 } 711 712 /* FP control status register */ 713 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus); 714 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl; 715 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7; 716 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800; 717 for (i = 0; i < 8; ++i) { 718 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1); 719 } 720 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp; 721 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip; 722 idx += 1; 723 724 /* XMM control status register */ 725 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus); 726 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl; 727 idx += 1; 728 729 /* MSRs */ 730 assert(whpx_register_names[idx] == WHvX64RegisterEfer); 731 env->efer = vcxt.values[idx++].Reg64; 732 #ifdef TARGET_X86_64 733 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase); 734 env->kernelgsbase = vcxt.values[idx++].Reg64; 735 #endif 736 737 assert(whpx_register_names[idx] == WHvX64RegisterApicBase); 738 apic_base = vcxt.values[idx++].Reg64; 739 if (apic_base != vcpu->apic_base) { 740 vcpu->apic_base = apic_base; 741 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base); 742 } 743 744 /* WHvX64RegisterPat - Skipped */ 745 746 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs); 747 env->sysenter_cs = vcxt.values[idx++].Reg64; 748 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip); 749 env->sysenter_eip = vcxt.values[idx++].Reg64; 750 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp); 751 env->sysenter_esp = vcxt.values[idx++].Reg64; 752 assert(whpx_register_names[idx] == WHvX64RegisterStar); 753 env->star = vcxt.values[idx++].Reg64; 754 #ifdef TARGET_X86_64 755 assert(whpx_register_names[idx] == WHvX64RegisterLstar); 756 env->lstar = vcxt.values[idx++].Reg64; 757 assert(whpx_register_names[idx] == WHvX64RegisterCstar); 758 env->cstar = vcxt.values[idx++].Reg64; 759 assert(whpx_register_names[idx] == WHvX64RegisterSfmask); 760 env->fmask = vcxt.values[idx++].Reg64; 761 #endif 762 763 /* Interrupt / Event Registers - Skipped */ 764 765 assert(idx == RTL_NUMBER_OF(whpx_register_names)); 766 767 if (whpx_apic_in_platform()) { 768 whpx_apic_get(x86_cpu->apic_state); 769 } 770 771 x86_update_hflags(env); 772 } 773 774 static HRESULT CALLBACK whpx_emu_ioport_callback( 775 void *ctx, 776 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess) 777 { 778 MemTxAttrs attrs = { 0 }; 779 address_space_rw(&address_space_io, IoAccess->Port, attrs, 780 &IoAccess->Data, IoAccess->AccessSize, 781 IoAccess->Direction); 782 return S_OK; 783 } 784 785 static HRESULT CALLBACK whpx_emu_mmio_callback( 786 void *ctx, 787 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma) 788 { 789 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize, 790 ma->Direction); 791 return S_OK; 792 } 793 794 static HRESULT CALLBACK whpx_emu_getreg_callback( 795 void *ctx, 796 const WHV_REGISTER_NAME *RegisterNames, 797 UINT32 RegisterCount, 798 WHV_REGISTER_VALUE *RegisterValues) 799 { 800 HRESULT hr; 801 struct whpx_state *whpx = &whpx_global; 802 CPUState *cpu = (CPUState *)ctx; 803 804 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 805 whpx->partition, cpu->cpu_index, 806 RegisterNames, RegisterCount, 807 RegisterValues); 808 if (FAILED(hr)) { 809 error_report("WHPX: Failed to get virtual processor registers," 810 " hr=%08lx", hr); 811 } 812 813 return hr; 814 } 815 816 static HRESULT CALLBACK whpx_emu_setreg_callback( 817 void *ctx, 818 const WHV_REGISTER_NAME *RegisterNames, 819 UINT32 RegisterCount, 820 const WHV_REGISTER_VALUE *RegisterValues) 821 { 822 HRESULT hr; 823 struct whpx_state *whpx = &whpx_global; 824 CPUState *cpu = (CPUState *)ctx; 825 826 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 827 whpx->partition, cpu->cpu_index, 828 RegisterNames, RegisterCount, 829 RegisterValues); 830 if (FAILED(hr)) { 831 error_report("WHPX: Failed to set virtual processor registers," 832 " hr=%08lx", hr); 833 } 834 835 /* 836 * The emulator just successfully wrote the register state. We clear the 837 * dirty state so we avoid the double write on resume of the VP. 838 */ 839 cpu->accel->dirty = false; 840 841 return hr; 842 } 843 844 static HRESULT CALLBACK whpx_emu_translate_callback( 845 void *ctx, 846 WHV_GUEST_VIRTUAL_ADDRESS Gva, 847 WHV_TRANSLATE_GVA_FLAGS TranslateFlags, 848 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult, 849 WHV_GUEST_PHYSICAL_ADDRESS *Gpa) 850 { 851 HRESULT hr; 852 struct whpx_state *whpx = &whpx_global; 853 CPUState *cpu = (CPUState *)ctx; 854 WHV_TRANSLATE_GVA_RESULT res; 855 856 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index, 857 Gva, TranslateFlags, &res, Gpa); 858 if (FAILED(hr)) { 859 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr); 860 } else { 861 *TranslationResult = res.ResultCode; 862 } 863 864 return hr; 865 } 866 867 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = { 868 .Size = sizeof(WHV_EMULATOR_CALLBACKS), 869 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback, 870 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback, 871 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback, 872 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback, 873 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback, 874 }; 875 876 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx) 877 { 878 HRESULT hr; 879 AccelCPUState *vcpu = cpu->accel; 880 WHV_EMULATOR_STATUS emu_status; 881 882 hr = whp_dispatch.WHvEmulatorTryMmioEmulation( 883 vcpu->emulator, cpu, 884 &vcpu->exit_ctx.VpContext, ctx, 885 &emu_status); 886 if (FAILED(hr)) { 887 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr); 888 return -1; 889 } 890 891 if (!emu_status.EmulationSuccessful) { 892 error_report("WHPX: Failed to emulate MMIO access with" 893 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 894 return -1; 895 } 896 897 return 0; 898 } 899 900 static int whpx_handle_portio(CPUState *cpu, 901 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx) 902 { 903 HRESULT hr; 904 AccelCPUState *vcpu = cpu->accel; 905 WHV_EMULATOR_STATUS emu_status; 906 907 hr = whp_dispatch.WHvEmulatorTryIoEmulation( 908 vcpu->emulator, cpu, 909 &vcpu->exit_ctx.VpContext, ctx, 910 &emu_status); 911 if (FAILED(hr)) { 912 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr); 913 return -1; 914 } 915 916 if (!emu_status.EmulationSuccessful) { 917 error_report("WHPX: Failed to emulate PortIO access with" 918 " EmulatorReturnStatus: %u", emu_status.AsUINT32); 919 return -1; 920 } 921 922 return 0; 923 } 924 925 /* 926 * Controls whether we should intercept various exceptions on the guest, 927 * namely breakpoint/single-step events. 928 * 929 * The 'exceptions' argument accepts a bitmask, e.g: 930 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...) 931 */ 932 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions) 933 { 934 struct whpx_state *whpx = &whpx_global; 935 WHV_PARTITION_PROPERTY prop = { 0, }; 936 HRESULT hr; 937 938 if (exceptions == whpx->exception_exit_bitmap) { 939 return S_OK; 940 } 941 942 prop.ExceptionExitBitmap = exceptions; 943 944 hr = whp_dispatch.WHvSetPartitionProperty( 945 whpx->partition, 946 WHvPartitionPropertyCodeExceptionExitBitmap, 947 &prop, 948 sizeof(WHV_PARTITION_PROPERTY)); 949 950 if (SUCCEEDED(hr)) { 951 whpx->exception_exit_bitmap = exceptions; 952 } 953 954 return hr; 955 } 956 957 958 /* 959 * This function is called before/after stepping over a single instruction. 960 * It will update the CPU registers to arm/disarm the instruction stepping 961 * accordingly. 962 */ 963 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu, 964 bool set, 965 uint64_t *exit_context_rflags) 966 { 967 WHV_REGISTER_NAME reg_name; 968 WHV_REGISTER_VALUE reg_value; 969 HRESULT hr; 970 struct whpx_state *whpx = &whpx_global; 971 972 /* 973 * If we are trying to step over a single instruction, we need to set the 974 * TF bit in rflags. Otherwise, clear it. 975 */ 976 reg_name = WHvX64RegisterRflags; 977 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 978 whpx->partition, 979 cpu->cpu_index, 980 ®_name, 981 1, 982 ®_value); 983 984 if (FAILED(hr)) { 985 error_report("WHPX: Failed to get rflags, hr=%08lx", hr); 986 return hr; 987 } 988 989 if (exit_context_rflags) { 990 assert(*exit_context_rflags == reg_value.Reg64); 991 } 992 993 if (set) { 994 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */ 995 reg_value.Reg64 |= TF_MASK; 996 } else { 997 reg_value.Reg64 &= ~TF_MASK; 998 } 999 1000 if (exit_context_rflags) { 1001 *exit_context_rflags = reg_value.Reg64; 1002 } 1003 1004 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1005 whpx->partition, 1006 cpu->cpu_index, 1007 ®_name, 1008 1, 1009 ®_value); 1010 1011 if (FAILED(hr)) { 1012 error_report("WHPX: Failed to set rflags," 1013 " hr=%08lx", 1014 hr); 1015 return hr; 1016 } 1017 1018 reg_name = WHvRegisterInterruptState; 1019 reg_value.Reg64 = 0; 1020 1021 /* Suspend delivery of hardware interrupts during single-stepping. */ 1022 reg_value.InterruptState.InterruptShadow = set != 0; 1023 1024 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1025 whpx->partition, 1026 cpu->cpu_index, 1027 ®_name, 1028 1, 1029 ®_value); 1030 1031 if (FAILED(hr)) { 1032 error_report("WHPX: Failed to set InterruptState," 1033 " hr=%08lx", 1034 hr); 1035 return hr; 1036 } 1037 1038 if (!set) { 1039 /* 1040 * We have just finished stepping over a single instruction, 1041 * and intercepted the INT1 generated by it. 1042 * We need to now hide the INT1 from the guest, 1043 * as it would not be expecting it. 1044 */ 1045 1046 reg_name = WHvX64RegisterPendingDebugException; 1047 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1048 whpx->partition, 1049 cpu->cpu_index, 1050 ®_name, 1051 1, 1052 ®_value); 1053 1054 if (FAILED(hr)) { 1055 error_report("WHPX: Failed to get pending debug exceptions," 1056 "hr=%08lx", hr); 1057 return hr; 1058 } 1059 1060 if (reg_value.PendingDebugException.SingleStep) { 1061 reg_value.PendingDebugException.SingleStep = 0; 1062 1063 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1064 whpx->partition, 1065 cpu->cpu_index, 1066 ®_name, 1067 1, 1068 ®_value); 1069 1070 if (FAILED(hr)) { 1071 error_report("WHPX: Failed to clear pending debug exceptions," 1072 "hr=%08lx", hr); 1073 return hr; 1074 } 1075 } 1076 1077 } 1078 1079 return S_OK; 1080 } 1081 1082 /* Tries to find a breakpoint at the specified address. */ 1083 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address) 1084 { 1085 struct whpx_state *whpx = &whpx_global; 1086 int i; 1087 1088 if (whpx->breakpoints.breakpoints) { 1089 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) { 1090 if (address == whpx->breakpoints.breakpoints->data[i].address) { 1091 return &whpx->breakpoints.breakpoints->data[i]; 1092 } 1093 } 1094 } 1095 1096 return NULL; 1097 } 1098 1099 /* 1100 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for 1101 * debugging user-mode applications. Since the WHPX API does not offer 1102 * an easy way to pass the intercepted exception back to the guest, we 1103 * resort to using INT1 instead, and let the guest always handle INT3. 1104 */ 1105 static const uint8_t whpx_breakpoint_instruction = 0xF1; 1106 1107 /* 1108 * The WHPX QEMU backend implements breakpoints by writing the INT1 1109 * instruction into memory (ignoring the DRx registers). This raises a few 1110 * issues that need to be carefully handled: 1111 * 1112 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints 1113 * at the same location, and later remove them in arbitrary order. 1114 * This should not cause memory corruption, and should only remove the 1115 * physical breakpoint instruction when the last QEMU breakpoint is gone. 1116 * 1117 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid 1118 * physical location. Hence, physically adding/removing a breakpoint can 1119 * theoretically fail at any time. We need to keep track of it. 1120 * 1121 * The function below rebuilds a list of low-level breakpoints (one per 1122 * address, tracking the original instruction and any errors) from the list of 1123 * high-level breakpoints (set via cpu_breakpoint_insert()). 1124 * 1125 * In order to optimize performance, this function stores the list of 1126 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the 1127 * low-level ones, so that it won't be re-invoked until these breakpoints 1128 * change. 1129 * 1130 * Note that this function decides which breakpoints should be inserted into, 1131 * memory, but doesn't actually do it. The memory accessing is done in 1132 * whpx_apply_breakpoints(). 1133 */ 1134 static void whpx_translate_cpu_breakpoints( 1135 struct whpx_breakpoints *breakpoints, 1136 CPUState *cpu, 1137 int cpu_breakpoint_count) 1138 { 1139 CPUBreakpoint *bp; 1140 int cpu_bp_index = 0; 1141 1142 breakpoints->original_addresses = 1143 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count); 1144 1145 breakpoints->original_address_count = cpu_breakpoint_count; 1146 1147 int max_breakpoints = cpu_breakpoint_count + 1148 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0); 1149 1150 struct whpx_breakpoint_collection *new_breakpoints = 1151 g_malloc0(sizeof(struct whpx_breakpoint_collection) 1152 + max_breakpoints * sizeof(struct whpx_breakpoint)); 1153 1154 new_breakpoints->allocated = max_breakpoints; 1155 new_breakpoints->used = 0; 1156 1157 /* 1158 * 1. Preserve all old breakpoints that could not be automatically 1159 * cleared when the CPU got stopped. 1160 */ 1161 if (breakpoints->breakpoints) { 1162 int i; 1163 for (i = 0; i < breakpoints->breakpoints->used; i++) { 1164 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) { 1165 new_breakpoints->data[new_breakpoints->used++] = 1166 breakpoints->breakpoints->data[i]; 1167 } 1168 } 1169 } 1170 1171 /* 2. Map all CPU breakpoints to WHPX breakpoints */ 1172 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1173 int i; 1174 bool found = false; 1175 1176 /* This will be used to detect changed CPU breakpoints later. */ 1177 breakpoints->original_addresses[cpu_bp_index++] = bp->pc; 1178 1179 for (i = 0; i < new_breakpoints->used; i++) { 1180 /* 1181 * WARNING: This loop has O(N^2) complexity, where N is the 1182 * number of breakpoints. It should not be a bottleneck in 1183 * real-world scenarios, since it only needs to run once after 1184 * the breakpoints have been modified. 1185 * If this ever becomes a concern, it can be optimized by storing 1186 * high-level breakpoint objects in a tree or hash map. 1187 */ 1188 1189 if (new_breakpoints->data[i].address == bp->pc) { 1190 /* There was already a breakpoint at this address. */ 1191 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) { 1192 new_breakpoints->data[i].state = WHPX_BP_SET; 1193 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) { 1194 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING; 1195 } 1196 1197 found = true; 1198 break; 1199 } 1200 } 1201 1202 if (!found && new_breakpoints->used < new_breakpoints->allocated) { 1203 /* No WHPX breakpoint at this address. Create one. */ 1204 new_breakpoints->data[new_breakpoints->used].address = bp->pc; 1205 new_breakpoints->data[new_breakpoints->used].state = 1206 WHPX_BP_SET_PENDING; 1207 new_breakpoints->used++; 1208 } 1209 } 1210 1211 /* 1212 * Free the previous breakpoint list. This can be optimized by keeping 1213 * it as shadow buffer for the next computation instead of freeing 1214 * it immediately. 1215 */ 1216 g_free(breakpoints->breakpoints); 1217 1218 breakpoints->breakpoints = new_breakpoints; 1219 } 1220 1221 /* 1222 * Physically inserts/removes the breakpoints by reading and writing the 1223 * physical memory, keeping a track of the failed attempts. 1224 * 1225 * Passing resuming=true will try to set all previously unset breakpoints. 1226 * Passing resuming=false will remove all inserted ones. 1227 */ 1228 static void whpx_apply_breakpoints( 1229 struct whpx_breakpoint_collection *breakpoints, 1230 CPUState *cpu, 1231 bool resuming) 1232 { 1233 int i, rc; 1234 if (!breakpoints) { 1235 return; 1236 } 1237 1238 for (i = 0; i < breakpoints->used; i++) { 1239 /* Decide what to do right now based on the last known state. */ 1240 WhpxBreakpointState state = breakpoints->data[i].state; 1241 switch (state) { 1242 case WHPX_BP_CLEARED: 1243 if (resuming) { 1244 state = WHPX_BP_SET_PENDING; 1245 } 1246 break; 1247 case WHPX_BP_SET_PENDING: 1248 if (!resuming) { 1249 state = WHPX_BP_CLEARED; 1250 } 1251 break; 1252 case WHPX_BP_SET: 1253 if (!resuming) { 1254 state = WHPX_BP_CLEAR_PENDING; 1255 } 1256 break; 1257 case WHPX_BP_CLEAR_PENDING: 1258 if (resuming) { 1259 state = WHPX_BP_SET; 1260 } 1261 break; 1262 } 1263 1264 if (state == WHPX_BP_SET_PENDING) { 1265 /* Remember the original instruction. */ 1266 rc = cpu_memory_rw_debug(cpu, 1267 breakpoints->data[i].address, 1268 &breakpoints->data[i].original_instruction, 1269 1, 1270 false); 1271 1272 if (!rc) { 1273 /* Write the breakpoint instruction. */ 1274 rc = cpu_memory_rw_debug(cpu, 1275 breakpoints->data[i].address, 1276 (void *)&whpx_breakpoint_instruction, 1277 1, 1278 true); 1279 } 1280 1281 if (!rc) { 1282 state = WHPX_BP_SET; 1283 } 1284 1285 } 1286 1287 if (state == WHPX_BP_CLEAR_PENDING) { 1288 /* Restore the original instruction. */ 1289 rc = cpu_memory_rw_debug(cpu, 1290 breakpoints->data[i].address, 1291 &breakpoints->data[i].original_instruction, 1292 1, 1293 true); 1294 1295 if (!rc) { 1296 state = WHPX_BP_CLEARED; 1297 } 1298 } 1299 1300 breakpoints->data[i].state = state; 1301 } 1302 } 1303 1304 /* 1305 * This function is called when the a VCPU is about to start and no other 1306 * VCPUs have been started so far. Since the VCPU start order could be 1307 * arbitrary, it doesn't have to be VCPU#0. 1308 * 1309 * It is used to commit the breakpoints into memory, and configure WHPX 1310 * to intercept debug exceptions. 1311 * 1312 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or 1313 * more VCPUs are already running, so this is the best place to do it. 1314 */ 1315 static int whpx_first_vcpu_starting(CPUState *cpu) 1316 { 1317 struct whpx_state *whpx = &whpx_global; 1318 HRESULT hr; 1319 1320 g_assert(bql_locked()); 1321 1322 if (!QTAILQ_EMPTY(&cpu->breakpoints) || 1323 (whpx->breakpoints.breakpoints && 1324 whpx->breakpoints.breakpoints->used)) { 1325 CPUBreakpoint *bp; 1326 int i = 0; 1327 bool update_pending = false; 1328 1329 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1330 if (i >= whpx->breakpoints.original_address_count || 1331 bp->pc != whpx->breakpoints.original_addresses[i]) { 1332 update_pending = true; 1333 } 1334 1335 i++; 1336 } 1337 1338 if (i != whpx->breakpoints.original_address_count) { 1339 update_pending = true; 1340 } 1341 1342 if (update_pending) { 1343 /* 1344 * The CPU breakpoints have changed since the last call to 1345 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must 1346 * now be recomputed. 1347 */ 1348 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i); 1349 } 1350 1351 /* Actually insert the breakpoints into the memory. */ 1352 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true); 1353 } 1354 1355 uint64_t exception_mask; 1356 if (whpx->step_pending || 1357 (whpx->breakpoints.breakpoints && 1358 whpx->breakpoints.breakpoints->used)) { 1359 /* 1360 * We are either attempting to single-step one or more CPUs, or 1361 * have one or more breakpoints enabled. Both require intercepting 1362 * the WHvX64ExceptionTypeBreakpointTrap exception. 1363 */ 1364 1365 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault; 1366 } else { 1367 /* Let the guest handle all exceptions. */ 1368 exception_mask = 0; 1369 } 1370 1371 hr = whpx_set_exception_exit_bitmap(exception_mask); 1372 if (!SUCCEEDED(hr)) { 1373 error_report("WHPX: Failed to update exception exit mask," 1374 "hr=%08lx.", hr); 1375 return 1; 1376 } 1377 1378 return 0; 1379 } 1380 1381 /* 1382 * This function is called when the last VCPU has finished running. 1383 * It is used to remove any previously set breakpoints from memory. 1384 */ 1385 static int whpx_last_vcpu_stopping(CPUState *cpu) 1386 { 1387 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false); 1388 return 0; 1389 } 1390 1391 /* Returns the address of the next instruction that is about to be executed. */ 1392 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid) 1393 { 1394 if (cpu->accel->dirty) { 1395 /* The CPU registers have been modified by other parts of QEMU. */ 1396 return cpu_env(cpu)->eip; 1397 } else if (exit_context_valid) { 1398 /* 1399 * The CPU registers have not been modified by neither other parts 1400 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters(). 1401 * This is the most common case. 1402 */ 1403 AccelCPUState *vcpu = cpu->accel; 1404 return vcpu->exit_ctx.VpContext.Rip; 1405 } else { 1406 /* 1407 * The CPU registers have been modified by a call to 1408 * WHvSetVirtualProcessorRegisters() and must be re-queried from 1409 * the target. 1410 */ 1411 WHV_REGISTER_VALUE reg_value; 1412 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip; 1413 HRESULT hr; 1414 struct whpx_state *whpx = &whpx_global; 1415 1416 hr = whp_dispatch.WHvGetVirtualProcessorRegisters( 1417 whpx->partition, 1418 cpu->cpu_index, 1419 ®_name, 1420 1, 1421 ®_value); 1422 1423 if (FAILED(hr)) { 1424 error_report("WHPX: Failed to get PC, hr=%08lx", hr); 1425 return 0; 1426 } 1427 1428 return reg_value.Reg64; 1429 } 1430 } 1431 1432 static int whpx_handle_halt(CPUState *cpu) 1433 { 1434 int ret = 0; 1435 1436 bql_lock(); 1437 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1438 (cpu_env(cpu)->eflags & IF_MASK)) && 1439 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1440 cpu->exception_index = EXCP_HLT; 1441 cpu->halted = true; 1442 ret = 1; 1443 } 1444 bql_unlock(); 1445 1446 return ret; 1447 } 1448 1449 static void whpx_vcpu_pre_run(CPUState *cpu) 1450 { 1451 HRESULT hr; 1452 struct whpx_state *whpx = &whpx_global; 1453 AccelCPUState *vcpu = cpu->accel; 1454 X86CPU *x86_cpu = X86_CPU(cpu); 1455 CPUX86State *env = &x86_cpu->env; 1456 int irq; 1457 uint8_t tpr; 1458 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int; 1459 UINT32 reg_count = 0; 1460 WHV_REGISTER_VALUE reg_values[3]; 1461 WHV_REGISTER_NAME reg_names[3]; 1462 1463 memset(&new_int, 0, sizeof(new_int)); 1464 memset(reg_values, 0, sizeof(reg_values)); 1465 1466 bql_lock(); 1467 1468 /* Inject NMI */ 1469 if (!vcpu->interruption_pending && 1470 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { 1471 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { 1472 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 1473 vcpu->interruptable = false; 1474 new_int.InterruptionType = WHvX64PendingNmi; 1475 new_int.InterruptionPending = 1; 1476 new_int.InterruptionVector = 2; 1477 } 1478 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 1479 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 1480 } 1481 } 1482 1483 /* 1484 * Force the VCPU out of its inner loop to process any INIT requests or 1485 * commit pending TPR access. 1486 */ 1487 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 1488 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1489 !(env->hflags & HF_SMM_MASK)) { 1490 cpu->exit_request = 1; 1491 } 1492 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1493 cpu->exit_request = 1; 1494 } 1495 } 1496 1497 /* Get pending hard interruption or replay one that was overwritten */ 1498 if (!whpx_apic_in_platform()) { 1499 if (!vcpu->interruption_pending && 1500 vcpu->interruptable && (env->eflags & IF_MASK)) { 1501 assert(!new_int.InterruptionPending); 1502 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1503 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1504 irq = cpu_get_pic_interrupt(env); 1505 if (irq >= 0) { 1506 new_int.InterruptionType = WHvX64PendingInterrupt; 1507 new_int.InterruptionPending = 1; 1508 new_int.InterruptionVector = irq; 1509 } 1510 } 1511 } 1512 1513 /* Setup interrupt state if new one was prepared */ 1514 if (new_int.InterruptionPending) { 1515 reg_values[reg_count].PendingInterruption = new_int; 1516 reg_names[reg_count] = WHvRegisterPendingInterruption; 1517 reg_count += 1; 1518 } 1519 } else if (vcpu->ready_for_pic_interrupt && 1520 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 1521 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 1522 irq = cpu_get_pic_interrupt(env); 1523 if (irq >= 0) { 1524 reg_names[reg_count] = WHvRegisterPendingEvent; 1525 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT) 1526 { 1527 .EventPending = 1, 1528 .EventType = WHvX64PendingEventExtInt, 1529 .Vector = irq, 1530 }; 1531 reg_count += 1; 1532 } 1533 } 1534 1535 /* Sync the TPR to the CR8 if was modified during the intercept */ 1536 tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state)); 1537 if (tpr != vcpu->tpr) { 1538 vcpu->tpr = tpr; 1539 reg_values[reg_count].Reg64 = tpr; 1540 cpu->exit_request = 1; 1541 reg_names[reg_count] = WHvX64RegisterCr8; 1542 reg_count += 1; 1543 } 1544 1545 /* Update the state of the interrupt delivery notification */ 1546 if (!vcpu->window_registered && 1547 cpu->interrupt_request & CPU_INTERRUPT_HARD) { 1548 reg_values[reg_count].DeliverabilityNotifications = 1549 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) { 1550 .InterruptNotification = 1 1551 }; 1552 vcpu->window_registered = 1; 1553 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications; 1554 reg_count += 1; 1555 } 1556 1557 bql_unlock(); 1558 vcpu->ready_for_pic_interrupt = false; 1559 1560 if (reg_count) { 1561 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1562 whpx->partition, cpu->cpu_index, 1563 reg_names, reg_count, reg_values); 1564 if (FAILED(hr)) { 1565 error_report("WHPX: Failed to set interrupt state registers," 1566 " hr=%08lx", hr); 1567 } 1568 } 1569 } 1570 1571 static void whpx_vcpu_post_run(CPUState *cpu) 1572 { 1573 AccelCPUState *vcpu = cpu->accel; 1574 X86CPU *x86_cpu = X86_CPU(cpu); 1575 CPUX86State *env = &x86_cpu->env; 1576 1577 env->eflags = vcpu->exit_ctx.VpContext.Rflags; 1578 1579 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8; 1580 if (vcpu->tpr != tpr) { 1581 vcpu->tpr = tpr; 1582 bql_lock(); 1583 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr)); 1584 bql_unlock(); 1585 } 1586 1587 vcpu->interruption_pending = 1588 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending; 1589 1590 vcpu->interruptable = 1591 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow; 1592 } 1593 1594 static void whpx_vcpu_process_async_events(CPUState *cpu) 1595 { 1596 X86CPU *x86_cpu = X86_CPU(cpu); 1597 CPUX86State *env = &x86_cpu->env; 1598 AccelCPUState *vcpu = cpu->accel; 1599 1600 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 1601 !(env->hflags & HF_SMM_MASK)) { 1602 whpx_cpu_synchronize_state(cpu); 1603 do_cpu_init(x86_cpu); 1604 vcpu->interruptable = true; 1605 } 1606 1607 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) { 1608 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; 1609 apic_poll_irq(x86_cpu->apic_state); 1610 } 1611 1612 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) && 1613 (env->eflags & IF_MASK)) || 1614 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) { 1615 cpu->halted = false; 1616 } 1617 1618 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) { 1619 whpx_cpu_synchronize_state(cpu); 1620 do_cpu_sipi(x86_cpu); 1621 } 1622 1623 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 1624 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR; 1625 whpx_cpu_synchronize_state(cpu); 1626 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip, 1627 env->tpr_access_type); 1628 } 1629 } 1630 1631 static int whpx_vcpu_run(CPUState *cpu) 1632 { 1633 HRESULT hr; 1634 struct whpx_state *whpx = &whpx_global; 1635 AccelCPUState *vcpu = cpu->accel; 1636 struct whpx_breakpoint *stepped_over_bp = NULL; 1637 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE; 1638 int ret; 1639 1640 g_assert(bql_locked()); 1641 1642 if (whpx->running_cpus++ == 0) { 1643 /* Insert breakpoints into memory, update exception exit bitmap. */ 1644 ret = whpx_first_vcpu_starting(cpu); 1645 if (ret != 0) { 1646 return ret; 1647 } 1648 } 1649 1650 if (whpx->breakpoints.breakpoints && 1651 whpx->breakpoints.breakpoints->used > 0) 1652 { 1653 uint64_t pc = whpx_vcpu_get_pc(cpu, true); 1654 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc); 1655 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) { 1656 stepped_over_bp = NULL; 1657 } 1658 1659 if (stepped_over_bp) { 1660 /* 1661 * We are trying to run the instruction overwritten by an active 1662 * breakpoint. We will temporarily disable the breakpoint, suspend 1663 * other CPUs, and step over the instruction. 1664 */ 1665 exclusive_step_mode = WHPX_STEP_EXCLUSIVE; 1666 } 1667 } 1668 1669 if (exclusive_step_mode == WHPX_STEP_NONE) { 1670 whpx_vcpu_process_async_events(cpu); 1671 if (cpu->halted && !whpx_apic_in_platform()) { 1672 cpu->exception_index = EXCP_HLT; 1673 qatomic_set(&cpu->exit_request, false); 1674 return 0; 1675 } 1676 } 1677 1678 bql_unlock(); 1679 1680 if (exclusive_step_mode != WHPX_STEP_NONE) { 1681 start_exclusive(); 1682 g_assert(cpu == current_cpu); 1683 g_assert(!cpu->running); 1684 cpu->running = true; 1685 1686 hr = whpx_set_exception_exit_bitmap( 1687 1UL << WHvX64ExceptionTypeDebugTrapOrFault); 1688 if (!SUCCEEDED(hr)) { 1689 error_report("WHPX: Failed to update exception exit mask, " 1690 "hr=%08lx.", hr); 1691 return 1; 1692 } 1693 1694 if (stepped_over_bp) { 1695 /* Temporarily disable the triggered breakpoint. */ 1696 cpu_memory_rw_debug(cpu, 1697 stepped_over_bp->address, 1698 &stepped_over_bp->original_instruction, 1699 1, 1700 true); 1701 } 1702 } else { 1703 cpu_exec_start(cpu); 1704 } 1705 1706 do { 1707 if (cpu->accel->dirty) { 1708 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE); 1709 cpu->accel->dirty = false; 1710 } 1711 1712 if (exclusive_step_mode == WHPX_STEP_NONE) { 1713 whpx_vcpu_pre_run(cpu); 1714 1715 if (qatomic_read(&cpu->exit_request)) { 1716 whpx_vcpu_kick(cpu); 1717 } 1718 } 1719 1720 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1721 whpx_vcpu_configure_single_stepping(cpu, true, NULL); 1722 } 1723 1724 hr = whp_dispatch.WHvRunVirtualProcessor( 1725 whpx->partition, cpu->cpu_index, 1726 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx)); 1727 1728 if (FAILED(hr)) { 1729 error_report("WHPX: Failed to exec a virtual processor," 1730 " hr=%08lx", hr); 1731 ret = -1; 1732 break; 1733 } 1734 1735 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) { 1736 whpx_vcpu_configure_single_stepping(cpu, 1737 false, 1738 &vcpu->exit_ctx.VpContext.Rflags); 1739 } 1740 1741 whpx_vcpu_post_run(cpu); 1742 1743 switch (vcpu->exit_ctx.ExitReason) { 1744 case WHvRunVpExitReasonMemoryAccess: 1745 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess); 1746 break; 1747 1748 case WHvRunVpExitReasonX64IoPortAccess: 1749 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess); 1750 break; 1751 1752 case WHvRunVpExitReasonX64InterruptWindow: 1753 vcpu->ready_for_pic_interrupt = 1; 1754 vcpu->window_registered = 0; 1755 ret = 0; 1756 break; 1757 1758 case WHvRunVpExitReasonX64ApicEoi: 1759 assert(whpx_apic_in_platform()); 1760 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector); 1761 break; 1762 1763 case WHvRunVpExitReasonX64Halt: 1764 /* 1765 * WARNING: as of build 19043.1526 (21H1), this exit reason is no 1766 * longer used. 1767 */ 1768 ret = whpx_handle_halt(cpu); 1769 break; 1770 1771 case WHvRunVpExitReasonX64ApicInitSipiTrap: { 1772 WHV_INTERRUPT_CONTROL ipi = {0}; 1773 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr; 1774 uint32_t delivery_mode = 1775 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT; 1776 int dest_shorthand = 1777 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT; 1778 bool broadcast = false; 1779 bool include_self = false; 1780 uint32_t i; 1781 1782 /* We only registered for INIT and SIPI exits. */ 1783 if ((delivery_mode != APIC_DM_INIT) && 1784 (delivery_mode != APIC_DM_SIPI)) { 1785 error_report( 1786 "WHPX: Unexpected APIC exit that is not a INIT or SIPI"); 1787 break; 1788 } 1789 1790 if (delivery_mode == APIC_DM_INIT) { 1791 ipi.Type = WHvX64InterruptTypeInit; 1792 } else { 1793 ipi.Type = WHvX64InterruptTypeSipi; 1794 } 1795 1796 ipi.DestinationMode = 1797 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ? 1798 WHvX64InterruptDestinationModeLogical : 1799 WHvX64InterruptDestinationModePhysical; 1800 1801 ipi.TriggerMode = 1802 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ? 1803 WHvX64InterruptTriggerModeLevel : 1804 WHvX64InterruptTriggerModeEdge; 1805 1806 ipi.Vector = icr & APIC_VECTOR_MASK; 1807 switch (dest_shorthand) { 1808 /* no shorthand. Bits 56-63 contain the destination. */ 1809 case 0: 1810 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK; 1811 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1812 &ipi, sizeof(ipi)); 1813 if (FAILED(hr)) { 1814 error_report("WHPX: Failed to request interrupt hr=%08lx", 1815 hr); 1816 } 1817 1818 break; 1819 1820 /* self */ 1821 case 1: 1822 include_self = true; 1823 break; 1824 1825 /* broadcast, including self */ 1826 case 2: 1827 broadcast = true; 1828 include_self = true; 1829 break; 1830 1831 /* broadcast, excluding self */ 1832 case 3: 1833 broadcast = true; 1834 break; 1835 } 1836 1837 if (!broadcast && !include_self) { 1838 break; 1839 } 1840 1841 for (i = 0; i <= max_vcpu_index; i++) { 1842 if (i == cpu->cpu_index && !include_self) { 1843 continue; 1844 } 1845 1846 /* 1847 * Assuming that APIC Ids are identity mapped since 1848 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers 1849 * are not handled yet and the hypervisor doesn't allow the 1850 * guest to modify the APIC ID. 1851 */ 1852 ipi.Destination = i; 1853 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition, 1854 &ipi, sizeof(ipi)); 1855 if (FAILED(hr)) { 1856 error_report( 1857 "WHPX: Failed to request SIPI for %d, hr=%08lx", 1858 i, hr); 1859 } 1860 } 1861 1862 break; 1863 } 1864 1865 case WHvRunVpExitReasonCanceled: 1866 if (exclusive_step_mode != WHPX_STEP_NONE) { 1867 /* 1868 * We are trying to step over a single instruction, and 1869 * likely got a request to stop from another thread. 1870 * Delay it until we are done stepping 1871 * over. 1872 */ 1873 ret = 0; 1874 } else { 1875 cpu->exception_index = EXCP_INTERRUPT; 1876 ret = 1; 1877 } 1878 break; 1879 case WHvRunVpExitReasonX64MsrAccess: { 1880 WHV_REGISTER_VALUE reg_values[3] = {0}; 1881 WHV_REGISTER_NAME reg_names[3]; 1882 UINT32 reg_count; 1883 1884 reg_names[0] = WHvX64RegisterRip; 1885 reg_names[1] = WHvX64RegisterRax; 1886 reg_names[2] = WHvX64RegisterRdx; 1887 1888 reg_values[0].Reg64 = 1889 vcpu->exit_ctx.VpContext.Rip + 1890 vcpu->exit_ctx.VpContext.InstructionLength; 1891 1892 /* 1893 * For all unsupported MSR access we: 1894 * ignore writes 1895 * return 0 on read. 1896 */ 1897 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ? 1898 1 : 3; 1899 1900 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1901 whpx->partition, 1902 cpu->cpu_index, 1903 reg_names, reg_count, 1904 reg_values); 1905 1906 if (FAILED(hr)) { 1907 error_report("WHPX: Failed to set MsrAccess state " 1908 " registers, hr=%08lx", hr); 1909 } 1910 ret = 0; 1911 break; 1912 } 1913 case WHvRunVpExitReasonX64Cpuid: { 1914 WHV_REGISTER_VALUE reg_values[5]; 1915 WHV_REGISTER_NAME reg_names[5]; 1916 UINT32 reg_count = 5; 1917 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0; 1918 X86CPU *x86_cpu = X86_CPU(cpu); 1919 CPUX86State *env = &x86_cpu->env; 1920 1921 memset(reg_values, 0, sizeof(reg_values)); 1922 1923 rip = vcpu->exit_ctx.VpContext.Rip + 1924 vcpu->exit_ctx.VpContext.InstructionLength; 1925 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax; 1926 1927 /* 1928 * Ideally, these should be supplied to the hypervisor during VCPU 1929 * initialization and it should be able to satisfy this request. 1930 * But, currently, WHPX doesn't support setting CPUID values in the 1931 * hypervisor once the partition has been setup, which is too late 1932 * since VCPUs are realized later. For now, use the values from 1933 * QEMU to satisfy these requests, until WHPX adds support for 1934 * being able to set these values in the hypervisor at runtime. 1935 */ 1936 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx, 1937 (UINT32 *)&rcx, (UINT32 *)&rdx); 1938 switch (cpuid_fn) { 1939 case 0x40000000: 1940 /* Expose the vmware cpu frequency cpuid leaf */ 1941 rax = 0x40000010; 1942 rbx = rcx = rdx = 0; 1943 break; 1944 1945 case 0x40000010: 1946 rax = env->tsc_khz; 1947 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */ 1948 rcx = rdx = 0; 1949 break; 1950 1951 case 0x80000001: 1952 /* Remove any support of OSVW */ 1953 rcx &= ~CPUID_EXT3_OSVW; 1954 break; 1955 } 1956 1957 reg_names[0] = WHvX64RegisterRip; 1958 reg_names[1] = WHvX64RegisterRax; 1959 reg_names[2] = WHvX64RegisterRcx; 1960 reg_names[3] = WHvX64RegisterRdx; 1961 reg_names[4] = WHvX64RegisterRbx; 1962 1963 reg_values[0].Reg64 = rip; 1964 reg_values[1].Reg64 = rax; 1965 reg_values[2].Reg64 = rcx; 1966 reg_values[3].Reg64 = rdx; 1967 reg_values[4].Reg64 = rbx; 1968 1969 hr = whp_dispatch.WHvSetVirtualProcessorRegisters( 1970 whpx->partition, cpu->cpu_index, 1971 reg_names, 1972 reg_count, 1973 reg_values); 1974 1975 if (FAILED(hr)) { 1976 error_report("WHPX: Failed to set CpuidAccess state registers," 1977 " hr=%08lx", hr); 1978 } 1979 ret = 0; 1980 break; 1981 } 1982 case WHvRunVpExitReasonException: 1983 whpx_get_registers(cpu); 1984 1985 if ((vcpu->exit_ctx.VpException.ExceptionType == 1986 WHvX64ExceptionTypeDebugTrapOrFault) && 1987 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) && 1988 (vcpu->exit_ctx.VpException.InstructionBytes[0] == 1989 whpx_breakpoint_instruction)) { 1990 /* Stopped at a software breakpoint. */ 1991 cpu->exception_index = EXCP_DEBUG; 1992 } else if ((vcpu->exit_ctx.VpException.ExceptionType == 1993 WHvX64ExceptionTypeDebugTrapOrFault) && 1994 !cpu->singlestep_enabled) { 1995 /* 1996 * Just finished stepping over a breakpoint, but the 1997 * gdb does not expect us to do single-stepping. 1998 * Don't do anything special. 1999 */ 2000 cpu->exception_index = EXCP_INTERRUPT; 2001 } else { 2002 /* Another exception or debug event. Report it to GDB. */ 2003 cpu->exception_index = EXCP_DEBUG; 2004 } 2005 2006 ret = 1; 2007 break; 2008 case WHvRunVpExitReasonNone: 2009 case WHvRunVpExitReasonUnrecoverableException: 2010 case WHvRunVpExitReasonInvalidVpRegisterValue: 2011 case WHvRunVpExitReasonUnsupportedFeature: 2012 default: 2013 error_report("WHPX: Unexpected VP exit code %d", 2014 vcpu->exit_ctx.ExitReason); 2015 whpx_get_registers(cpu); 2016 bql_lock(); 2017 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2018 bql_unlock(); 2019 break; 2020 } 2021 2022 } while (!ret); 2023 2024 if (stepped_over_bp) { 2025 /* Restore the breakpoint we stepped over */ 2026 cpu_memory_rw_debug(cpu, 2027 stepped_over_bp->address, 2028 (void *)&whpx_breakpoint_instruction, 2029 1, 2030 true); 2031 } 2032 2033 if (exclusive_step_mode != WHPX_STEP_NONE) { 2034 g_assert(cpu_in_exclusive_context(cpu)); 2035 cpu->running = false; 2036 end_exclusive(); 2037 2038 exclusive_step_mode = WHPX_STEP_NONE; 2039 } else { 2040 cpu_exec_end(cpu); 2041 } 2042 2043 bql_lock(); 2044 current_cpu = cpu; 2045 2046 if (--whpx->running_cpus == 0) { 2047 whpx_last_vcpu_stopping(cpu); 2048 } 2049 2050 qatomic_set(&cpu->exit_request, false); 2051 2052 return ret < 0; 2053 } 2054 2055 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2056 { 2057 if (!cpu->accel->dirty) { 2058 whpx_get_registers(cpu); 2059 cpu->accel->dirty = true; 2060 } 2061 } 2062 2063 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu, 2064 run_on_cpu_data arg) 2065 { 2066 whpx_set_registers(cpu, WHPX_SET_RESET_STATE); 2067 cpu->accel->dirty = false; 2068 } 2069 2070 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu, 2071 run_on_cpu_data arg) 2072 { 2073 whpx_set_registers(cpu, WHPX_SET_FULL_STATE); 2074 cpu->accel->dirty = false; 2075 } 2076 2077 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu, 2078 run_on_cpu_data arg) 2079 { 2080 cpu->accel->dirty = true; 2081 } 2082 2083 /* 2084 * CPU support. 2085 */ 2086 2087 void whpx_cpu_synchronize_state(CPUState *cpu) 2088 { 2089 if (!cpu->accel->dirty) { 2090 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL); 2091 } 2092 } 2093 2094 void whpx_cpu_synchronize_post_reset(CPUState *cpu) 2095 { 2096 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2097 } 2098 2099 void whpx_cpu_synchronize_post_init(CPUState *cpu) 2100 { 2101 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2102 } 2103 2104 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu) 2105 { 2106 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2107 } 2108 2109 void whpx_cpu_synchronize_pre_resume(bool step_pending) 2110 { 2111 whpx_global.step_pending = step_pending; 2112 } 2113 2114 /* 2115 * Vcpu support. 2116 */ 2117 2118 static Error *whpx_migration_blocker; 2119 2120 static void whpx_cpu_update_state(void *opaque, bool running, RunState state) 2121 { 2122 CPUX86State *env = opaque; 2123 2124 if (running) { 2125 env->tsc_valid = false; 2126 } 2127 } 2128 2129 int whpx_init_vcpu(CPUState *cpu) 2130 { 2131 HRESULT hr; 2132 struct whpx_state *whpx = &whpx_global; 2133 AccelCPUState *vcpu = NULL; 2134 Error *local_error = NULL; 2135 X86CPU *x86_cpu = X86_CPU(cpu); 2136 CPUX86State *env = &x86_cpu->env; 2137 UINT64 freq = 0; 2138 int ret; 2139 2140 /* Add migration blockers for all unsupported features of the 2141 * Windows Hypervisor Platform 2142 */ 2143 if (whpx_migration_blocker == NULL) { 2144 error_setg(&whpx_migration_blocker, 2145 "State blocked due to non-migratable CPUID feature support," 2146 "dirty memory tracking support, and XSAVE/XRSTOR support"); 2147 2148 if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) { 2149 error_report_err(local_error); 2150 ret = -EINVAL; 2151 goto error; 2152 } 2153 } 2154 2155 vcpu = g_new0(AccelCPUState, 1); 2156 2157 hr = whp_dispatch.WHvEmulatorCreateEmulator( 2158 &whpx_emu_callbacks, 2159 &vcpu->emulator); 2160 if (FAILED(hr)) { 2161 error_report("WHPX: Failed to setup instruction completion support," 2162 " hr=%08lx", hr); 2163 ret = -EINVAL; 2164 goto error; 2165 } 2166 2167 hr = whp_dispatch.WHvCreateVirtualProcessor( 2168 whpx->partition, cpu->cpu_index, 0); 2169 if (FAILED(hr)) { 2170 error_report("WHPX: Failed to create a virtual processor," 2171 " hr=%08lx", hr); 2172 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2173 ret = -EINVAL; 2174 goto error; 2175 } 2176 2177 /* 2178 * vcpu's TSC frequency is either specified by user, or use the value 2179 * provided by Hyper-V if the former is not present. In the latter case, we 2180 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC 2181 * frequency can be migrated later via this field. 2182 */ 2183 if (!env->tsc_khz) { 2184 hr = whp_dispatch.WHvGetCapability( 2185 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq), 2186 NULL); 2187 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2188 if (FAILED(hr)) { 2189 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr); 2190 } else { 2191 env->tsc_khz = freq / 1000; /* Hz to KHz */ 2192 } 2193 } 2194 } 2195 2196 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY; 2197 hr = whp_dispatch.WHvGetCapability( 2198 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL); 2199 if (hr != WHV_E_UNKNOWN_CAPABILITY) { 2200 if (FAILED(hr)) { 2201 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr); 2202 } else { 2203 env->apic_bus_freq = freq; 2204 } 2205 } 2206 2207 /* 2208 * If the vmware cpuid frequency leaf option is set, and we have a valid 2209 * tsc value, trap the corresponding cpuid's. 2210 */ 2211 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) { 2212 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010}; 2213 2214 hr = whp_dispatch.WHvSetPartitionProperty( 2215 whpx->partition, 2216 WHvPartitionPropertyCodeCpuidExitList, 2217 cpuidExitList, 2218 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2219 2220 if (FAILED(hr)) { 2221 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2222 hr); 2223 ret = -EINVAL; 2224 goto error; 2225 } 2226 } 2227 2228 vcpu->interruptable = true; 2229 vcpu->dirty = true; 2230 cpu->accel = vcpu; 2231 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index); 2232 qemu_add_vm_change_state_handler(whpx_cpu_update_state, env); 2233 2234 return 0; 2235 2236 error: 2237 g_free(vcpu); 2238 2239 return ret; 2240 } 2241 2242 int whpx_vcpu_exec(CPUState *cpu) 2243 { 2244 int ret; 2245 int fatal; 2246 2247 for (;;) { 2248 if (cpu->exception_index >= EXCP_INTERRUPT) { 2249 ret = cpu->exception_index; 2250 cpu->exception_index = -1; 2251 break; 2252 } 2253 2254 fatal = whpx_vcpu_run(cpu); 2255 2256 if (fatal) { 2257 error_report("WHPX: Failed to exec a virtual processor"); 2258 abort(); 2259 } 2260 } 2261 2262 return ret; 2263 } 2264 2265 void whpx_destroy_vcpu(CPUState *cpu) 2266 { 2267 struct whpx_state *whpx = &whpx_global; 2268 AccelCPUState *vcpu = cpu->accel; 2269 2270 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index); 2271 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator); 2272 g_free(cpu->accel); 2273 } 2274 2275 void whpx_vcpu_kick(CPUState *cpu) 2276 { 2277 struct whpx_state *whpx = &whpx_global; 2278 whp_dispatch.WHvCancelRunVirtualProcessor( 2279 whpx->partition, cpu->cpu_index, 0); 2280 } 2281 2282 /* 2283 * Memory support. 2284 */ 2285 2286 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size, 2287 void *host_va, int add, int rom, 2288 const char *name) 2289 { 2290 struct whpx_state *whpx = &whpx_global; 2291 HRESULT hr; 2292 2293 /* 2294 if (add) { 2295 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n", 2296 (void*)start_pa, (void*)size, host_va, 2297 (rom ? "ROM" : "RAM"), name); 2298 } else { 2299 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n", 2300 (void*)start_pa, (void*)size, host_va, name); 2301 } 2302 */ 2303 2304 if (add) { 2305 hr = whp_dispatch.WHvMapGpaRange(whpx->partition, 2306 host_va, 2307 start_pa, 2308 size, 2309 (WHvMapGpaRangeFlagRead | 2310 WHvMapGpaRangeFlagExecute | 2311 (rom ? 0 : WHvMapGpaRangeFlagWrite))); 2312 } else { 2313 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition, 2314 start_pa, 2315 size); 2316 } 2317 2318 if (FAILED(hr)) { 2319 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes," 2320 " Host:%p, hr=%08lx", 2321 (add ? "MAP" : "UNMAP"), name, 2322 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr); 2323 } 2324 } 2325 2326 static void whpx_process_section(MemoryRegionSection *section, int add) 2327 { 2328 MemoryRegion *mr = section->mr; 2329 hwaddr start_pa = section->offset_within_address_space; 2330 ram_addr_t size = int128_get64(section->size); 2331 unsigned int delta; 2332 uint64_t host_va; 2333 2334 if (!memory_region_is_ram(mr)) { 2335 return; 2336 } 2337 2338 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask()); 2339 delta &= ~qemu_real_host_page_mask(); 2340 if (delta > size) { 2341 return; 2342 } 2343 start_pa += delta; 2344 size -= delta; 2345 size &= qemu_real_host_page_mask(); 2346 if (!size || (start_pa & ~qemu_real_host_page_mask())) { 2347 return; 2348 } 2349 2350 host_va = (uintptr_t)memory_region_get_ram_ptr(mr) 2351 + section->offset_within_region + delta; 2352 2353 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add, 2354 memory_region_is_rom(mr), mr->name); 2355 } 2356 2357 static void whpx_region_add(MemoryListener *listener, 2358 MemoryRegionSection *section) 2359 { 2360 memory_region_ref(section->mr); 2361 whpx_process_section(section, 1); 2362 } 2363 2364 static void whpx_region_del(MemoryListener *listener, 2365 MemoryRegionSection *section) 2366 { 2367 whpx_process_section(section, 0); 2368 memory_region_unref(section->mr); 2369 } 2370 2371 static void whpx_transaction_begin(MemoryListener *listener) 2372 { 2373 } 2374 2375 static void whpx_transaction_commit(MemoryListener *listener) 2376 { 2377 } 2378 2379 static void whpx_log_sync(MemoryListener *listener, 2380 MemoryRegionSection *section) 2381 { 2382 MemoryRegion *mr = section->mr; 2383 2384 if (!memory_region_is_ram(mr)) { 2385 return; 2386 } 2387 2388 memory_region_set_dirty(mr, 0, int128_get64(section->size)); 2389 } 2390 2391 static MemoryListener whpx_memory_listener = { 2392 .name = "whpx", 2393 .begin = whpx_transaction_begin, 2394 .commit = whpx_transaction_commit, 2395 .region_add = whpx_region_add, 2396 .region_del = whpx_region_del, 2397 .log_sync = whpx_log_sync, 2398 .priority = MEMORY_LISTENER_PRIORITY_ACCEL, 2399 }; 2400 2401 static void whpx_memory_init(void) 2402 { 2403 memory_listener_register(&whpx_memory_listener, &address_space_memory); 2404 } 2405 2406 /* 2407 * Load the functions from the given library, using the given handle. If a 2408 * handle is provided, it is used, otherwise the library is opened. The 2409 * handle will be updated on return with the opened one. 2410 */ 2411 static bool load_whp_dispatch_fns(HMODULE *handle, 2412 WHPFunctionList function_list) 2413 { 2414 HMODULE hLib = *handle; 2415 2416 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll" 2417 #define WINHV_EMULATION_DLL "WinHvEmulation.dll" 2418 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \ 2419 whp_dispatch.function_name = \ 2420 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2421 2422 #define WHP_LOAD_FIELD(return_type, function_name, signature) \ 2423 whp_dispatch.function_name = \ 2424 (function_name ## _t)GetProcAddress(hLib, #function_name); \ 2425 if (!whp_dispatch.function_name) { \ 2426 error_report("Could not load function %s", #function_name); \ 2427 goto error; \ 2428 } \ 2429 2430 #define WHP_LOAD_LIB(lib_name, handle_lib) \ 2431 if (!handle_lib) { \ 2432 handle_lib = LoadLibrary(lib_name); \ 2433 if (!handle_lib) { \ 2434 error_report("Could not load library %s.", lib_name); \ 2435 goto error; \ 2436 } \ 2437 } \ 2438 2439 switch (function_list) { 2440 case WINHV_PLATFORM_FNS_DEFAULT: 2441 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2442 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD) 2443 break; 2444 2445 case WINHV_EMULATION_FNS_DEFAULT: 2446 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib) 2447 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD) 2448 break; 2449 2450 case WINHV_PLATFORM_FNS_SUPPLEMENTAL: 2451 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib) 2452 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL) 2453 break; 2454 } 2455 2456 *handle = hLib; 2457 return true; 2458 2459 error: 2460 if (hLib) { 2461 FreeLibrary(hLib); 2462 } 2463 2464 return false; 2465 } 2466 2467 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v, 2468 const char *name, void *opaque, 2469 Error **errp) 2470 { 2471 struct whpx_state *whpx = &whpx_global; 2472 OnOffSplit mode; 2473 2474 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 2475 return; 2476 } 2477 2478 switch (mode) { 2479 case ON_OFF_SPLIT_ON: 2480 whpx->kernel_irqchip_allowed = true; 2481 whpx->kernel_irqchip_required = true; 2482 break; 2483 2484 case ON_OFF_SPLIT_OFF: 2485 whpx->kernel_irqchip_allowed = false; 2486 whpx->kernel_irqchip_required = false; 2487 break; 2488 2489 case ON_OFF_SPLIT_SPLIT: 2490 error_setg(errp, "WHPX: split irqchip currently not supported"); 2491 error_append_hint(errp, 2492 "Try without kernel-irqchip or with kernel-irqchip=on|off"); 2493 break; 2494 2495 default: 2496 /* 2497 * The value was checked in visit_type_OnOffSplit() above. If 2498 * we get here, then something is wrong in QEMU. 2499 */ 2500 abort(); 2501 } 2502 } 2503 2504 /* 2505 * Partition support 2506 */ 2507 2508 static int whpx_accel_init(MachineState *ms) 2509 { 2510 struct whpx_state *whpx; 2511 int ret; 2512 HRESULT hr; 2513 WHV_CAPABILITY whpx_cap; 2514 UINT32 whpx_cap_size; 2515 WHV_PARTITION_PROPERTY prop; 2516 UINT32 cpuidExitList[] = {1, 0x80000001}; 2517 WHV_CAPABILITY_FEATURES features = {0}; 2518 2519 whpx = &whpx_global; 2520 2521 if (!init_whp_dispatch()) { 2522 ret = -ENOSYS; 2523 goto error; 2524 } 2525 2526 whpx->mem_quota = ms->ram_size; 2527 2528 hr = whp_dispatch.WHvGetCapability( 2529 WHvCapabilityCodeHypervisorPresent, &whpx_cap, 2530 sizeof(whpx_cap), &whpx_cap_size); 2531 if (FAILED(hr) || !whpx_cap.HypervisorPresent) { 2532 error_report("WHPX: No accelerator found, hr=%08lx", hr); 2533 ret = -ENOSPC; 2534 goto error; 2535 } 2536 2537 hr = whp_dispatch.WHvGetCapability( 2538 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL); 2539 if (FAILED(hr)) { 2540 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr); 2541 ret = -EINVAL; 2542 goto error; 2543 } 2544 2545 hr = whp_dispatch.WHvCreatePartition(&whpx->partition); 2546 if (FAILED(hr)) { 2547 error_report("WHPX: Failed to create partition, hr=%08lx", hr); 2548 ret = -EINVAL; 2549 goto error; 2550 } 2551 2552 /* 2553 * Query the XSAVE capability of the partition. Any error here is not 2554 * considered fatal. 2555 */ 2556 hr = whp_dispatch.WHvGetPartitionProperty( 2557 whpx->partition, 2558 WHvPartitionPropertyCodeProcessorXsaveFeatures, 2559 &whpx_xsave_cap, 2560 sizeof(whpx_xsave_cap), 2561 &whpx_cap_size); 2562 2563 /* 2564 * Windows version which don't support this property will return with the 2565 * specific error code. 2566 */ 2567 if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) { 2568 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr); 2569 } 2570 2571 if (!whpx_has_xsave()) { 2572 printf("WHPX: Partition is not XSAVE capable\n"); 2573 } 2574 2575 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2576 prop.ProcessorCount = ms->smp.cpus; 2577 hr = whp_dispatch.WHvSetPartitionProperty( 2578 whpx->partition, 2579 WHvPartitionPropertyCodeProcessorCount, 2580 &prop, 2581 sizeof(WHV_PARTITION_PROPERTY)); 2582 2583 if (FAILED(hr)) { 2584 error_report("WHPX: Failed to set partition processor count to %u," 2585 " hr=%08lx", prop.ProcessorCount, hr); 2586 ret = -EINVAL; 2587 goto error; 2588 } 2589 2590 /* 2591 * Error out if WHP doesn't support apic emulation and user is requiring 2592 * it. 2593 */ 2594 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation || 2595 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) { 2596 error_report("WHPX: kernel irqchip requested, but unavailable. " 2597 "Try without kernel-irqchip or with kernel-irqchip=off"); 2598 ret = -EINVAL; 2599 goto error; 2600 } 2601 2602 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation && 2603 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) { 2604 WHV_X64_LOCAL_APIC_EMULATION_MODE mode = 2605 WHvX64LocalApicEmulationModeXApic; 2606 printf("WHPX: setting APIC emulation mode in the hypervisor\n"); 2607 hr = whp_dispatch.WHvSetPartitionProperty( 2608 whpx->partition, 2609 WHvPartitionPropertyCodeLocalApicEmulationMode, 2610 &mode, 2611 sizeof(mode)); 2612 if (FAILED(hr)) { 2613 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr); 2614 if (whpx->kernel_irqchip_required) { 2615 error_report("WHPX: kernel irqchip requested, but unavailable"); 2616 ret = -EINVAL; 2617 goto error; 2618 } 2619 } else { 2620 whpx->apic_in_platform = true; 2621 } 2622 } 2623 2624 /* Register for MSR and CPUID exits */ 2625 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY)); 2626 prop.ExtendedVmExits.X64MsrExit = 1; 2627 prop.ExtendedVmExits.X64CpuidExit = 1; 2628 prop.ExtendedVmExits.ExceptionExit = 1; 2629 if (whpx_apic_in_platform()) { 2630 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1; 2631 } 2632 2633 hr = whp_dispatch.WHvSetPartitionProperty( 2634 whpx->partition, 2635 WHvPartitionPropertyCodeExtendedVmExits, 2636 &prop, 2637 sizeof(WHV_PARTITION_PROPERTY)); 2638 if (FAILED(hr)) { 2639 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr); 2640 ret = -EINVAL; 2641 goto error; 2642 } 2643 2644 hr = whp_dispatch.WHvSetPartitionProperty( 2645 whpx->partition, 2646 WHvPartitionPropertyCodeCpuidExitList, 2647 cpuidExitList, 2648 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32)); 2649 2650 if (FAILED(hr)) { 2651 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx", 2652 hr); 2653 ret = -EINVAL; 2654 goto error; 2655 } 2656 2657 /* 2658 * We do not want to intercept any exceptions from the guest, 2659 * until we actually start debugging with gdb. 2660 */ 2661 whpx->exception_exit_bitmap = -1; 2662 hr = whpx_set_exception_exit_bitmap(0); 2663 2664 if (FAILED(hr)) { 2665 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr); 2666 ret = -EINVAL; 2667 goto error; 2668 } 2669 2670 hr = whp_dispatch.WHvSetupPartition(whpx->partition); 2671 if (FAILED(hr)) { 2672 error_report("WHPX: Failed to setup partition, hr=%08lx", hr); 2673 ret = -EINVAL; 2674 goto error; 2675 } 2676 2677 whpx_memory_init(); 2678 2679 printf("Windows Hypervisor Platform accelerator is operational\n"); 2680 return 0; 2681 2682 error: 2683 2684 if (NULL != whpx->partition) { 2685 whp_dispatch.WHvDeletePartition(whpx->partition); 2686 whpx->partition = NULL; 2687 } 2688 2689 return ret; 2690 } 2691 2692 int whpx_enabled(void) 2693 { 2694 return whpx_allowed; 2695 } 2696 2697 bool whpx_apic_in_platform(void) { 2698 return whpx_global.apic_in_platform; 2699 } 2700 2701 static void whpx_accel_class_init(ObjectClass *oc, const void *data) 2702 { 2703 AccelClass *ac = ACCEL_CLASS(oc); 2704 ac->name = "WHPX"; 2705 ac->init_machine = whpx_accel_init; 2706 ac->allowed = &whpx_allowed; 2707 2708 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 2709 NULL, whpx_set_kernel_irqchip, 2710 NULL, NULL); 2711 object_class_property_set_description(oc, "kernel-irqchip", 2712 "Configure WHPX in-kernel irqchip"); 2713 } 2714 2715 static void whpx_accel_instance_init(Object *obj) 2716 { 2717 struct whpx_state *whpx = &whpx_global; 2718 2719 memset(whpx, 0, sizeof(struct whpx_state)); 2720 /* Turn on kernel-irqchip, by default */ 2721 whpx->kernel_irqchip_allowed = true; 2722 } 2723 2724 static const TypeInfo whpx_accel_type = { 2725 .name = ACCEL_CLASS_NAME("whpx"), 2726 .parent = TYPE_ACCEL, 2727 .instance_init = whpx_accel_instance_init, 2728 .class_init = whpx_accel_class_init, 2729 }; 2730 2731 static void whpx_type_init(void) 2732 { 2733 type_register_static(&whpx_accel_type); 2734 } 2735 2736 bool init_whp_dispatch(void) 2737 { 2738 if (whp_dispatch_initialized) { 2739 return true; 2740 } 2741 2742 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) { 2743 goto error; 2744 } 2745 2746 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) { 2747 goto error; 2748 } 2749 2750 assert(load_whp_dispatch_fns(&hWinHvPlatform, 2751 WINHV_PLATFORM_FNS_SUPPLEMENTAL)); 2752 whp_dispatch_initialized = true; 2753 2754 return true; 2755 error: 2756 if (hWinHvPlatform) { 2757 FreeLibrary(hWinHvPlatform); 2758 } 2759 2760 if (hWinHvEmulation) { 2761 FreeLibrary(hWinHvEmulation); 2762 } 2763 2764 return false; 2765 } 2766 2767 type_init(whpx_type_init); 2768