1 /*
2 * QEMU Windows Hypervisor Platform accelerator (WHPX)
3 *
4 * Copyright Microsoft Corp. 2017
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
8 *
9 */
10
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "system/address-spaces.h"
14 #include "system/ioport.h"
15 #include "gdbstub/helpers.h"
16 #include "qemu/accel.h"
17 #include "system/whpx.h"
18 #include "system/cpus.h"
19 #include "system/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/intc/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33
34 #include <winhvplatform.h>
35 #include <winhvemulation.h>
36
37 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL)
38
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40
41 /* X64 General purpose registers */
42 WHvX64RegisterRax,
43 WHvX64RegisterRcx,
44 WHvX64RegisterRdx,
45 WHvX64RegisterRbx,
46 WHvX64RegisterRsp,
47 WHvX64RegisterRbp,
48 WHvX64RegisterRsi,
49 WHvX64RegisterRdi,
50 WHvX64RegisterR8,
51 WHvX64RegisterR9,
52 WHvX64RegisterR10,
53 WHvX64RegisterR11,
54 WHvX64RegisterR12,
55 WHvX64RegisterR13,
56 WHvX64RegisterR14,
57 WHvX64RegisterR15,
58 WHvX64RegisterRip,
59 WHvX64RegisterRflags,
60
61 /* X64 Segment registers */
62 WHvX64RegisterEs,
63 WHvX64RegisterCs,
64 WHvX64RegisterSs,
65 WHvX64RegisterDs,
66 WHvX64RegisterFs,
67 WHvX64RegisterGs,
68 WHvX64RegisterLdtr,
69 WHvX64RegisterTr,
70
71 /* X64 Table registers */
72 WHvX64RegisterIdtr,
73 WHvX64RegisterGdtr,
74
75 /* X64 Control Registers */
76 WHvX64RegisterCr0,
77 WHvX64RegisterCr2,
78 WHvX64RegisterCr3,
79 WHvX64RegisterCr4,
80 WHvX64RegisterCr8,
81
82 /* X64 Debug Registers */
83 /*
84 * WHvX64RegisterDr0,
85 * WHvX64RegisterDr1,
86 * WHvX64RegisterDr2,
87 * WHvX64RegisterDr3,
88 * WHvX64RegisterDr6,
89 * WHvX64RegisterDr7,
90 */
91
92 /* X64 Floating Point and Vector Registers */
93 WHvX64RegisterXmm0,
94 WHvX64RegisterXmm1,
95 WHvX64RegisterXmm2,
96 WHvX64RegisterXmm3,
97 WHvX64RegisterXmm4,
98 WHvX64RegisterXmm5,
99 WHvX64RegisterXmm6,
100 WHvX64RegisterXmm7,
101 WHvX64RegisterXmm8,
102 WHvX64RegisterXmm9,
103 WHvX64RegisterXmm10,
104 WHvX64RegisterXmm11,
105 WHvX64RegisterXmm12,
106 WHvX64RegisterXmm13,
107 WHvX64RegisterXmm14,
108 WHvX64RegisterXmm15,
109 WHvX64RegisterFpMmx0,
110 WHvX64RegisterFpMmx1,
111 WHvX64RegisterFpMmx2,
112 WHvX64RegisterFpMmx3,
113 WHvX64RegisterFpMmx4,
114 WHvX64RegisterFpMmx5,
115 WHvX64RegisterFpMmx6,
116 WHvX64RegisterFpMmx7,
117 WHvX64RegisterFpControlStatus,
118 WHvX64RegisterXmmControlStatus,
119
120 /* X64 MSRs */
121 WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123 WHvX64RegisterKernelGsBase,
124 #endif
125 WHvX64RegisterApicBase,
126 /* WHvX64RegisterPat, */
127 WHvX64RegisterSysenterCs,
128 WHvX64RegisterSysenterEip,
129 WHvX64RegisterSysenterEsp,
130 WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132 WHvX64RegisterLstar,
133 WHvX64RegisterCstar,
134 WHvX64RegisterSfmask,
135 #endif
136
137 /* Interrupt / Event Registers */
138 /*
139 * WHvRegisterPendingInterruption,
140 * WHvRegisterInterruptState,
141 * WHvRegisterPendingEvent0,
142 * WHvRegisterPendingEvent1
143 * WHvX64RegisterDeliverabilityNotifications,
144 */
145 };
146
147 struct whpx_register_set {
148 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150
151 /*
152 * The current implementation of instruction stepping sets the TF flag
153 * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155 *
156 * This approach has a few limitations:
157 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158 * along with the other flags, possibly restoring it later. It would
159 * result in another INT1 when the flags are restored, triggering
160 * a stop in gdb that could be cleared by doing another step.
161 *
162 * Stepping over a POPF/LAHF instruction will let it overwrite the
163 * TF flags, ending the stepping mode.
164 *
165 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166 * or anything that could result in a page fault) will save the flags
167 * to the stack, clear the TF flag, and let the guest execute the
168 * handler. Normally, the guest will restore the original flags,
169 * that will continue single-stepping.
170 *
171 * 3. Debuggers running on the guest may wish to set TF to do instruction
172 * stepping. INT1 events generated by it would be intercepted by us,
173 * as long as the gdb is connected to QEMU.
174 *
175 * In practice this means that:
176 * 1. Stepping through flags-modifying instructions may cause gdb to
177 * continue or stop in unexpected places. This will be fully recoverable
178 * and will not crash the target.
179 *
180 * 2. Stepping over an instruction that triggers an exception will step
181 * over the exception handler, not into it.
182 *
183 * 3. Debugging the guest via gdb, while running debugger on the guest
184 * at the same time may lead to unexpected effects. Removing all
185 * breakpoints set via QEMU will prevent any further interference
186 * with the guest-level debuggers.
187 *
188 * The limitations can be addressed as shown below:
189 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190 * stepping through them. The exact semantics of the instructions is
191 * defined in the "Combined Volume Set of Intel 64 and IA-32
192 * Architectures Software Developer's Manuals", however it involves a
193 * fair amount of corner cases due to compatibility with real mode,
194 * virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195 *
196 * 2. We could step into the guest's exception handlers using the following
197 * sequence:
198 * a. Temporarily enable catching of all exception types via
199 * whpx_set_exception_exit_bitmap().
200 * b. Once an exception is intercepted, read the IDT/GDT and locate
201 * the original handler.
202 * c. Patch the original handler, injecting an INT3 at the beginning.
203 * d. Update the exception exit bitmap to only catch the
204 * WHvX64ExceptionTypeBreakpointTrap exception.
205 * e. Let the affected CPU run in the exclusive mode.
206 * f. Restore the original handler and the exception exit bitmap.
207 * Note that handling all corner cases related to IDT/GDT is harder
208 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209 * rough idea.
210 *
211 * 3. In order to properly support guest-level debugging in parallel with
212 * the QEMU-level debugging, we would need to be able to pass some INT1
213 * events to the guest. This could be done via the following methods:
214 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215 * it seems to only work for interrupts and not software
216 * exceptions.
217 * b. Locating and patching the original handler by parsing IDT/GDT.
218 * This involves relatively complex logic outlined in the previous
219 * paragraph.
220 * c. Emulating the exception invocation (i.e. manually updating RIP,
221 * RFLAGS, and pushing the old values to stack). This is even more
222 * complicated than the previous option, since it involves checking
223 * CPL, gate attributes, and doing various adjustments depending
224 * on the current CPU mode, whether the CPL is changing, etc.
225 */
226 typedef enum WhpxStepMode {
227 WHPX_STEP_NONE = 0,
228 /* Halt other VCPUs */
229 WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231
232 struct AccelCPUState {
233 WHV_EMULATOR_HANDLE emulator;
234 bool window_registered;
235 bool interruptable;
236 bool ready_for_pic_interrupt;
237 uint64_t tpr;
238 uint64_t apic_base;
239 bool interruption_pending;
240 bool dirty;
241
242 /* Must be the last field as it may have a tail */
243 WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
244 };
245
246 static bool whpx_allowed;
247 static bool whp_dispatch_initialized;
248 static HMODULE hWinHvPlatform, hWinHvEmulation;
249 static uint32_t max_vcpu_index;
250 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
251
252 struct whpx_state whpx_global;
253 struct WHPDispatch whp_dispatch;
254
whpx_has_xsave(void)255 static bool whpx_has_xsave(void)
256 {
257 return whpx_xsave_cap.XsaveSupport;
258 }
259
whpx_seg_q2h(const SegmentCache * qs,int v86,int r86)260 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
261 int r86)
262 {
263 WHV_X64_SEGMENT_REGISTER hs;
264 unsigned flags = qs->flags;
265
266 hs.Base = qs->base;
267 hs.Limit = qs->limit;
268 hs.Selector = qs->selector;
269
270 if (v86) {
271 hs.Attributes = 0;
272 hs.SegmentType = 3;
273 hs.Present = 1;
274 hs.DescriptorPrivilegeLevel = 3;
275 hs.NonSystemSegment = 1;
276
277 } else {
278 hs.Attributes = (flags >> DESC_TYPE_SHIFT);
279
280 if (r86) {
281 /* hs.Base &= 0xfffff; */
282 }
283 }
284
285 return hs;
286 }
287
whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER * hs)288 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
289 {
290 SegmentCache qs;
291
292 qs.base = hs->Base;
293 qs.limit = hs->Limit;
294 qs.selector = hs->Selector;
295
296 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
297
298 return qs;
299 }
300
301 /* X64 Extended Control Registers */
whpx_set_xcrs(CPUState * cpu)302 static void whpx_set_xcrs(CPUState *cpu)
303 {
304 HRESULT hr;
305 struct whpx_state *whpx = &whpx_global;
306 WHV_REGISTER_VALUE xcr0;
307 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
308
309 if (!whpx_has_xsave()) {
310 return;
311 }
312
313 /* Only xcr0 is supported by the hypervisor currently */
314 xcr0.Reg64 = cpu_env(cpu)->xcr0;
315 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
316 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
317 if (FAILED(hr)) {
318 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
319 }
320 }
321
whpx_set_tsc(CPUState * cpu)322 static int whpx_set_tsc(CPUState *cpu)
323 {
324 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
325 WHV_REGISTER_VALUE tsc_val;
326 HRESULT hr;
327 struct whpx_state *whpx = &whpx_global;
328
329 /*
330 * Suspend the partition prior to setting the TSC to reduce the variance
331 * in TSC across vCPUs. When the first vCPU runs post suspend, the
332 * partition is automatically resumed.
333 */
334 if (whp_dispatch.WHvSuspendPartitionTime) {
335
336 /*
337 * Unable to suspend partition while setting TSC is not a fatal
338 * error. It just increases the likelihood of TSC variance between
339 * vCPUs and some guest OS are able to handle that just fine.
340 */
341 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
342 if (FAILED(hr)) {
343 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
344 }
345 }
346
347 tsc_val.Reg64 = cpu_env(cpu)->tsc;
348 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
349 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
350 if (FAILED(hr)) {
351 error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
352 return -1;
353 }
354
355 return 0;
356 }
357
358 /*
359 * The CR8 register in the CPU is mapped to the TPR register of the APIC,
360 * however, they use a slightly different encoding. Specifically:
361 *
362 * APIC.TPR[bits 7:4] = CR8[bits 3:0]
363 *
364 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
365 * and IA-32 Architectures Software Developer's Manual.
366 *
367 * The functions below translate the value of CR8 to TPR and vice versa.
368 */
369
whpx_apic_tpr_to_cr8(uint64_t tpr)370 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
371 {
372 return tpr >> 4;
373 }
374
whpx_cr8_to_apic_tpr(uint64_t cr8)375 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
376 {
377 return cr8 << 4;
378 }
379
whpx_set_registers(CPUState * cpu,int level)380 static void whpx_set_registers(CPUState *cpu, int level)
381 {
382 struct whpx_state *whpx = &whpx_global;
383 AccelCPUState *vcpu = cpu->accel;
384 X86CPU *x86_cpu = X86_CPU(cpu);
385 CPUX86State *env = &x86_cpu->env;
386 struct whpx_register_set vcxt;
387 HRESULT hr;
388 int idx;
389 int idx_next;
390 int i;
391 int v86, r86;
392
393 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
394
395 /*
396 * Following MSRs have side effects on the guest or are too heavy for
397 * runtime. Limit them to full state update.
398 */
399 if (level >= WHPX_SET_RESET_STATE) {
400 whpx_set_tsc(cpu);
401 }
402
403 memset(&vcxt, 0, sizeof(struct whpx_register_set));
404
405 v86 = (env->eflags & VM_MASK);
406 r86 = !(env->cr[0] & CR0_PE_MASK);
407
408 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
409 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
410
411 idx = 0;
412
413 /* Indexes for first 16 registers match between HV and QEMU definitions */
414 idx_next = 16;
415 for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
416 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
417 }
418 idx = idx_next;
419
420 /* Same goes for RIP and RFLAGS */
421 assert(whpx_register_names[idx] == WHvX64RegisterRip);
422 vcxt.values[idx++].Reg64 = env->eip;
423
424 assert(whpx_register_names[idx] == WHvX64RegisterRflags);
425 vcxt.values[idx++].Reg64 = env->eflags;
426
427 /* Translate 6+4 segment registers. HV and QEMU order matches */
428 assert(idx == WHvX64RegisterEs);
429 for (i = 0; i < 6; i += 1, idx += 1) {
430 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
431 }
432
433 assert(idx == WHvX64RegisterLdtr);
434 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
435
436 assert(idx == WHvX64RegisterTr);
437 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
438
439 assert(idx == WHvX64RegisterIdtr);
440 vcxt.values[idx].Table.Base = env->idt.base;
441 vcxt.values[idx].Table.Limit = env->idt.limit;
442 idx += 1;
443
444 assert(idx == WHvX64RegisterGdtr);
445 vcxt.values[idx].Table.Base = env->gdt.base;
446 vcxt.values[idx].Table.Limit = env->gdt.limit;
447 idx += 1;
448
449 /* CR0, 2, 3, 4, 8 */
450 assert(whpx_register_names[idx] == WHvX64RegisterCr0);
451 vcxt.values[idx++].Reg64 = env->cr[0];
452 assert(whpx_register_names[idx] == WHvX64RegisterCr2);
453 vcxt.values[idx++].Reg64 = env->cr[2];
454 assert(whpx_register_names[idx] == WHvX64RegisterCr3);
455 vcxt.values[idx++].Reg64 = env->cr[3];
456 assert(whpx_register_names[idx] == WHvX64RegisterCr4);
457 vcxt.values[idx++].Reg64 = env->cr[4];
458 assert(whpx_register_names[idx] == WHvX64RegisterCr8);
459 vcxt.values[idx++].Reg64 = vcpu->tpr;
460
461 /* 8 Debug Registers - Skipped */
462
463 /*
464 * Extended control registers needs to be handled separately depending
465 * on whether xsave is supported/enabled or not.
466 */
467 whpx_set_xcrs(cpu);
468
469 /* 16 XMM registers */
470 assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
471 idx_next = idx + 16;
472 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
473 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
474 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
475 }
476 idx = idx_next;
477
478 /* 8 FP registers */
479 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
480 for (i = 0; i < 8; i += 1, idx += 1) {
481 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
482 /* vcxt.values[idx].Fp.AsUINT128.High64 =
483 env->fpregs[i].mmx.MMX_Q(1);
484 */
485 }
486
487 /* FP control status register */
488 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
489 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
490 vcxt.values[idx].FpControlStatus.FpStatus =
491 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
492 vcxt.values[idx].FpControlStatus.FpTag = 0;
493 for (i = 0; i < 8; ++i) {
494 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
495 }
496 vcxt.values[idx].FpControlStatus.Reserved = 0;
497 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
498 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
499 idx += 1;
500
501 /* XMM control status register */
502 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
503 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
504 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
505 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
506 idx += 1;
507
508 /* MSRs */
509 assert(whpx_register_names[idx] == WHvX64RegisterEfer);
510 vcxt.values[idx++].Reg64 = env->efer;
511 #ifdef TARGET_X86_64
512 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
513 vcxt.values[idx++].Reg64 = env->kernelgsbase;
514 #endif
515
516 assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
517 vcxt.values[idx++].Reg64 = vcpu->apic_base;
518
519 /* WHvX64RegisterPat - Skipped */
520
521 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
522 vcxt.values[idx++].Reg64 = env->sysenter_cs;
523 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
524 vcxt.values[idx++].Reg64 = env->sysenter_eip;
525 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
526 vcxt.values[idx++].Reg64 = env->sysenter_esp;
527 assert(whpx_register_names[idx] == WHvX64RegisterStar);
528 vcxt.values[idx++].Reg64 = env->star;
529 #ifdef TARGET_X86_64
530 assert(whpx_register_names[idx] == WHvX64RegisterLstar);
531 vcxt.values[idx++].Reg64 = env->lstar;
532 assert(whpx_register_names[idx] == WHvX64RegisterCstar);
533 vcxt.values[idx++].Reg64 = env->cstar;
534 assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
535 vcxt.values[idx++].Reg64 = env->fmask;
536 #endif
537
538 /* Interrupt / Event Registers - Skipped */
539
540 assert(idx == RTL_NUMBER_OF(whpx_register_names));
541
542 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
543 whpx->partition, cpu->cpu_index,
544 whpx_register_names,
545 RTL_NUMBER_OF(whpx_register_names),
546 &vcxt.values[0]);
547
548 if (FAILED(hr)) {
549 error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
550 hr);
551 }
552 }
553
whpx_get_tsc(CPUState * cpu)554 static int whpx_get_tsc(CPUState *cpu)
555 {
556 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
557 WHV_REGISTER_VALUE tsc_val;
558 HRESULT hr;
559 struct whpx_state *whpx = &whpx_global;
560
561 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
562 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
563 if (FAILED(hr)) {
564 error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
565 return -1;
566 }
567
568 cpu_env(cpu)->tsc = tsc_val.Reg64;
569 return 0;
570 }
571
572 /* X64 Extended Control Registers */
whpx_get_xcrs(CPUState * cpu)573 static void whpx_get_xcrs(CPUState *cpu)
574 {
575 HRESULT hr;
576 struct whpx_state *whpx = &whpx_global;
577 WHV_REGISTER_VALUE xcr0;
578 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
579
580 if (!whpx_has_xsave()) {
581 return;
582 }
583
584 /* Only xcr0 is supported by the hypervisor currently */
585 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
586 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
587 if (FAILED(hr)) {
588 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
589 return;
590 }
591
592 cpu_env(cpu)->xcr0 = xcr0.Reg64;
593 }
594
whpx_get_registers(CPUState * cpu)595 static void whpx_get_registers(CPUState *cpu)
596 {
597 struct whpx_state *whpx = &whpx_global;
598 AccelCPUState *vcpu = cpu->accel;
599 X86CPU *x86_cpu = X86_CPU(cpu);
600 CPUX86State *env = &x86_cpu->env;
601 struct whpx_register_set vcxt;
602 uint64_t tpr, apic_base;
603 HRESULT hr;
604 int idx;
605 int idx_next;
606 int i;
607
608 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
609
610 if (!env->tsc_valid) {
611 whpx_get_tsc(cpu);
612 env->tsc_valid = !runstate_is_running();
613 }
614
615 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
616 whpx->partition, cpu->cpu_index,
617 whpx_register_names,
618 RTL_NUMBER_OF(whpx_register_names),
619 &vcxt.values[0]);
620 if (FAILED(hr)) {
621 error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
622 hr);
623 }
624
625 if (whpx_apic_in_platform()) {
626 /*
627 * Fetch the TPR value from the emulated APIC. It may get overwritten
628 * below with the value from CR8 returned by
629 * WHvGetVirtualProcessorRegisters().
630 */
631 whpx_apic_get(x86_cpu->apic_state);
632 vcpu->tpr = whpx_apic_tpr_to_cr8(
633 cpu_get_apic_tpr(x86_cpu->apic_state));
634 }
635
636 idx = 0;
637
638 /* Indexes for first 16 registers match between HV and QEMU definitions */
639 idx_next = 16;
640 for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
641 env->regs[idx] = vcxt.values[idx].Reg64;
642 }
643 idx = idx_next;
644
645 /* Same goes for RIP and RFLAGS */
646 assert(whpx_register_names[idx] == WHvX64RegisterRip);
647 env->eip = vcxt.values[idx++].Reg64;
648 assert(whpx_register_names[idx] == WHvX64RegisterRflags);
649 env->eflags = vcxt.values[idx++].Reg64;
650
651 /* Translate 6+4 segment registers. HV and QEMU order matches */
652 assert(idx == WHvX64RegisterEs);
653 for (i = 0; i < 6; i += 1, idx += 1) {
654 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
655 }
656
657 assert(idx == WHvX64RegisterLdtr);
658 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
659 assert(idx == WHvX64RegisterTr);
660 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
661 assert(idx == WHvX64RegisterIdtr);
662 env->idt.base = vcxt.values[idx].Table.Base;
663 env->idt.limit = vcxt.values[idx].Table.Limit;
664 idx += 1;
665 assert(idx == WHvX64RegisterGdtr);
666 env->gdt.base = vcxt.values[idx].Table.Base;
667 env->gdt.limit = vcxt.values[idx].Table.Limit;
668 idx += 1;
669
670 /* CR0, 2, 3, 4, 8 */
671 assert(whpx_register_names[idx] == WHvX64RegisterCr0);
672 env->cr[0] = vcxt.values[idx++].Reg64;
673 assert(whpx_register_names[idx] == WHvX64RegisterCr2);
674 env->cr[2] = vcxt.values[idx++].Reg64;
675 assert(whpx_register_names[idx] == WHvX64RegisterCr3);
676 env->cr[3] = vcxt.values[idx++].Reg64;
677 assert(whpx_register_names[idx] == WHvX64RegisterCr4);
678 env->cr[4] = vcxt.values[idx++].Reg64;
679 assert(whpx_register_names[idx] == WHvX64RegisterCr8);
680 tpr = vcxt.values[idx++].Reg64;
681 if (tpr != vcpu->tpr) {
682 vcpu->tpr = tpr;
683 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
684 }
685
686 /* 8 Debug Registers - Skipped */
687
688 /*
689 * Extended control registers needs to be handled separately depending
690 * on whether xsave is supported/enabled or not.
691 */
692 whpx_get_xcrs(cpu);
693
694 /* 16 XMM registers */
695 assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
696 idx_next = idx + 16;
697 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
698 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
699 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
700 }
701 idx = idx_next;
702
703 /* 8 FP registers */
704 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
705 for (i = 0; i < 8; i += 1, idx += 1) {
706 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
707 /* env->fpregs[i].mmx.MMX_Q(1) =
708 vcxt.values[idx].Fp.AsUINT128.High64;
709 */
710 }
711
712 /* FP control status register */
713 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
714 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
715 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
716 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
717 for (i = 0; i < 8; ++i) {
718 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
719 }
720 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
721 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
722 idx += 1;
723
724 /* XMM control status register */
725 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
726 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
727 idx += 1;
728
729 /* MSRs */
730 assert(whpx_register_names[idx] == WHvX64RegisterEfer);
731 env->efer = vcxt.values[idx++].Reg64;
732 #ifdef TARGET_X86_64
733 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
734 env->kernelgsbase = vcxt.values[idx++].Reg64;
735 #endif
736
737 assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
738 apic_base = vcxt.values[idx++].Reg64;
739 if (apic_base != vcpu->apic_base) {
740 vcpu->apic_base = apic_base;
741 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
742 }
743
744 /* WHvX64RegisterPat - Skipped */
745
746 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
747 env->sysenter_cs = vcxt.values[idx++].Reg64;
748 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
749 env->sysenter_eip = vcxt.values[idx++].Reg64;
750 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
751 env->sysenter_esp = vcxt.values[idx++].Reg64;
752 assert(whpx_register_names[idx] == WHvX64RegisterStar);
753 env->star = vcxt.values[idx++].Reg64;
754 #ifdef TARGET_X86_64
755 assert(whpx_register_names[idx] == WHvX64RegisterLstar);
756 env->lstar = vcxt.values[idx++].Reg64;
757 assert(whpx_register_names[idx] == WHvX64RegisterCstar);
758 env->cstar = vcxt.values[idx++].Reg64;
759 assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
760 env->fmask = vcxt.values[idx++].Reg64;
761 #endif
762
763 /* Interrupt / Event Registers - Skipped */
764
765 assert(idx == RTL_NUMBER_OF(whpx_register_names));
766
767 if (whpx_apic_in_platform()) {
768 whpx_apic_get(x86_cpu->apic_state);
769 }
770
771 x86_update_hflags(env);
772 }
773
whpx_emu_ioport_callback(void * ctx,WHV_EMULATOR_IO_ACCESS_INFO * IoAccess)774 static HRESULT CALLBACK whpx_emu_ioport_callback(
775 void *ctx,
776 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
777 {
778 MemTxAttrs attrs = { 0 };
779 address_space_rw(&address_space_io, IoAccess->Port, attrs,
780 &IoAccess->Data, IoAccess->AccessSize,
781 IoAccess->Direction);
782 return S_OK;
783 }
784
whpx_emu_mmio_callback(void * ctx,WHV_EMULATOR_MEMORY_ACCESS_INFO * ma)785 static HRESULT CALLBACK whpx_emu_mmio_callback(
786 void *ctx,
787 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
788 {
789 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
790 ma->Direction);
791 return S_OK;
792 }
793
whpx_emu_getreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,WHV_REGISTER_VALUE * RegisterValues)794 static HRESULT CALLBACK whpx_emu_getreg_callback(
795 void *ctx,
796 const WHV_REGISTER_NAME *RegisterNames,
797 UINT32 RegisterCount,
798 WHV_REGISTER_VALUE *RegisterValues)
799 {
800 HRESULT hr;
801 struct whpx_state *whpx = &whpx_global;
802 CPUState *cpu = (CPUState *)ctx;
803
804 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
805 whpx->partition, cpu->cpu_index,
806 RegisterNames, RegisterCount,
807 RegisterValues);
808 if (FAILED(hr)) {
809 error_report("WHPX: Failed to get virtual processor registers,"
810 " hr=%08lx", hr);
811 }
812
813 return hr;
814 }
815
whpx_emu_setreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,const WHV_REGISTER_VALUE * RegisterValues)816 static HRESULT CALLBACK whpx_emu_setreg_callback(
817 void *ctx,
818 const WHV_REGISTER_NAME *RegisterNames,
819 UINT32 RegisterCount,
820 const WHV_REGISTER_VALUE *RegisterValues)
821 {
822 HRESULT hr;
823 struct whpx_state *whpx = &whpx_global;
824 CPUState *cpu = (CPUState *)ctx;
825
826 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
827 whpx->partition, cpu->cpu_index,
828 RegisterNames, RegisterCount,
829 RegisterValues);
830 if (FAILED(hr)) {
831 error_report("WHPX: Failed to set virtual processor registers,"
832 " hr=%08lx", hr);
833 }
834
835 /*
836 * The emulator just successfully wrote the register state. We clear the
837 * dirty state so we avoid the double write on resume of the VP.
838 */
839 cpu->accel->dirty = false;
840
841 return hr;
842 }
843
whpx_emu_translate_callback(void * ctx,WHV_GUEST_VIRTUAL_ADDRESS Gva,WHV_TRANSLATE_GVA_FLAGS TranslateFlags,WHV_TRANSLATE_GVA_RESULT_CODE * TranslationResult,WHV_GUEST_PHYSICAL_ADDRESS * Gpa)844 static HRESULT CALLBACK whpx_emu_translate_callback(
845 void *ctx,
846 WHV_GUEST_VIRTUAL_ADDRESS Gva,
847 WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
848 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
849 WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
850 {
851 HRESULT hr;
852 struct whpx_state *whpx = &whpx_global;
853 CPUState *cpu = (CPUState *)ctx;
854 WHV_TRANSLATE_GVA_RESULT res;
855
856 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
857 Gva, TranslateFlags, &res, Gpa);
858 if (FAILED(hr)) {
859 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
860 } else {
861 *TranslationResult = res.ResultCode;
862 }
863
864 return hr;
865 }
866
867 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
868 .Size = sizeof(WHV_EMULATOR_CALLBACKS),
869 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
870 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
871 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
872 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
873 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
874 };
875
whpx_handle_mmio(CPUState * cpu,WHV_MEMORY_ACCESS_CONTEXT * ctx)876 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
877 {
878 HRESULT hr;
879 AccelCPUState *vcpu = cpu->accel;
880 WHV_EMULATOR_STATUS emu_status;
881
882 hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
883 vcpu->emulator, cpu,
884 &vcpu->exit_ctx.VpContext, ctx,
885 &emu_status);
886 if (FAILED(hr)) {
887 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
888 return -1;
889 }
890
891 if (!emu_status.EmulationSuccessful) {
892 error_report("WHPX: Failed to emulate MMIO access with"
893 " EmulatorReturnStatus: %u", emu_status.AsUINT32);
894 return -1;
895 }
896
897 return 0;
898 }
899
whpx_handle_portio(CPUState * cpu,WHV_X64_IO_PORT_ACCESS_CONTEXT * ctx)900 static int whpx_handle_portio(CPUState *cpu,
901 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
902 {
903 HRESULT hr;
904 AccelCPUState *vcpu = cpu->accel;
905 WHV_EMULATOR_STATUS emu_status;
906
907 hr = whp_dispatch.WHvEmulatorTryIoEmulation(
908 vcpu->emulator, cpu,
909 &vcpu->exit_ctx.VpContext, ctx,
910 &emu_status);
911 if (FAILED(hr)) {
912 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
913 return -1;
914 }
915
916 if (!emu_status.EmulationSuccessful) {
917 error_report("WHPX: Failed to emulate PortIO access with"
918 " EmulatorReturnStatus: %u", emu_status.AsUINT32);
919 return -1;
920 }
921
922 return 0;
923 }
924
925 /*
926 * Controls whether we should intercept various exceptions on the guest,
927 * namely breakpoint/single-step events.
928 *
929 * The 'exceptions' argument accepts a bitmask, e.g:
930 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
931 */
whpx_set_exception_exit_bitmap(UINT64 exceptions)932 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
933 {
934 struct whpx_state *whpx = &whpx_global;
935 WHV_PARTITION_PROPERTY prop = { 0, };
936 HRESULT hr;
937
938 if (exceptions == whpx->exception_exit_bitmap) {
939 return S_OK;
940 }
941
942 prop.ExceptionExitBitmap = exceptions;
943
944 hr = whp_dispatch.WHvSetPartitionProperty(
945 whpx->partition,
946 WHvPartitionPropertyCodeExceptionExitBitmap,
947 &prop,
948 sizeof(WHV_PARTITION_PROPERTY));
949
950 if (SUCCEEDED(hr)) {
951 whpx->exception_exit_bitmap = exceptions;
952 }
953
954 return hr;
955 }
956
957
958 /*
959 * This function is called before/after stepping over a single instruction.
960 * It will update the CPU registers to arm/disarm the instruction stepping
961 * accordingly.
962 */
whpx_vcpu_configure_single_stepping(CPUState * cpu,bool set,uint64_t * exit_context_rflags)963 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
964 bool set,
965 uint64_t *exit_context_rflags)
966 {
967 WHV_REGISTER_NAME reg_name;
968 WHV_REGISTER_VALUE reg_value;
969 HRESULT hr;
970 struct whpx_state *whpx = &whpx_global;
971
972 /*
973 * If we are trying to step over a single instruction, we need to set the
974 * TF bit in rflags. Otherwise, clear it.
975 */
976 reg_name = WHvX64RegisterRflags;
977 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
978 whpx->partition,
979 cpu->cpu_index,
980 ®_name,
981 1,
982 ®_value);
983
984 if (FAILED(hr)) {
985 error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
986 return hr;
987 }
988
989 if (exit_context_rflags) {
990 assert(*exit_context_rflags == reg_value.Reg64);
991 }
992
993 if (set) {
994 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
995 reg_value.Reg64 |= TF_MASK;
996 } else {
997 reg_value.Reg64 &= ~TF_MASK;
998 }
999
1000 if (exit_context_rflags) {
1001 *exit_context_rflags = reg_value.Reg64;
1002 }
1003
1004 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1005 whpx->partition,
1006 cpu->cpu_index,
1007 ®_name,
1008 1,
1009 ®_value);
1010
1011 if (FAILED(hr)) {
1012 error_report("WHPX: Failed to set rflags,"
1013 " hr=%08lx",
1014 hr);
1015 return hr;
1016 }
1017
1018 reg_name = WHvRegisterInterruptState;
1019 reg_value.Reg64 = 0;
1020
1021 /* Suspend delivery of hardware interrupts during single-stepping. */
1022 reg_value.InterruptState.InterruptShadow = set != 0;
1023
1024 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1025 whpx->partition,
1026 cpu->cpu_index,
1027 ®_name,
1028 1,
1029 ®_value);
1030
1031 if (FAILED(hr)) {
1032 error_report("WHPX: Failed to set InterruptState,"
1033 " hr=%08lx",
1034 hr);
1035 return hr;
1036 }
1037
1038 if (!set) {
1039 /*
1040 * We have just finished stepping over a single instruction,
1041 * and intercepted the INT1 generated by it.
1042 * We need to now hide the INT1 from the guest,
1043 * as it would not be expecting it.
1044 */
1045
1046 reg_name = WHvX64RegisterPendingDebugException;
1047 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1048 whpx->partition,
1049 cpu->cpu_index,
1050 ®_name,
1051 1,
1052 ®_value);
1053
1054 if (FAILED(hr)) {
1055 error_report("WHPX: Failed to get pending debug exceptions,"
1056 "hr=%08lx", hr);
1057 return hr;
1058 }
1059
1060 if (reg_value.PendingDebugException.SingleStep) {
1061 reg_value.PendingDebugException.SingleStep = 0;
1062
1063 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1064 whpx->partition,
1065 cpu->cpu_index,
1066 ®_name,
1067 1,
1068 ®_value);
1069
1070 if (FAILED(hr)) {
1071 error_report("WHPX: Failed to clear pending debug exceptions,"
1072 "hr=%08lx", hr);
1073 return hr;
1074 }
1075 }
1076
1077 }
1078
1079 return S_OK;
1080 }
1081
1082 /* Tries to find a breakpoint at the specified address. */
whpx_lookup_breakpoint_by_addr(uint64_t address)1083 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1084 {
1085 struct whpx_state *whpx = &whpx_global;
1086 int i;
1087
1088 if (whpx->breakpoints.breakpoints) {
1089 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1090 if (address == whpx->breakpoints.breakpoints->data[i].address) {
1091 return &whpx->breakpoints.breakpoints->data[i];
1092 }
1093 }
1094 }
1095
1096 return NULL;
1097 }
1098
1099 /*
1100 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1101 * debugging user-mode applications. Since the WHPX API does not offer
1102 * an easy way to pass the intercepted exception back to the guest, we
1103 * resort to using INT1 instead, and let the guest always handle INT3.
1104 */
1105 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1106
1107 /*
1108 * The WHPX QEMU backend implements breakpoints by writing the INT1
1109 * instruction into memory (ignoring the DRx registers). This raises a few
1110 * issues that need to be carefully handled:
1111 *
1112 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1113 * at the same location, and later remove them in arbitrary order.
1114 * This should not cause memory corruption, and should only remove the
1115 * physical breakpoint instruction when the last QEMU breakpoint is gone.
1116 *
1117 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1118 * physical location. Hence, physically adding/removing a breakpoint can
1119 * theoretically fail at any time. We need to keep track of it.
1120 *
1121 * The function below rebuilds a list of low-level breakpoints (one per
1122 * address, tracking the original instruction and any errors) from the list of
1123 * high-level breakpoints (set via cpu_breakpoint_insert()).
1124 *
1125 * In order to optimize performance, this function stores the list of
1126 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1127 * low-level ones, so that it won't be re-invoked until these breakpoints
1128 * change.
1129 *
1130 * Note that this function decides which breakpoints should be inserted into,
1131 * memory, but doesn't actually do it. The memory accessing is done in
1132 * whpx_apply_breakpoints().
1133 */
whpx_translate_cpu_breakpoints(struct whpx_breakpoints * breakpoints,CPUState * cpu,int cpu_breakpoint_count)1134 static void whpx_translate_cpu_breakpoints(
1135 struct whpx_breakpoints *breakpoints,
1136 CPUState *cpu,
1137 int cpu_breakpoint_count)
1138 {
1139 CPUBreakpoint *bp;
1140 int cpu_bp_index = 0;
1141
1142 breakpoints->original_addresses =
1143 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1144
1145 breakpoints->original_address_count = cpu_breakpoint_count;
1146
1147 int max_breakpoints = cpu_breakpoint_count +
1148 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1149
1150 struct whpx_breakpoint_collection *new_breakpoints =
1151 g_malloc0(sizeof(struct whpx_breakpoint_collection)
1152 + max_breakpoints * sizeof(struct whpx_breakpoint));
1153
1154 new_breakpoints->allocated = max_breakpoints;
1155 new_breakpoints->used = 0;
1156
1157 /*
1158 * 1. Preserve all old breakpoints that could not be automatically
1159 * cleared when the CPU got stopped.
1160 */
1161 if (breakpoints->breakpoints) {
1162 int i;
1163 for (i = 0; i < breakpoints->breakpoints->used; i++) {
1164 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1165 new_breakpoints->data[new_breakpoints->used++] =
1166 breakpoints->breakpoints->data[i];
1167 }
1168 }
1169 }
1170
1171 /* 2. Map all CPU breakpoints to WHPX breakpoints */
1172 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1173 int i;
1174 bool found = false;
1175
1176 /* This will be used to detect changed CPU breakpoints later. */
1177 breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1178
1179 for (i = 0; i < new_breakpoints->used; i++) {
1180 /*
1181 * WARNING: This loop has O(N^2) complexity, where N is the
1182 * number of breakpoints. It should not be a bottleneck in
1183 * real-world scenarios, since it only needs to run once after
1184 * the breakpoints have been modified.
1185 * If this ever becomes a concern, it can be optimized by storing
1186 * high-level breakpoint objects in a tree or hash map.
1187 */
1188
1189 if (new_breakpoints->data[i].address == bp->pc) {
1190 /* There was already a breakpoint at this address. */
1191 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1192 new_breakpoints->data[i].state = WHPX_BP_SET;
1193 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1194 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1195 }
1196
1197 found = true;
1198 break;
1199 }
1200 }
1201
1202 if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1203 /* No WHPX breakpoint at this address. Create one. */
1204 new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1205 new_breakpoints->data[new_breakpoints->used].state =
1206 WHPX_BP_SET_PENDING;
1207 new_breakpoints->used++;
1208 }
1209 }
1210
1211 /*
1212 * Free the previous breakpoint list. This can be optimized by keeping
1213 * it as shadow buffer for the next computation instead of freeing
1214 * it immediately.
1215 */
1216 g_free(breakpoints->breakpoints);
1217
1218 breakpoints->breakpoints = new_breakpoints;
1219 }
1220
1221 /*
1222 * Physically inserts/removes the breakpoints by reading and writing the
1223 * physical memory, keeping a track of the failed attempts.
1224 *
1225 * Passing resuming=true will try to set all previously unset breakpoints.
1226 * Passing resuming=false will remove all inserted ones.
1227 */
whpx_apply_breakpoints(struct whpx_breakpoint_collection * breakpoints,CPUState * cpu,bool resuming)1228 static void whpx_apply_breakpoints(
1229 struct whpx_breakpoint_collection *breakpoints,
1230 CPUState *cpu,
1231 bool resuming)
1232 {
1233 int i, rc;
1234 if (!breakpoints) {
1235 return;
1236 }
1237
1238 for (i = 0; i < breakpoints->used; i++) {
1239 /* Decide what to do right now based on the last known state. */
1240 WhpxBreakpointState state = breakpoints->data[i].state;
1241 switch (state) {
1242 case WHPX_BP_CLEARED:
1243 if (resuming) {
1244 state = WHPX_BP_SET_PENDING;
1245 }
1246 break;
1247 case WHPX_BP_SET_PENDING:
1248 if (!resuming) {
1249 state = WHPX_BP_CLEARED;
1250 }
1251 break;
1252 case WHPX_BP_SET:
1253 if (!resuming) {
1254 state = WHPX_BP_CLEAR_PENDING;
1255 }
1256 break;
1257 case WHPX_BP_CLEAR_PENDING:
1258 if (resuming) {
1259 state = WHPX_BP_SET;
1260 }
1261 break;
1262 }
1263
1264 if (state == WHPX_BP_SET_PENDING) {
1265 /* Remember the original instruction. */
1266 rc = cpu_memory_rw_debug(cpu,
1267 breakpoints->data[i].address,
1268 &breakpoints->data[i].original_instruction,
1269 1,
1270 false);
1271
1272 if (!rc) {
1273 /* Write the breakpoint instruction. */
1274 rc = cpu_memory_rw_debug(cpu,
1275 breakpoints->data[i].address,
1276 (void *)&whpx_breakpoint_instruction,
1277 1,
1278 true);
1279 }
1280
1281 if (!rc) {
1282 state = WHPX_BP_SET;
1283 }
1284
1285 }
1286
1287 if (state == WHPX_BP_CLEAR_PENDING) {
1288 /* Restore the original instruction. */
1289 rc = cpu_memory_rw_debug(cpu,
1290 breakpoints->data[i].address,
1291 &breakpoints->data[i].original_instruction,
1292 1,
1293 true);
1294
1295 if (!rc) {
1296 state = WHPX_BP_CLEARED;
1297 }
1298 }
1299
1300 breakpoints->data[i].state = state;
1301 }
1302 }
1303
1304 /*
1305 * This function is called when the a VCPU is about to start and no other
1306 * VCPUs have been started so far. Since the VCPU start order could be
1307 * arbitrary, it doesn't have to be VCPU#0.
1308 *
1309 * It is used to commit the breakpoints into memory, and configure WHPX
1310 * to intercept debug exceptions.
1311 *
1312 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1313 * more VCPUs are already running, so this is the best place to do it.
1314 */
whpx_first_vcpu_starting(CPUState * cpu)1315 static int whpx_first_vcpu_starting(CPUState *cpu)
1316 {
1317 struct whpx_state *whpx = &whpx_global;
1318 HRESULT hr;
1319
1320 g_assert(bql_locked());
1321
1322 if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1323 (whpx->breakpoints.breakpoints &&
1324 whpx->breakpoints.breakpoints->used)) {
1325 CPUBreakpoint *bp;
1326 int i = 0;
1327 bool update_pending = false;
1328
1329 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1330 if (i >= whpx->breakpoints.original_address_count ||
1331 bp->pc != whpx->breakpoints.original_addresses[i]) {
1332 update_pending = true;
1333 }
1334
1335 i++;
1336 }
1337
1338 if (i != whpx->breakpoints.original_address_count) {
1339 update_pending = true;
1340 }
1341
1342 if (update_pending) {
1343 /*
1344 * The CPU breakpoints have changed since the last call to
1345 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1346 * now be recomputed.
1347 */
1348 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1349 }
1350
1351 /* Actually insert the breakpoints into the memory. */
1352 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1353 }
1354
1355 uint64_t exception_mask;
1356 if (whpx->step_pending ||
1357 (whpx->breakpoints.breakpoints &&
1358 whpx->breakpoints.breakpoints->used)) {
1359 /*
1360 * We are either attempting to single-step one or more CPUs, or
1361 * have one or more breakpoints enabled. Both require intercepting
1362 * the WHvX64ExceptionTypeBreakpointTrap exception.
1363 */
1364
1365 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1366 } else {
1367 /* Let the guest handle all exceptions. */
1368 exception_mask = 0;
1369 }
1370
1371 hr = whpx_set_exception_exit_bitmap(exception_mask);
1372 if (!SUCCEEDED(hr)) {
1373 error_report("WHPX: Failed to update exception exit mask,"
1374 "hr=%08lx.", hr);
1375 return 1;
1376 }
1377
1378 return 0;
1379 }
1380
1381 /*
1382 * This function is called when the last VCPU has finished running.
1383 * It is used to remove any previously set breakpoints from memory.
1384 */
whpx_last_vcpu_stopping(CPUState * cpu)1385 static int whpx_last_vcpu_stopping(CPUState *cpu)
1386 {
1387 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1388 return 0;
1389 }
1390
1391 /* Returns the address of the next instruction that is about to be executed. */
whpx_vcpu_get_pc(CPUState * cpu,bool exit_context_valid)1392 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1393 {
1394 if (cpu->accel->dirty) {
1395 /* The CPU registers have been modified by other parts of QEMU. */
1396 return cpu_env(cpu)->eip;
1397 } else if (exit_context_valid) {
1398 /*
1399 * The CPU registers have not been modified by neither other parts
1400 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1401 * This is the most common case.
1402 */
1403 AccelCPUState *vcpu = cpu->accel;
1404 return vcpu->exit_ctx.VpContext.Rip;
1405 } else {
1406 /*
1407 * The CPU registers have been modified by a call to
1408 * WHvSetVirtualProcessorRegisters() and must be re-queried from
1409 * the target.
1410 */
1411 WHV_REGISTER_VALUE reg_value;
1412 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1413 HRESULT hr;
1414 struct whpx_state *whpx = &whpx_global;
1415
1416 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1417 whpx->partition,
1418 cpu->cpu_index,
1419 ®_name,
1420 1,
1421 ®_value);
1422
1423 if (FAILED(hr)) {
1424 error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1425 return 0;
1426 }
1427
1428 return reg_value.Reg64;
1429 }
1430 }
1431
whpx_handle_halt(CPUState * cpu)1432 static int whpx_handle_halt(CPUState *cpu)
1433 {
1434 int ret = 0;
1435
1436 bql_lock();
1437 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1438 (cpu_env(cpu)->eflags & IF_MASK)) &&
1439 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1440 cpu->exception_index = EXCP_HLT;
1441 cpu->halted = true;
1442 ret = 1;
1443 }
1444 bql_unlock();
1445
1446 return ret;
1447 }
1448
whpx_vcpu_pre_run(CPUState * cpu)1449 static void whpx_vcpu_pre_run(CPUState *cpu)
1450 {
1451 HRESULT hr;
1452 struct whpx_state *whpx = &whpx_global;
1453 AccelCPUState *vcpu = cpu->accel;
1454 X86CPU *x86_cpu = X86_CPU(cpu);
1455 CPUX86State *env = &x86_cpu->env;
1456 int irq;
1457 uint8_t tpr;
1458 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1459 UINT32 reg_count = 0;
1460 WHV_REGISTER_VALUE reg_values[3];
1461 WHV_REGISTER_NAME reg_names[3];
1462
1463 memset(&new_int, 0, sizeof(new_int));
1464 memset(reg_values, 0, sizeof(reg_values));
1465
1466 bql_lock();
1467
1468 /* Inject NMI */
1469 if (!vcpu->interruption_pending &&
1470 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1471 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1472 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1473 vcpu->interruptable = false;
1474 new_int.InterruptionType = WHvX64PendingNmi;
1475 new_int.InterruptionPending = 1;
1476 new_int.InterruptionVector = 2;
1477 }
1478 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1479 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1480 }
1481 }
1482
1483 /*
1484 * Force the VCPU out of its inner loop to process any INIT requests or
1485 * commit pending TPR access.
1486 */
1487 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1488 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1489 !(env->hflags & HF_SMM_MASK)) {
1490 cpu->exit_request = 1;
1491 }
1492 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1493 cpu->exit_request = 1;
1494 }
1495 }
1496
1497 /* Get pending hard interruption or replay one that was overwritten */
1498 if (!whpx_apic_in_platform()) {
1499 if (!vcpu->interruption_pending &&
1500 vcpu->interruptable && (env->eflags & IF_MASK)) {
1501 assert(!new_int.InterruptionPending);
1502 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1503 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1504 irq = cpu_get_pic_interrupt(env);
1505 if (irq >= 0) {
1506 new_int.InterruptionType = WHvX64PendingInterrupt;
1507 new_int.InterruptionPending = 1;
1508 new_int.InterruptionVector = irq;
1509 }
1510 }
1511 }
1512
1513 /* Setup interrupt state if new one was prepared */
1514 if (new_int.InterruptionPending) {
1515 reg_values[reg_count].PendingInterruption = new_int;
1516 reg_names[reg_count] = WHvRegisterPendingInterruption;
1517 reg_count += 1;
1518 }
1519 } else if (vcpu->ready_for_pic_interrupt &&
1520 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1521 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1522 irq = cpu_get_pic_interrupt(env);
1523 if (irq >= 0) {
1524 reg_names[reg_count] = WHvRegisterPendingEvent;
1525 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1526 {
1527 .EventPending = 1,
1528 .EventType = WHvX64PendingEventExtInt,
1529 .Vector = irq,
1530 };
1531 reg_count += 1;
1532 }
1533 }
1534
1535 /* Sync the TPR to the CR8 if was modified during the intercept */
1536 tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1537 if (tpr != vcpu->tpr) {
1538 vcpu->tpr = tpr;
1539 reg_values[reg_count].Reg64 = tpr;
1540 cpu->exit_request = 1;
1541 reg_names[reg_count] = WHvX64RegisterCr8;
1542 reg_count += 1;
1543 }
1544
1545 /* Update the state of the interrupt delivery notification */
1546 if (!vcpu->window_registered &&
1547 cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1548 reg_values[reg_count].DeliverabilityNotifications =
1549 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1550 .InterruptNotification = 1
1551 };
1552 vcpu->window_registered = 1;
1553 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1554 reg_count += 1;
1555 }
1556
1557 bql_unlock();
1558 vcpu->ready_for_pic_interrupt = false;
1559
1560 if (reg_count) {
1561 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1562 whpx->partition, cpu->cpu_index,
1563 reg_names, reg_count, reg_values);
1564 if (FAILED(hr)) {
1565 error_report("WHPX: Failed to set interrupt state registers,"
1566 " hr=%08lx", hr);
1567 }
1568 }
1569 }
1570
whpx_vcpu_post_run(CPUState * cpu)1571 static void whpx_vcpu_post_run(CPUState *cpu)
1572 {
1573 AccelCPUState *vcpu = cpu->accel;
1574 X86CPU *x86_cpu = X86_CPU(cpu);
1575 CPUX86State *env = &x86_cpu->env;
1576
1577 env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1578
1579 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1580 if (vcpu->tpr != tpr) {
1581 vcpu->tpr = tpr;
1582 bql_lock();
1583 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1584 bql_unlock();
1585 }
1586
1587 vcpu->interruption_pending =
1588 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1589
1590 vcpu->interruptable =
1591 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1592 }
1593
whpx_vcpu_process_async_events(CPUState * cpu)1594 static void whpx_vcpu_process_async_events(CPUState *cpu)
1595 {
1596 X86CPU *x86_cpu = X86_CPU(cpu);
1597 CPUX86State *env = &x86_cpu->env;
1598 AccelCPUState *vcpu = cpu->accel;
1599
1600 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1601 !(env->hflags & HF_SMM_MASK)) {
1602 whpx_cpu_synchronize_state(cpu);
1603 do_cpu_init(x86_cpu);
1604 vcpu->interruptable = true;
1605 }
1606
1607 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1608 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1609 apic_poll_irq(x86_cpu->apic_state);
1610 }
1611
1612 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1613 (env->eflags & IF_MASK)) ||
1614 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1615 cpu->halted = false;
1616 }
1617
1618 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1619 whpx_cpu_synchronize_state(cpu);
1620 do_cpu_sipi(x86_cpu);
1621 }
1622
1623 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1624 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1625 whpx_cpu_synchronize_state(cpu);
1626 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1627 env->tpr_access_type);
1628 }
1629 }
1630
whpx_vcpu_run(CPUState * cpu)1631 static int whpx_vcpu_run(CPUState *cpu)
1632 {
1633 HRESULT hr;
1634 struct whpx_state *whpx = &whpx_global;
1635 AccelCPUState *vcpu = cpu->accel;
1636 struct whpx_breakpoint *stepped_over_bp = NULL;
1637 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1638 int ret;
1639
1640 g_assert(bql_locked());
1641
1642 if (whpx->running_cpus++ == 0) {
1643 /* Insert breakpoints into memory, update exception exit bitmap. */
1644 ret = whpx_first_vcpu_starting(cpu);
1645 if (ret != 0) {
1646 return ret;
1647 }
1648 }
1649
1650 if (whpx->breakpoints.breakpoints &&
1651 whpx->breakpoints.breakpoints->used > 0)
1652 {
1653 uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1654 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1655 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1656 stepped_over_bp = NULL;
1657 }
1658
1659 if (stepped_over_bp) {
1660 /*
1661 * We are trying to run the instruction overwritten by an active
1662 * breakpoint. We will temporarily disable the breakpoint, suspend
1663 * other CPUs, and step over the instruction.
1664 */
1665 exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1666 }
1667 }
1668
1669 if (exclusive_step_mode == WHPX_STEP_NONE) {
1670 whpx_vcpu_process_async_events(cpu);
1671 if (cpu->halted && !whpx_apic_in_platform()) {
1672 cpu->exception_index = EXCP_HLT;
1673 qatomic_set(&cpu->exit_request, false);
1674 return 0;
1675 }
1676 }
1677
1678 bql_unlock();
1679
1680 if (exclusive_step_mode != WHPX_STEP_NONE) {
1681 start_exclusive();
1682 g_assert(cpu == current_cpu);
1683 g_assert(!cpu->running);
1684 cpu->running = true;
1685
1686 hr = whpx_set_exception_exit_bitmap(
1687 1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1688 if (!SUCCEEDED(hr)) {
1689 error_report("WHPX: Failed to update exception exit mask, "
1690 "hr=%08lx.", hr);
1691 return 1;
1692 }
1693
1694 if (stepped_over_bp) {
1695 /* Temporarily disable the triggered breakpoint. */
1696 cpu_memory_rw_debug(cpu,
1697 stepped_over_bp->address,
1698 &stepped_over_bp->original_instruction,
1699 1,
1700 true);
1701 }
1702 } else {
1703 cpu_exec_start(cpu);
1704 }
1705
1706 do {
1707 if (cpu->accel->dirty) {
1708 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1709 cpu->accel->dirty = false;
1710 }
1711
1712 if (exclusive_step_mode == WHPX_STEP_NONE) {
1713 whpx_vcpu_pre_run(cpu);
1714
1715 if (qatomic_read(&cpu->exit_request)) {
1716 whpx_vcpu_kick(cpu);
1717 }
1718 }
1719
1720 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1721 whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1722 }
1723
1724 hr = whp_dispatch.WHvRunVirtualProcessor(
1725 whpx->partition, cpu->cpu_index,
1726 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1727
1728 if (FAILED(hr)) {
1729 error_report("WHPX: Failed to exec a virtual processor,"
1730 " hr=%08lx", hr);
1731 ret = -1;
1732 break;
1733 }
1734
1735 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1736 whpx_vcpu_configure_single_stepping(cpu,
1737 false,
1738 &vcpu->exit_ctx.VpContext.Rflags);
1739 }
1740
1741 whpx_vcpu_post_run(cpu);
1742
1743 switch (vcpu->exit_ctx.ExitReason) {
1744 case WHvRunVpExitReasonMemoryAccess:
1745 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1746 break;
1747
1748 case WHvRunVpExitReasonX64IoPortAccess:
1749 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1750 break;
1751
1752 case WHvRunVpExitReasonX64InterruptWindow:
1753 vcpu->ready_for_pic_interrupt = 1;
1754 vcpu->window_registered = 0;
1755 ret = 0;
1756 break;
1757
1758 case WHvRunVpExitReasonX64ApicEoi:
1759 assert(whpx_apic_in_platform());
1760 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1761 break;
1762
1763 case WHvRunVpExitReasonX64Halt:
1764 /*
1765 * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1766 * longer used.
1767 */
1768 ret = whpx_handle_halt(cpu);
1769 break;
1770
1771 case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1772 WHV_INTERRUPT_CONTROL ipi = {0};
1773 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1774 uint32_t delivery_mode =
1775 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1776 int dest_shorthand =
1777 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1778 bool broadcast = false;
1779 bool include_self = false;
1780 uint32_t i;
1781
1782 /* We only registered for INIT and SIPI exits. */
1783 if ((delivery_mode != APIC_DM_INIT) &&
1784 (delivery_mode != APIC_DM_SIPI)) {
1785 error_report(
1786 "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1787 break;
1788 }
1789
1790 if (delivery_mode == APIC_DM_INIT) {
1791 ipi.Type = WHvX64InterruptTypeInit;
1792 } else {
1793 ipi.Type = WHvX64InterruptTypeSipi;
1794 }
1795
1796 ipi.DestinationMode =
1797 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1798 WHvX64InterruptDestinationModeLogical :
1799 WHvX64InterruptDestinationModePhysical;
1800
1801 ipi.TriggerMode =
1802 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1803 WHvX64InterruptTriggerModeLevel :
1804 WHvX64InterruptTriggerModeEdge;
1805
1806 ipi.Vector = icr & APIC_VECTOR_MASK;
1807 switch (dest_shorthand) {
1808 /* no shorthand. Bits 56-63 contain the destination. */
1809 case 0:
1810 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1811 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1812 &ipi, sizeof(ipi));
1813 if (FAILED(hr)) {
1814 error_report("WHPX: Failed to request interrupt hr=%08lx",
1815 hr);
1816 }
1817
1818 break;
1819
1820 /* self */
1821 case 1:
1822 include_self = true;
1823 break;
1824
1825 /* broadcast, including self */
1826 case 2:
1827 broadcast = true;
1828 include_self = true;
1829 break;
1830
1831 /* broadcast, excluding self */
1832 case 3:
1833 broadcast = true;
1834 break;
1835 }
1836
1837 if (!broadcast && !include_self) {
1838 break;
1839 }
1840
1841 for (i = 0; i <= max_vcpu_index; i++) {
1842 if (i == cpu->cpu_index && !include_self) {
1843 continue;
1844 }
1845
1846 /*
1847 * Assuming that APIC Ids are identity mapped since
1848 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1849 * are not handled yet and the hypervisor doesn't allow the
1850 * guest to modify the APIC ID.
1851 */
1852 ipi.Destination = i;
1853 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1854 &ipi, sizeof(ipi));
1855 if (FAILED(hr)) {
1856 error_report(
1857 "WHPX: Failed to request SIPI for %d, hr=%08lx",
1858 i, hr);
1859 }
1860 }
1861
1862 break;
1863 }
1864
1865 case WHvRunVpExitReasonCanceled:
1866 if (exclusive_step_mode != WHPX_STEP_NONE) {
1867 /*
1868 * We are trying to step over a single instruction, and
1869 * likely got a request to stop from another thread.
1870 * Delay it until we are done stepping
1871 * over.
1872 */
1873 ret = 0;
1874 } else {
1875 cpu->exception_index = EXCP_INTERRUPT;
1876 ret = 1;
1877 }
1878 break;
1879 case WHvRunVpExitReasonX64MsrAccess: {
1880 WHV_REGISTER_VALUE reg_values[3] = {0};
1881 WHV_REGISTER_NAME reg_names[3];
1882 UINT32 reg_count;
1883
1884 reg_names[0] = WHvX64RegisterRip;
1885 reg_names[1] = WHvX64RegisterRax;
1886 reg_names[2] = WHvX64RegisterRdx;
1887
1888 reg_values[0].Reg64 =
1889 vcpu->exit_ctx.VpContext.Rip +
1890 vcpu->exit_ctx.VpContext.InstructionLength;
1891
1892 /*
1893 * For all unsupported MSR access we:
1894 * ignore writes
1895 * return 0 on read.
1896 */
1897 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1898 1 : 3;
1899
1900 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1901 whpx->partition,
1902 cpu->cpu_index,
1903 reg_names, reg_count,
1904 reg_values);
1905
1906 if (FAILED(hr)) {
1907 error_report("WHPX: Failed to set MsrAccess state "
1908 " registers, hr=%08lx", hr);
1909 }
1910 ret = 0;
1911 break;
1912 }
1913 case WHvRunVpExitReasonX64Cpuid: {
1914 WHV_REGISTER_VALUE reg_values[5];
1915 WHV_REGISTER_NAME reg_names[5];
1916 UINT32 reg_count = 5;
1917 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1918 X86CPU *x86_cpu = X86_CPU(cpu);
1919 CPUX86State *env = &x86_cpu->env;
1920
1921 memset(reg_values, 0, sizeof(reg_values));
1922
1923 rip = vcpu->exit_ctx.VpContext.Rip +
1924 vcpu->exit_ctx.VpContext.InstructionLength;
1925 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1926
1927 /*
1928 * Ideally, these should be supplied to the hypervisor during VCPU
1929 * initialization and it should be able to satisfy this request.
1930 * But, currently, WHPX doesn't support setting CPUID values in the
1931 * hypervisor once the partition has been setup, which is too late
1932 * since VCPUs are realized later. For now, use the values from
1933 * QEMU to satisfy these requests, until WHPX adds support for
1934 * being able to set these values in the hypervisor at runtime.
1935 */
1936 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1937 (UINT32 *)&rcx, (UINT32 *)&rdx);
1938 switch (cpuid_fn) {
1939 case 0x40000000:
1940 /* Expose the vmware cpu frequency cpuid leaf */
1941 rax = 0x40000010;
1942 rbx = rcx = rdx = 0;
1943 break;
1944
1945 case 0x40000010:
1946 rax = env->tsc_khz;
1947 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1948 rcx = rdx = 0;
1949 break;
1950
1951 case 0x80000001:
1952 /* Remove any support of OSVW */
1953 rcx &= ~CPUID_EXT3_OSVW;
1954 break;
1955 }
1956
1957 reg_names[0] = WHvX64RegisterRip;
1958 reg_names[1] = WHvX64RegisterRax;
1959 reg_names[2] = WHvX64RegisterRcx;
1960 reg_names[3] = WHvX64RegisterRdx;
1961 reg_names[4] = WHvX64RegisterRbx;
1962
1963 reg_values[0].Reg64 = rip;
1964 reg_values[1].Reg64 = rax;
1965 reg_values[2].Reg64 = rcx;
1966 reg_values[3].Reg64 = rdx;
1967 reg_values[4].Reg64 = rbx;
1968
1969 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1970 whpx->partition, cpu->cpu_index,
1971 reg_names,
1972 reg_count,
1973 reg_values);
1974
1975 if (FAILED(hr)) {
1976 error_report("WHPX: Failed to set CpuidAccess state registers,"
1977 " hr=%08lx", hr);
1978 }
1979 ret = 0;
1980 break;
1981 }
1982 case WHvRunVpExitReasonException:
1983 whpx_get_registers(cpu);
1984
1985 if ((vcpu->exit_ctx.VpException.ExceptionType ==
1986 WHvX64ExceptionTypeDebugTrapOrFault) &&
1987 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
1988 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
1989 whpx_breakpoint_instruction)) {
1990 /* Stopped at a software breakpoint. */
1991 cpu->exception_index = EXCP_DEBUG;
1992 } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
1993 WHvX64ExceptionTypeDebugTrapOrFault) &&
1994 !cpu->singlestep_enabled) {
1995 /*
1996 * Just finished stepping over a breakpoint, but the
1997 * gdb does not expect us to do single-stepping.
1998 * Don't do anything special.
1999 */
2000 cpu->exception_index = EXCP_INTERRUPT;
2001 } else {
2002 /* Another exception or debug event. Report it to GDB. */
2003 cpu->exception_index = EXCP_DEBUG;
2004 }
2005
2006 ret = 1;
2007 break;
2008 case WHvRunVpExitReasonNone:
2009 case WHvRunVpExitReasonUnrecoverableException:
2010 case WHvRunVpExitReasonInvalidVpRegisterValue:
2011 case WHvRunVpExitReasonUnsupportedFeature:
2012 default:
2013 error_report("WHPX: Unexpected VP exit code %d",
2014 vcpu->exit_ctx.ExitReason);
2015 whpx_get_registers(cpu);
2016 bql_lock();
2017 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2018 bql_unlock();
2019 break;
2020 }
2021
2022 } while (!ret);
2023
2024 if (stepped_over_bp) {
2025 /* Restore the breakpoint we stepped over */
2026 cpu_memory_rw_debug(cpu,
2027 stepped_over_bp->address,
2028 (void *)&whpx_breakpoint_instruction,
2029 1,
2030 true);
2031 }
2032
2033 if (exclusive_step_mode != WHPX_STEP_NONE) {
2034 g_assert(cpu_in_exclusive_context(cpu));
2035 cpu->running = false;
2036 end_exclusive();
2037
2038 exclusive_step_mode = WHPX_STEP_NONE;
2039 } else {
2040 cpu_exec_end(cpu);
2041 }
2042
2043 bql_lock();
2044 current_cpu = cpu;
2045
2046 if (--whpx->running_cpus == 0) {
2047 whpx_last_vcpu_stopping(cpu);
2048 }
2049
2050 qatomic_set(&cpu->exit_request, false);
2051
2052 return ret < 0;
2053 }
2054
do_whpx_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)2055 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2056 {
2057 if (!cpu->accel->dirty) {
2058 whpx_get_registers(cpu);
2059 cpu->accel->dirty = true;
2060 }
2061 }
2062
do_whpx_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)2063 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2064 run_on_cpu_data arg)
2065 {
2066 whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2067 cpu->accel->dirty = false;
2068 }
2069
do_whpx_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)2070 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2071 run_on_cpu_data arg)
2072 {
2073 whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2074 cpu->accel->dirty = false;
2075 }
2076
do_whpx_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)2077 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2078 run_on_cpu_data arg)
2079 {
2080 cpu->accel->dirty = true;
2081 }
2082
2083 /*
2084 * CPU support.
2085 */
2086
whpx_cpu_synchronize_state(CPUState * cpu)2087 void whpx_cpu_synchronize_state(CPUState *cpu)
2088 {
2089 if (!cpu->accel->dirty) {
2090 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2091 }
2092 }
2093
whpx_cpu_synchronize_post_reset(CPUState * cpu)2094 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2095 {
2096 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2097 }
2098
whpx_cpu_synchronize_post_init(CPUState * cpu)2099 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2100 {
2101 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2102 }
2103
whpx_cpu_synchronize_pre_loadvm(CPUState * cpu)2104 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2105 {
2106 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2107 }
2108
whpx_cpu_synchronize_pre_resume(bool step_pending)2109 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2110 {
2111 whpx_global.step_pending = step_pending;
2112 }
2113
2114 /*
2115 * Vcpu support.
2116 */
2117
2118 static Error *whpx_migration_blocker;
2119
whpx_cpu_update_state(void * opaque,bool running,RunState state)2120 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2121 {
2122 CPUX86State *env = opaque;
2123
2124 if (running) {
2125 env->tsc_valid = false;
2126 }
2127 }
2128
whpx_init_vcpu(CPUState * cpu)2129 int whpx_init_vcpu(CPUState *cpu)
2130 {
2131 HRESULT hr;
2132 struct whpx_state *whpx = &whpx_global;
2133 AccelCPUState *vcpu = NULL;
2134 Error *local_error = NULL;
2135 X86CPU *x86_cpu = X86_CPU(cpu);
2136 CPUX86State *env = &x86_cpu->env;
2137 UINT64 freq = 0;
2138 int ret;
2139
2140 /* Add migration blockers for all unsupported features of the
2141 * Windows Hypervisor Platform
2142 */
2143 if (whpx_migration_blocker == NULL) {
2144 error_setg(&whpx_migration_blocker,
2145 "State blocked due to non-migratable CPUID feature support,"
2146 "dirty memory tracking support, and XSAVE/XRSTOR support");
2147
2148 if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) {
2149 error_report_err(local_error);
2150 ret = -EINVAL;
2151 goto error;
2152 }
2153 }
2154
2155 vcpu = g_new0(AccelCPUState, 1);
2156
2157 hr = whp_dispatch.WHvEmulatorCreateEmulator(
2158 &whpx_emu_callbacks,
2159 &vcpu->emulator);
2160 if (FAILED(hr)) {
2161 error_report("WHPX: Failed to setup instruction completion support,"
2162 " hr=%08lx", hr);
2163 ret = -EINVAL;
2164 goto error;
2165 }
2166
2167 hr = whp_dispatch.WHvCreateVirtualProcessor(
2168 whpx->partition, cpu->cpu_index, 0);
2169 if (FAILED(hr)) {
2170 error_report("WHPX: Failed to create a virtual processor,"
2171 " hr=%08lx", hr);
2172 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2173 ret = -EINVAL;
2174 goto error;
2175 }
2176
2177 /*
2178 * vcpu's TSC frequency is either specified by user, or use the value
2179 * provided by Hyper-V if the former is not present. In the latter case, we
2180 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2181 * frequency can be migrated later via this field.
2182 */
2183 if (!env->tsc_khz) {
2184 hr = whp_dispatch.WHvGetCapability(
2185 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2186 NULL);
2187 if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2188 if (FAILED(hr)) {
2189 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2190 } else {
2191 env->tsc_khz = freq / 1000; /* Hz to KHz */
2192 }
2193 }
2194 }
2195
2196 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2197 hr = whp_dispatch.WHvGetCapability(
2198 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2199 if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2200 if (FAILED(hr)) {
2201 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2202 } else {
2203 env->apic_bus_freq = freq;
2204 }
2205 }
2206
2207 /*
2208 * If the vmware cpuid frequency leaf option is set, and we have a valid
2209 * tsc value, trap the corresponding cpuid's.
2210 */
2211 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2212 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2213
2214 hr = whp_dispatch.WHvSetPartitionProperty(
2215 whpx->partition,
2216 WHvPartitionPropertyCodeCpuidExitList,
2217 cpuidExitList,
2218 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2219
2220 if (FAILED(hr)) {
2221 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2222 hr);
2223 ret = -EINVAL;
2224 goto error;
2225 }
2226 }
2227
2228 vcpu->interruptable = true;
2229 vcpu->dirty = true;
2230 cpu->accel = vcpu;
2231 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2232 qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);
2233
2234 return 0;
2235
2236 error:
2237 g_free(vcpu);
2238
2239 return ret;
2240 }
2241
whpx_vcpu_exec(CPUState * cpu)2242 int whpx_vcpu_exec(CPUState *cpu)
2243 {
2244 int ret;
2245 int fatal;
2246
2247 for (;;) {
2248 if (cpu->exception_index >= EXCP_INTERRUPT) {
2249 ret = cpu->exception_index;
2250 cpu->exception_index = -1;
2251 break;
2252 }
2253
2254 fatal = whpx_vcpu_run(cpu);
2255
2256 if (fatal) {
2257 error_report("WHPX: Failed to exec a virtual processor");
2258 abort();
2259 }
2260 }
2261
2262 return ret;
2263 }
2264
whpx_destroy_vcpu(CPUState * cpu)2265 void whpx_destroy_vcpu(CPUState *cpu)
2266 {
2267 struct whpx_state *whpx = &whpx_global;
2268 AccelCPUState *vcpu = cpu->accel;
2269
2270 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2271 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2272 g_free(cpu->accel);
2273 }
2274
whpx_vcpu_kick(CPUState * cpu)2275 void whpx_vcpu_kick(CPUState *cpu)
2276 {
2277 struct whpx_state *whpx = &whpx_global;
2278 whp_dispatch.WHvCancelRunVirtualProcessor(
2279 whpx->partition, cpu->cpu_index, 0);
2280 }
2281
2282 /*
2283 * Memory support.
2284 */
2285
whpx_update_mapping(hwaddr start_pa,ram_addr_t size,void * host_va,int add,int rom,const char * name)2286 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2287 void *host_va, int add, int rom,
2288 const char *name)
2289 {
2290 struct whpx_state *whpx = &whpx_global;
2291 HRESULT hr;
2292
2293 /*
2294 if (add) {
2295 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2296 (void*)start_pa, (void*)size, host_va,
2297 (rom ? "ROM" : "RAM"), name);
2298 } else {
2299 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n",
2300 (void*)start_pa, (void*)size, host_va, name);
2301 }
2302 */
2303
2304 if (add) {
2305 hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2306 host_va,
2307 start_pa,
2308 size,
2309 (WHvMapGpaRangeFlagRead |
2310 WHvMapGpaRangeFlagExecute |
2311 (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2312 } else {
2313 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2314 start_pa,
2315 size);
2316 }
2317
2318 if (FAILED(hr)) {
2319 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2320 " Host:%p, hr=%08lx",
2321 (add ? "MAP" : "UNMAP"), name,
2322 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2323 }
2324 }
2325
whpx_process_section(MemoryRegionSection * section,int add)2326 static void whpx_process_section(MemoryRegionSection *section, int add)
2327 {
2328 MemoryRegion *mr = section->mr;
2329 hwaddr start_pa = section->offset_within_address_space;
2330 ram_addr_t size = int128_get64(section->size);
2331 unsigned int delta;
2332 uint64_t host_va;
2333
2334 if (!memory_region_is_ram(mr)) {
2335 return;
2336 }
2337
2338 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2339 delta &= ~qemu_real_host_page_mask();
2340 if (delta > size) {
2341 return;
2342 }
2343 start_pa += delta;
2344 size -= delta;
2345 size &= qemu_real_host_page_mask();
2346 if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2347 return;
2348 }
2349
2350 host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2351 + section->offset_within_region + delta;
2352
2353 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2354 memory_region_is_rom(mr), mr->name);
2355 }
2356
whpx_region_add(MemoryListener * listener,MemoryRegionSection * section)2357 static void whpx_region_add(MemoryListener *listener,
2358 MemoryRegionSection *section)
2359 {
2360 memory_region_ref(section->mr);
2361 whpx_process_section(section, 1);
2362 }
2363
whpx_region_del(MemoryListener * listener,MemoryRegionSection * section)2364 static void whpx_region_del(MemoryListener *listener,
2365 MemoryRegionSection *section)
2366 {
2367 whpx_process_section(section, 0);
2368 memory_region_unref(section->mr);
2369 }
2370
whpx_transaction_begin(MemoryListener * listener)2371 static void whpx_transaction_begin(MemoryListener *listener)
2372 {
2373 }
2374
whpx_transaction_commit(MemoryListener * listener)2375 static void whpx_transaction_commit(MemoryListener *listener)
2376 {
2377 }
2378
whpx_log_sync(MemoryListener * listener,MemoryRegionSection * section)2379 static void whpx_log_sync(MemoryListener *listener,
2380 MemoryRegionSection *section)
2381 {
2382 MemoryRegion *mr = section->mr;
2383
2384 if (!memory_region_is_ram(mr)) {
2385 return;
2386 }
2387
2388 memory_region_set_dirty(mr, 0, int128_get64(section->size));
2389 }
2390
2391 static MemoryListener whpx_memory_listener = {
2392 .name = "whpx",
2393 .begin = whpx_transaction_begin,
2394 .commit = whpx_transaction_commit,
2395 .region_add = whpx_region_add,
2396 .region_del = whpx_region_del,
2397 .log_sync = whpx_log_sync,
2398 .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
2399 };
2400
whpx_memory_init(void)2401 static void whpx_memory_init(void)
2402 {
2403 memory_listener_register(&whpx_memory_listener, &address_space_memory);
2404 }
2405
2406 /*
2407 * Load the functions from the given library, using the given handle. If a
2408 * handle is provided, it is used, otherwise the library is opened. The
2409 * handle will be updated on return with the opened one.
2410 */
load_whp_dispatch_fns(HMODULE * handle,WHPFunctionList function_list)2411 static bool load_whp_dispatch_fns(HMODULE *handle,
2412 WHPFunctionList function_list)
2413 {
2414 HMODULE hLib = *handle;
2415
2416 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2417 #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2418 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2419 whp_dispatch.function_name = \
2420 (function_name ## _t)GetProcAddress(hLib, #function_name); \
2421
2422 #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2423 whp_dispatch.function_name = \
2424 (function_name ## _t)GetProcAddress(hLib, #function_name); \
2425 if (!whp_dispatch.function_name) { \
2426 error_report("Could not load function %s", #function_name); \
2427 goto error; \
2428 } \
2429
2430 #define WHP_LOAD_LIB(lib_name, handle_lib) \
2431 if (!handle_lib) { \
2432 handle_lib = LoadLibrary(lib_name); \
2433 if (!handle_lib) { \
2434 error_report("Could not load library %s.", lib_name); \
2435 goto error; \
2436 } \
2437 } \
2438
2439 switch (function_list) {
2440 case WINHV_PLATFORM_FNS_DEFAULT:
2441 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2442 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2443 break;
2444
2445 case WINHV_EMULATION_FNS_DEFAULT:
2446 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2447 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2448 break;
2449
2450 case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2451 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2452 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2453 break;
2454 }
2455
2456 *handle = hLib;
2457 return true;
2458
2459 error:
2460 if (hLib) {
2461 FreeLibrary(hLib);
2462 }
2463
2464 return false;
2465 }
2466
whpx_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)2467 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2468 const char *name, void *opaque,
2469 Error **errp)
2470 {
2471 struct whpx_state *whpx = &whpx_global;
2472 OnOffSplit mode;
2473
2474 if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2475 return;
2476 }
2477
2478 switch (mode) {
2479 case ON_OFF_SPLIT_ON:
2480 whpx->kernel_irqchip_allowed = true;
2481 whpx->kernel_irqchip_required = true;
2482 break;
2483
2484 case ON_OFF_SPLIT_OFF:
2485 whpx->kernel_irqchip_allowed = false;
2486 whpx->kernel_irqchip_required = false;
2487 break;
2488
2489 case ON_OFF_SPLIT_SPLIT:
2490 error_setg(errp, "WHPX: split irqchip currently not supported");
2491 error_append_hint(errp,
2492 "Try without kernel-irqchip or with kernel-irqchip=on|off");
2493 break;
2494
2495 default:
2496 /*
2497 * The value was checked in visit_type_OnOffSplit() above. If
2498 * we get here, then something is wrong in QEMU.
2499 */
2500 abort();
2501 }
2502 }
2503
2504 /*
2505 * Partition support
2506 */
2507
whpx_accel_init(MachineState * ms)2508 static int whpx_accel_init(MachineState *ms)
2509 {
2510 struct whpx_state *whpx;
2511 int ret;
2512 HRESULT hr;
2513 WHV_CAPABILITY whpx_cap;
2514 UINT32 whpx_cap_size;
2515 WHV_PARTITION_PROPERTY prop;
2516 UINT32 cpuidExitList[] = {1, 0x80000001};
2517 WHV_CAPABILITY_FEATURES features = {0};
2518
2519 whpx = &whpx_global;
2520
2521 if (!init_whp_dispatch()) {
2522 ret = -ENOSYS;
2523 goto error;
2524 }
2525
2526 whpx->mem_quota = ms->ram_size;
2527
2528 hr = whp_dispatch.WHvGetCapability(
2529 WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2530 sizeof(whpx_cap), &whpx_cap_size);
2531 if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2532 error_report("WHPX: No accelerator found, hr=%08lx", hr);
2533 ret = -ENOSPC;
2534 goto error;
2535 }
2536
2537 hr = whp_dispatch.WHvGetCapability(
2538 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2539 if (FAILED(hr)) {
2540 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2541 ret = -EINVAL;
2542 goto error;
2543 }
2544
2545 hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2546 if (FAILED(hr)) {
2547 error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2548 ret = -EINVAL;
2549 goto error;
2550 }
2551
2552 /*
2553 * Query the XSAVE capability of the partition. Any error here is not
2554 * considered fatal.
2555 */
2556 hr = whp_dispatch.WHvGetPartitionProperty(
2557 whpx->partition,
2558 WHvPartitionPropertyCodeProcessorXsaveFeatures,
2559 &whpx_xsave_cap,
2560 sizeof(whpx_xsave_cap),
2561 &whpx_cap_size);
2562
2563 /*
2564 * Windows version which don't support this property will return with the
2565 * specific error code.
2566 */
2567 if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2568 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2569 }
2570
2571 if (!whpx_has_xsave()) {
2572 printf("WHPX: Partition is not XSAVE capable\n");
2573 }
2574
2575 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2576 prop.ProcessorCount = ms->smp.cpus;
2577 hr = whp_dispatch.WHvSetPartitionProperty(
2578 whpx->partition,
2579 WHvPartitionPropertyCodeProcessorCount,
2580 &prop,
2581 sizeof(WHV_PARTITION_PROPERTY));
2582
2583 if (FAILED(hr)) {
2584 error_report("WHPX: Failed to set partition processor count to %u,"
2585 " hr=%08lx", prop.ProcessorCount, hr);
2586 ret = -EINVAL;
2587 goto error;
2588 }
2589
2590 /*
2591 * Error out if WHP doesn't support apic emulation and user is requiring
2592 * it.
2593 */
2594 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2595 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2596 error_report("WHPX: kernel irqchip requested, but unavailable. "
2597 "Try without kernel-irqchip or with kernel-irqchip=off");
2598 ret = -EINVAL;
2599 goto error;
2600 }
2601
2602 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2603 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2604 WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2605 WHvX64LocalApicEmulationModeXApic;
2606 printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2607 hr = whp_dispatch.WHvSetPartitionProperty(
2608 whpx->partition,
2609 WHvPartitionPropertyCodeLocalApicEmulationMode,
2610 &mode,
2611 sizeof(mode));
2612 if (FAILED(hr)) {
2613 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2614 if (whpx->kernel_irqchip_required) {
2615 error_report("WHPX: kernel irqchip requested, but unavailable");
2616 ret = -EINVAL;
2617 goto error;
2618 }
2619 } else {
2620 whpx->apic_in_platform = true;
2621 }
2622 }
2623
2624 /* Register for MSR and CPUID exits */
2625 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2626 prop.ExtendedVmExits.X64MsrExit = 1;
2627 prop.ExtendedVmExits.X64CpuidExit = 1;
2628 prop.ExtendedVmExits.ExceptionExit = 1;
2629 if (whpx_apic_in_platform()) {
2630 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2631 }
2632
2633 hr = whp_dispatch.WHvSetPartitionProperty(
2634 whpx->partition,
2635 WHvPartitionPropertyCodeExtendedVmExits,
2636 &prop,
2637 sizeof(WHV_PARTITION_PROPERTY));
2638 if (FAILED(hr)) {
2639 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2640 ret = -EINVAL;
2641 goto error;
2642 }
2643
2644 hr = whp_dispatch.WHvSetPartitionProperty(
2645 whpx->partition,
2646 WHvPartitionPropertyCodeCpuidExitList,
2647 cpuidExitList,
2648 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2649
2650 if (FAILED(hr)) {
2651 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2652 hr);
2653 ret = -EINVAL;
2654 goto error;
2655 }
2656
2657 /*
2658 * We do not want to intercept any exceptions from the guest,
2659 * until we actually start debugging with gdb.
2660 */
2661 whpx->exception_exit_bitmap = -1;
2662 hr = whpx_set_exception_exit_bitmap(0);
2663
2664 if (FAILED(hr)) {
2665 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2666 ret = -EINVAL;
2667 goto error;
2668 }
2669
2670 hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2671 if (FAILED(hr)) {
2672 error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2673 ret = -EINVAL;
2674 goto error;
2675 }
2676
2677 whpx_memory_init();
2678
2679 printf("Windows Hypervisor Platform accelerator is operational\n");
2680 return 0;
2681
2682 error:
2683
2684 if (NULL != whpx->partition) {
2685 whp_dispatch.WHvDeletePartition(whpx->partition);
2686 whpx->partition = NULL;
2687 }
2688
2689 return ret;
2690 }
2691
whpx_enabled(void)2692 int whpx_enabled(void)
2693 {
2694 return whpx_allowed;
2695 }
2696
whpx_apic_in_platform(void)2697 bool whpx_apic_in_platform(void) {
2698 return whpx_global.apic_in_platform;
2699 }
2700
whpx_accel_class_init(ObjectClass * oc,const void * data)2701 static void whpx_accel_class_init(ObjectClass *oc, const void *data)
2702 {
2703 AccelClass *ac = ACCEL_CLASS(oc);
2704 ac->name = "WHPX";
2705 ac->init_machine = whpx_accel_init;
2706 ac->allowed = &whpx_allowed;
2707
2708 object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2709 NULL, whpx_set_kernel_irqchip,
2710 NULL, NULL);
2711 object_class_property_set_description(oc, "kernel-irqchip",
2712 "Configure WHPX in-kernel irqchip");
2713 }
2714
whpx_accel_instance_init(Object * obj)2715 static void whpx_accel_instance_init(Object *obj)
2716 {
2717 struct whpx_state *whpx = &whpx_global;
2718
2719 memset(whpx, 0, sizeof(struct whpx_state));
2720 /* Turn on kernel-irqchip, by default */
2721 whpx->kernel_irqchip_allowed = true;
2722 }
2723
2724 static const TypeInfo whpx_accel_type = {
2725 .name = ACCEL_CLASS_NAME("whpx"),
2726 .parent = TYPE_ACCEL,
2727 .instance_init = whpx_accel_instance_init,
2728 .class_init = whpx_accel_class_init,
2729 };
2730
whpx_type_init(void)2731 static void whpx_type_init(void)
2732 {
2733 type_register_static(&whpx_accel_type);
2734 }
2735
init_whp_dispatch(void)2736 bool init_whp_dispatch(void)
2737 {
2738 if (whp_dispatch_initialized) {
2739 return true;
2740 }
2741
2742 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2743 goto error;
2744 }
2745
2746 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2747 goto error;
2748 }
2749
2750 assert(load_whp_dispatch_fns(&hWinHvPlatform,
2751 WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2752 whp_dispatch_initialized = true;
2753
2754 return true;
2755 error:
2756 if (hWinHvPlatform) {
2757 FreeLibrary(hWinHvPlatform);
2758 }
2759
2760 if (hWinHvEmulation) {
2761 FreeLibrary(hWinHvEmulation);
2762 }
2763
2764 return false;
2765 }
2766
2767 type_init(whpx_type_init);
2768